239 files changed, 184759 insertions, 0 deletions
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
new file mode 100644
index 000000000000..a736884be672
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -0,0 +1,998 @@
+//===----- AggressiveAntiDepBreaker.cpp - Anti-dep breaker ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+// If DebugDiv > 0 then only break antidep with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("agg-antidep-debugdiv",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("agg-antidep-debugmod",
+         cl::desc("Debug control for aggressive anti-dep breaker"),
+         cl::init(0), cl::Hidden);
+
+AggressiveAntiDepState::AggressiveAntiDepState(const unsigned TargetRegs,
+                                               MachineBasicBlock *BB) :
+  NumTargetRegs(TargetRegs), GroupNodes(TargetRegs, 0),
+  GroupNodeIndices(TargetRegs, 0),
+  KillIndices(TargetRegs, 0),
+  DefIndices(TargetRegs, 0)
+{
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0; i < NumTargetRegs; ++i) {
+    // Initialize all registers to be in their own group. Initially we
+    // assign the register to the same-indexed GroupNode.
+    GroupNodeIndices[i] = i;
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+}
+
+unsigned AggressiveAntiDepState::GetGroup(unsigned Reg) {
+  unsigned Node = GroupNodeIndices[Reg];
+  while (GroupNodes[Node] != Node)
+    Node = GroupNodes[Node];
+
+  return Node;
+}
+
+void AggressiveAntiDepState::GetGroupRegs(
+  unsigned Group,
+  std::vector<unsigned> &Regs,
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference> *RegRefs)
+{
+  for (unsigned Reg = 0; Reg != NumTargetRegs; ++Reg) {
+    if ((GetGroup(Reg) == Group) && (RegRefs->count(Reg) > 0))
+      Regs.push_back(Reg);
+  }
+}
+
+unsigned AggressiveAntiDepState::UnionGroups(unsigned Reg1, unsigned Reg2)
+{
+  assert(GroupNodes[0] == 0 && "GroupNode 0 not parent!");
+  assert(GroupNodeIndices[0] == 0 && "Reg 0 not in Group 0!");
+
+  // find group for each register
+  unsigned Group1 = GetGroup(Reg1);
+  unsigned Group2 = GetGroup(Reg2);
+
+  // if either group is 0, then that must become the parent
+  unsigned Parent = (Group1 == 0) ? Group1 : Group2;
+  unsigned Other = (Parent == Group1) ? Group2 : Group1;
+  GroupNodes.at(Other) = Parent;
+  return Parent;
+}
+
+unsigned AggressiveAntiDepState::LeaveGroup(unsigned Reg)
+{
+  // Create a new GroupNode for Reg. Reg's existing GroupNode must
+  // stay as is because there could be other GroupNodes referring to
+  // it.
+  unsigned idx = GroupNodes.size();
+  GroupNodes.push_back(idx);
+  GroupNodeIndices[Reg] = idx;
+  return idx;
+}
+
+bool AggressiveAntiDepState::IsLive(unsigned Reg)
+{
+  // KillIndex must be defined and DefIndex not defined for a register
+  // to be live.
+  return((KillIndices[Reg] != ~0u) && (DefIndices[Reg] == ~0u));
+}
+
+AggressiveAntiDepBreaker::AggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      State(nullptr) {
+  /* Collect a bitset of all registers that are only broken if they
+     are on the critical path. */
+  for (unsigned i = 0, e = CriticalPathRCs.size(); i < e; ++i) {
+    BitVector CPSet = TRI->getAllocatableSet(MF, CriticalPathRCs[i]);
+    if (CriticalPathSet.none())
+      CriticalPathSet = CPSet;
+    else
+      CriticalPathSet |= CPSet;
+   }
+
+  DEBUG(dbgs() << "AntiDep Critical-Path Registers:");
+  DEBUG(for (int r = CriticalPathSet.find_first(); r != -1;
+             r = CriticalPathSet.find_next(r))
+          dbgs() << " " << TRI->getName(r));
+  DEBUG(dbgs() << '\n');
+}
+
+AggressiveAntiDepBreaker::~AggressiveAntiDepBreaker() {
+  delete State;
+}
+
+void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  assert(!State);
+  State = new AggressiveAntiDepState(TRI->getNumRegs(), BB);
+
+  bool IsReturnBlock = BB->isReturnBlock();
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (const auto &LI : (*SI)->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        State->UnionGroups(Reg, 0);
+        KillIndices[Reg] = BB->size();
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(MF);
+  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      State->UnionGroups(AliasReg, 0);
+      KillIndices[AliasReg] = BB->size();
+      DefIndices[AliasReg] = ~0u;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::FinishBlock() {
+  delete State;
+  State = nullptr;
+}
+
+void AggressiveAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
+                                       unsigned InsertPosIndex) {
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  std::set<unsigned> PassthruRegs;
+  GetPassthruRegs(MI, PassthruRegs);
+  PrescanInstruction(MI, Count, PassthruRegs);
+  ScanInstruction(MI, Count);
+
+  DEBUG(dbgs() << "Observe: ");
+  DEBUG(MI.dump());
+  DEBUG(dbgs() << "\tRegs:");
+
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
+    // If Reg is current live, then mark that it can't be renamed as
+    // we don't know the extent of its live-range anymore (now that it
+    // has been scheduled). If it is not live but was defined in the
+    // previous schedule region, then set its def index to the most
+    // conservative location (i.e. the beginning of the previous
+    // schedule region).
+    if (State->IsLive(Reg)) {
+      DEBUG(if (State->GetGroup(Reg) != 0)
+              dbgs() << " " << TRI->getName(Reg) << "=g" <<
+                State->GetGroup(Reg) << "->g0(region live-out)");
+      State->UnionGroups(Reg, 0);
+    } else if ((DefIndices[Reg] < InsertPosIndex)
+               && (DefIndices[Reg] >= Count)) {
+      DefIndices[Reg] = Count;
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+bool AggressiveAntiDepBreaker::IsImplicitDefUse(MachineInstr &MI,
+                                                MachineOperand &MO) {
+  if (!MO.isReg() || !MO.isImplicit())
+    return false;
+
+  unsigned Reg = MO.getReg();
+  if (Reg == 0)
+    return false;
+
+  MachineOperand *Op = nullptr;
+  if (MO.isDef())
+    Op = MI.findRegisterUseOperand(Reg, true);
+  else
+    Op = MI.findRegisterDefOperand(Reg);
+
+  return(Op && Op->isImplicit());
+}
+
+void AggressiveAntiDepBreaker::GetPassthruRegs(
+    MachineInstr &MI, std::set<unsigned> &PassthruRegs) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    if ((MO.isDef() && MI.isRegTiedToUseOperand(i)) ||
+        IsImplicitDefUse(MI, MO)) {
+      const unsigned Reg = MO.getReg();
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        PassthruRegs.insert(*SubRegs);
+    }
+  }
+}
+
+/// AntiDepEdges - Return in Edges the anti- and output- dependencies
+/// in SU that we want to consider for breaking.
+static void AntiDepEdges(const SUnit *SU, std::vector<const SDep*>& Edges) {
+  SmallSet<unsigned, 4> RegSet;
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    if ((P->getKind() == SDep::Anti) || (P->getKind() == SDep::Output)) {
+      if (RegSet.insert(P->getReg()).second)
+        Edges.push_back(&*P);
+    }
+  }
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SUnit *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = nullptr;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  if (SU) {
+    for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+         P != PE; ++P) {
+      const SUnit *PredSU = P->getSUnit();
+      unsigned PredLatency = P->getLatency();
+      unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+      // In the case of a latency tie, prefer an anti-dependency edge over
+      // other types of edges.
+      if (NextDepth < PredTotalLatency ||
+          (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+        NextDepth = PredTotalLatency;
+        Next = &*P;
+      }
+    }
+  }
+
+  return (Next) ? Next->getSUnit() : nullptr;
+}
+
+void AggressiveAntiDepBreaker::HandleLastUse(unsigned Reg, unsigned KillIdx,
+                                             const char *tag,
+                                             const char *header,
+                                             const char *footer) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // FIXME: We must leave subregisters of live super registers as live, so that
+  // we don't clear out the register tracking information for subregisters of
+  // super registers we're still tracking (and with which we're unioning
+  // subregister definitions).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI)) {
+      DEBUG(if (!header && footer) dbgs() << footer);
+      return;
+    }
+
+  if (!State->IsLive(Reg)) {
+    KillIndices[Reg] = KillIdx;
+    DefIndices[Reg] = ~0u;
+    RegRefs.erase(Reg);
+    State->LeaveGroup(Reg);
+    DEBUG(if (header) {
+        dbgs() << header << TRI->getName(Reg); header = nullptr; });
+    DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << tag);
+    // Repeat for subregisters. Note that we only do this if the superregister
+    // was not live because otherwise, regardless whether we have an explicit
+    // use of the subregister, the subregister's contents are needed for the
+    // uses of the superregister.
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubregReg = *SubRegs;
+      if (!State->IsLive(SubregReg)) {
+        KillIndices[SubregReg] = KillIdx;
+        DefIndices[SubregReg] = ~0u;
+        RegRefs.erase(SubregReg);
+        State->LeaveGroup(SubregReg);
+        DEBUG(if (header) {
+            dbgs() << header << TRI->getName(Reg); header = nullptr; });
+        DEBUG(dbgs() << " " << TRI->getName(SubregReg) << "->g" <<
+              State->GetGroup(SubregReg) << tag);
+      }
+    }
+  }
+
+  DEBUG(if (!header && footer) dbgs() << footer);
+}
+
+void AggressiveAntiDepBreaker::PrescanInstruction(
+    MachineInstr &MI, unsigned Count, std::set<unsigned> &PassthruRegs) {
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Handle dead defs by simulating a last-use of the register just
+  // after the def. A dead def can occur because the def is truly
+  // dead, or because only a subregister is live at the def. If we
+  // don't do this the dead def will be incorrectly merged into the
+  // previous def.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    HandleLastUse(Reg, Count + 1, "", "\tDead Def: ", "\n");
+  }
+
+  DEBUG(dbgs() << "\tDef Groups:");
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" << State->GetGroup(Reg));
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI). Inline assembly may
+    // reference either system calls or the register directly. Skip it until we
+    // can tell user specified registers from compiler-specified.
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI) ||
+        MI.isInlineAsm()) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Any aliased that are live at this point are completely or
+    // partially defined here, so group those aliases with Reg.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (State->IsLive(AliasReg)) {
+        State->UnionGroups(Reg, AliasReg);
+        DEBUG(dbgs() << "->g" << State->GetGroup(Reg) << "(via " <<
+              TRI->getName(AliasReg) << ")");
+      }
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Scan the register defs for this instruction and update
+  // live-ranges.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    // Ignore KILLs and passthru registers for liveness...
+    if (MI.isKill() || (PassthruRegs.count(Reg) != 0))
+      continue;
+
+    // Update def for Reg and aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      // We need to be careful here not to define already-live super registers.
+      // If the super register is already live, then this definition is not
+      // a definition of the whole super register (just a partial insertion
+      // into it). Earlier subregister definitions (which we've not yet visited
+      // because we're iterating bottom-up) need to be linked to the same group
+      // as this definition.
+      if (TRI->isSuperRegister(Reg, *AI) && State->IsLive(*AI))
+        continue;
+
+      DefIndices[*AI] = Count;
+    }
+  }
+}
+
+void AggressiveAntiDepBreaker::ScanInstruction(MachineInstr &MI,
+                                               unsigned Count) {
+  DEBUG(dbgs() << "\tUse Groups:");
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // If MI's uses have special allocation requirement, don't allow
+  // any use registers to be changed. Also assume all registers
+  // used in a call must not be changed (ABI).
+  // Inline Assembly register uses also cannot be safely changed.
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservatively here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special = MI.isCall() || MI.hasExtraSrcRegAllocReq() ||
+                 TII->isPredicated(MI) || MI.isInlineAsm();
+
+  // Scan the register uses for this instruction and update
+  // live-ranges, groups and RegRefs.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    DEBUG(dbgs() << " " << TRI->getName(Reg) << "=g" <<
+          State->GetGroup(Reg));
+
+    // It wasn't previously live but now it is, this is a kill. Forget
+    // the previous live-range information and start a new live-range
+    // for the register.
+    HandleLastUse(Reg, Count, "(last-use)");
+
+    if (Special) {
+      DEBUG(if (State->GetGroup(Reg) != 0) dbgs() << "->g0(alloc-req)");
+      State->UnionGroups(Reg, 0);
+    }
+
+    // Note register reference...
+    const TargetRegisterClass *RC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      RC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+    AggressiveAntiDepState::RegisterReference RR = { &MO, RC };
+    RegRefs.insert(std::make_pair(Reg, RR));
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  // Form a group of all defs and uses of a KILL instruction to ensure
+  // that all registers are renamed as a group.
+  if (MI.isKill()) {
+    DEBUG(dbgs() << "\tKill Group:");
+
+    unsigned FirstReg = 0;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (FirstReg != 0) {
+        DEBUG(dbgs() << "=" << TRI->getName(Reg));
+        State->UnionGroups(FirstReg, Reg);
+      } else {
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+        FirstReg = Reg;
+      }
+    }
+
+    DEBUG(dbgs() << "->g" << State->GetGroup(FirstReg) << '\n');
+  }
+}
+
+BitVector AggressiveAntiDepBreaker::GetRenameRegisters(unsigned Reg) {
+  BitVector BV(TRI->getNumRegs(), false);
+  bool first = true;
+
+  // Check all references that need rewriting for Reg. For each, use
+  // the corresponding register class to narrow the set of registers
+  // that are appropriate for renaming.
+  for (const auto &Q : make_range(State->GetRegRefs().equal_range(Reg))) {
+    const TargetRegisterClass *RC = Q.second.RC;
+    if (!RC) continue;
+
+    BitVector RCBV = TRI->getAllocatableSet(MF, RC);
+    if (first) {
+      BV |= RCBV;
+      first = false;
+    } else {
+      BV &= RCBV;
+    }
+
+    DEBUG(dbgs() << " " << TRI->getRegClassName(RC));
+  }
+
+  return BV;
+}
+
+bool AggressiveAntiDepBreaker::FindSuitableFreeRegisters(
+                                unsigned AntiDepGroupIndex,
+                                RenameOrderType& RenameOrder,
+                                std::map<unsigned, unsigned> &RenameMap) {
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // Collect all referenced registers in the same group as
+  // AntiDepReg. These all need to be renamed together if we are to
+  // break the anti-dependence.
+  std::vector<unsigned> Regs;
+  State->GetGroupRegs(AntiDepGroupIndex, Regs, &RegRefs);
+  assert(Regs.size() > 0 && "Empty register group!");
+  if (Regs.size() == 0)
+    return false;
+
+  // Find the "superest" register in the group. At the same time,
+  // collect the BitVector of registers that can be used to rename
+  // each register.
+  DEBUG(dbgs() << "\tRename Candidates for Group g" << AntiDepGroupIndex
+        << ":\n");
+  std::map<unsigned, BitVector> RenameRegisterMap;
+  unsigned SuperReg = 0;
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if ((SuperReg == 0) || TRI->isSuperRegister(SuperReg, Reg))
+      SuperReg = Reg;
+
+    // If Reg has any references, then collect possible rename regs
+    if (RegRefs.count(Reg) > 0) {
+      DEBUG(dbgs() << "\t\t" << TRI->getName(Reg) << ":");
+
+      BitVector &BV = RenameRegisterMap[Reg];
+      assert(BV.empty());
+      BV = GetRenameRegisters(Reg);
+
+      DEBUG({
+        dbgs() << " ::";
+        for (int r = BV.find_first(); r != -1; r = BV.find_next(r))
+          dbgs() << " " << TRI->getName(r);
+        dbgs() << "\n";
+      });
+    }
+  }
+
+  // All group registers should be a subreg of SuperReg.
+  for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+    unsigned Reg = Regs[i];
+    if (Reg == SuperReg) continue;
+    bool IsSub = TRI->isSubRegister(SuperReg, Reg);
+    // FIXME: remove this once PR18663 has been properly fixed. For now,
+    // return a conservative answer:
+    // assert(IsSub && "Expecting group subregister");
+    if (!IsSub)
+      return false;
+  }
+
+#ifndef NDEBUG
+  // If DebugDiv > 0 then only rename (renamecnt % DebugDiv) == DebugMod
+  if (DebugDiv > 0) {
+    static int renamecnt = 0;
+    if (renamecnt++ % DebugDiv != DebugMod)
+      return false;
+
+    dbgs() << "*** Performing rename " << TRI->getName(SuperReg) <<
+      " for debug ***\n";
+  }
+#endif
+
+  // Check each possible rename register for SuperReg in round-robin
+  // order. If that register is available, and the corresponding
+  // registers are available for the other group subregisters, then we
+  // can use those registers to rename.
+
+  // FIXME: Using getMinimalPhysRegClass is very conservative. We should
+  // check every use of the register and find the largest register class
+  // that can be used in all of them.
+  const TargetRegisterClass *SuperRC =
+    TRI->getMinimalPhysRegClass(SuperReg, MVT::Other);
+
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(SuperRC);
+  if (Order.empty()) {
+    DEBUG(dbgs() << "\tEmpty Super Regclass!!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "\tFind Registers:");
+
+  RenameOrder.insert(RenameOrderType::value_type(SuperRC, Order.size()));
+
+  unsigned OrigR = RenameOrder[SuperRC];
+  unsigned EndR = ((OrigR == Order.size()) ? 0 : OrigR);
+  unsigned R = OrigR;
+  do {
+    if (R == 0) R = Order.size();
+    --R;
+    const unsigned NewSuperReg = Order[R];
+    // Don't consider non-allocatable registers
+    if (!MRI.isAllocatable(NewSuperReg)) continue;
+    // Don't replace a register with itself.
+    if (NewSuperReg == SuperReg) continue;
+
+    DEBUG(dbgs() << " [" << TRI->getName(NewSuperReg) << ':');
+    RenameMap.clear();
+
+    // For each referenced group register (which must be a SuperReg or
+    // a subregister of SuperReg), find the corresponding subregister
+    // of NewSuperReg and make sure it is free to be renamed.
+    for (unsigned i = 0, e = Regs.size(); i != e; ++i) {
+      unsigned Reg = Regs[i];
+      unsigned NewReg = 0;
+      if (Reg == SuperReg) {
+        NewReg = NewSuperReg;
+      } else {
+        unsigned NewSubRegIdx = TRI->getSubRegIndex(SuperReg, Reg);
+        if (NewSubRegIdx != 0)
+          NewReg = TRI->getSubReg(NewSuperReg, NewSubRegIdx);
+      }
+
+      DEBUG(dbgs() << " " << TRI->getName(NewReg));
+
+      // Check if Reg can be renamed to NewReg.
+      if (!RenameRegisterMap[Reg].test(NewReg)) {
+        DEBUG(dbgs() << "(no rename)");
+        goto next_super_reg;
+      }
+
+      // If NewReg is dead and NewReg's most recent def is not before
+      // Regs's kill, it's safe to replace Reg with NewReg. We
+      // must also check all aliases of NewReg, because we can't define a
+      // register when any sub or super is already live.
+      if (State->IsLive(NewReg) || (KillIndices[Reg] > DefIndices[NewReg])) {
+        DEBUG(dbgs() << "(live)");
+        goto next_super_reg;
+      } else {
+        bool found = false;
+        for (MCRegAliasIterator AI(NewReg, TRI, false); AI.isValid(); ++AI) {
+          unsigned AliasReg = *AI;
+          if (State->IsLive(AliasReg) ||
+              (KillIndices[Reg] > DefIndices[AliasReg])) {
+            DEBUG(dbgs() << "(alias " << TRI->getName(AliasReg) << " live)");
+            found = true;
+            break;
+          }
+        }
+        if (found)
+          goto next_super_reg;
+      }
+
+      // We cannot rename 'Reg' to 'NewReg' if one of the uses of 'Reg' also
+      // defines 'NewReg' via an early-clobber operand.
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        MachineInstr *UseMI = Q.second.Operand->getParent();
+        int Idx = UseMI->findRegisterDefOperandIdx(NewReg, false, true, TRI);
+        if (Idx == -1)
+          continue;
+
+        if (UseMI->getOperand(Idx).isEarlyClobber()) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
+      // Also, we cannot rename 'Reg' to 'NewReg' if the instruction defining
+      // 'Reg' is an early-clobber define and that instruction also uses
+      // 'NewReg'.
+      for (const auto &Q : make_range(RegRefs.equal_range(Reg))) {
+        if (!Q.second.Operand->isDef() || !Q.second.Operand->isEarlyClobber())
+          continue;
+
+        MachineInstr *DefMI = Q.second.Operand->getParent();
+        if (DefMI->readsRegister(NewReg, TRI)) {
+          DEBUG(dbgs() << "(ec)");
+          goto next_super_reg;
+        }
+      }
+
+      // Record that 'Reg' can be renamed to 'NewReg'.
+      RenameMap.insert(std::pair<unsigned, unsigned>(Reg, NewReg));
+    }
+
+    // If we fall-out here, then every register in the group can be
+    // renamed, as recorded in RenameMap.
+    RenameOrder.erase(SuperRC);
+    RenameOrder.insert(RenameOrderType::value_type(SuperRC, R));
+    DEBUG(dbgs() << "]\n");
+    return true;
+
+  next_super_reg:
+    DEBUG(dbgs() << ']');
+  } while (R != EndR);
+
+  DEBUG(dbgs() << '\n');
+
+  // No registers are free and available!
+  return false;
+}
+
+/// BreakAntiDependencies - Identifiy anti-dependencies within the
+/// ScheduleDAG and break them by renaming registers.
+///
+unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
+                              const std::vector<SUnit>& SUnits,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              unsigned InsertPosIndex,
+                              DbgValueVector &DbgValues) {
+
+  std::vector<unsigned> &KillIndices = State->GetKillIndices();
+  std::vector<unsigned> &DefIndices = State->GetDefIndices();
+  std::multimap<unsigned, AggressiveAntiDepState::RegisterReference>&
+    RegRefs = State->GetRegRefs();
+
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // For each regclass the next register to use for renaming.
+  RenameOrderType RenameOrder;
+
+  // ...need a map from MI to SUnit.
+  std::map<MachineInstr *, const SUnit *> MISUnitMap;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap.insert(std::pair<MachineInstr *, const SUnit *>(SU->getInstr(),
+                                                               SU));
+  }
+
+  // Track progress along the critical path through the SUnit graph as
+  // we walk the instructions. This is needed for regclasses that only
+  // break critical-path anti-dependencies.
+  const SUnit *CriticalPathSU = nullptr;
+  MachineInstr *CriticalPathMI = nullptr;
+  if (CriticalPathSet.any()) {
+    for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+      const SUnit *SU = &SUnits[i];
+      if (!CriticalPathSU ||
+          ((SU->getDepth() + SU->Latency) >
+           (CriticalPathSU->getDepth() + CriticalPathSU->Latency))) {
+        CriticalPathSU = SU;
+      }
+    }
+
+    CriticalPathMI = CriticalPathSU->getInstr();
+  }
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "\n===== Aggressive anti-dependency breaking\n");
+  DEBUG(dbgs() << "Available regs:");
+  for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+    if (!State->IsLive(Reg))
+      DEBUG(dbgs() << " " << TRI->getName(Reg));
+  }
+  DEBUG(dbgs() << '\n');
+#endif
+
+  BitVector RegAliases(TRI->getNumRegs());
+
+  // Attempt to break anti-dependence edges. Walk the instructions
+  // from the bottom up, tracking information about liveness as we go
+  // to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin;
+       I != E; --Count) {
+    MachineInstr &MI = *--I;
+
+    if (MI.isDebugValue())
+      continue;
+
+    DEBUG(dbgs() << "Anti: ");
+    DEBUG(MI.dump());
+
+    std::set<unsigned> PassthruRegs;
+    GetPassthruRegs(MI, PassthruRegs);
+
+    // Process the defs in MI...
+    PrescanInstruction(MI, Count, PassthruRegs);
+
+    // The dependence edges that represent anti- and output-
+    // dependencies that are candidates for breaking.
+    std::vector<const SDep *> Edges;
+    const SUnit *PathSU = MISUnitMap[&MI];
+    AntiDepEdges(PathSU, Edges);
+
+    // If MI is not on the critical path, then we don't rename
+    // registers in the CriticalPathSet.
+    BitVector *ExcludeRegs = nullptr;
+    if (&MI == CriticalPathMI) {
+      CriticalPathSU = CriticalPathStep(CriticalPathSU);
+      CriticalPathMI = (CriticalPathSU) ? CriticalPathSU->getInstr() : nullptr;
+    } else if (CriticalPathSet.any()) {
+      ExcludeRegs = &CriticalPathSet;
+    }
+
+    // Ignore KILL instructions (they form a group in ScanInstruction
+    // but don't cause any anti-dependence breaking themselves)
+    if (!MI.isKill()) {
+      // Attempt to break each anti-dependency...
+      for (unsigned i = 0, e = Edges.size(); i != e; ++i) {
+        const SDep *Edge = Edges[i];
+        SUnit *NextSU = Edge->getSUnit();
+
+        if ((Edge->getKind() != SDep::Anti) &&
+            (Edge->getKind() != SDep::Output)) continue;
+
+        unsigned AntiDepReg = Edge->getReg();
+        DEBUG(dbgs() << "\tAntidep reg: " << TRI->getName(AntiDepReg));
+        assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+
+        if (!MRI.isAllocatable(AntiDepReg)) {
+          // Don't break anti-dependencies on non-allocatable registers.
+          DEBUG(dbgs() << " (non-allocatable)\n");
+          continue;
+        } else if (ExcludeRegs && ExcludeRegs->test(AntiDepReg)) {
+          // Don't break anti-dependencies for critical path registers
+          // if not on the critical path
+          DEBUG(dbgs() << " (not critical-path)\n");
+          continue;
+        } else if (PassthruRegs.count(AntiDepReg) != 0) {
+          // If the anti-dep register liveness "passes-thru", then
+          // don't try to change it. It will be changed along with
+          // the use if required to break an earlier antidep.
+          DEBUG(dbgs() << " (passthru)\n");
+          continue;
+        } else {
+          // No anti-dep breaking for implicit deps
+          MachineOperand *AntiDepOp = MI.findRegisterDefOperand(AntiDepReg);
+          assert(AntiDepOp && "Can't find index for defined register operand");
+          if (!AntiDepOp || AntiDepOp->isImplicit()) {
+            DEBUG(dbgs() << " (implicit)\n");
+            continue;
+          }
+
+          // If the SUnit has other dependencies on the SUnit that
+          // it anti-depends on, don't bother breaking the
+          // anti-dependency since those edges would prevent such
+          // units from being scheduled past each other
+          // regardless.
+          //
+          // Also, if there are dependencies on other SUnits with the
+          // same register as the anti-dependency, don't attempt to
+          // break it.
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if (P->getSUnit() == NextSU ?
+                (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+              AntiDepReg = 0;
+              break;
+            }
+          }
+          for (SUnit::const_pred_iterator P = PathSU->Preds.begin(),
+                 PE = PathSU->Preds.end(); P != PE; ++P) {
+            if ((P->getSUnit() == NextSU) && (P->getKind() != SDep::Anti) &&
+                (P->getKind() != SDep::Output)) {
+              DEBUG(dbgs() << " (real dependency)\n");
+              AntiDepReg = 0;
+              break;
+            } else if ((P->getSUnit() != NextSU) &&
+                       (P->getKind() == SDep::Data) &&
+                       (P->getReg() == AntiDepReg)) {
+              DEBUG(dbgs() << " (other dependency)\n");
+              AntiDepReg = 0;
+              break;
+            }
+          }
+
+          if (AntiDepReg == 0) continue;
+
+          // If the definition of the anti-dependency register does not start
+          // a new live range, bail out. This can happen if the anti-dep
+          // register is a sub-register of another register whose live range
+          // spans over PathSU. In such case, PathSU defines only a part of
+          // the larger register.
+          RegAliases.reset();
+          for (MCRegAliasIterator AI(AntiDepReg, TRI, true); AI.isValid(); ++AI)
+            RegAliases.set(*AI);
+          for (SDep S : PathSU->Succs) {
+            SDep::Kind K = S.getKind();
+            if (K != SDep::Data && K != SDep::Output && K != SDep::Anti)
+              continue;
+            unsigned R = S.getReg();
+            if (!RegAliases[R])
+              continue;
+            if (R == AntiDepReg || TRI->isSubRegister(AntiDepReg, R))
+              continue;
+            AntiDepReg = 0;
+            break;
+          }
+
+          if (AntiDepReg == 0) continue;
+        }
+
+        assert(AntiDepReg != 0);
+        if (AntiDepReg == 0) continue;
+
+        // Determine AntiDepReg's register group.
+        const unsigned GroupIndex = State->GetGroup(AntiDepReg);
+        if (GroupIndex == 0) {
+          DEBUG(dbgs() << " (zero group)\n");
+          continue;
+        }
+
+        DEBUG(dbgs() << '\n');
+
+        // Look for a suitable register to use to break the anti-dependence.
+        std::map<unsigned, unsigned> RenameMap;
+        if (FindSuitableFreeRegisters(GroupIndex, RenameOrder, RenameMap)) {
+          DEBUG(dbgs() << "\tBreaking anti-dependence edge on "
+                << TRI->getName(AntiDepReg) << ":");
+
+          // Handle each group register...
+          for (std::map<unsigned, unsigned>::iterator
+                 S = RenameMap.begin(), E = RenameMap.end(); S != E; ++S) {
+            unsigned CurrReg = S->first;
+            unsigned NewReg = S->second;
+
+            DEBUG(dbgs() << " " << TRI->getName(CurrReg) << "->" <<
+                  TRI->getName(NewReg) << "(" <<
+                  RegRefs.count(CurrReg) << " refs)");
+
+            // Update the references to the old register CurrReg to
+            // refer to the new register NewReg.
+            for (const auto &Q : make_range(RegRefs.equal_range(CurrReg))) {
+              Q.second.Operand->setReg(NewReg);
+              // If the SU for the instruction being updated has debug
+              // information related to the anti-dependency register, make
+              // sure to update that as well.
+              const SUnit *SU = MISUnitMap[Q.second.Operand->getParent()];
+              if (!SU) continue;
+              for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                     DVE = DbgValues.end(); DVI != DVE; ++DVI)
+                if (DVI->second == Q.second.Operand->getParent())
+                  UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
+            }
+
+            // We just went back in time and modified history; the
+            // liveness information for CurrReg is now inconsistent. Set
+            // the state as if it were dead.
+            State->UnionGroups(NewReg, 0);
+            RegRefs.erase(NewReg);
+            DefIndices[NewReg] = DefIndices[CurrReg];
+            KillIndices[NewReg] = KillIndices[CurrReg];
+
+            State->UnionGroups(CurrReg, 0);
+            RegRefs.erase(CurrReg);
+            DefIndices[CurrReg] = KillIndices[CurrReg];
+            KillIndices[CurrReg] = ~0u;
+            assert(((KillIndices[CurrReg] == ~0u) !=
+                    (DefIndices[CurrReg] == ~0u)) &&
+                   "Kill and Def maps aren't consistent for AntiDepReg!");
+          }
+
+          ++Broken;
+          DEBUG(dbgs() << '\n');
+        }
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
new file mode 100644
index 000000000000..f97e6666b219
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -0,0 +1,179 @@
+//=- llvm/CodeGen/AggressiveAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AggressiveAntiDepBreaker class, which
+// implements register anti-dependence breaking during post-RA
+// scheduling. It attempts to break all anti-dependencies within a
+// block.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <map>
+
+namespace llvm {
+class RegisterClassInfo;
+
+  /// Contains all the state necessary for anti-dep breaking.
+class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepState {
+  public:
+    /// Information about a register reference within a liverange
+    typedef struct {
+      /// The registers operand
+      MachineOperand *Operand;
+      /// The register class
+      const TargetRegisterClass *RC;
+    } RegisterReference;
+
+  private:
+    /// Number of non-virtual target registers (i.e. TRI->getNumRegs()).
+    const unsigned NumTargetRegs;
+
+    /// Implements a disjoint-union data structure to
+    /// form register groups. A node is represented by an index into
+    /// the vector. A node can "point to" itself to indicate that it
+    /// is the parent of a group, or point to another node to indicate
+    /// that it is a member of the same group as that node.
+    std::vector<unsigned> GroupNodes;
+
+    /// For each register, the index of the GroupNode
+    /// currently representing the group that the register belongs to.
+    /// Register 0 is always represented by the 0 group, a group
+    /// composed of registers that are not eligible for anti-aliasing.
+    std::vector<unsigned> GroupNodeIndices;
+
+    /// Map registers to all their references within a live range.
+    std::multimap<unsigned, RegisterReference> RegRefs;
+
+    /// The index of the most recent kill (proceeding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// The index of the most recent complete def (proceeding bottom
+    /// up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+  public:
+    AggressiveAntiDepState(const unsigned TargetRegs, MachineBasicBlock *BB);
+
+    /// Return the kill indices.
+    std::vector<unsigned> &GetKillIndices() { return KillIndices; }
+
+    /// Return the define indices.
+    std::vector<unsigned> &GetDefIndices() { return DefIndices; }
+
+    /// Return the RegRefs map.
+    std::multimap<unsigned, RegisterReference>& GetRegRefs() { return RegRefs; }
+
+    // Get the group for a register. The returned value is
+    // the index of the GroupNode representing the group.
+    unsigned GetGroup(unsigned Reg);
+
+    // Return a vector of the registers belonging to a group.
+    // If RegRefs is non-NULL then only included referenced registers.
+    void GetGroupRegs(
+       unsigned Group,
+       std::vector<unsigned> &Regs,
+       std::multimap<unsigned,
+         AggressiveAntiDepState::RegisterReference> *RegRefs);
+
+    // Union Reg1's and Reg2's groups to form a new group.
+    // Return the index of the GroupNode representing the group.
+    unsigned UnionGroups(unsigned Reg1, unsigned Reg2);
+
+    // Remove a register from its current group and place
+    // it alone in its own group. Return the index of the GroupNode
+    // representing the registers new group.
+    unsigned LeaveGroup(unsigned Reg);
+
+    /// Return true if Reg is live.
+    bool IsLive(unsigned Reg);
+  };
+
+  class LLVM_LIBRARY_VISIBILITY AggressiveAntiDepBreaker
+      : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// The set of registers that should only be
+    /// renamed if they are on the critical path.
+    BitVector CriticalPathSet;
+
+    /// The state used to identify and rename anti-dependence registers.
+    AggressiveAntiDepState *State;
+
+  public:
+    AggressiveAntiDepBreaker(MachineFunction& MFi,
+                          const RegisterClassInfo &RCI,
+                          TargetSubtargetInfo::RegClassVector& CriticalPathRCs);
+    ~AggressiveAntiDepBreaker() override;
+
+    /// Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB) override;
+
+    /// Identifiy anti-dependencies along the critical path
+    /// of the ScheduleDAG and break them by renaming registers.
+    ///
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues) override;
+
+    /// Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr &MI, unsigned Count,
+                 unsigned InsertPosIndex) override;
+
+    /// Finish anti-dep breaking for a basic block.
+    void FinishBlock() override;
+
+  private:
+    /// Keep track of a position in the allocation order for each regclass.
+    typedef std::map<const TargetRegisterClass *, unsigned> RenameOrderType;
+
+    /// Return true if MO represents a register
+    /// that is both implicitly used and defined in MI
+    bool IsImplicitDefUse(MachineInstr &MI, MachineOperand &MO);
+
+    /// If MI implicitly def/uses a register, then
+    /// return that register and all subregisters.
+    void GetPassthruRegs(MachineInstr &MI, std::set<unsigned> &PassthruRegs);
+
+    void HandleLastUse(unsigned Reg, unsigned KillIdx, const char *tag,
+                       const char *header = nullptr,
+                       const char *footer = nullptr);
+
+    void PrescanInstruction(MachineInstr &MI, unsigned Count,
+                            std::set<unsigned> &PassthruRegs);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
+    BitVector GetRenameRegisters(unsigned Reg);
+    bool FindSuitableFreeRegisters(unsigned AntiDepGroupIndex,
+                                   RenameOrderType& RenameOrder,
+                                   std::map<unsigned, unsigned> &RenameMap);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.cpp b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
new file mode 100644
index 000000000000..40451c0d6c19
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.cpp
@@ -0,0 +1,54 @@
+//===-- llvm/CodeGen/AllocationOrder.cpp - Allocation Order ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Compare VirtRegMap::getRegAllocPref().
+AllocationOrder::AllocationOrder(unsigned VirtReg,
+                                 const VirtRegMap &VRM,
+                                 const RegisterClassInfo &RegClassInfo,
+                                 const LiveRegMatrix *Matrix)
+  : Pos(0) {
+  const MachineFunction &MF = VRM.getMachineFunction();
+  const TargetRegisterInfo *TRI = &VRM.getTargetRegInfo();
+  Order = RegClassInfo.getOrder(MF.getRegInfo().getRegClass(VirtReg));
+  TRI->getRegAllocationHints(VirtReg, Order, Hints, MF, &VRM, Matrix);
+  rewind();
+
+  DEBUG({
+    if (!Hints.empty()) {
+      dbgs() << "hints:";
+      for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+        dbgs() << ' ' << PrintReg(Hints[I], TRI);
+      dbgs() << '\n';
+    }
+  });
+#ifndef NDEBUG
+  for (unsigned I = 0, E = Hints.size(); I != E; ++I)
+    assert(std::find(Order.begin(), Order.end(), Hints[I]) != Order.end() &&
+           "Target hint is outside allocation order.");
+#endif
+}
diff --git a/contrib/llvm/lib/CodeGen/AllocationOrder.h b/contrib/llvm/lib/CodeGen/AllocationOrder.h
new file mode 100644
index 000000000000..2aee3a63a2b1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AllocationOrder.h
@@ -0,0 +1,89 @@
+//===-- llvm/CodeGen/AllocationOrder.h - Allocation Order -*- C++ -*-------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an allocation order for virtual registers.
+//
+// The preferred allocation order for a virtual register depends on allocation
+// hints and target hooks. The AllocationOrder class encapsulates all of that.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+#define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/MC/MCRegisterInfo.h"
+
+namespace llvm {
+
+class RegisterClassInfo;
+class VirtRegMap;
+class LiveRegMatrix;
+
+class LLVM_LIBRARY_VISIBILITY AllocationOrder {
+  SmallVector<MCPhysReg, 16> Hints;
+  ArrayRef<MCPhysReg> Order;
+  int Pos;
+
+public:
+  /// Create a new AllocationOrder for VirtReg.
+  /// @param VirtReg      Virtual register to allocate for.
+  /// @param VRM          Virtual register map for function.
+  /// @param RegClassInfo Information about reserved and allocatable registers.
+  AllocationOrder(unsigned VirtReg,
+                  const VirtRegMap &VRM,
+                  const RegisterClassInfo &RegClassInfo,
+                  const LiveRegMatrix *Matrix);
+
+  /// Get the allocation order without reordered hints.
+  ArrayRef<MCPhysReg> getOrder() const { return Order; }
+
+  /// Return the next physical register in the allocation order, or 0.
+  /// It is safe to call next() again after it returned 0, it will keep
+  /// returning 0 until rewind() is called.
+  unsigned next(unsigned Limit = 0) {
+    if (Pos < 0)
+      return Hints.end()[Pos++];
+    if (!Limit)
+      Limit = Order.size();
+    while (Pos < int(Limit)) {
+      unsigned Reg = Order[Pos++];
+      if (!isHint(Reg))
+        return Reg;
+    }
+    return 0;
+  }
+
+  /// As next(), but allow duplicates to be returned, and stop before the
+  /// Limit'th register in the RegisterClassInfo allocation order.
+  ///
+  /// This can produce more than Limit registers if there are hints.
+  unsigned nextWithDups(unsigned Limit) {
+    if (Pos < 0)
+      return Hints.end()[Pos++];
+    if (Pos < int(Limit))
+      return Order[Pos++];
+    return 0;
+  }
+
+  /// Start over from the beginning.
+  void rewind() { Pos = -int(Hints.size()); }
+
+  /// Return true if the last register returned from next() was a preferred register.
+  bool isHint() const { return Pos <= 0; }
+
+  /// Return true if PhysReg is a preferred register.
+  bool isHint(unsigned PhysReg) const {
+    return std::find(Hints.begin(), Hints.end(), PhysReg) != Hints.end();
+  }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/Analysis.cpp b/contrib/llvm/lib/CodeGen/Analysis.cpp
new file mode 100644
index 000000000000..d69073458cdf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Analysis.cpp
@@ -0,0 +1,731 @@
+//===-- Analysis.cpp - CodeGen LLVM IR Analysis Utilities -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines several CodeGen-specific LLVM IR analysis utilities.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/GlobalStatus.h"
+
+using namespace llvm;
+
+/// Compute the linearized index of a member in a nested aggregate/struct/array
+/// by recursing and accumulating CurIndex as long as there are indices in the
+/// index list.
+unsigned llvm::ComputeLinearIndex(Type *Ty,
+                                  const unsigned *Indices,
+                                  const unsigned *IndicesEnd,
+                                  unsigned CurIndex) {
+  // Base case: We're done.
+  if (Indices && Indices == IndicesEnd)
+    return CurIndex;
+
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+        EI != EE; ++EI) {
+      if (Indices && *Indices == unsigned(EI - EB))
+        return ComputeLinearIndex(*EI, Indices+1, IndicesEnd, CurIndex);
+      CurIndex = ComputeLinearIndex(*EI, nullptr, nullptr, CurIndex);
+    }
+    assert(!Indices && "Unexpected out of bound");
+    return CurIndex;
+  }
+  // Given an array type, recursively traverse the elements.
+  else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    unsigned NumElts = ATy->getNumElements();
+    // Compute the Linear offset when jumping one element of the array
+    unsigned EltLinearOffset = ComputeLinearIndex(EltTy, nullptr, nullptr, 0);
+    if (Indices) {
+      assert(*Indices < NumElts && "Unexpected out of bound");
+      // If the indice is inside the array, compute the index to the requested
+      // elt and recurse inside the element with the end of the indices list
+      CurIndex += EltLinearOffset* *Indices;
+      return ComputeLinearIndex(EltTy, Indices+1, IndicesEnd, CurIndex);
+    }
+    CurIndex += EltLinearOffset*NumElts;
+    return CurIndex;
+  }
+  // We haven't found the type we're looking for, so keep searching.
+  return CurIndex + 1;
+}
+
+/// ComputeValueVTs - Given an LLVM IR type, compute a sequence of
+/// EVTs that represent all the individual underlying
+/// non-aggregate types that comprise it.
+///
+/// If Offsets is non-null, it points to a vector to be filled in
+/// with the in-memory offsets of each of the individual values.
+///
+void llvm::ComputeValueVTs(const TargetLowering &TLI, const DataLayout &DL,
+                           Type *Ty, SmallVectorImpl<EVT> &ValueVTs,
+                           SmallVectorImpl<uint64_t> *Offsets,
+                           uint64_t StartingOffset) {
+  // Given a struct type, recursively traverse the elements.
+  if (StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = DL.getStructLayout(STy);
+    for (StructType::element_iterator EB = STy->element_begin(),
+                                      EI = EB,
+                                      EE = STy->element_end();
+         EI != EE; ++EI)
+      ComputeValueVTs(TLI, DL, *EI, ValueVTs, Offsets,
+                      StartingOffset + SL->getElementOffset(EI - EB));
+    return;
+  }
+  // Given an array type, recursively traverse the elements.
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    Type *EltTy = ATy->getElementType();
+    uint64_t EltSize = DL.getTypeAllocSize(EltTy);
+    for (unsigned i = 0, e = ATy->getNumElements(); i != e; ++i)
+      ComputeValueVTs(TLI, DL, EltTy, ValueVTs, Offsets,
+                      StartingOffset + i * EltSize);
+    return;
+  }
+  // Interpret void as zero return values.
+  if (Ty->isVoidTy())
+    return;
+  // Base case: we can get an EVT for this LLVM IR type.
+  ValueVTs.push_back(TLI.getValueType(DL, Ty));
+  if (Offsets)
+    Offsets->push_back(StartingOffset);
+}
+
+/// ExtractTypeInfo - Returns the type info, possibly bitcast, encoded in V.
+GlobalValue *llvm::ExtractTypeInfo(Value *V) {
+  V = V->stripPointerCasts();
+  GlobalValue *GV = dyn_cast<GlobalValue>(V);
+  GlobalVariable *Var = dyn_cast<GlobalVariable>(V);
+
+  if (Var && Var->getName() == "llvm.eh.catch.all.value") {
+    assert(Var->hasInitializer() &&
+           "The EH catch-all value must have an initializer");
+    Value *Init = Var->getInitializer();
+    GV = dyn_cast<GlobalValue>(Init);
+    if (!GV) V = cast<ConstantPointerNull>(Init);
+  }
+
+  assert((GV || isa<ConstantPointerNull>(V)) &&
+         "TypeInfo must be a global variable or NULL");
+  return GV;
+}
+
+/// hasInlineAsmMemConstraint - Return true if the inline asm instruction being
+/// processed uses a memory 'm' constraint.
+bool
+llvm::hasInlineAsmMemConstraint(InlineAsm::ConstraintInfoVector &CInfos,
+                                const TargetLowering &TLI) {
+  for (unsigned i = 0, e = CInfos.size(); i != e; ++i) {
+    InlineAsm::ConstraintInfo &CI = CInfos[i];
+    for (unsigned j = 0, ee = CI.Codes.size(); j != ee; ++j) {
+      TargetLowering::ConstraintType CType = TLI.getConstraintType(CI.Codes[j]);
+      if (CType == TargetLowering::C_Memory)
+        return true;
+    }
+
+    // Indirect operand accesses access memory.
+    if (CI.isIndirect)
+      return true;
+  }
+
+  return false;
+}
+
+/// getFCmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR floating-point condition code.  This includes
+/// consideration of global floating-point math flags.
+///
+ISD::CondCode llvm::getFCmpCondCode(FCmpInst::Predicate Pred) {
+  switch (Pred) {
+  case FCmpInst::FCMP_FALSE: return ISD::SETFALSE;
+  case FCmpInst::FCMP_OEQ:   return ISD::SETOEQ;
+  case FCmpInst::FCMP_OGT:   return ISD::SETOGT;
+  case FCmpInst::FCMP_OGE:   return ISD::SETOGE;
+  case FCmpInst::FCMP_OLT:   return ISD::SETOLT;
+  case FCmpInst::FCMP_OLE:   return ISD::SETOLE;
+  case FCmpInst::FCMP_ONE:   return ISD::SETONE;
+  case FCmpInst::FCMP_ORD:   return ISD::SETO;
+  case FCmpInst::FCMP_UNO:   return ISD::SETUO;
+  case FCmpInst::FCMP_UEQ:   return ISD::SETUEQ;
+  case FCmpInst::FCMP_UGT:   return ISD::SETUGT;
+  case FCmpInst::FCMP_UGE:   return ISD::SETUGE;
+  case FCmpInst::FCMP_ULT:   return ISD::SETULT;
+  case FCmpInst::FCMP_ULE:   return ISD::SETULE;
+  case FCmpInst::FCMP_UNE:   return ISD::SETUNE;
+  case FCmpInst::FCMP_TRUE:  return ISD::SETTRUE;
+  default: llvm_unreachable("Invalid FCmp predicate opcode!");
+  }
+}
+
+ISD::CondCode llvm::getFCmpCodeWithoutNaN(ISD::CondCode CC) {
+  switch (CC) {
+    case ISD::SETOEQ: case ISD::SETUEQ: return ISD::SETEQ;
+    case ISD::SETONE: case ISD::SETUNE: return ISD::SETNE;
+    case ISD::SETOLT: case ISD::SETULT: return ISD::SETLT;
+    case ISD::SETOLE: case ISD::SETULE: return ISD::SETLE;
+    case ISD::SETOGT: case ISD::SETUGT: return ISD::SETGT;
+    case ISD::SETOGE: case ISD::SETUGE: return ISD::SETGE;
+    default: return CC;
+  }
+}
+
+/// getICmpCondCode - Return the ISD condition code corresponding to
+/// the given LLVM IR integer condition code.
+///
+ISD::CondCode llvm::getICmpCondCode(ICmpInst::Predicate Pred) {
+  switch (Pred) {
+  case ICmpInst::ICMP_EQ:  return ISD::SETEQ;
+  case ICmpInst::ICMP_NE:  return ISD::SETNE;
+  case ICmpInst::ICMP_SLE: return ISD::SETLE;
+  case ICmpInst::ICMP_ULE: return ISD::SETULE;
+  case ICmpInst::ICMP_SGE: return ISD::SETGE;
+  case ICmpInst::ICMP_UGE: return ISD::SETUGE;
+  case ICmpInst::ICMP_SLT: return ISD::SETLT;
+  case ICmpInst::ICMP_ULT: return ISD::SETULT;
+  case ICmpInst::ICMP_SGT: return ISD::SETGT;
+  case ICmpInst::ICMP_UGT: return ISD::SETUGT;
+  default:
+    llvm_unreachable("Invalid ICmp predicate opcode!");
+  }
+}
+
+static bool isNoopBitcast(Type *T1, Type *T2,
+                          const TargetLoweringBase& TLI) {
+  return T1 == T2 || (T1->isPointerTy() && T2->isPointerTy()) ||
+         (isa<VectorType>(T1) && isa<VectorType>(T2) &&
+          TLI.isTypeLegal(EVT::getEVT(T1)) && TLI.isTypeLegal(EVT::getEVT(T2)));
+}
+
+/// Look through operations that will be free to find the earliest source of
+/// this value.
+///
+/// @param ValLoc If V has aggegate type, we will be interested in a particular
+/// scalar component. This records its address; the reverse of this list gives a
+/// sequence of indices appropriate for an extractvalue to locate the important
+/// value. This value is updated during the function and on exit will indicate
+/// similar information for the Value returned.
+///
+/// @param DataBits If this function looks through truncate instructions, this
+/// will record the smallest size attained.
+static const Value *getNoopInput(const Value *V,
+                                 SmallVectorImpl<unsigned> &ValLoc,
+                                 unsigned &DataBits,
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
+  while (true) {
+    // Try to look through V1; if V1 is not an instruction, it can't be looked
+    // through.
+    const Instruction *I = dyn_cast<Instruction>(V);
+    if (!I || I->getNumOperands() == 0) return V;
+    const Value *NoopInput = nullptr;
+
+    Value *Op = I->getOperand(0);
+    if (isa<BitCastInst>(I)) {
+      // Look through truly no-op bitcasts.
+      if (isNoopBitcast(Op->getType(), I->getType(), TLI))
+        NoopInput = Op;
+    } else if (isa<GetElementPtrInst>(I)) {
+      // Look through getelementptr
+      if (cast<GetElementPtrInst>(I)->hasAllZeroIndices())
+        NoopInput = Op;
+    } else if (isa<IntToPtrInst>(I)) {
+      // Look through inttoptr.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(Op->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<PtrToIntInst>(I)) {
+      // Look through ptrtoint.
+      // Make sure this isn't a truncating or extending cast.  We could
+      // support this eventually, but don't bother for now.
+      if (!isa<VectorType>(I->getType()) &&
+          DL.getPointerSizeInBits() ==
+              cast<IntegerType>(I->getType())->getBitWidth())
+        NoopInput = Op;
+    } else if (isa<TruncInst>(I) &&
+               TLI.allowTruncateForTailCall(Op->getType(), I->getType())) {
+      DataBits = std::min(DataBits, I->getType()->getPrimitiveSizeInBits());
+      NoopInput = Op;
+    } else if (isa<CallInst>(I)) {
+      // Look through call (skipping callee)
+      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 1;
+           i != e; ++i) {
+        unsigned attrInd = i - I->op_begin() + 1;
+        if (cast<CallInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
+            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
+          NoopInput = *i;
+          break;
+        }
+      }
+    } else if (isa<InvokeInst>(I)) {
+      // Look through invoke (skipping BB, BB, Callee)
+      for (User::const_op_iterator i = I->op_begin(), e = I->op_end() - 3;
+           i != e; ++i) {
+        unsigned attrInd = i - I->op_begin() + 1;
+        if (cast<InvokeInst>(I)->paramHasAttr(attrInd, Attribute::Returned) &&
+            isNoopBitcast((*i)->getType(), I->getType(), TLI)) {
+          NoopInput = *i;
+          break;
+        }
+      }
+    } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
+      // Value may come from either the aggregate or the scalar
+      ArrayRef<unsigned> InsertLoc = IVI->getIndices();
+      if (ValLoc.size() >= InsertLoc.size() &&
+          std::equal(InsertLoc.begin(), InsertLoc.end(), ValLoc.rbegin())) {
+        // The type being inserted is a nested sub-type of the aggregate; we
+        // have to remove those initial indices to get the location we're
+        // interested in for the operand.
+        ValLoc.resize(ValLoc.size() - InsertLoc.size());
+        NoopInput = IVI->getInsertedValueOperand();
+      } else {
+        // The struct we're inserting into has the value we're interested in, no
+        // change of address.
+        NoopInput = Op;
+      }
+    } else if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+      // The part we're interested in will inevitably be some sub-section of the
+      // previous aggregate. Combine the two paths to obtain the true address of
+      // our element.
+      ArrayRef<unsigned> ExtractLoc = EVI->getIndices();
+      ValLoc.append(ExtractLoc.rbegin(), ExtractLoc.rend());
+      NoopInput = Op;
+    }
+    // Terminate if we couldn't find anything to look through.
+    if (!NoopInput)
+      return V;
+
+    V = NoopInput;
+  }
+}
+
+/// Return true if this scalar return value only has bits discarded on its path
+/// from the "tail call" to the "ret". This includes the obvious noop
+/// instructions handled by getNoopInput above as well as free truncations (or
+/// extensions prior to the call).
+static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
+                                 SmallVectorImpl<unsigned> &RetIndices,
+                                 SmallVectorImpl<unsigned> &CallIndices,
+                                 bool AllowDifferingSizes,
+                                 const TargetLoweringBase &TLI,
+                                 const DataLayout &DL) {
+
+  // Trace the sub-value needed by the return value as far back up the graph as
+  // possible, in the hope that it will intersect with the value produced by the
+  // call. In the simple case with no "returned" attribute, the hope is actually
+  // that we end up back at the tail call instruction itself.
+  unsigned BitsRequired = UINT_MAX;
+  RetVal = getNoopInput(RetVal, RetIndices, BitsRequired, TLI, DL);
+
+  // If this slot in the value returned is undef, it doesn't matter what the
+  // call puts there, it'll be fine.
+  if (isa<UndefValue>(RetVal))
+    return true;
+
+  // Now do a similar search up through the graph to find where the value
+  // actually returned by the "tail call" comes from. In the simple case without
+  // a "returned" attribute, the search will be blocked immediately and the loop
+  // a Noop.
+  unsigned BitsProvided = UINT_MAX;
+  CallVal = getNoopInput(CallVal, CallIndices, BitsProvided, TLI, DL);
+
+  // There's no hope if we can't actually trace them to (the same part of!) the
+  // same value.
+  if (CallVal != RetVal || CallIndices != RetIndices)
+    return false;
+
+  // However, intervening truncates may have made the call non-tail. Make sure
+  // all the bits that are needed by the "ret" have been provided by the "tail
+  // call". FIXME: with sufficiently cunning bit-tracking, we could look through
+  // extensions too.
+  if (BitsProvided < BitsRequired ||
+      (!AllowDifferingSizes && BitsProvided != BitsRequired))
+    return false;
+
+  return true;
+}
+
+/// For an aggregate type, determine whether a given index is within bounds or
+/// not.
+static bool indexReallyValid(CompositeType *T, unsigned Idx) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return Idx < AT->getNumElements();
+
+  return Idx < cast<StructType>(T)->getNumElements();
+}
+
+/// Move the given iterators to the next leaf type in depth first traversal.
+///
+/// Performs a depth-first traversal of the type as specified by its arguments,
+/// stopping at the next leaf node (which may be a legitimate scalar type or an
+/// empty struct or array).
+///
+/// @param SubTypes List of the partial components making up the type from
+/// outermost to innermost non-empty aggregate. The element currently
+/// represented is SubTypes.back()->getTypeAtIndex(Path.back() - 1).
+///
+/// @param Path Set of extractvalue indices leading from the outermost type
+/// (SubTypes[0]) to the leaf node currently represented.
+///
+/// @returns true if a new type was found, false otherwise. Calling this
+/// function again on a finished iterator will repeatedly return
+/// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
+/// aggregate or a non-aggregate
+static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
+                                  SmallVectorImpl<unsigned> &Path) {
+  // First march back up the tree until we can successfully increment one of the
+  // coordinates in Path.
+  while (!Path.empty() && !indexReallyValid(SubTypes.back(), Path.back() + 1)) {
+    Path.pop_back();
+    SubTypes.pop_back();
+  }
+
+  // If we reached the top, then the iterator is done.
+  if (Path.empty())
+    return false;
+
+  // We know there's *some* valid leaf now, so march back down the tree picking
+  // out the left-most element at each node.
+  ++Path.back();
+  Type *DeeperType = SubTypes.back()->getTypeAtIndex(Path.back());
+  while (DeeperType->isAggregateType()) {
+    CompositeType *CT = cast<CompositeType>(DeeperType);
+    if (!indexReallyValid(CT, 0))
+      return true;
+
+    SubTypes.push_back(CT);
+    Path.push_back(0);
+
+    DeeperType = CT->getTypeAtIndex(0U);
+  }
+
+  return true;
+}
+
+/// Find the first non-empty, scalar-like type in Next and setup the iterator
+/// components.
+///
+/// Assuming Next is an aggregate of some kind, this function will traverse the
+/// tree from left to right (i.e. depth-first) looking for the first
+/// non-aggregate type which will play a role in function return.
+///
+/// For example, if Next was {[0 x i64], {{}, i32, {}}, i32} then we would setup
+/// Path as [1, 1] and SubTypes as [Next, {{}, i32, {}}] to represent the first
+/// i32 in that type.
+static bool firstRealType(Type *Next,
+                          SmallVectorImpl<CompositeType *> &SubTypes,
+                          SmallVectorImpl<unsigned> &Path) {
+  // First initialise the iterator components to the first "leaf" node
+  // (i.e. node with no valid sub-type at any index, so {} does count as a leaf
+  // despite nominally being an aggregate).
+  while (Next->isAggregateType() &&
+         indexReallyValid(cast<CompositeType>(Next), 0)) {
+    SubTypes.push_back(cast<CompositeType>(Next));
+    Path.push_back(0);
+    Next = cast<CompositeType>(Next)->getTypeAtIndex(0U);
+  }
+
+  // If there's no Path now, Next was originally scalar already (or empty
+  // leaf). We're done.
+  if (Path.empty())
+    return true;
+
+  // Otherwise, use normal iteration to keep looking through the tree until we
+  // find a non-aggregate type.
+  while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType()) {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+  }
+
+  return true;
+}
+
+/// Set the iterator data-structures to the next non-empty, non-aggregate
+/// subtype.
+static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
+                         SmallVectorImpl<unsigned> &Path) {
+  do {
+    if (!advanceToNextLeafType(SubTypes, Path))
+      return false;
+
+    assert(!Path.empty() && "found a leaf but didn't set the path?");
+  } while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType());
+
+  return true;
+}
+
+
+/// Test if the given instruction is in a position to be optimized
+/// with a tail-call. This roughly means that it's in a block with
+/// a return and there's nothing that needs to be scheduled
+/// between it and the return.
+///
+/// This function only tests target-independent requirements.
+bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
+  const Instruction *I = CS.getInstruction();
+  const BasicBlock *ExitBB = I->getParent();
+  const TerminatorInst *Term = ExitBB->getTerminator();
+  const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
+
+  // The block must end in a return statement or unreachable.
+  //
+  // FIXME: Decline tailcall if it's not guaranteed and if the block ends in
+  // an unreachable, for now. The way tailcall optimization is currently
+  // implemented means it will add an epilogue followed by a jump. That is
+  // not profitable. Also, if the callee is a special function (e.g.
+  // longjmp on x86), it can end up causing miscompilation that has not
+  // been fully understood.
+  if (!Ret &&
+      (!TM.Options.GuaranteedTailCallOpt || !isa<UnreachableInst>(Term)))
+    return false;
+
+  // If I will have a chain, make sure no other instruction that will have a
+  // chain interposes between I and the return.
+  if (I->mayHaveSideEffects() || I->mayReadFromMemory() ||
+      !isSafeToSpeculativelyExecute(I))
+    for (BasicBlock::const_iterator BBI = std::prev(ExitBB->end(), 2);; --BBI) {
+      if (&*BBI == I)
+        break;
+      // Debug info intrinsics do not get in the way of tail call optimization.
+      if (isa<DbgInfoIntrinsic>(BBI))
+        continue;
+      if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
+          !isSafeToSpeculativelyExecute(&*BBI))
+        return false;
+    }
+
+  const Function *F = ExitBB->getParent();
+  return returnTypeIsEligibleForTailCall(
+      F, I, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
+}
+
+bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
+                                           const Instruction *I,
+                                           const ReturnInst *Ret,
+                                           const TargetLoweringBase &TLI) {
+  // If the block ends with a void return or unreachable, it doesn't matter
+  // what the call's return type is.
+  if (!Ret || Ret->getNumOperands() == 0) return true;
+
+  // If the return value is undef, it doesn't matter what the call's
+  // return type is.
+  if (isa<UndefValue>(Ret->getOperand(0))) return true;
+
+  // Make sure the attributes attached to each return are compatible.
+  AttrBuilder CallerAttrs(F->getAttributes(),
+                          AttributeSet::ReturnIndex);
+  AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
+                          AttributeSet::ReturnIndex);
+
+  // Noalias is completely benign as far as calling convention goes, it
+  // shouldn't affect whether the call is a tail call.
+  CallerAttrs = CallerAttrs.removeAttribute(Attribute::NoAlias);
+  CalleeAttrs = CalleeAttrs.removeAttribute(Attribute::NoAlias);
+
+  bool AllowDifferingSizes = true;
+  if (CallerAttrs.contains(Attribute::ZExt)) {
+    if (!CalleeAttrs.contains(Attribute::ZExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::ZExt);
+    CalleeAttrs.removeAttribute(Attribute::ZExt);
+  } else if (CallerAttrs.contains(Attribute::SExt)) {
+    if (!CalleeAttrs.contains(Attribute::SExt))
+      return false;
+
+    AllowDifferingSizes = false;
+    CallerAttrs.removeAttribute(Attribute::SExt);
+    CalleeAttrs.removeAttribute(Attribute::SExt);
+  }
+
+  // If they're still different, there's some facet we don't understand
+  // (currently only "inreg", but in future who knows). It may be OK but the
+  // only safe option is to reject the tail call.
+  if (CallerAttrs != CalleeAttrs)
+    return false;
+
+  const Value *RetVal = Ret->getOperand(0), *CallVal = I;
+  SmallVector<unsigned, 4> RetPath, CallPath;
+  SmallVector<CompositeType *, 4> RetSubTypes, CallSubTypes;
+
+  bool RetEmpty = !firstRealType(RetVal->getType(), RetSubTypes, RetPath);
+  bool CallEmpty = !firstRealType(CallVal->getType(), CallSubTypes, CallPath);
+
+  // Nothing's actually returned, it doesn't matter what the callee put there
+  // it's a valid tail call.
+  if (RetEmpty)
+    return true;
+
+  // Iterate pairwise through each of the value types making up the tail call
+  // and the corresponding return. For each one we want to know whether it's
+  // essentially going directly from the tail call to the ret, via operations
+  // that end up not generating any code.
+  //
+  // We allow a certain amount of covariance here. For example it's permitted
+  // for the tail call to define more bits than the ret actually cares about
+  // (e.g. via a truncate).
+  do {
+    if (CallEmpty) {
+      // We've exhausted the values produced by the tail call instruction, the
+      // rest are essentially undef. The type doesn't really matter, but we need
+      // *something*.
+      Type *SlotType = RetSubTypes.back()->getTypeAtIndex(RetPath.back());
+      CallVal = UndefValue::get(SlotType);
+    }
+
+    // The manipulations performed when we're looking through an insertvalue or
+    // an extractvalue would happen at the front of the RetPath list, so since
+    // we have to copy it anyway it's more efficient to create a reversed copy.
+    SmallVector<unsigned, 4> TmpRetPath(RetPath.rbegin(), RetPath.rend());
+    SmallVector<unsigned, 4> TmpCallPath(CallPath.rbegin(), CallPath.rend());
+
+    // Finally, we can check whether the value produced by the tail call at this
+    // index is compatible with the value we return.
+    if (!slotOnlyDiscardsData(RetVal, CallVal, TmpRetPath, TmpCallPath,
+                              AllowDifferingSizes, TLI,
+                              F->getParent()->getDataLayout()))
+      return false;
+
+    CallEmpty  = !nextRealType(CallSubTypes, CallPath);
+  } while(nextRealType(RetSubTypes, RetPath));
+
+  return true;
+}
+
+bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
+  if (!GV->hasLinkOnceODRLinkage())
+    return false;
+
+  // We assume that anyone who sets global unnamed_addr on a non-constant knows
+  // what they're doing.
+  if (GV->hasGlobalUnnamedAddr())
+    return true;
+
+  // If it is a non constant variable, it needs to be uniqued across shared
+  // objects.
+  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
+    if (!Var->isConstant())
+      return false;
+  }
+
+  return GV->hasAtLeastLocalUnnamedAddr();
+}
+
+static void collectFuncletMembers(
+    DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
+    const MachineBasicBlock *MBB) {
+  SmallVector<const MachineBasicBlock *, 16> Worklist = {MBB};
+  while (!Worklist.empty()) {
+    const MachineBasicBlock *Visiting = Worklist.pop_back_val();
+    // Don't follow blocks which start new funclets.
+    if (Visiting->isEHPad() && Visiting != MBB)
+      continue;
+
+    // Add this MBB to our funclet.
+    auto P = FuncletMembership.insert(std::make_pair(Visiting, Funclet));
+
+    // Don't revisit blocks.
+    if (!P.second) {
+      assert(P.first->second == Funclet && "MBB is part of two funclets!");
+      continue;
+    }
+
+    // Returns are boundaries where funclet transfer can occur, don't follow
+    // successors.
+    if (Visiting->isReturnBlock())
+      continue;
+
+    for (const MachineBasicBlock *Succ : Visiting->successors())
+      Worklist.push_back(Succ);
+  }
+}
+
+DenseMap<const MachineBasicBlock *, int>
+llvm::getFuncletMembership(const MachineFunction &MF) {
+  DenseMap<const MachineBasicBlock *, int> FuncletMembership;
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (!MF.getMMI().hasEHFunclets())
+    return FuncletMembership;
+
+  int EntryBBNumber = MF.front().getNumber();
+  bool IsSEH = isAsynchronousEHPersonality(
+      classifyEHPersonality(MF.getFunction()->getPersonalityFn()));
+
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<const MachineBasicBlock *, 16> FuncletBlocks;
+  SmallVector<const MachineBasicBlock *, 16> UnreachableBlocks;
+  SmallVector<const MachineBasicBlock *, 16> SEHCatchPads;
+  SmallVector<std::pair<const MachineBasicBlock *, int>, 16> CatchRetSuccessors;
+  for (const MachineBasicBlock &MBB : MF) {
+    if (MBB.isEHFuncletEntry()) {
+      FuncletBlocks.push_back(&MBB);
+    } else if (IsSEH && MBB.isEHPad()) {
+      SEHCatchPads.push_back(&MBB);
+    } else if (MBB.pred_empty()) {
+      UnreachableBlocks.push_back(&MBB);
+    }
+
+    MachineBasicBlock::const_iterator MBBI = MBB.getFirstTerminator();
+    // CatchPads are not funclets for SEH so do not consider CatchRet to
+    // transfer control to another funclet.
+    if (MBBI->getOpcode() != TII->getCatchReturnOpcode())
+      continue;
+
+    // FIXME: SEH CatchPads are not necessarily in the parent function:
+    // they could be inside a finally block.
+    const MachineBasicBlock *Successor = MBBI->getOperand(0).getMBB();
+    const MachineBasicBlock *SuccessorColor = MBBI->getOperand(1).getMBB();
+    CatchRetSuccessors.push_back(
+        {Successor, IsSEH ? EntryBBNumber : SuccessorColor->getNumber()});
+  }
+
+  // We don't have anything to do if there aren't any EH pads.
+  if (FuncletBlocks.empty())
+    return FuncletMembership;
+
+  // Identify all the basic blocks reachable from the function entry.
+  collectFuncletMembers(FuncletMembership, EntryBBNumber, &MF.front());
+  // All blocks not part of a funclet are in the parent function.
+  for (const MachineBasicBlock *MBB : UnreachableBlocks)
+    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
+  // Next, identify all the blocks inside the funclets.
+  for (const MachineBasicBlock *MBB : FuncletBlocks)
+    collectFuncletMembers(FuncletMembership, MBB->getNumber(), MBB);
+  // SEH CatchPads aren't really funclets, handle them separately.
+  for (const MachineBasicBlock *MBB : SEHCatchPads)
+    collectFuncletMembers(FuncletMembership, EntryBBNumber, MBB);
+  // Finally, identify all the targets of a catchret.
+  for (std::pair<const MachineBasicBlock *, int> CatchRetPair :
+       CatchRetSuccessors)
+    collectFuncletMembers(FuncletMembership, CatchRetPair.second,
+                          CatchRetPair.first);
+  return FuncletMembership;
+}
diff --git a/contrib/llvm/lib/CodeGen/AntiDepBreaker.h b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
new file mode 100644
index 000000000000..04f7f419f5ea
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AntiDepBreaker.h
@@ -0,0 +1,67 @@
+//=- llvm/CodeGen/AntiDepBreaker.h - Anti-Dependence Breaking -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AntiDepBreaker class, which implements
+// anti-dependence breaking heuristics for post-register-allocation scheduling.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <vector>
+
+namespace llvm {
+
+/// This class works in conjunction with the post-RA scheduler to rename
+/// registers to break register anti-dependencies (WAR hazards).
+class LLVM_LIBRARY_VISIBILITY AntiDepBreaker {
+public:
+  typedef std::vector<std::pair<MachineInstr *, MachineInstr *> > 
+    DbgValueVector;
+
+  virtual ~AntiDepBreaker();
+
+  /// Initialize anti-dep breaking for a new basic block.
+  virtual void StartBlock(MachineBasicBlock *BB) =0;
+
+  /// Identifiy anti-dependencies within a basic-block region and break them by
+  /// renaming registers. Return the number of anti-dependencies broken.
+  virtual unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                         MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End,
+                                         unsigned InsertPosIndex,
+                                         DbgValueVector &DbgValues) = 0;
+  
+  /// Update liveness information to account for the current
+  /// instruction, which will not be scheduled.
+  virtual void Observe(MachineInstr &MI, unsigned Count,
+                       unsigned InsertPosIndex) = 0;
+
+  /// Finish anti-dep breaking for a basic block.
+  virtual void FinishBlock() =0;
+
+  /// Update DBG_VALUE if dependency breaker is updating
+  /// other machine instruction to use NewReg.
+  void UpdateDbgValue(MachineInstr &MI, unsigned OldReg, unsigned NewReg) {
+    assert(MI.isDebugValue() && "MI is not DBG_VALUE!");
+    if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg)
+      MI.getOperand(0).setReg(NewReg);
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
new file mode 100644
index 000000000000..5294c98e314d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -0,0 +1,139 @@
+//===-- CodeGen/AsmPrinter/ARMException.cpp - ARM EHABI Exception Impl ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+ARMException::ARMException(AsmPrinter *A) : DwarfCFIExceptionBase(A) {}
+
+ARMException::~ARMException() {}
+
+ARMTargetStreamer &ARMException::getTargetStreamer() {
+  MCTargetStreamer &TS = *Asm->OutStreamer->getTargetStreamer();
+  return static_cast<ARMTargetStreamer &>(TS);
+}
+
+/// endModule - Emit all exception information that should come after the
+/// content.
+void ARMException::endModule() {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->EmitCFISections(false, true);
+}
+
+void ARMException::beginFunction(const MachineFunction *MF) {
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    getTargetStreamer().emitFnStart();
+  // See if we need call frame info.
+  AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
+  assert(MoveType != AsmPrinter::CFI_M_EH &&
+         "non-EH CFI not yet supported in prologue with EHABI lowering");
+  if (MoveType == AsmPrinter::CFI_M_Debug) {
+    shouldEmitCFI = true;
+    Asm->OutStreamer->EmitCFIStartProc(false);
+  }
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void ARMException::endFunction(const MachineFunction *MF) {
+  ARMTargetStreamer &ATS = getTargetStreamer();
+  const Function *F = MF->getFunction();
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  bool forceEmitPersonality =
+    F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+    F->needsUnwindTableEntry();
+  bool shouldEmitPersonality = forceEmitPersonality ||
+    !MMI->getLandingPads().empty();
+  if (!Asm->MF->getFunction()->needsUnwindTableEntry() &&
+      !shouldEmitPersonality)
+    ATS.emitCantUnwind();
+  else if (shouldEmitPersonality) {
+    // Emit references to personality.
+    if (Per) {
+      MCSymbol *PerSym = Asm->getSymbol(Per);
+      Asm->OutStreamer->EmitSymbolAttribute(PerSym, MCSA_Global);
+      ATS.emitPersonality(PerSym);
+    }
+
+    // Emit .handlerdata directive.
+    ATS.emitHandlerData();
+
+    // Emit actual exception table
+    emitExceptionTable();
+  }
+
+  if (Asm->MAI->getExceptionHandlingType() == ExceptionHandling::ARM)
+    ATS.emitFnEnd();
+}
+
+void ARMException::emitTypeInfos(unsigned TTypeEncoding) {
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+
+  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer->AddBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (const GlobalValue *GV : reverse(TypeInfos)) {
+    if (VerboseAsm)
+      Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
+    Asm->EmitTTypeReference(GV, TTypeEncoding);
+  }
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer->AddBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (TypeID != 0)
+        Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->EmitTTypeReference((TypeID == 0 ? nullptr : TypeInfos[TypeID - 1]),
+                            TTypeEncoding);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
new file mode 100644
index 000000000000..8c6838394ac9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -0,0 +1,45 @@
+//===-- llvm/CodeGen/AddressPool.cpp - Dwarf Debug Framework ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AddressPool.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+class MCExpr;
+
+unsigned AddressPool::getIndex(const MCSymbol *Sym, bool TLS) {
+  HasBeenUsed = true;
+  auto IterBool =
+      Pool.insert(std::make_pair(Sym, AddressPoolEntry(Pool.size(), TLS)));
+  return IterBool.first->second.Number;
+}
+
+// Emit addresses into the section given.
+void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
+  if (Pool.empty())
+    return;
+
+  // Start the dwarf addr section.
+  Asm.OutStreamer->SwitchSection(AddrSection);
+
+  // Order the address pool entries by ID
+  SmallVector<const MCExpr *, 64> Entries(Pool.size());
+
+  for (const auto &I : Pool)
+    Entries[I.second.Number] =
+        I.second.TLS
+            ? Asm.getObjFileLowering().getDebugThreadLocalSymbol(I.first)
+            : MCSymbolRefExpr::create(I.first, Asm.OutContext);
+
+  for (const MCExpr *Entry : Entries)
+    Asm.OutStreamer->EmitValue(Entry, Asm.getDataLayout().getPointerSize());
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
new file mode 100644
index 000000000000..ba3e3b7c315d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AddressPool.h
@@ -0,0 +1,52 @@
+//===-- llvm/CodeGen/AddressPool.h - Dwarf Debug Framework -----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ADDRESSPOOL_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/MC/MCSymbol.h"
+
+namespace llvm {
+class MCSection;
+class AsmPrinter;
+// Collection of addresses for this unit and assorted labels.
+// A Symbol->unsigned mapping of addresses used by indirect
+// references.
+class AddressPool {
+  struct AddressPoolEntry {
+    unsigned Number;
+    bool TLS;
+    AddressPoolEntry(unsigned Number, bool TLS) : Number(Number), TLS(TLS) {}
+  };
+  DenseMap<const MCSymbol *, AddressPoolEntry> Pool;
+
+  /// Record whether the AddressPool has been queried for an address index since
+  /// the last "resetUsedFlag" call. Used to implement type unit fallback - a
+  /// type that references addresses cannot be placed in a type unit when using
+  /// fission.
+  bool HasBeenUsed;
+
+public:
+  AddressPool() : HasBeenUsed(false) {}
+
+  /// \brief Returns the index into the address pool with the given
+  /// label/symbol.
+  unsigned getIndex(const MCSymbol *Sym, bool TLS = false);
+
+  void emit(AsmPrinter &Asm, MCSection *AddrSection);
+
+  bool isEmpty() { return Pool.empty(); }
+
+  bool hasBeenUsed() const { return HasBeenUsed; }
+
+  void resetUsedFlag() { HasBeenUsed = false; }
+};
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
new file mode 100644
index 000000000000..272baceeed89
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -0,0 +1,2620 @@
+//===-- AsmPrinter.cpp - Common AsmPrinter code ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "CodeViewDebug.h"
+#include "DwarfDebug.h"
+#include "DwarfException.h"
+#include "WinException.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "Debug Info Emission";
+static const char *const EHTimerName = "DWARF Exception Writer";
+static const char *const CodeViewLineTablesGroupName = "CodeView Line Tables";
+
+STATISTIC(EmittedInsts, "Number of machine instrs printed");
+
+char AsmPrinter::ID = 0;
+
+typedef DenseMap<GCStrategy*, std::unique_ptr<GCMetadataPrinter>> gcp_map_type;
+static gcp_map_type &getGCMap(void *&P) {
+  if (!P)
+    P = new gcp_map_type();
+  return *(gcp_map_type*)P;
+}
+
+
+/// getGVAlignmentLog2 - Return the alignment to use for the specified global
+/// value in log2 form.  This rounds up to the preferred alignment if possible
+/// and legal.
+static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
+                                   unsigned InBits = 0) {
+  unsigned NumBits = 0;
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
+    NumBits = DL.getPreferredAlignmentLog(GVar);
+
+  // If InBits is specified, round it to it.
+  if (InBits > NumBits)
+    NumBits = InBits;
+
+  // If the GV has a specified alignment, take it into account.
+  if (GV->getAlignment() == 0)
+    return NumBits;
+
+  unsigned GVAlign = Log2_32(GV->getAlignment());
+
+  // If the GVAlign is larger than NumBits, or if we are required to obey
+  // NumBits because the GV has an assigned section, obey it.
+  if (GVAlign > NumBits || GV->hasSection())
+    NumBits = GVAlign;
+  return NumBits;
+}
+
+AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
+    : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
+      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
+      LastMI(nullptr), LastFn(0), Counter(~0U) {
+  DD = nullptr;
+  MMI = nullptr;
+  LI = nullptr;
+  MF = nullptr;
+  CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr;
+  CurrentFnBegin = nullptr;
+  CurrentFnEnd = nullptr;
+  GCMetadataPrinters = nullptr;
+  VerboseAsm = OutStreamer->isVerboseAsm();
+}
+
+AsmPrinter::~AsmPrinter() {
+  assert(!DD && Handlers.empty() && "Debug/EH info didn't get finalized");
+
+  if (GCMetadataPrinters) {
+    gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
+
+    delete &GCMap;
+    GCMetadataPrinters = nullptr;
+  }
+}
+
+bool AsmPrinter::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
+
+/// getFunctionNumber - Return a unique ID for the current function.
+///
+unsigned AsmPrinter::getFunctionNumber() const {
+  return MF->getFunctionNumber();
+}
+
+const TargetLoweringObjectFile &AsmPrinter::getObjFileLowering() const {
+  return *TM.getObjFileLowering();
+}
+
+const DataLayout &AsmPrinter::getDataLayout() const {
+  return MMI->getModule()->getDataLayout();
+}
+
+// Do not use the cached DataLayout because some client use it without a Module
+// (llmv-dsymutil, llvm-dwarfdump).
+unsigned AsmPrinter::getPointerSize() const { return TM.getPointerSize(); }
+
+const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
+  assert(MF && "getSubtargetInfo requires a valid MachineFunction!");
+  return MF->getSubtarget<MCSubtargetInfo>();
+}
+
+void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
+  S.EmitInstruction(Inst, getSubtargetInfo());
+}
+
+StringRef AsmPrinter::getTargetTriple() const {
+  return TM.getTargetTriple().str();
+}
+
+/// getCurrentSection() - Return the current section we are emitting to.
+const MCSection *AsmPrinter::getCurrentSection() const {
+  return OutStreamer->getCurrentSection().first;
+}
+
+
+
+void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+  if (isVerbose())
+    AU.addRequired<MachineLoopInfo>();
+}
+
+bool AsmPrinter::doInitialization(Module &M) {
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+
+  // Initialize TargetLoweringObjectFile.
+  const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
+    .Initialize(OutContext, TM);
+
+  OutStreamer->InitSections(false);
+
+  Mang = new Mangler();
+
+  // Emit the version-min deplyment target directive if needed.
+  //
+  // FIXME: If we end up with a collection of these sorts of Darwin-specific
+  // or ELF-specific things, it may make sense to have a platform helper class
+  // that will work with the target helper class. For now keep it here, as the
+  // alternative is duplicated code in each of the target asm printers that
+  // use the directive, where it would need the same conditionalization
+  // anyway.
+  Triple TT(getTargetTriple());
+  // If there is a version specified, Major will be non-zero.
+  if (TT.isOSDarwin() && TT.getOSMajorVersion() != 0) {
+    unsigned Major, Minor, Update;
+    MCVersionMinType VersionType;
+    if (TT.isWatchOS()) {
+      VersionType = MCVM_WatchOSVersionMin;
+      TT.getWatchOSVersion(Major, Minor, Update);
+    } else if (TT.isTvOS()) {
+      VersionType = MCVM_TvOSVersionMin;
+      TT.getiOSVersion(Major, Minor, Update);
+    } else if (TT.isMacOSX()) {
+      VersionType = MCVM_OSXVersionMin;
+      if (!TT.getMacOSXVersion(Major, Minor, Update))
+        Major = 0;
+    } else {
+      VersionType = MCVM_IOSVersionMin;
+      TT.getiOSVersion(Major, Minor, Update);
+    }
+    if (Major != 0)
+      OutStreamer->EmitVersionMin(VersionType, Major, Minor, Update);
+  }
+
+  // Allow the target to emit any magic that it wants at the start of the file.
+  EmitStartOfAsmFile(M);
+
+  // Very minimal debug info. It is ignored if we emit actual debug info. If we
+  // don't, this at least helps the user find where a global came from.
+  if (MAI->hasSingleParameterDotFile()) {
+    // .file "foo.c"
+    OutStreamer->EmitFileDirective(M.getModuleIdentifier());
+  }
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (auto &I : *MI)
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(*I))
+      MP->beginAssembly(M, *MI, *this);
+
+  // Emit module-level inline asm if it exists.
+  if (!M.getModuleInlineAsm().empty()) {
+    // We're at the module level. Construct MCSubtarget from the default CPU
+    // and target triple.
+    std::unique_ptr<MCSubtargetInfo> STI(TM.getTarget().createMCSubtargetInfo(
+        TM.getTargetTriple().str(), TM.getTargetCPU(),
+        TM.getTargetFeatureString()));
+    OutStreamer->AddComment("Start of file scope inline assembly");
+    OutStreamer->AddBlankLine();
+    EmitInlineAsm(M.getModuleInlineAsm()+"\n",
+                  OutContext.getSubtargetCopy(*STI), TM.Options.MCOptions);
+    OutStreamer->AddComment("End of file scope inline assembly");
+    OutStreamer->AddBlankLine();
+  }
+
+  if (MAI->doesSupportDebugInformation()) {
+    bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
+    if (EmitCodeView && TM.getTargetTriple().isKnownWindowsMSVCEnvironment()) {
+      Handlers.push_back(HandlerInfo(new CodeViewDebug(this),
+                                     DbgTimerName,
+                                     CodeViewLineTablesGroupName));
+    }
+    if (!EmitCodeView || MMI->getModule()->getDwarfVersion()) {
+      DD = new DwarfDebug(this, &M);
+      DD->beginModule();
+      Handlers.push_back(HandlerInfo(DD, DbgTimerName, DWARFGroupName));
+    }
+  }
+
+  EHStreamer *ES = nullptr;
+  switch (MAI->getExceptionHandlingType()) {
+  case ExceptionHandling::None:
+    break;
+  case ExceptionHandling::SjLj:
+  case ExceptionHandling::DwarfCFI:
+    ES = new DwarfCFIException(this);
+    break;
+  case ExceptionHandling::ARM:
+    ES = new ARMException(this);
+    break;
+  case ExceptionHandling::WinEH:
+    switch (MAI->getWinEHEncodingType()) {
+    default: llvm_unreachable("unsupported unwinding information encoding");
+    case WinEH::EncodingType::Invalid:
+      break;
+    case WinEH::EncodingType::X86:
+    case WinEH::EncodingType::Itanium:
+      ES = new WinException(this);
+      break;
+    }
+    break;
+  }
+  if (ES)
+    Handlers.push_back(HandlerInfo(ES, EHTimerName, DWARFGroupName));
+  return false;
+}
+
+static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
+  if (!MAI.hasWeakDefCanBeHiddenDirective())
+    return false;
+
+  return canBeOmittedFromSymbolTable(GV);
+}
+
+void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+  GlobalValue::LinkageTypes Linkage = GV->getLinkage();
+  switch (Linkage) {
+  case GlobalValue::CommonLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+    if (MAI->hasWeakDefDirective()) {
+      // .globl _foo
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+
+      if (!canBeHidden(GV, *MAI))
+        // .weak_definition _foo
+        OutStreamer->EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+      else
+        OutStreamer->EmitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
+    } else if (MAI->hasLinkOnceDirective()) {
+      // .globl _foo
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+      //NOTE: linkonce is handled by the section the symbol was assigned to.
+    } else {
+      // .weak _foo
+      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
+    }
+    return;
+  case GlobalValue::ExternalLinkage:
+    // If external, declare as a global symbol: .globl _foo
+    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+    return;
+  case GlobalValue::PrivateLinkage:
+  case GlobalValue::InternalLinkage:
+    return;
+  case GlobalValue::AppendingLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    llvm_unreachable("Should never emit this");
+  }
+  llvm_unreachable("Unknown linkage type!");
+}
+
+void AsmPrinter::getNameWithPrefix(SmallVectorImpl<char> &Name,
+                                   const GlobalValue *GV) const {
+  TM.getNameWithPrefix(Name, GV, *Mang);
+}
+
+MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
+  return TM.getSymbol(GV, *Mang);
+}
+
+/// EmitGlobalVariable - Emit the specified global variable to the .s file.
+void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+  bool IsEmuTLSVar = TM.Options.EmulatedTLS && GV->isThreadLocal();
+  assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
+         "No emulated TLS variables in the common section");
+
+  // Never emit TLS variable xyz in emulated TLS model.
+  // The initialization value is in __emutls_t.xyz instead of xyz.
+  if (IsEmuTLSVar)
+    return;
+
+  if (GV->hasInitializer()) {
+    // Check to see if this is a special global used by LLVM, if so, emit it.
+    if (EmitSpecialLLVMGlobal(GV))
+      return;
+
+    // Skip the emission of global equivalents. The symbol can be emitted later
+    // on by emitGlobalGOTEquivs in case it turns out to be needed.
+    if (GlobalGOTEquivs.count(getSymbol(GV)))
+      return;
+
+    if (isVerbose()) {
+      // When printing the control variable __emutls_v.*,
+      // we don't need to print the original TLS variable name.
+      GV->printAsOperand(OutStreamer->GetCommentOS(),
+                     /*PrintType=*/false, GV->getParent());
+      OutStreamer->GetCommentOS() << '\n';
+    }
+  }
+
+  MCSymbol *GVSym = getSymbol(GV);
+  MCSymbol *EmittedSym = GVSym;
+
+  // getOrCreateEmuTLSControlSym only creates the symbol with name and default
+  // attributes.
+  // GV's or GVSym's attributes will be used for the EmittedSym.
+  EmitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
+
+  if (!GV->hasInitializer())   // External globals require no extra code.
+    return;
+
+  GVSym->redefineIfPossible();
+  if (GVSym->isDefined() || GVSym->isVariable())
+    report_fatal_error("symbol '" + Twine(GVSym->getName()) +
+                       "' is already defined");
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer->EmitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
+
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
+
+  // If the alignment is specified, we *must* obey it.  Overaligning a global
+  // with a specified alignment is a prompt way to break globals emitted to
+  // sections and expected to be contiguous (e.g. ObjC metadata).
+  unsigned AlignLog = getGVAlignmentLog2(GV, DL);
+
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->setSymbolSize(GVSym, Size);
+  }
+
+  // Handle common symbols
+  if (GVKind.isCommon()) {
+    if (Size == 0) Size = 1;   // .comm Foo, 0 is undefined, avoid it.
+    unsigned Align = 1 << AlignLog;
+    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
+      Align = 0;
+
+    // .comm _foo, 42, 4
+    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    return;
+  }
+
+  // Determine to which section this global should be emitted.
+  MCSection *TheSection =
+      getObjFileLowering().SectionForGlobal(GV, GVKind, *Mang, TM);
+
+  // If we have a bss global going to a section that supports the
+  // zerofill directive, do so here.
+  if (GVKind.isBSS() && MAI->hasMachoZeroFillDirective() &&
+      TheSection->isVirtualSection()) {
+    if (Size == 0)
+      Size = 1; // zerofill of 0 bytes is undefined.
+    unsigned Align = 1 << AlignLog;
+    EmitLinkage(GV, GVSym);
+    // .zerofill __DATA, __bss, _foo, 400, 5
+    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Align);
+    return;
+  }
+
+  // If this is a BSS local symbol and we are emitting in the BSS
+  // section use .lcomm/.comm directive.
+  if (GVKind.isBSSLocal() &&
+      getObjFileLowering().getBSSSection() == TheSection) {
+    if (Size == 0)
+      Size = 1; // .comm Foo, 0 is undefined, avoid it.
+    unsigned Align = 1 << AlignLog;
+
+    // Use .lcomm only if it supports user-specified alignment.
+    // Otherwise, while it would still be correct to use .lcomm in some
+    // cases (e.g. when Align == 1), the external assembler might enfore
+    // some -unknown- default alignment behavior, which could cause
+    // spurious differences between external and integrated assembler.
+    // Prefer to simply fall back to .local / .comm in this case.
+    if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
+      // .lcomm _foo, 42
+      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Align);
+      return;
+    }
+
+    if (!getObjFileLowering().getCommDirectiveSupportsAlignment())
+      Align = 0;
+
+    // .local _foo
+    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Local);
+    // .comm _foo, 42, 4
+    OutStreamer->EmitCommonSymbol(GVSym, Size, Align);
+    return;
+  }
+
+  // Handle thread local data for mach-o which requires us to output an
+  // additional structure of data and mangle the original symbol so that we
+  // can reference it later.
+  //
+  // TODO: This should become an "emit thread local global" method on TLOF.
+  // All of this macho specific stuff should be sunk down into TLOFMachO and
+  // stuff like "TLSExtraDataSection" should no longer be part of the parent
+  // TLOF class.  This will also make it more obvious that stuff like
+  // MCStreamer::EmitTBSSSymbol is macho specific and only called from macho
+  // specific code.
+  if (GVKind.isThreadLocal() && MAI->hasMachoTBSSDirective()) {
+    // Emit the .tbss symbol
+    MCSymbol *MangSym =
+      OutContext.getOrCreateSymbol(GVSym->getName() + Twine("$tlv$init"));
+
+    if (GVKind.isThreadBSS()) {
+      TheSection = getObjFileLowering().getTLSBSSSection();
+      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, 1 << AlignLog);
+    } else if (GVKind.isThreadData()) {
+      OutStreamer->SwitchSection(TheSection);
+
+      EmitAlignment(AlignLog, GV);
+      OutStreamer->EmitLabel(MangSym);
+
+      EmitGlobalConstant(GV->getParent()->getDataLayout(),
+                         GV->getInitializer());
+    }
+
+    OutStreamer->AddBlankLine();
+
+    // Emit the variable struct for the runtime.
+    MCSection *TLVSect = getObjFileLowering().getTLSExtraDataSection();
+
+    OutStreamer->SwitchSection(TLVSect);
+    // Emit the linkage here.
+    EmitLinkage(GV, GVSym);
+    OutStreamer->EmitLabel(GVSym);
+
+    // Three pointers in size:
+    //   - __tlv_bootstrap - used to make sure support exists
+    //   - spare pointer, used when mapped by the runtime
+    //   - pointer to mangled symbol above with initializer
+    unsigned PtrSize = DL.getPointerTypeSize(GV->getType());
+    OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
+                                PtrSize);
+    OutStreamer->EmitIntValue(0, PtrSize);
+    OutStreamer->EmitSymbolValue(MangSym, PtrSize);
+
+    OutStreamer->AddBlankLine();
+    return;
+  }
+
+  MCSymbol *EmittedInitSym = GVSym;
+
+  OutStreamer->SwitchSection(TheSection);
+
+  EmitLinkage(GV, EmittedInitSym);
+  EmitAlignment(AlignLog, GV);
+
+  OutStreamer->EmitLabel(EmittedInitSym);
+
+  EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    // .size foo, 42
+    OutStreamer->emitELFSize(cast<MCSymbolELF>(EmittedInitSym),
+                             MCConstantExpr::create(Size, OutContext));
+
+  OutStreamer->AddBlankLine();
+}
+
+/// EmitFunctionHeader - This method emits the header for the current
+/// function.
+void AsmPrinter::EmitFunctionHeader() {
+  // Print out constants referenced by the function
+  EmitConstantPool();
+
+  // Print the 'header' of function.
+  const Function *F = MF->getFunction();
+
+  OutStreamer->SwitchSection(
+      getObjFileLowering().SectionForGlobal(F, *Mang, TM));
+  EmitVisibility(CurrentFnSym, F->getVisibility());
+
+  EmitLinkage(F, CurrentFnSym);
+  if (MAI->hasFunctionAlignment())
+    EmitAlignment(MF->getAlignment(), F);
+
+  if (MAI->hasDotTypeDotSizeDirective())
+    OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_ELF_TypeFunction);
+
+  if (isVerbose()) {
+    F->printAsOperand(OutStreamer->GetCommentOS(),
+                   /*PrintType=*/false, F->getParent());
+    OutStreamer->GetCommentOS() << '\n';
+  }
+
+  // Emit the prefix data.
+  if (F->hasPrefixData())
+    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+
+  // Emit the CurrentFnSym.  This is a virtual function to allow targets to
+  // do their wild and crazy things as required.
+  EmitFunctionEntryLabel();
+
+  // If the function had address-taken blocks that got deleted, then we have
+  // references to the dangling symbols.  Emit them at the start of the function
+  // so that we don't get references to undefined symbols.
+  std::vector<MCSymbol*> DeadBlockSyms;
+  MMI->takeDeletedSymbolsForFunction(F, DeadBlockSyms);
+  for (unsigned i = 0, e = DeadBlockSyms.size(); i != e; ++i) {
+    OutStreamer->AddComment("Address taken block that was later removed");
+    OutStreamer->EmitLabel(DeadBlockSyms[i]);
+  }
+
+  if (CurrentFnBegin) {
+    if (MAI->useAssignmentForEHBegin()) {
+      MCSymbol *CurPos = OutContext.createTempSymbol();
+      OutStreamer->EmitLabel(CurPos);
+      OutStreamer->EmitAssignment(CurrentFnBegin,
+                                 MCSymbolRefExpr::create(CurPos, OutContext));
+    } else {
+      OutStreamer->EmitLabel(CurrentFnBegin);
+    }
+  }
+
+  // Emit pre-function debug and/or EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->beginFunction(MF);
+  }
+
+  // Emit the prologue data.
+  if (F->hasPrologueData())
+    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrologueData());
+}
+
+/// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
+/// function.  This can be overridden by targets as required to do custom stuff.
+void AsmPrinter::EmitFunctionEntryLabel() {
+  CurrentFnSym->redefineIfPossible();
+
+  // The function label could have already been emitted if two symbols end up
+  // conflicting due to asm renaming.  Detect this and emit an error.
+  if (CurrentFnSym->isVariable())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' is a protected alias");
+  if (CurrentFnSym->isDefined())
+    report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
+                       "' label emitted multiple times to assembly file");
+
+  return OutStreamer->EmitLabel(CurrentFnSym);
+}
+
+/// emitComments - Pretty-print comments for instructions.
+static void emitComments(const MachineInstr &MI, raw_ostream &CommentOS) {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // Check for spills and reloads
+  int FI;
+
+  const MachineFrameInfo *FrameInfo = MF->getFrameInfo();
+
+  // We assume a single instruction only has a spill or reload, not
+  // both.
+  const MachineMemOperand *MMO;
+  if (TII->isLoadFromStackSlotPostFE(MI, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
+      CommentOS << MMO->getSize() << "-byte Reload\n";
+    }
+  } else if (TII->hasLoadFromStackSlot(MI, MMO, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI))
+      CommentOS << MMO->getSize() << "-byte Folded Reload\n";
+  } else if (TII->isStoreToStackSlotPostFE(MI, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI)) {
+      MMO = *MI.memoperands_begin();
+      CommentOS << MMO->getSize() << "-byte Spill\n";
+    }
+  } else if (TII->hasStoreToStackSlot(MI, MMO, FI)) {
+    if (FrameInfo->isSpillSlotObjectIndex(FI))
+      CommentOS << MMO->getSize() << "-byte Folded Spill\n";
+  }
+
+  // Check for spill-induced copies
+  if (MI.getAsmPrinterFlag(MachineInstr::ReloadReuse))
+    CommentOS << " Reload Reuse\n";
+}
+
+/// emitImplicitDef - This method emits the specified machine instruction
+/// that is an implicit def.
+void AsmPrinter::emitImplicitDef(const MachineInstr *MI) const {
+  unsigned RegNo = MI->getOperand(0).getReg();
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "implicit-def: "
+     << PrintReg(RegNo, MF->getSubtarget().getRegisterInfo());
+
+  OutStreamer->AddComment(OS.str());
+  OutStreamer->AddBlankLine();
+}
+
+static void emitKill(const MachineInstr *MI, AsmPrinter &AP) {
+  std::string Str;
+  raw_string_ostream OS(Str);
+  OS << "kill:";
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI->getOperand(i);
+    assert(Op.isReg() && "KILL instruction must have only register operands");
+    OS << ' '
+       << PrintReg(Op.getReg(),
+                   AP.MF->getSubtarget().getRegisterInfo())
+       << (Op.isDef() ? "<def>" : "<kill>");
+  }
+  AP.OutStreamer->AddComment(OS.str());
+  AP.OutStreamer->AddBlankLine();
+}
+
+/// emitDebugValueComment - This method handles the target-independent form
+/// of DBG_VALUE, returning true if it was able to do so.  A false return
+/// means the target will need to handle MI in EmitInstruction.
+static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
+  // This code handles only the 4-operand target-independent form.
+  if (MI->getNumOperands() != 4)
+    return false;
+
+  SmallString<128> Str;
+  raw_svector_ostream OS(Str);
+  OS << "DEBUG_VALUE: ";
+
+  const DILocalVariable *V = MI->getDebugVariable();
+  if (auto *SP = dyn_cast<DISubprogram>(V->getScope())) {
+    StringRef Name = SP->getDisplayName();
+    if (!Name.empty())
+      OS << Name << ":";
+  }
+  OS << V->getName();
+
+  const DIExpression *Expr = MI->getDebugExpression();
+  if (Expr->isBitPiece())
+    OS << " [bit_piece offset=" << Expr->getBitPieceOffset()
+       << " size=" << Expr->getBitPieceSize() << "]";
+  OS << " <- ";
+
+  // The second operand is only an offset if it's an immediate.
+  bool Deref = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
+  int64_t Offset = Deref ? MI->getOperand(1).getImm() : 0;
+
+  for (unsigned i = 0; i < Expr->getNumElements(); ++i) {
+    if (Deref) {
+      // We currently don't support extra Offsets or derefs after the first
+      // one. Bail out early instead of emitting an incorrect comment
+      OS << " [complex expression]";
+      AP.OutStreamer->emitRawComment(OS.str());
+      return true;
+    }
+    uint64_t Op = Expr->getElement(i);
+    if (Op == dwarf::DW_OP_deref) {
+      Deref = true;
+      continue;
+    } else if (Op == dwarf::DW_OP_bit_piece) {
+      // There can't be any operands after this in a valid expression
+      break;
+    }
+    uint64_t ExtraOffset = Expr->getElement(i++);
+    if (Op == dwarf::DW_OP_plus)
+      Offset += ExtraOffset;
+    else {
+      assert(Op == dwarf::DW_OP_minus);
+      Offset -= ExtraOffset;
+    }
+  }
+
+  // Register or immediate value. Register 0 means undef.
+  if (MI->getOperand(0).isFPImm()) {
+    APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
+    if (MI->getOperand(0).getFPImm()->getType()->isFloatTy()) {
+      OS << (double)APF.convertToFloat();
+    } else if (MI->getOperand(0).getFPImm()->getType()->isDoubleTy()) {
+      OS << APF.convertToDouble();
+    } else {
+      // There is no good way to print long double.  Convert a copy to
+      // double.  Ah well, it's only a comment.
+      bool ignored;
+      APF.convert(APFloat::IEEEdouble, APFloat::rmNearestTiesToEven,
+                  &ignored);
+      OS << "(long double) " << APF.convertToDouble();
+    }
+  } else if (MI->getOperand(0).isImm()) {
+    OS << MI->getOperand(0).getImm();
+  } else if (MI->getOperand(0).isCImm()) {
+    MI->getOperand(0).getCImm()->getValue().print(OS, false /*isSigned*/);
+  } else {
+    unsigned Reg;
+    if (MI->getOperand(0).isReg()) {
+      Reg = MI->getOperand(0).getReg();
+    } else {
+      assert(MI->getOperand(0).isFI() && "Unknown operand type");
+      const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering();
+      Offset += TFI->getFrameIndexReference(*AP.MF,
+                                            MI->getOperand(0).getIndex(), Reg);
+      Deref = true;
+    }
+    if (Reg == 0) {
+      // Suppress offset, it is not meaningful here.
+      OS << "undef";
+      // NOTE: Want this comment at start of line, don't emit with AddComment.
+      AP.OutStreamer->emitRawComment(OS.str());
+      return true;
+    }
+    if (Deref)
+      OS << '[';
+    OS << PrintReg(Reg, AP.MF->getSubtarget().getRegisterInfo());
+  }
+
+  if (Deref)
+    OS << '+' << Offset << ']';
+
+  // NOTE: Want this comment at start of line, don't emit with AddComment.
+  AP.OutStreamer->emitRawComment(OS.str());
+  return true;
+}
+
+AsmPrinter::CFIMoveType AsmPrinter::needsCFIMoves() {
+  if (MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI &&
+      MF->getFunction()->needsUnwindTableEntry())
+    return CFI_M_EH;
+
+  if (MMI->hasDebugInfo())
+    return CFI_M_Debug;
+
+  return CFI_M_None;
+}
+
+bool AsmPrinter::needsSEHMoves() {
+  return MAI->usesWindowsCFI() && MF->getFunction()->needsUnwindTableEntry();
+}
+
+void AsmPrinter::emitCFIInstruction(const MachineInstr &MI) {
+  ExceptionHandling ExceptionHandlingType = MAI->getExceptionHandlingType();
+  if (ExceptionHandlingType != ExceptionHandling::DwarfCFI &&
+      ExceptionHandlingType != ExceptionHandling::ARM)
+    return;
+
+  if (needsCFIMoves() == CFI_M_None)
+    return;
+
+  const MachineModuleInfo &MMI = MF->getMMI();
+  const std::vector<MCCFIInstruction> &Instrs = MMI.getFrameInstructions();
+  unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
+  const MCCFIInstruction &CFI = Instrs[CFIIndex];
+  emitCFIInstruction(CFI);
+}
+
+void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
+  // The operands are the MCSymbol and the frame offset of the allocation.
+  MCSymbol *FrameAllocSym = MI.getOperand(0).getMCSymbol();
+  int FrameOffset = MI.getOperand(1).getImm();
+
+  // Emit a symbol assignment.
+  OutStreamer->EmitAssignment(FrameAllocSym,
+                             MCConstantExpr::create(FrameOffset, OutContext));
+}
+
+/// EmitFunctionBody - This method emits the body and trailer for a
+/// function.
+void AsmPrinter::EmitFunctionBody() {
+  EmitFunctionHeader();
+
+  // Emit target-specific gunk before the function body.
+  EmitFunctionBodyStart();
+
+  bool ShouldPrintDebugScopes = MMI->hasDebugInfo();
+
+  // Print out code for the function.
+  bool HasAnyRealCode = false;
+  for (auto &MBB : *MF) {
+    // Print a label for the basic block.
+    EmitBasicBlockStart(MBB);
+    for (auto &MI : MBB) {
+
+      // Print the assembly for the instruction.
+      if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
+          !MI.isDebugValue()) {
+        HasAnyRealCode = true;
+        ++EmittedInsts;
+      }
+
+      if (ShouldPrintDebugScopes) {
+        for (const HandlerInfo &HI : Handlers) {
+          NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
+                             TimePassesIsEnabled);
+          HI.Handler->beginInstruction(&MI);
+        }
+      }
+
+      if (isVerbose())
+        emitComments(MI, OutStreamer->GetCommentOS());
+
+      switch (MI.getOpcode()) {
+      case TargetOpcode::CFI_INSTRUCTION:
+        emitCFIInstruction(MI);
+        break;
+
+      case TargetOpcode::LOCAL_ESCAPE:
+        emitFrameAlloc(MI);
+        break;
+
+      case TargetOpcode::EH_LABEL:
+      case TargetOpcode::GC_LABEL:
+        OutStreamer->EmitLabel(MI.getOperand(0).getMCSymbol());
+        break;
+      case TargetOpcode::INLINEASM:
+        EmitInlineAsm(&MI);
+        break;
+      case TargetOpcode::DBG_VALUE:
+        if (isVerbose()) {
+          if (!emitDebugValueComment(&MI, *this))
+            EmitInstruction(&MI);
+        }
+        break;
+      case TargetOpcode::IMPLICIT_DEF:
+        if (isVerbose()) emitImplicitDef(&MI);
+        break;
+      case TargetOpcode::KILL:
+        if (isVerbose()) emitKill(&MI, *this);
+        break;
+      default:
+        EmitInstruction(&MI);
+        break;
+      }
+
+      if (ShouldPrintDebugScopes) {
+        for (const HandlerInfo &HI : Handlers) {
+          NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
+                             TimePassesIsEnabled);
+          HI.Handler->endInstruction();
+        }
+      }
+    }
+
+    EmitBasicBlockEnd(MBB);
+  }
+
+  // If the function is empty and the object file uses .subsections_via_symbols,
+  // then we need to emit *something* to the function body to prevent the
+  // labels from collapsing together.  Just emit a noop.
+  if ((MAI->hasSubsectionsViaSymbols() && !HasAnyRealCode)) {
+    MCInst Noop;
+    MF->getSubtarget().getInstrInfo()->getNoopForMachoTarget(Noop);
+    OutStreamer->AddComment("avoids zero-length function");
+
+    // Targets can opt-out of emitting the noop here by leaving the opcode
+    // unspecified.
+    if (Noop.getOpcode())
+      OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+  }
+
+  const Function *F = MF->getFunction();
+  for (const auto &BB : *F) {
+    if (!BB.hasAddressTaken())
+      continue;
+    MCSymbol *Sym = GetBlockAddressSymbol(&BB);
+    if (Sym->isDefined())
+      continue;
+    OutStreamer->AddComment("Address of block that was removed by CodeGen");
+    OutStreamer->EmitLabel(Sym);
+  }
+
+  // Emit target-specific gunk after the function body.
+  EmitFunctionBodyEnd();
+
+  if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() ||
+      MMI->hasEHFunclets() || MAI->hasDotTypeDotSizeDirective()) {
+    // Create a symbol for the end of function.
+    CurrentFnEnd = createTempSymbol("func_end");
+    OutStreamer->EmitLabel(CurrentFnEnd);
+  }
+
+  // If the target wants a .size directive for the size of the function, emit
+  // it.
+  if (MAI->hasDotTypeDotSizeDirective()) {
+    // We can get the size as difference between the function label and the
+    // temp label.
+    const MCExpr *SizeExp = MCBinaryExpr::createSub(
+        MCSymbolRefExpr::create(CurrentFnEnd, OutContext),
+        MCSymbolRefExpr::create(CurrentFnSymForSize, OutContext), OutContext);
+    if (auto Sym = dyn_cast<MCSymbolELF>(CurrentFnSym))
+      OutStreamer->emitELFSize(Sym, SizeExp);
+  }
+
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->markFunctionEnd();
+  }
+
+  // Print out jump tables referenced by the function.
+  EmitJumpTableInfo();
+
+  // Emit post-function debug and/or EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName, TimePassesIsEnabled);
+    HI.Handler->endFunction(MF);
+  }
+  MMI->EndFunction();
+
+  OutStreamer->AddBlankLine();
+}
+
+/// \brief Compute the number of Global Variables that uses a Constant.
+static unsigned getNumGlobalVariableUses(const Constant *C) {
+  if (!C)
+    return 0;
+
+  if (isa<GlobalVariable>(C))
+    return 1;
+
+  unsigned NumUses = 0;
+  for (auto *CU : C->users())
+    NumUses += getNumGlobalVariableUses(dyn_cast<Constant>(CU));
+
+  return NumUses;
+}
+
+/// \brief Only consider global GOT equivalents if at least one user is a
+/// cstexpr inside an initializer of another global variables. Also, don't
+/// handle cstexpr inside instructions. During global variable emission,
+/// candidates are skipped and are emitted later in case at least one cstexpr
+/// isn't replaced by a PC relative GOT entry access.
+static bool isGOTEquivalentCandidate(const GlobalVariable *GV,
+                                     unsigned &NumGOTEquivUsers) {
+  // Global GOT equivalents are unnamed private globals with a constant
+  // pointer initializer to another global symbol. They must point to a
+  // GlobalVariable or Function, i.e., as GlobalValue.
+  if (!GV->hasGlobalUnnamedAddr() || !GV->hasInitializer() ||
+      !GV->isConstant() || !GV->isDiscardableIfUnused() ||
+      !dyn_cast<GlobalValue>(GV->getOperand(0)))
+    return false;
+
+  // To be a got equivalent, at least one of its users need to be a constant
+  // expression used by another global variable.
+  for (auto *U : GV->users())
+    NumGOTEquivUsers += getNumGlobalVariableUses(dyn_cast<Constant>(U));
+
+  return NumGOTEquivUsers > 0;
+}
+
+/// \brief Unnamed constant global variables solely contaning a pointer to
+/// another globals variable is equivalent to a GOT table entry; it contains the
+/// the address of another symbol. Optimize it and replace accesses to these
+/// "GOT equivalents" by using the GOT entry for the final global instead.
+/// Compute GOT equivalent candidates among all global variables to avoid
+/// emitting them if possible later on, after it use is replaced by a GOT entry
+/// access.
+void AsmPrinter::computeGlobalGOTEquivs(Module &M) {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  for (const auto &G : M.globals()) {
+    unsigned NumGOTEquivUsers = 0;
+    if (!isGOTEquivalentCandidate(&G, NumGOTEquivUsers))
+      continue;
+
+    const MCSymbol *GOTEquivSym = getSymbol(&G);
+    GlobalGOTEquivs[GOTEquivSym] = std::make_pair(&G, NumGOTEquivUsers);
+  }
+}
+
+/// \brief Constant expressions using GOT equivalent globals may not be eligible
+/// for PC relative GOT entry conversion, in such cases we need to emit such
+/// globals we previously omitted in EmitGlobalVariable.
+void AsmPrinter::emitGlobalGOTEquivs() {
+  if (!getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    return;
+
+  SmallVector<const GlobalVariable *, 8> FailedCandidates;
+  for (auto &I : GlobalGOTEquivs) {
+    const GlobalVariable *GV = I.second.first;
+    unsigned Cnt = I.second.second;
+    if (Cnt)
+      FailedCandidates.push_back(GV);
+  }
+  GlobalGOTEquivs.clear();
+
+  for (auto *GV : FailedCandidates)
+    EmitGlobalVariable(GV);
+}
+
+void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
+                                          const GlobalIndirectSymbol& GIS) {
+  MCSymbol *Name = getSymbol(&GIS);
+
+  if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_Global);
+  else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_WeakReference);
+  else
+    assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
+
+  // Set the symbol type to function if the alias has a function type.
+  // This affects codegen when the aliasee is not a function.
+  if (GIS.getType()->getPointerElementType()->isFunctionTy()) {
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeFunction);
+    if (isa<GlobalIFunc>(GIS))
+      OutStreamer->EmitSymbolAttribute(Name, MCSA_ELF_TypeIndFunction);
+  }
+
+  EmitVisibility(Name, GIS.getVisibility());
+
+  const MCExpr *Expr = lowerConstant(GIS.getIndirectSymbol());
+
+  if (isa<GlobalAlias>(&GIS) && MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
+    OutStreamer->EmitSymbolAttribute(Name, MCSA_AltEntry);
+
+  // Emit the directives as assignments aka .set:
+  OutStreamer->EmitAssignment(Name, Expr);
+
+  if (auto *GA = dyn_cast<GlobalAlias>(&GIS)) {
+    // If the aliasee does not correspond to a symbol in the output, i.e. the
+    // alias is not of an object or the aliased object is private, then set the
+    // size of the alias symbol from the type of the alias. We don't do this in
+    // other situations as the alias and aliasee having differing types but same
+    // size may be intentional.
+    const GlobalObject *BaseObject = GA->getBaseObject();
+    if (MAI->hasDotTypeDotSizeDirective() && GA->getValueType()->isSized() &&
+        (!BaseObject || BaseObject->hasPrivateLinkage())) {
+      const DataLayout &DL = M.getDataLayout();
+      uint64_t Size = DL.getTypeAllocSize(GA->getValueType());
+      OutStreamer->emitELFSize(cast<MCSymbolELF>(Name),
+                               MCConstantExpr::create(Size, OutContext));
+    }
+  }
+}
+
+bool AsmPrinter::doFinalization(Module &M) {
+  // Set the MachineFunction to nullptr so that we can catch attempted
+  // accesses to MF specific features at the module level and so that
+  // we can conditionalize accesses based on whether or not it is nullptr.
+  MF = nullptr;
+
+  // Gather all GOT equivalent globals in the module. We really need two
+  // passes over the globals: one to compute and another to avoid its emission
+  // in EmitGlobalVariable, otherwise we would not be able to handle cases
+  // where the got equivalent shows up before its use.
+  computeGlobalGOTEquivs(M);
+
+  // Emit global variables.
+  for (const auto &G : M.globals())
+    EmitGlobalVariable(&G);
+
+  // Emit remaining GOT equivalent globals.
+  emitGlobalGOTEquivs();
+
+  // Emit visibility info for declarations
+  for (const Function &F : M) {
+    if (!F.isDeclarationForLinker())
+      continue;
+    GlobalValue::VisibilityTypes V = F.getVisibility();
+    if (V == GlobalValue::DefaultVisibility)
+      continue;
+
+    MCSymbol *Name = getSymbol(&F);
+    EmitVisibility(Name, V, false);
+  }
+
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+  // Emit module flags.
+  SmallVector<Module::ModuleFlagEntry, 8> ModuleFlags;
+  M.getModuleFlagsMetadata(ModuleFlags);
+  if (!ModuleFlags.empty())
+    TLOF.emitModuleFlags(*OutStreamer, ModuleFlags, *Mang, TM);
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    MachineModuleInfoELF &MMIELF = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    // Output stubs for external and common global variables.
+    MachineModuleInfoELF::SymbolListTy Stubs = MMIELF.GetGVStubList();
+    if (!Stubs.empty()) {
+      OutStreamer->SwitchSection(TLOF.getDataSection());
+      const DataLayout &DL = M.getDataLayout();
+
+      for (const auto &Stub : Stubs) {
+        OutStreamer->EmitLabel(Stub.first);
+        OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
+                                     DL.getPointerSize());
+      }
+    }
+  }
+
+  // Finalize debug and EH information.
+  for (const HandlerInfo &HI : Handlers) {
+    NamedRegionTimer T(HI.TimerName, HI.TimerGroupName,
+                       TimePassesIsEnabled);
+    HI.Handler->endModule();
+    delete HI.Handler;
+  }
+  Handlers.clear();
+  DD = nullptr;
+
+  // If the target wants to know about weak references, print them all.
+  if (MAI->getWeakRefDirective()) {
+    // FIXME: This is not lazy, it would be nice to only print weak references
+    // to stuff that is actually used.  Note that doing so would require targets
+    // to notice uses in operands (due to constant exprs etc).  This should
+    // happen with the MC stuff eventually.
+
+    // Print out module-level global objects here.
+    for (const auto &GO : M.global_objects()) {
+      if (!GO.hasExternalWeakLinkage())
+        continue;
+      OutStreamer->EmitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
+    }
+  }
+
+  OutStreamer->AddBlankLine();
+
+  // Print aliases in topological order, that is, for each alias a = b,
+  // b must be printed before a.
+  // This is because on some targets (e.g. PowerPC) linker expects aliases in
+  // such an order to generate correct TOC information.
+  SmallVector<const GlobalAlias *, 16> AliasStack;
+  SmallPtrSet<const GlobalAlias *, 16> AliasVisited;
+  for (const auto &Alias : M.aliases()) {
+    for (const GlobalAlias *Cur = &Alias; Cur;
+         Cur = dyn_cast<GlobalAlias>(Cur->getAliasee())) {
+      if (!AliasVisited.insert(Cur).second)
+        break;
+      AliasStack.push_back(Cur);
+    }
+    for (const GlobalAlias *AncestorAlias : reverse(AliasStack))
+      emitGlobalIndirectSymbol(M, *AncestorAlias);
+    AliasStack.clear();
+  }
+  for (const auto &IFunc : M.ifuncs())
+    emitGlobalIndirectSymbol(M, IFunc);
+
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "AsmPrinter didn't require GCModuleInfo?");
+  for (GCModuleInfo::iterator I = MI->end(), E = MI->begin(); I != E; )
+    if (GCMetadataPrinter *MP = GetOrCreateGCPrinter(**--I))
+      MP->finishAssembly(M, *MI, *this);
+
+  // Emit llvm.ident metadata in an '.ident' directive.
+  EmitModuleIdents(M);
+
+  // Emit __morestack address if needed for indirect calls.
+  if (MMI->usesMorestackAddr()) {
+    unsigned Align = 1;
+    MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
+        getDataLayout(), SectionKind::getReadOnly(),
+        /*C=*/nullptr, Align);
+    OutStreamer->SwitchSection(ReadOnlySection);
+
+    MCSymbol *AddrSymbol =
+        OutContext.getOrCreateSymbol(StringRef("__morestack_addr"));
+    OutStreamer->EmitLabel(AddrSymbol);
+
+    unsigned PtrSize = M.getDataLayout().getPointerSize(0);
+    OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+                                 PtrSize);
+  }
+
+  // If we don't have any trampolines, then we don't require stack memory
+  // to be executable. Some targets have a directive to declare this.
+  Function *InitTrampolineIntrinsic = M.getFunction("llvm.init.trampoline");
+  if (!InitTrampolineIntrinsic || InitTrampolineIntrinsic->use_empty())
+    if (MCSection *S = MAI->getNonexecutableStackSection(OutContext))
+      OutStreamer->SwitchSection(S);
+
+  // Allow the target to emit any magic that it wants at the end of the file,
+  // after everything else has gone out.
+  EmitEndOfAsmFile(M);
+
+  delete Mang; Mang = nullptr;
+  MMI = nullptr;
+
+  OutStreamer->Finish();
+  OutStreamer->reset();
+
+  return false;
+}
+
+MCSymbol *AsmPrinter::getCurExceptionSym() {
+  if (!CurExceptionSym)
+    CurExceptionSym = createTempSymbol("exception");
+  return CurExceptionSym;
+}
+
+void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
+  this->MF = &MF;
+  // Get the function symbol.
+  CurrentFnSym = getSymbol(MF.getFunction());
+  CurrentFnSymForSize = CurrentFnSym;
+  CurrentFnBegin = nullptr;
+  CurExceptionSym = nullptr;
+  bool NeedsLocalForSize = MAI->needsLocalForSize();
+  if (!MMI->getLandingPads().empty() || MMI->hasDebugInfo() ||
+      MMI->hasEHFunclets() || NeedsLocalForSize) {
+    CurrentFnBegin = createTempSymbol("func_begin");
+    if (NeedsLocalForSize)
+      CurrentFnSymForSize = CurrentFnBegin;
+  }
+
+  if (isVerbose())
+    LI = &getAnalysis<MachineLoopInfo>();
+}
+
+namespace {
+// Keep track the alignment, constpool entries per Section.
+  struct SectionCPs {
+    MCSection *S;
+    unsigned Alignment;
+    SmallVector<unsigned, 4> CPEs;
+    SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}
+  };
+}
+
+/// EmitConstantPool - Print to the current output stream assembly
+/// representations of the constants in the constant pool MCP. This is
+/// used to print out constants which have been "spilled to memory" by
+/// the code generator.
+///
+void AsmPrinter::EmitConstantPool() {
+  const MachineConstantPool *MCP = MF->getConstantPool();
+  const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
+  if (CP.empty()) return;
+
+  // Calculate sections for constant pool entries. We collect entries to go into
+  // the same section together to reduce amount of section switch statements.
+  SmallVector<SectionCPs, 4> CPSections;
+  for (unsigned i = 0, e = CP.size(); i != e; ++i) {
+    const MachineConstantPoolEntry &CPE = CP[i];
+    unsigned Align = CPE.getAlignment();
+
+    SectionKind Kind = CPE.getSectionKind(&getDataLayout());
+
+    const Constant *C = nullptr;
+    if (!CPE.isMachineConstantPoolEntry())
+      C = CPE.Val.ConstVal;
+
+    MCSection *S = getObjFileLowering().getSectionForConstant(getDataLayout(),
+                                                              Kind, C, Align);
+
+    // The number of sections are small, just do a linear search from the
+    // last section to the first.
+    bool Found = false;
+    unsigned SecIdx = CPSections.size();
+    while (SecIdx != 0) {
+      if (CPSections[--SecIdx].S == S) {
+        Found = true;
+        break;
+      }
+    }
+    if (!Found) {
+      SecIdx = CPSections.size();
+      CPSections.push_back(SectionCPs(S, Align));
+    }
+
+    if (Align > CPSections[SecIdx].Alignment)
+      CPSections[SecIdx].Alignment = Align;
+    CPSections[SecIdx].CPEs.push_back(i);
+  }
+
+  // Now print stuff into the calculated sections.
+  const MCSection *CurSection = nullptr;
+  unsigned Offset = 0;
+  for (unsigned i = 0, e = CPSections.size(); i != e; ++i) {
+    for (unsigned j = 0, ee = CPSections[i].CPEs.size(); j != ee; ++j) {
+      unsigned CPI = CPSections[i].CPEs[j];
+      MCSymbol *Sym = GetCPISymbol(CPI);
+      if (!Sym->isUndefined())
+        continue;
+
+      if (CurSection != CPSections[i].S) {
+        OutStreamer->SwitchSection(CPSections[i].S);
+        EmitAlignment(Log2_32(CPSections[i].Alignment));
+        CurSection = CPSections[i].S;
+        Offset = 0;
+      }
+
+      MachineConstantPoolEntry CPE = CP[CPI];
+
+      // Emit inter-object padding for alignment.
+      unsigned AlignMask = CPE.getAlignment() - 1;
+      unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
+      OutStreamer->EmitZeros(NewOffset - Offset);
+
+      Type *Ty = CPE.getType();
+      Offset = NewOffset + getDataLayout().getTypeAllocSize(Ty);
+
+      OutStreamer->EmitLabel(Sym);
+      if (CPE.isMachineConstantPoolEntry())
+        EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
+      else
+        EmitGlobalConstant(getDataLayout(), CPE.Val.ConstVal);
+    }
+  }
+}
+
+/// EmitJumpTableInfo - Print assembly representations of the jump tables used
+/// by the current function to the current output stream.
+///
+void AsmPrinter::EmitJumpTableInfo() {
+  const DataLayout &DL = MF->getDataLayout();
+  const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
+  if (!MJTI) return;
+  if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_Inline) return;
+  const std::vector<MachineJumpTableEntry> &JT = MJTI->getJumpTables();
+  if (JT.empty()) return;
+
+  // Pick the directive to use to print the jump table entries, and switch to
+  // the appropriate section.
+  const Function *F = MF->getFunction();
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+  bool JTInDiffSection = !TLOF.shouldPutJumpTableInFunctionSection(
+      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32,
+      *F);
+  if (JTInDiffSection) {
+    // Drop it in the readonly section.
+    MCSection *ReadOnlySection = TLOF.getSectionForJumpTable(*F, *Mang, TM);
+    OutStreamer->SwitchSection(ReadOnlySection);
+  }
+
+  EmitAlignment(Log2_32(MJTI->getEntryAlignment(DL)));
+
+  // Jump tables in code sections are marked with a data_region directive
+  // where that's supported.
+  if (!JTInDiffSection)
+    OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+
+  for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
+    const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
+
+    // If this jump table was deleted, ignore it.
+    if (JTBBs.empty()) continue;
+
+    // For the EK_LabelDifference32 entry, if using .set avoids a relocation,
+    /// emit a .set directive for each unique entry.
+    if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
+        MAI->doesSetDirectiveSuppressReloc()) {
+      SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
+      const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+      const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
+      for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii) {
+        const MachineBasicBlock *MBB = JTBBs[ii];
+        if (!EmittedSets.insert(MBB).second)
+          continue;
+
+        // .set LJTSet, LBB32-base
+        const MCExpr *LHS =
+          MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+        OutStreamer->EmitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
+                                    MCBinaryExpr::createSub(LHS, Base,
+                                                            OutContext));
+      }
+    }
+
+    // On some targets (e.g. Darwin) we want to emit two consecutive labels
+    // before each jump table.  The first label is never referenced, but tells
+    // the assembler and linker the extents of the jump table object.  The
+    // second label is actually referenced by the code.
+    if (JTInDiffSection && DL.hasLinkerPrivateGlobalPrefix())
+      // FIXME: This doesn't have to have any specific name, just any randomly
+      // named and numbered 'l' label would work.  Simplify GetJTISymbol.
+      OutStreamer->EmitLabel(GetJTISymbol(JTI, true));
+
+    OutStreamer->EmitLabel(GetJTISymbol(JTI));
+
+    for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
+      EmitJumpTableEntry(MJTI, JTBBs[ii], JTI);
+  }
+  if (!JTInDiffSection)
+    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+}
+
+/// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
+/// current stream.
+void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+                                    const MachineBasicBlock *MBB,
+                                    unsigned UID) const {
+  assert(MBB && MBB->getNumber() >= 0 && "Invalid basic block");
+  const MCExpr *Value = nullptr;
+  switch (MJTI->getEntryKind()) {
+  case MachineJumpTableInfo::EK_Inline:
+    llvm_unreachable("Cannot emit EK_Inline jump table entry");
+  case MachineJumpTableInfo::EK_Custom32:
+    Value = MF->getSubtarget().getTargetLowering()->LowerCustomJumpTableEntry(
+        MJTI, MBB, UID, OutContext);
+    break;
+  case MachineJumpTableInfo::EK_BlockAddress:
+    // EK_BlockAddress - Each entry is a plain address of block, e.g.:
+    //     .word LBB123
+    Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+    break;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress: {
+    // EK_GPRel32BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gprel32 LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer->EmitGPRel32Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress: {
+    // EK_GPRel64BlockAddress - Each entry is an address of block, encoded
+    // with a relocation as gp-relative, e.g.:
+    //     .gpdword LBB123
+    MCSymbol *MBBSym = MBB->getSymbol();
+    OutStreamer->EmitGPRel64Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    return;
+  }
+
+  case MachineJumpTableInfo::EK_LabelDifference32: {
+    // Each entry is the address of the block minus the address of the jump
+    // table. This is used for PIC jump tables where gprel32 is not supported.
+    // e.g.:
+    //      .word LBB123 - LJTI1_2
+    // If the .set directive avoids relocations, this is emitted as:
+    //      .set L4_5_set_123, LBB123 - LJTI1_2
+    //      .word L4_5_set_123
+    if (MAI->doesSetDirectiveSuppressReloc()) {
+      Value = MCSymbolRefExpr::create(GetJTSetSymbol(UID, MBB->getNumber()),
+                                      OutContext);
+      break;
+    }
+    Value = MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
+    const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+    const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF, UID, OutContext);
+    Value = MCBinaryExpr::createSub(Value, Base, OutContext);
+    break;
+  }
+  }
+
+  assert(Value && "Unknown entry kind!");
+
+  unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
+  OutStreamer->EmitValue(Value, EntrySize);
+}
+
+
+/// EmitSpecialLLVMGlobal - Check to see if the specified global is a
+/// special global used by LLVM.  If so, emit it and return true, otherwise
+/// do nothing and return false.
+bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
+  if (GV->getName() == "llvm.used") {
+    if (MAI->hasNoDeadStrip())    // No need to emit this at all.
+      EmitLLVMUsedList(cast<ConstantArray>(GV->getInitializer()));
+    return true;
+  }
+
+  // Ignore debug and non-emitted data.  This handles llvm.compiler.used.
+  if (GV->getSection() == "llvm.metadata" ||
+      GV->hasAvailableExternallyLinkage())
+    return true;
+
+  if (!GV->hasAppendingLinkage()) return false;
+
+  assert(GV->hasInitializer() && "Not a special LLVM global!");
+
+  if (GV->getName() == "llvm.global_ctors") {
+    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ true);
+
+    if (TM.getRelocationModel() == Reloc::Static &&
+        MAI->hasStaticCtorDtorReferenceInStaticMode()) {
+      StringRef Sym(".constructors_used");
+      OutStreamer->EmitSymbolAttribute(OutContext.getOrCreateSymbol(Sym),
+                                       MCSA_Reference);
+    }
+    return true;
+  }
+
+  if (GV->getName() == "llvm.global_dtors") {
+    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+                       /* isCtor */ false);
+
+    if (TM.getRelocationModel() == Reloc::Static &&
+        MAI->hasStaticCtorDtorReferenceInStaticMode()) {
+      StringRef Sym(".destructors_used");
+      OutStreamer->EmitSymbolAttribute(OutContext.getOrCreateSymbol(Sym),
+                                       MCSA_Reference);
+    }
+    return true;
+  }
+
+  report_fatal_error("unknown special variable");
+}
+
+/// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
+/// global in the specified llvm.used list for which emitUsedDirectiveFor
+/// is true, as being used with this directive.
+void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
+  // Should be an array of 'i8*'.
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
+    const GlobalValue *GV =
+      dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
+    if (GV)
+      OutStreamer->EmitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
+  }
+}
+
+namespace {
+struct Structor {
+  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {}
+  int Priority;
+  llvm::Constant *Func;
+  llvm::GlobalValue *ComdatKey;
+};
+} // end namespace
+
+/// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
+/// priority.
+void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
+                                    bool isCtor) {
+  // Should be an array of '{ int, void ()* }' structs.  The first value is the
+  // init priority.
+  if (!isa<ConstantArray>(List)) return;
+
+  // Sanity check the structors list.
+  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
+  if (!InitList) return; // Not an array!
+  StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
+  // FIXME: Only allow the 3-field form in LLVM 4.0.
+  if (!ETy || ETy->getNumElements() < 2 || ETy->getNumElements() > 3)
+    return; // Not an array of two or three elements!
+  if (!isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
+      !isa<PointerType>(ETy->getTypeAtIndex(1U))) return; // Not (int, ptr).
+  if (ETy->getNumElements() == 3 && !isa<PointerType>(ETy->getTypeAtIndex(2U)))
+    return; // Not (int, ptr, ptr).
+
+  // Gather the structors in a form that's convenient for sorting by priority.
+  SmallVector<Structor, 8> Structors;
+  for (Value *O : InitList->operands()) {
+    ConstantStruct *CS = dyn_cast<ConstantStruct>(O);
+    if (!CS) continue; // Malformed.
+    if (CS->getOperand(1)->isNullValue())
+      break;  // Found a null terminator, skip the rest.
+    ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
+    if (!Priority) continue; // Malformed.
+    Structors.push_back(Structor());
+    Structor &S = Structors.back();
+    S.Priority = Priority->getLimitedValue(65535);
+    S.Func = CS->getOperand(1);
+    if (ETy->getNumElements() == 3 && !CS->getOperand(2)->isNullValue())
+      S.ComdatKey =
+          dyn_cast<GlobalValue>(CS->getOperand(2)->stripPointerCasts());
+  }
+
+  // Emit the function pointers in the target-specific order
+  unsigned Align = Log2_32(DL.getPointerPrefAlignment());
+  std::stable_sort(Structors.begin(), Structors.end(),
+                   [](const Structor &L,
+                      const Structor &R) { return L.Priority < R.Priority; });
+  for (Structor &S : Structors) {
+    const TargetLoweringObjectFile &Obj = getObjFileLowering();
+    const MCSymbol *KeySym = nullptr;
+    if (GlobalValue *GV = S.ComdatKey) {
+      if (GV->hasAvailableExternallyLinkage())
+        // If the associated variable is available_externally, some other TU
+        // will provide its dynamic initializer.
+        continue;
+
+      KeySym = getSymbol(GV);
+    }
+    MCSection *OutputSection =
+        (isCtor ? Obj.getStaticCtorSection(S.Priority, KeySym)
+                : Obj.getStaticDtorSection(S.Priority, KeySym));
+    OutStreamer->SwitchSection(OutputSection);
+    if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
+      EmitAlignment(Align);
+    EmitXXStructor(DL, S.Func);
+  }
+}
+
+void AsmPrinter::EmitModuleIdents(Module &M) {
+  if (!MAI->hasIdentDirective())
+    return;
+
+  if (const NamedMDNode *NMD = M.getNamedMetadata("llvm.ident")) {
+    for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
+      const MDNode *N = NMD->getOperand(i);
+      assert(N->getNumOperands() == 1 &&
+             "llvm.ident metadata entry can have only one operand");
+      const MDString *S = cast<MDString>(N->getOperand(0));
+      OutStreamer->EmitIdent(S->getString());
+    }
+  }
+}
+
+//===--------------------------------------------------------------------===//
+// Emission and print routines
+//
+
+/// EmitInt8 - Emit a byte directive and value.
+///
+void AsmPrinter::EmitInt8(int Value) const {
+  OutStreamer->EmitIntValue(Value, 1);
+}
+
+/// EmitInt16 - Emit a short directive and value.
+///
+void AsmPrinter::EmitInt16(int Value) const {
+  OutStreamer->EmitIntValue(Value, 2);
+}
+
+/// EmitInt32 - Emit a long directive and value.
+///
+void AsmPrinter::EmitInt32(int Value) const {
+  OutStreamer->EmitIntValue(Value, 4);
+}
+
+/// Emit something like ".long Hi-Lo" where the size in bytes of the directive
+/// is specified by Size and Hi/Lo specify the labels. This implicitly uses
+/// .set if it avoids relocations.
+void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
+                                     unsigned Size) const {
+  OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, Size);
+}
+
+/// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
+/// where the size in bytes of the directive is specified by Size and Label
+/// specifies the label.  This implicitly uses .set if it is available.
+void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+                                     unsigned Size,
+                                     bool IsSectionRelative) const {
+  if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
+    OutStreamer->EmitCOFFSecRel32(Label);
+    return;
+  }
+
+  // Emit Label+Offset (or just Label if Offset is zero)
+  const MCExpr *Expr = MCSymbolRefExpr::create(Label, OutContext);
+  if (Offset)
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
+
+  OutStreamer->EmitValue(Expr, Size);
+}
+
+//===----------------------------------------------------------------------===//
+
+// EmitAlignment - Emit an alignment directive to the specified power of
+// two boundary.  For example, if you pass in 3 here, you will get an 8
+// byte alignment.  If a global value is specified, and if that global has
+// an explicit alignment requested, it will override the alignment request
+// if required for correctness.
+//
+void AsmPrinter::EmitAlignment(unsigned NumBits, const GlobalObject *GV) const {
+  if (GV)
+    NumBits = getGVAlignmentLog2(GV, GV->getParent()->getDataLayout(), NumBits);
+
+  if (NumBits == 0) return;   // 1-byte aligned: no need to emit alignment.
+
+  assert(NumBits <
+             static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+         "undefined behavior");
+  if (getCurrentSection()->getKind().isText())
+    OutStreamer->EmitCodeAlignment(1u << NumBits);
+  else
+    OutStreamer->EmitValueToAlignment(1u << NumBits);
+}
+
+//===----------------------------------------------------------------------===//
+// Constant emission.
+//===----------------------------------------------------------------------===//
+
+const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
+  MCContext &Ctx = OutContext;
+
+  if (CV->isNullValue() || isa<UndefValue>(CV))
+    return MCConstantExpr::create(0, Ctx);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV))
+    return MCConstantExpr::create(CI->getZExtValue(), Ctx);
+
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV))
+    return MCSymbolRefExpr::create(getSymbol(GV), Ctx);
+
+  if (const BlockAddress *BA = dyn_cast<BlockAddress>(CV))
+    return MCSymbolRefExpr::create(GetBlockAddressSymbol(BA), Ctx);
+
+  const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV);
+  if (!CE) {
+    llvm_unreachable("Unknown constant value to lower!");
+  }
+
+  switch (CE->getOpcode()) {
+  default:
+    // If the code isn't optimized, there may be outstanding folding
+    // opportunities. Attempt to fold the expression using DataLayout as a
+    // last resort before giving up.
+    if (Constant *C = ConstantFoldConstantExpression(CE, getDataLayout()))
+      if (C != CE)
+        return lowerConstant(C);
+
+    // Otherwise report the problem to the user.
+    {
+      std::string S;
+      raw_string_ostream OS(S);
+      OS << "Unsupported expression in static initializer: ";
+      CE->printAsOperand(OS, /*PrintType=*/false,
+                     !MF ? nullptr : MF->getFunction()->getParent());
+      report_fatal_error(OS.str());
+    }
+  case Instruction::GetElementPtr: {
+    // Generate a symbolic expression for the byte address
+    APInt OffsetAI(getDataLayout().getPointerTypeSizeInBits(CE->getType()), 0);
+    cast<GEPOperator>(CE)->accumulateConstantOffset(getDataLayout(), OffsetAI);
+
+    const MCExpr *Base = lowerConstant(CE->getOperand(0));
+    if (!OffsetAI)
+      return Base;
+
+    int64_t Offset = OffsetAI.getSExtValue();
+    return MCBinaryExpr::createAdd(Base, MCConstantExpr::create(Offset, Ctx),
+                                   Ctx);
+  }
+
+  case Instruction::Trunc:
+    // We emit the value and depend on the assembler to truncate the generated
+    // expression properly.  This is important for differences between
+    // blockaddress labels.  Since the two labels are in the same function, it
+    // is reasonable to treat their delta as a 32-bit value.
+    // FALL THROUGH.
+  case Instruction::BitCast:
+    return lowerConstant(CE->getOperand(0));
+
+  case Instruction::IntToPtr: {
+    const DataLayout &DL = getDataLayout();
+
+    // Handle casts to pointers by changing them into casts to the appropriate
+    // integer type.  This promotes constant folding and simplifies this code.
+    Constant *Op = CE->getOperand(0);
+    Op = ConstantExpr::getIntegerCast(Op, DL.getIntPtrType(CV->getType()),
+                                      false/*ZExt*/);
+    return lowerConstant(Op);
+  }
+
+  case Instruction::PtrToInt: {
+    const DataLayout &DL = getDataLayout();
+
+    // Support only foldable casts to/from pointers that can be eliminated by
+    // changing the pointer to the appropriately sized integer type.
+    Constant *Op = CE->getOperand(0);
+    Type *Ty = CE->getType();
+
+    const MCExpr *OpExpr = lowerConstant(Op);
+
+    // We can emit the pointer value into this slot if the slot is an
+    // integer slot equal to the size of the pointer.
+    if (DL.getTypeAllocSize(Ty) == DL.getTypeAllocSize(Op->getType()))
+      return OpExpr;
+
+    // Otherwise the pointer is smaller than the resultant integer, mask off
+    // the high bits so we are sure to get a proper truncation if the input is
+    // a constant expr.
+    unsigned InBits = DL.getTypeAllocSizeInBits(Op->getType());
+    const MCExpr *MaskExpr = MCConstantExpr::create(~0ULL >> (64-InBits), Ctx);
+    return MCBinaryExpr::createAnd(OpExpr, MaskExpr, Ctx);
+  }
+
+  case Instruction::Sub: {
+    GlobalValue *LHSGV;
+    APInt LHSOffset;
+    if (IsConstantOffsetFromGlobal(CE->getOperand(0), LHSGV, LHSOffset,
+                                   getDataLayout())) {
+      GlobalValue *RHSGV;
+      APInt RHSOffset;
+      if (IsConstantOffsetFromGlobal(CE->getOperand(1), RHSGV, RHSOffset,
+                                     getDataLayout())) {
+        const MCExpr *RelocExpr = getObjFileLowering().lowerRelativeReference(
+            LHSGV, RHSGV, *Mang, TM);
+        if (!RelocExpr)
+          RelocExpr = MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(getSymbol(LHSGV), Ctx),
+              MCSymbolRefExpr::create(getSymbol(RHSGV), Ctx), Ctx);
+        int64_t Addend = (LHSOffset - RHSOffset).getSExtValue();
+        if (Addend != 0)
+          RelocExpr = MCBinaryExpr::createAdd(
+              RelocExpr, MCConstantExpr::create(Addend, Ctx), Ctx);
+        return RelocExpr;
+      }
+    }
+  }
+  // else fallthrough
+
+  // The MC library also has a right-shift operator, but it isn't consistently
+  // signed or unsigned between different targets.
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::SDiv:
+  case Instruction::SRem:
+  case Instruction::Shl:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    const MCExpr *LHS = lowerConstant(CE->getOperand(0));
+    const MCExpr *RHS = lowerConstant(CE->getOperand(1));
+    switch (CE->getOpcode()) {
+    default: llvm_unreachable("Unknown binary operator constant cast expr");
+    case Instruction::Add: return MCBinaryExpr::createAdd(LHS, RHS, Ctx);
+    case Instruction::Sub: return MCBinaryExpr::createSub(LHS, RHS, Ctx);
+    case Instruction::Mul: return MCBinaryExpr::createMul(LHS, RHS, Ctx);
+    case Instruction::SDiv: return MCBinaryExpr::createDiv(LHS, RHS, Ctx);
+    case Instruction::SRem: return MCBinaryExpr::createMod(LHS, RHS, Ctx);
+    case Instruction::Shl: return MCBinaryExpr::createShl(LHS, RHS, Ctx);
+    case Instruction::And: return MCBinaryExpr::createAnd(LHS, RHS, Ctx);
+    case Instruction::Or:  return MCBinaryExpr::createOr (LHS, RHS, Ctx);
+    case Instruction::Xor: return MCBinaryExpr::createXor(LHS, RHS, Ctx);
+    }
+  }
+  }
+}
+
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *C,
+                                   AsmPrinter &AP,
+                                   const Constant *BaseCV = nullptr,
+                                   uint64_t Offset = 0);
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP);
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const ConstantDataSequential *V) {
+  StringRef Data = V->getRawDataValues();
+  assert(!Data.empty() && "Empty aggregates should be CAZ node");
+  char C = Data[0];
+  for (unsigned i = 1, e = Data.size(); i != e; ++i)
+    if (Data[i] != C) return -1;
+  return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
+}
+
+
+/// isRepeatedByteSequence - Determine whether the given value is
+/// composed of a repeated sequence of identical bytes and return the
+/// byte value.  If it is not a repeated sequence, return -1.
+static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    uint64_t Size = DL.getTypeAllocSizeInBits(V->getType());
+    assert(Size % 8 == 0);
+
+    // Extend the element to take zero padding into account.
+    APInt Value = CI->getValue().zextOrSelf(Size);
+    if (!Value.isSplat(8))
+      return -1;
+
+    return Value.zextOrTrunc(8).getZExtValue();
+  }
+  if (const ConstantArray *CA = dyn_cast<ConstantArray>(V)) {
+    // Make sure all array elements are sequences of the same repeated
+    // byte.
+    assert(CA->getNumOperands() != 0 && "Should be a CAZ");
+    Constant *Op0 = CA->getOperand(0);
+    int Byte = isRepeatedByteSequence(Op0, DL);
+    if (Byte == -1)
+      return -1;
+
+    // All array elements must be equal.
+    for (unsigned i = 1, e = CA->getNumOperands(); i != e; ++i)
+      if (CA->getOperand(i) != Op0)
+        return -1;
+    return Byte;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V))
+    return isRepeatedByteSequence(CDS);
+
+  return -1;
+}
+
+static void emitGlobalConstantDataSequential(const DataLayout &DL,
+                                             const ConstantDataSequential *CDS,
+                                             AsmPrinter &AP) {
+
+  // See if we can aggregate this into a .fill, if so, emit it as such.
+  int Value = isRepeatedByteSequence(CDS, DL);
+  if (Value != -1) {
+    uint64_t Bytes = DL.getTypeAllocSize(CDS->getType());
+    // Don't emit a 1-byte object as a .fill.
+    if (Bytes > 1)
+      return AP.OutStreamer->emitFill(Bytes, Value);
+  }
+
+  // If this can be emitted with .ascii/.asciz, emit it as such.
+  if (CDS->isString())
+    return AP.OutStreamer->EmitBytes(CDS->getAsString());
+
+  // Otherwise, emit the values in successive locations.
+  unsigned ElementByteSize = CDS->getElementByteSize();
+  if (isa<IntegerType>(CDS->getElementType())) {
+    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+      if (AP.isVerbose())
+        AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
+                                                 CDS->getElementAsInteger(i));
+      AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i),
+                                   ElementByteSize);
+    }
+  } else {
+    for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I)
+      emitGlobalConstantFP(cast<ConstantFP>(CDS->getElementAsConstant(I)), AP);
+  }
+
+  unsigned Size = DL.getTypeAllocSize(CDS->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
+                        CDS->getNumElements();
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer->EmitZeros(Padding);
+
+}
+
+static void emitGlobalConstantArray(const DataLayout &DL,
+                                    const ConstantArray *CA, AsmPrinter &AP,
+                                    const Constant *BaseCV, uint64_t Offset) {
+  // See if we can aggregate some values.  Make sure it can be
+  // represented as a series of bytes of the constant value.
+  int Value = isRepeatedByteSequence(CA, DL);
+
+  if (Value != -1) {
+    uint64_t Bytes = DL.getTypeAllocSize(CA->getType());
+    AP.OutStreamer->emitFill(Bytes, Value);
+  }
+  else {
+    for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+      emitGlobalConstantImpl(DL, CA->getOperand(i), AP, BaseCV, Offset);
+      Offset += DL.getTypeAllocSize(CA->getOperand(i)->getType());
+    }
+  }
+}
+
+static void emitGlobalConstantVector(const DataLayout &DL,
+                                     const ConstantVector *CV, AsmPrinter &AP) {
+  for (unsigned i = 0, e = CV->getType()->getNumElements(); i != e; ++i)
+    emitGlobalConstantImpl(DL, CV->getOperand(i), AP);
+
+  unsigned Size = DL.getTypeAllocSize(CV->getType());
+  unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
+                         CV->getType()->getNumElements();
+  if (unsigned Padding = Size - EmittedSize)
+    AP.OutStreamer->EmitZeros(Padding);
+}
+
+static void emitGlobalConstantStruct(const DataLayout &DL,
+                                     const ConstantStruct *CS, AsmPrinter &AP,
+                                     const Constant *BaseCV, uint64_t Offset) {
+  // Print the fields in successive locations. Pad to align if needed!
+  unsigned Size = DL.getTypeAllocSize(CS->getType());
+  const StructLayout *Layout = DL.getStructLayout(CS->getType());
+  uint64_t SizeSoFar = 0;
+  for (unsigned i = 0, e = CS->getNumOperands(); i != e; ++i) {
+    const Constant *Field = CS->getOperand(i);
+
+    // Print the actual field value.
+    emitGlobalConstantImpl(DL, Field, AP, BaseCV, Offset + SizeSoFar);
+
+    // Check if padding is needed and insert one or more 0s.
+    uint64_t FieldSize = DL.getTypeAllocSize(Field->getType());
+    uint64_t PadSize = ((i == e-1 ? Size : Layout->getElementOffset(i+1))
+                        - Layout->getElementOffset(i)) - FieldSize;
+    SizeSoFar += FieldSize + PadSize;
+
+    // Insert padding - this may include padding to increase the size of the
+    // current field up to the ABI size (if the struct is not packed) as well
+    // as padding to ensure that the next field starts at the right offset.
+    AP.OutStreamer->EmitZeros(PadSize);
+  }
+  assert(SizeSoFar == Layout->getSizeInBytes() &&
+         "Layout of constant struct may be incorrect!");
+}
+
+static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
+  APInt API = CFP->getValueAPF().bitcastToAPInt();
+
+  // First print a comment with what we think the original floating-point value
+  // should have been.
+  if (AP.isVerbose()) {
+    SmallString<8> StrVal;
+    CFP->getValueAPF().toString(StrVal);
+
+    if (CFP->getType())
+      CFP->getType()->print(AP.OutStreamer->GetCommentOS());
+    else
+      AP.OutStreamer->GetCommentOS() << "Printing <null> Type";
+    AP.OutStreamer->GetCommentOS() << ' ' << StrVal << '\n';
+  }
+
+  // Now iterate through the APInt chunks, emitting them in endian-correct
+  // order, possibly with a smaller chunk at beginning/end (e.g. for x87 80-bit
+  // floats).
+  unsigned NumBytes = API.getBitWidth() / 8;
+  unsigned TrailingBytes = NumBytes % sizeof(uint64_t);
+  const uint64_t *p = API.getRawData();
+
+  // PPC's long double has odd notions of endianness compared to how LLVM
+  // handles it: p[0] goes first for *big* endian on PPC.
+  if (AP.getDataLayout().isBigEndian() && !CFP->getType()->isPPC_FP128Ty()) {
+    int Chunk = API.getNumWords() - 1;
+
+    if (TrailingBytes)
+      AP.OutStreamer->EmitIntValue(p[Chunk--], TrailingBytes);
+
+    for (; Chunk >= 0; --Chunk)
+      AP.OutStreamer->EmitIntValue(p[Chunk], sizeof(uint64_t));
+  } else {
+    unsigned Chunk;
+    for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
+      AP.OutStreamer->EmitIntValue(p[Chunk], sizeof(uint64_t));
+
+    if (TrailingBytes)
+      AP.OutStreamer->EmitIntValue(p[Chunk], TrailingBytes);
+  }
+
+  // Emit the tail padding for the long double.
+  const DataLayout &DL = AP.getDataLayout();
+  AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(CFP->getType()) -
+                            DL.getTypeStoreSize(CFP->getType()));
+}
+
+static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
+  const DataLayout &DL = AP.getDataLayout();
+  unsigned BitWidth = CI->getBitWidth();
+
+  // Copy the value as we may massage the layout for constants whose bit width
+  // is not a multiple of 64-bits.
+  APInt Realigned(CI->getValue());
+  uint64_t ExtraBits = 0;
+  unsigned ExtraBitsSize = BitWidth & 63;
+
+  if (ExtraBitsSize) {
+    // The bit width of the data is not a multiple of 64-bits.
+    // The extra bits are expected to be at the end of the chunk of the memory.
+    // Little endian:
+    // * Nothing to be done, just record the extra bits to emit.
+    // Big endian:
+    // * Record the extra bits to emit.
+    // * Realign the raw data to emit the chunks of 64-bits.
+    if (DL.isBigEndian()) {
+      // Basically the structure of the raw data is a chunk of 64-bits cells:
+      //    0        1         BitWidth / 64
+      // [chunk1][chunk2] ... [chunkN].
+      // The most significant chunk is chunkN and it should be emitted first.
+      // However, due to the alignment issue chunkN contains useless bits.
+      // Realign the chunks so that they contain only useless information:
+      // ExtraBits     0       1       (BitWidth / 64) - 1
+      //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
+      ExtraBits = Realigned.getRawData()[0] &
+        (((uint64_t)-1) >> (64 - ExtraBitsSize));
+      Realigned = Realigned.lshr(ExtraBitsSize);
+    } else
+      ExtraBits = Realigned.getRawData()[BitWidth / 64];
+  }
+
+  // We don't expect assemblers to support integer data directives
+  // for more than 64 bits, so we emit the data in at most 64-bit
+  // quantities at a time.
+  const uint64_t *RawData = Realigned.getRawData();
+  for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
+    uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
+    AP.OutStreamer->EmitIntValue(Val, 8);
+  }
+
+  if (ExtraBitsSize) {
+    // Emit the extra bits after the 64-bits chunks.
+
+    // Emit a directive that fills the expected size.
+    uint64_t Size = AP.getDataLayout().getTypeAllocSize(CI->getType());
+    Size -= (BitWidth / 64) * 8;
+    assert(Size && Size * 8 >= ExtraBitsSize &&
+           (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
+           == ExtraBits && "Directive too small for extra bits.");
+    AP.OutStreamer->EmitIntValue(ExtraBits, Size);
+  }
+}
+
+/// \brief Transform a not absolute MCExpr containing a reference to a GOT
+/// equivalent global, by a target specific GOT pc relative access to the
+/// final symbol.
+static void handleIndirectSymViaGOTPCRel(AsmPrinter &AP, const MCExpr **ME,
+                                         const Constant *BaseCst,
+                                         uint64_t Offset) {
+  // The global @foo below illustrates a global that uses a got equivalent.
+  //
+  //  @bar = global i32 42
+  //  @gotequiv = private unnamed_addr constant i32* @bar
+  //  @foo = i32 trunc (i64 sub (i64 ptrtoint (i32** @gotequiv to i64),
+  //                             i64 ptrtoint (i32* @foo to i64))
+  //                        to i32)
+  //
+  // The cstexpr in @foo is converted into the MCExpr `ME`, where we actually
+  // check whether @foo is suitable to use a GOTPCREL. `ME` is usually in the
+  // form:
+  //
+  //  foo = cstexpr, where
+  //    cstexpr := <gotequiv> - "." + <cst>
+  //    cstexpr := <gotequiv> - (<foo> - <offset from @foo base>) + <cst>
+  //
+  // After canonicalization by evaluateAsRelocatable `ME` turns into:
+  //
+  //  cstexpr := <gotequiv> - <foo> + gotpcrelcst, where
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  MCValue MV;
+  if (!(*ME)->evaluateAsRelocatable(MV, nullptr, nullptr) || MV.isAbsolute())
+    return;
+  const MCSymbolRefExpr *SymA = MV.getSymA();
+  if (!SymA)
+    return;
+
+  // Check that GOT equivalent symbol is cached.
+  const MCSymbol *GOTEquivSym = &SymA->getSymbol();
+  if (!AP.GlobalGOTEquivs.count(GOTEquivSym))
+    return;
+
+  const GlobalValue *BaseGV = dyn_cast_or_null<GlobalValue>(BaseCst);
+  if (!BaseGV)
+    return;
+
+  // Check for a valid base symbol
+  const MCSymbol *BaseSym = AP.getSymbol(BaseGV);
+  const MCSymbolRefExpr *SymB = MV.getSymB();
+
+  if (!SymB || BaseSym != &SymB->getSymbol())
+    return;
+
+  // Make sure to match:
+  //
+  //    gotpcrelcst := <offset from @foo base> + <cst>
+  //
+  // If gotpcrelcst is positive it means that we can safely fold the pc rel
+  // displacement into the GOTPCREL. We can also can have an extra offset <cst>
+  // if the target knows how to encode it.
+  //
+  int64_t GOTPCRelCst = Offset + MV.getConstant();
+  if (GOTPCRelCst < 0)
+    return;
+  if (!AP.getObjFileLowering().supportGOTPCRelWithOffset() && GOTPCRelCst != 0)
+    return;
+
+  // Emit the GOT PC relative to replace the got equivalent global, i.e.:
+  //
+  //  bar:
+  //    .long 42
+  //  gotequiv:
+  //    .quad bar
+  //  foo:
+  //    .long gotequiv - "." + <cst>
+  //
+  // is replaced by the target specific equivalent to:
+  //
+  //  bar:
+  //    .long 42
+  //  foo:
+  //    .long bar@GOTPCREL+<gotpcrelcst>
+  //
+  AsmPrinter::GOTEquivUsePair Result = AP.GlobalGOTEquivs[GOTEquivSym];
+  const GlobalVariable *GV = Result.first;
+  int NumUses = (int)Result.second;
+  const GlobalValue *FinalGV = dyn_cast<GlobalValue>(GV->getOperand(0));
+  const MCSymbol *FinalSym = AP.getSymbol(FinalGV);
+  *ME = AP.getObjFileLowering().getIndirectSymViaGOTPCRel(
+      FinalSym, MV, Offset, AP.MMI, *AP.OutStreamer);
+
+  // Update GOT equivalent usage information
+  --NumUses;
+  if (NumUses >= 0)
+    AP.GlobalGOTEquivs[GOTEquivSym] = std::make_pair(GV, NumUses);
+}
+
+static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
+                                   AsmPrinter &AP, const Constant *BaseCV,
+                                   uint64_t Offset) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
+
+  // Globals with sub-elements such as combinations of arrays and structs
+  // are handled recursively by emitGlobalConstantImpl. Keep track of the
+  // constant symbol base and the current position with BaseCV and Offset.
+  if (!BaseCV && CV->hasOneUse())
+    BaseCV = dyn_cast<Constant>(CV->user_back());
+
+  if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
+    return AP.OutStreamer->EmitZeros(Size);
+
+  if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
+    switch (Size) {
+    case 1:
+    case 2:
+    case 4:
+    case 8:
+      if (AP.isVerbose())
+        AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
+                                                 CI->getZExtValue());
+      AP.OutStreamer->EmitIntValue(CI->getZExtValue(), Size);
+      return;
+    default:
+      emitGlobalConstantLargeInt(CI, AP);
+      return;
+    }
+  }
+
+  if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
+    return emitGlobalConstantFP(CFP, AP);
+
+  if (isa<ConstantPointerNull>(CV)) {
+    AP.OutStreamer->EmitIntValue(0, Size);
+    return;
+  }
+
+  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(CV))
+    return emitGlobalConstantDataSequential(DL, CDS, AP);
+
+  if (const ConstantArray *CVA = dyn_cast<ConstantArray>(CV))
+    return emitGlobalConstantArray(DL, CVA, AP, BaseCV, Offset);
+
+  if (const ConstantStruct *CVS = dyn_cast<ConstantStruct>(CV))
+    return emitGlobalConstantStruct(DL, CVS, AP, BaseCV, Offset);
+
+  if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(CV)) {
+    // Look through bitcasts, which might not be able to be MCExpr'ized (e.g. of
+    // vectors).
+    if (CE->getOpcode() == Instruction::BitCast)
+      return emitGlobalConstantImpl(DL, CE->getOperand(0), AP);
+
+    if (Size > 8) {
+      // If the constant expression's size is greater than 64-bits, then we have
+      // to emit the value in chunks. Try to constant fold the value and emit it
+      // that way.
+      Constant *New = ConstantFoldConstantExpression(CE, DL);
+      if (New && New != CE)
+        return emitGlobalConstantImpl(DL, New, AP);
+    }
+  }
+
+  if (const ConstantVector *V = dyn_cast<ConstantVector>(CV))
+    return emitGlobalConstantVector(DL, V, AP);
+
+  // Otherwise, it must be a ConstantExpr.  Lower it to an MCExpr, then emit it
+  // thread the streamer with EmitValue.
+  const MCExpr *ME = AP.lowerConstant(CV);
+
+  // Since lowerConstant already folded and got rid of all IR pointer and
+  // integer casts, detect GOT equivalent accesses by looking into the MCExpr
+  // directly.
+  if (AP.getObjFileLowering().supportIndirectSymViaGOTPCRel())
+    handleIndirectSymViaGOTPCRel(AP, &ME, BaseCV, Offset);
+
+  AP.OutStreamer->EmitValue(ME, Size);
+}
+
+/// EmitGlobalConstant - Print a general LLVM constant to the .s file.
+void AsmPrinter::EmitGlobalConstant(const DataLayout &DL, const Constant *CV) {
+  uint64_t Size = DL.getTypeAllocSize(CV->getType());
+  if (Size)
+    emitGlobalConstantImpl(DL, CV, *this);
+  else if (MAI->hasSubsectionsViaSymbols()) {
+    // If the global has zero size, emit a single byte so that two labels don't
+    // look like they are at the same location.
+    OutStreamer->EmitIntValue(0, 1);
+  }
+}
+
+void AsmPrinter::EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+  // Target doesn't support this yet!
+  llvm_unreachable("Target does not support EmitMachineConstantPoolValue");
+}
+
+void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
+  if (Offset > 0)
+    OS << '+' << Offset;
+  else if (Offset < 0)
+    OS << Offset;
+}
+
+//===----------------------------------------------------------------------===//
+// Symbol Lowering Routines.
+//===----------------------------------------------------------------------===//
+
+MCSymbol *AsmPrinter::createTempSymbol(const Twine &Name) const {
+  return OutContext.createTempSymbol(Name, true);
+}
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BlockAddress *BA) const {
+  return MMI->getAddrLabelSymbol(BA->getBasicBlock());
+}
+
+MCSymbol *AsmPrinter::GetBlockAddressSymbol(const BasicBlock *BB) const {
+  return MMI->getAddrLabelSymbol(BB);
+}
+
+/// GetCPISymbol - Return the symbol for the specified constant pool entry.
+MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      "CPI" + Twine(getFunctionNumber()) + "_" +
+                                      Twine(CPID));
+}
+
+/// GetJTISymbol - Return the symbol for the specified jump table entry.
+MCSymbol *AsmPrinter::GetJTISymbol(unsigned JTID, bool isLinkerPrivate) const {
+  return MF->getJTISymbol(JTID, OutContext, isLinkerPrivate);
+}
+
+/// GetJTSetSymbol - Return the symbol for the specified jump table .set
+/// FIXME: privatize to AsmPrinter.
+MCSymbol *AsmPrinter::GetJTSetSymbol(unsigned UID, unsigned MBBID) const {
+  const DataLayout &DL = getDataLayout();
+  return OutContext.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                                      Twine(getFunctionNumber()) + "_" +
+                                      Twine(UID) + "_set_" + Twine(MBBID));
+}
+
+MCSymbol *AsmPrinter::getSymbolWithGlobalValueBase(const GlobalValue *GV,
+                                                   StringRef Suffix) const {
+  return getObjFileLowering().getSymbolWithGlobalValueBase(GV, Suffix, *Mang,
+                                                           TM);
+}
+
+/// Return the MCSymbol for the specified ExternalSymbol.
+MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
+  SmallString<60> NameStr;
+  Mangler::getNameWithPrefix(NameStr, Sym, getDataLayout());
+  return OutContext.getOrCreateSymbol(NameStr);
+}
+
+
+
+/// PrintParentLoopComment - Print comments about parent loops of this one.
+static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                   unsigned FunctionNumber) {
+  if (!Loop) return;
+  PrintParentLoopComment(OS, Loop->getParentLoop(), FunctionNumber);
+  OS.indent(Loop->getLoopDepth()*2)
+    << "Parent Loop BB" << FunctionNumber << "_"
+    << Loop->getHeader()->getNumber()
+    << " Depth=" << Loop->getLoopDepth() << '\n';
+}
+
+
+/// PrintChildLoopComment - Print comments about child loops within
+/// the loop for this basic block, with nesting.
+static void PrintChildLoopComment(raw_ostream &OS, const MachineLoop *Loop,
+                                  unsigned FunctionNumber) {
+  // Add child loop information
+  for (const MachineLoop *CL : *Loop) {
+    OS.indent(CL->getLoopDepth()*2)
+      << "Child Loop BB" << FunctionNumber << "_"
+      << CL->getHeader()->getNumber() << " Depth " << CL->getLoopDepth()
+      << '\n';
+    PrintChildLoopComment(OS, CL, FunctionNumber);
+  }
+}
+
+/// emitBasicBlockLoopComments - Pretty-print comments for basic blocks.
+static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
+                                       const MachineLoopInfo *LI,
+                                       const AsmPrinter &AP) {
+  // Add loop depth information
+  const MachineLoop *Loop = LI->getLoopFor(&MBB);
+  if (!Loop) return;
+
+  MachineBasicBlock *Header = Loop->getHeader();
+  assert(Header && "No header for loop");
+
+  // If this block is not a loop header, just print out what is the loop header
+  // and return.
+  if (Header != &MBB) {
+    AP.OutStreamer->AddComment("  in Loop: Header=BB" +
+                               Twine(AP.getFunctionNumber())+"_" +
+                               Twine(Loop->getHeader()->getNumber())+
+                               " Depth="+Twine(Loop->getLoopDepth()));
+    return;
+  }
+
+  // Otherwise, it is a loop header.  Print out information about child and
+  // parent loops.
+  raw_ostream &OS = AP.OutStreamer->GetCommentOS();
+
+  PrintParentLoopComment(OS, Loop->getParentLoop(), AP.getFunctionNumber());
+
+  OS << "=>";
+  OS.indent(Loop->getLoopDepth()*2-2);
+
+  OS << "This ";
+  if (Loop->empty())
+    OS << "Inner ";
+  OS << "Loop Header: Depth=" + Twine(Loop->getLoopDepth()) << '\n';
+
+  PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
+}
+
+
+/// EmitBasicBlockStart - This method prints the label for the specified
+/// MachineBasicBlock, an alignment (if present) and a comment describing
+/// it if appropriate.
+void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) const {
+  // End the previous funclet and start a new one.
+  if (MBB.isEHFuncletEntry()) {
+    for (const HandlerInfo &HI : Handlers) {
+      HI.Handler->endFunclet();
+      HI.Handler->beginFunclet(MBB);
+    }
+  }
+
+  // Emit an alignment directive for this block, if needed.
+  if (unsigned Align = MBB.getAlignment())
+    EmitAlignment(Align);
+
+  // If the block has its address taken, emit any labels that were used to
+  // reference the block.  It is possible that there is more than one label
+  // here, because multiple LLVM BB's may have been RAUW'd to this block after
+  // the references were generated.
+  if (MBB.hasAddressTaken()) {
+    const BasicBlock *BB = MBB.getBasicBlock();
+    if (isVerbose())
+      OutStreamer->AddComment("Block address taken");
+
+    // MBBs can have their address taken as part of CodeGen without having
+    // their corresponding BB's address taken in IR
+    if (BB->hasAddressTaken())
+      for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
+        OutStreamer->EmitLabel(Sym);
+  }
+
+  // Print some verbose block comments.
+  if (isVerbose()) {
+    if (const BasicBlock *BB = MBB.getBasicBlock()) {
+      if (BB->hasName()) {
+        BB->printAsOperand(OutStreamer->GetCommentOS(),
+                           /*PrintType=*/false, BB->getModule());
+        OutStreamer->GetCommentOS() << '\n';
+      }
+    }
+    emitBasicBlockLoopComments(MBB, LI, *this);
+  }
+
+  // Print the main label for the block.
+  if (MBB.pred_empty() ||
+      (isBlockOnlyReachableByFallthrough(&MBB) && !MBB.isEHFuncletEntry())) {
+    if (isVerbose()) {
+      // NOTE: Want this comment at start of line, don't emit with AddComment.
+      OutStreamer->emitRawComment(" BB#" + Twine(MBB.getNumber()) + ":", false);
+    }
+  } else {
+    OutStreamer->EmitLabel(MBB.getSymbol());
+  }
+}
+
+void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility,
+                                bool IsDefinition) const {
+  MCSymbolAttr Attr = MCSA_Invalid;
+
+  switch (Visibility) {
+  default: break;
+  case GlobalValue::HiddenVisibility:
+    if (IsDefinition)
+      Attr = MAI->getHiddenVisibilityAttr();
+    else
+      Attr = MAI->getHiddenDeclarationVisibilityAttr();
+    break;
+  case GlobalValue::ProtectedVisibility:
+    Attr = MAI->getProtectedVisibilityAttr();
+    break;
+  }
+
+  if (Attr != MCSA_Invalid)
+    OutStreamer->EmitSymbolAttribute(Sym, Attr);
+}
+
+/// isBlockOnlyReachableByFallthough - Return true if the basic block has
+/// exactly one predecessor and the control transfer mechanism between
+/// the predecessor and this block is a fall-through.
+bool AsmPrinter::
+isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // If this is a landing pad, it isn't a fall through.  If it has no preds,
+  // then nothing falls through to it.
+  if (MBB->isEHPad() || MBB->pred_empty())
+    return false;
+
+  // If there isn't exactly one predecessor, it can't be a fall through.
+  if (MBB->pred_size() > 1)
+    return false;
+
+  // The predecessor has to be immediately before this block.
+  MachineBasicBlock *Pred = *MBB->pred_begin();
+  if (!Pred->isLayoutSuccessor(MBB))
+    return false;
+
+  // If the block is completely empty, then it definitely does fall through.
+  if (Pred->empty())
+    return true;
+
+  // Check the terminators in the previous blocks
+  for (const auto &MI : Pred->terminators()) {
+    // If it is not a simple branch, we are in a table somewhere.
+    if (!MI.isBranch() || MI.isIndirectBranch())
+      return false;
+
+    // If we are the operands of one of the branches, this is not a fall
+    // through. Note that targets with delay slots will usually bundle
+    // terminators with the delay slot instruction.
+    for (ConstMIBundleOperands OP(MI); OP.isValid(); ++OP) {
+      if (OP->isJTI())
+        return false;
+      if (OP->isMBB() && OP->getMBB() == MBB)
+        return false;
+    }
+  }
+
+  return true;
+}
+
+
+
+GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
+  if (!S.usesMetadata())
+    return nullptr;
+
+  assert(!S.useStatepoints() && "statepoints do not currently support custom"
+         " stackmap formats, please see the documentation for a description of"
+         " the default format.  If you really need a custom serialized format,"
+         " please file a bug");
+
+  gcp_map_type &GCMap = getGCMap(GCMetadataPrinters);
+  gcp_map_type::iterator GCPI = GCMap.find(&S);
+  if (GCPI != GCMap.end())
+    return GCPI->second.get();
+
+  const char *Name = S.getName().c_str();
+
+  for (GCMetadataPrinterRegistry::iterator
+         I = GCMetadataPrinterRegistry::begin(),
+         E = GCMetadataPrinterRegistry::end(); I != E; ++I)
+    if (strcmp(Name, I->getName()) == 0) {
+      std::unique_ptr<GCMetadataPrinter> GMP = I->instantiate();
+      GMP->S = &S;
+      auto IterBool = GCMap.insert(std::make_pair(&S, std::move(GMP)));
+      return IterBool.first->second.get();
+    }
+
+  report_fatal_error("no GCMetadataPrinter registered for GC: " + Twine(Name));
+}
+
+/// Pin vtable to this file.
+AsmPrinterHandler::~AsmPrinterHandler() {}
+
+void AsmPrinterHandler::markFunctionEnd() {}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
new file mode 100644
index 000000000000..60f40d063cc8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -0,0 +1,290 @@
+//===-- AsmPrinterDwarf.cpp - AsmPrinter Dwarf Support --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Dwarf emissions parts of AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ByteStreamer.h"
+#include "DwarfDebug.h"
+#include "DwarfExpression.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+//===----------------------------------------------------------------------===//
+// Dwarf Emission Helper Routines
+//===----------------------------------------------------------------------===//
+
+/// EmitSLEB128 - emit the specified signed leb128 value.
+void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const {
+  if (isVerbose() && Desc)
+    OutStreamer->AddComment(Desc);
+
+  OutStreamer->EmitSLEB128IntValue(Value);
+}
+
+/// EmitULEB128 - emit the specified unsigned leb128 value.
+void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc,
+                             unsigned PadTo) const {
+  if (isVerbose() && Desc)
+    OutStreamer->AddComment(Desc);
+
+  OutStreamer->EmitULEB128IntValue(Value, PadTo);
+}
+
+static const char *DecodeDWARFEncoding(unsigned Encoding) {
+  switch (Encoding) {
+  case dwarf::DW_EH_PE_absptr:
+    return "absptr";
+  case dwarf::DW_EH_PE_omit:
+    return "omit";
+  case dwarf::DW_EH_PE_pcrel:
+    return "pcrel";
+  case dwarf::DW_EH_PE_udata4:
+    return "udata4";
+  case dwarf::DW_EH_PE_udata8:
+    return "udata8";
+  case dwarf::DW_EH_PE_sdata4:
+    return "sdata4";
+  case dwarf::DW_EH_PE_sdata8:
+    return "sdata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4:
+    return "pcrel udata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4:
+    return "pcrel sdata4";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8:
+    return "pcrel udata8";
+  case dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8:
+    return "pcrel sdata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata4
+      :
+    return "indirect pcrel udata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+      :
+    return "indirect pcrel sdata4";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_udata8
+      :
+    return "indirect pcrel udata8";
+  case dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata8
+      :
+    return "indirect pcrel sdata8";
+  }
+
+  return "<unknown encoding>";
+}
+
+/// EmitEncodingByte - Emit a .byte 42 directive that corresponds to an
+/// encoding.  If verbose assembly output is enabled, we output comments
+/// describing the encoding.  Desc is an optional string saying what the
+/// encoding is specifying (e.g. "LSDA").
+void AsmPrinter::EmitEncodingByte(unsigned Val, const char *Desc) const {
+  if (isVerbose()) {
+    if (Desc)
+      OutStreamer->AddComment(Twine(Desc) + " Encoding = " +
+                              Twine(DecodeDWARFEncoding(Val)));
+    else
+      OutStreamer->AddComment(Twine("Encoding = ") + DecodeDWARFEncoding(Val));
+  }
+
+  OutStreamer->EmitIntValue(Val, 1);
+}
+
+/// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
+unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return 0;
+
+  switch (Encoding & 0x07) {
+  default:
+    llvm_unreachable("Invalid encoded value.");
+  case dwarf::DW_EH_PE_absptr:
+    return MF->getDataLayout().getPointerSize();
+  case dwarf::DW_EH_PE_udata2:
+    return 2;
+  case dwarf::DW_EH_PE_udata4:
+    return 4;
+  case dwarf::DW_EH_PE_udata8:
+    return 8;
+  }
+}
+
+void AsmPrinter::EmitTTypeReference(const GlobalValue *GV,
+                                    unsigned Encoding) const {
+  if (GV) {
+    const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+    const MCExpr *Exp =
+        TLOF.getTTypeGlobalReference(GV, Encoding, *Mang, TM, MMI,
+                                     *OutStreamer);
+    OutStreamer->EmitValue(Exp, GetSizeOfEncodedValue(Encoding));
+  } else
+    OutStreamer->EmitIntValue(0, GetSizeOfEncodedValue(Encoding));
+}
+
+void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
+                                          bool ForceOffset) const {
+  if (!ForceOffset) {
+    // On COFF targets, we have to emit the special .secrel32 directive.
+    if (MAI->needsDwarfSectionOffsetDirective()) {
+      OutStreamer->EmitCOFFSecRel32(Label);
+      return;
+    }
+
+    // If the format uses relocations with dwarf, refer to the symbol directly.
+    if (MAI->doesDwarfUseRelocationsAcrossSections()) {
+      OutStreamer->EmitSymbolValue(Label, 4);
+      return;
+    }
+  }
+
+  // Otherwise, emit it as a label difference from the start of the section.
+  EmitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
+}
+
+void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntryRef S) const {
+  if (MAI->doesDwarfUseRelocationsAcrossSections()) {
+    emitDwarfSymbolReference(S.getSymbol());
+    return;
+  }
+
+  // Just emit the offset directly; no need for symbol math.
+  EmitInt32(S.getOffset());
+}
+
+/// EmitDwarfRegOp - Emit dwarf register operation.
+void AsmPrinter::EmitDwarfRegOp(ByteStreamer &Streamer,
+                                const MachineLocation &MLoc) const {
+  DebugLocDwarfExpression Expr(getDwarfDebug()->getDwarfVersion(), Streamer);
+  const MCRegisterInfo *MRI = MMI->getContext().getRegisterInfo();
+  int Reg = MRI->getDwarfRegNum(MLoc.getReg(), false);
+  if (Reg < 0) {
+    // We assume that pointers are always in an addressable register.
+    if (MLoc.isIndirect())
+      // FIXME: We have no reasonable way of handling errors in here. The
+      // caller might be in the middle of a dwarf expression. We should
+      // probably assert that Reg >= 0 once debug info generation is more
+      // mature.
+      return Expr.EmitOp(dwarf::DW_OP_nop,
+                         "nop (could not find a dwarf register number)");
+
+    // Attempt to find a valid super- or sub-register.
+    if (!Expr.AddMachineRegPiece(*MF->getSubtarget().getRegisterInfo(),
+                                 MLoc.getReg()))
+      Expr.EmitOp(dwarf::DW_OP_nop,
+                  "nop (could not find a dwarf register number)");
+    return;
+  }
+
+  if (MLoc.isIndirect())
+    Expr.AddRegIndirect(Reg, MLoc.getOffset());
+  else
+    Expr.AddReg(Reg);
+}
+
+//===----------------------------------------------------------------------===//
+// Dwarf Lowering Routines
+//===----------------------------------------------------------------------===//
+
+void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
+  switch (Inst.getOperation()) {
+  default:
+    llvm_unreachable("Unexpected instruction");
+  case MCCFIInstruction::OpDefCfaOffset:
+    OutStreamer->EmitCFIDefCfaOffset(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpAdjustCfaOffset:
+    OutStreamer->EmitCFIAdjustCfaOffset(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OutStreamer->EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OutStreamer->EmitCFIDefCfaRegister(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpOffset:
+    OutStreamer->EmitCFIOffset(Inst.getRegister(), Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpRegister:
+    OutStreamer->EmitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    break;
+  case MCCFIInstruction::OpWindowSave:
+    OutStreamer->EmitCFIWindowSave();
+    break;
+  case MCCFIInstruction::OpSameValue:
+    OutStreamer->EmitCFISameValue(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpGnuArgsSize:
+    OutStreamer->EmitCFIGnuArgsSize(Inst.getOffset());
+    break;
+  case MCCFIInstruction::OpEscape:
+    OutStreamer->EmitCFIEscape(Inst.getValues());
+    break;
+  }
+}
+
+void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
+  // Emit the code (index) for the abbreviation.
+  if (isVerbose())
+    OutStreamer->AddComment("Abbrev [" + Twine(Die.getAbbrevNumber()) + "] 0x" +
+                            Twine::utohexstr(Die.getOffset()) + ":0x" +
+                            Twine::utohexstr(Die.getSize()) + " " +
+                            dwarf::TagString(Die.getTag()));
+  EmitULEB128(Die.getAbbrevNumber());
+
+  // Emit the DIE attribute values.
+  for (const auto &V : Die.values()) {
+    dwarf::Attribute Attr = V.getAttribute();
+    assert(V.getForm() && "Too many attributes for DIE (check abbreviation)");
+
+    if (isVerbose()) {
+      OutStreamer->AddComment(dwarf::AttributeString(Attr));
+      if (Attr == dwarf::DW_AT_accessibility)
+        OutStreamer->AddComment(
+            dwarf::AccessibilityString(V.getDIEInteger().getValue()));
+    }
+
+    // Emit an attribute using the defined form.
+    V.EmitValue(this);
+  }
+
+  // Emit the DIE children if any.
+  if (Die.hasChildren()) {
+    for (auto &Child : Die.children())
+      emitDwarfDIE(Child);
+
+    OutStreamer->AddComment("End Of Children Mark");
+    EmitInt8(0);
+  }
+}
+
+void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const {
+  // Emit the abbreviations code (base 1 index.)
+  EmitULEB128(Abbrev.getNumber(), "Abbreviation Code");
+
+  // Emit the abbreviations data.
+  Abbrev.Emit(this);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
new file mode 100644
index 000000000000..638226e90a7a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterHandler.h
@@ -0,0 +1,74 @@
+//===-- lib/CodeGen/AsmPrinter/AsmPrinterHandler.h -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a generic interface for AsmPrinter handlers,
+// like debug and EH info emitters.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_ASMPRINTERHANDLER_H
+
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MCSymbol;
+
+typedef MCSymbol *ExceptionSymbolProvider(AsmPrinter *Asm);
+
+/// \brief Collects and handles AsmPrinter objects required to build debug
+/// or EH information.
+class AsmPrinterHandler {
+public:
+  virtual ~AsmPrinterHandler();
+
+  /// \brief For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) = 0;
+
+  /// \brief Emit all sections that should come after the content.
+  virtual void endModule() = 0;
+
+  /// \brief Gather pre-function debug information.
+  /// Every beginFunction(MF) call should be followed by an endFunction(MF)
+  /// call.
+  virtual void beginFunction(const MachineFunction *MF) = 0;
+
+  // \brief Emit any of function marker (like .cfi_endproc). This is called
+  // before endFunction and cannot switch sections.
+  virtual void markFunctionEnd();
+
+  /// \brief Gather post-function debug information.
+  /// Please note that some AsmPrinter implementations may not call
+  /// beginFunction at all.
+  virtual void endFunction(const MachineFunction *MF) = 0;
+
+  virtual void beginFragment(const MachineBasicBlock *MBB,
+                             ExceptionSymbolProvider ESP) {}
+  virtual void endFragment() {}
+
+  /// \brief Emit target-specific EH funclet machinery.
+  virtual void beginFunclet(const MachineBasicBlock &MBB,
+                            MCSymbol *Sym = nullptr) {}
+  virtual void endFunclet() {}
+
+  /// \brief Process beginning of an instruction.
+  virtual void beginInstruction(const MachineInstr *MI) = 0;
+
+  /// \brief Process end of an instruction.
+  virtual void endInstruction() = 0;
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
new file mode 100644
index 000000000000..2ce6c182235f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -0,0 +1,578 @@
+//===-- AsmPrinterInlineAsm.cpp - AsmPrinter Inline Asm Handling ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the inline assembler pieces of the AsmPrinter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "asm-printer"
+
+namespace {
+  struct SrcMgrDiagInfo {
+    const MDNode *LocInfo;
+    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
+    void *DiagContext;
+  };
+}
+
+/// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
+/// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
+/// struct above.
+static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
+  SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
+  assert(DiagInfo && "Diagnostic context not passed down?");
+
+  // If the inline asm had metadata associated with it, pull out a location
+  // cookie corresponding to which line the error occurred on.
+  unsigned LocCookie = 0;
+  if (const MDNode *LocInfo = DiagInfo->LocInfo) {
+    unsigned ErrorLine = Diag.getLineNo()-1;
+    if (ErrorLine >= LocInfo->getNumOperands())
+      ErrorLine = 0;
+
+    if (LocInfo->getNumOperands() != 0)
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocInfo->getOperand(ErrorLine)))
+        LocCookie = CI->getZExtValue();
+  }
+
+  DiagInfo->DiagHandler(Diag, DiagInfo->DiagContext, LocCookie);
+}
+
+/// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
+void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
+                               const MCTargetOptions &MCOptions,
+                               const MDNode *LocMDNode,
+                               InlineAsm::AsmDialect Dialect) const {
+  assert(!Str.empty() && "Can't emit empty inline asm block");
+
+  // Remember if the buffer is nul terminated or not so we can avoid a copy.
+  bool isNullTerminated = Str.back() == 0;
+  if (isNullTerminated)
+    Str = Str.substr(0, Str.size()-1);
+
+  // If the output streamer does not have mature MC support or the integrated
+  // assembler has been disabled, just emit the blob textually.
+  // Otherwise parse the asm and emit it via MC support.
+  // This is useful in case the asm parser doesn't handle something but the
+  // system assembler does.
+  const MCAsmInfo *MCAI = TM.getMCAsmInfo();
+  assert(MCAI && "No MCAsmInfo");
+  if (!MCAI->useIntegratedAssembler() &&
+      !OutStreamer->isIntegratedAssemblerRequired()) {
+    emitInlineAsmStart();
+    OutStreamer->EmitRawText(Str);
+    emitInlineAsmEnd(STI, nullptr);
+    return;
+  }
+
+  SourceMgr SrcMgr;
+  SrcMgrDiagInfo DiagInfo;
+
+  // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
+  LLVMContext &LLVMCtx = MMI->getModule()->getContext();
+  bool HasDiagHandler = false;
+  if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) {
+    // If the source manager has an issue, we arrange for srcMgrDiagHandler
+    // to be invoked, getting DiagInfo passed into it.
+    DiagInfo.LocInfo = LocMDNode;
+    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
+    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
+    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
+    HasDiagHandler = true;
+  }
+
+  std::unique_ptr<MemoryBuffer> Buffer;
+  if (isNullTerminated)
+    Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
+  else
+    Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
+
+  // Tell SrcMgr about this buffer, it takes ownership of the buffer.
+  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+
+  std::unique_ptr<MCAsmParser> Parser(
+      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI));
+
+  // We create a new MCInstrInfo here since we might be at the module level
+  // and not have a MachineFunction to initialize the TargetInstrInfo from and
+  // we only need MCInstrInfo for asm parsing. We create one unconditionally
+  // because it's not subtarget dependent.
+  std::unique_ptr<MCInstrInfo> MII(TM.getTarget().createMCInstrInfo());
+  std::unique_ptr<MCTargetAsmParser> TAP(TM.getTarget().createMCAsmParser(
+      STI, *Parser, *MII, MCOptions));
+  if (!TAP)
+    report_fatal_error("Inline asm not supported by this streamer because"
+                       " we don't have an asm parser for this target\n");
+  Parser->setAssemblerDialect(Dialect);
+  Parser->setTargetParser(*TAP.get());
+  if (MF) {
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    TAP->SetFrameRegister(TRI->getFrameRegister(*MF));
+  }
+
+  emitInlineAsmStart();
+  // Don't implicitly switch to the text section before the asm.
+  int Res = Parser->Run(/*NoInitialTextSection*/ true,
+                        /*NoFinalize*/ true);
+  emitInlineAsmEnd(STI, &TAP->getSTI());
+  if (Res && !HasDiagHandler)
+    report_fatal_error("Error parsing inline asm\n");
+}
+
+static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                               MachineModuleInfo *MMI, int InlineAsmVariant,
+                               AsmPrinter *AP, unsigned LocCookie,
+                               raw_ostream &OS) {
+  // Switch to the inline assembly variant.
+  OS << "\t.intel_syntax\n\t";
+
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+
+      OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      }
+      if (Done) break;
+
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
+
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+
+      bool Error = false;
+
+      // Scan to find the machine operand number for the operand.
+      for (; Val; --Val) {
+        if (OpNo >= MI->getNumOperands()) break;
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+      }
+
+      // We may have a location metadata attached to the end of the
+      // instruction, and at no point should see metadata at any
+      // other point while processing. It's an error if so.
+      if (OpNo >= MI->getNumOperands() ||
+          MI->getOperand(OpNo).isMetadata()) {
+        Error = true;
+      } else {
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        ++OpNo;  // Skip over the ID number.
+
+        if (InlineAsm::isMemKind(OpFlags)) {
+          Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                            /*Modifier*/ nullptr, OS);
+        } else {
+          Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                      /*Modifier*/ nullptr, OS);
+        }
+      }
+      if (Error) {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+        MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+      }
+      break;
+    }
+    }
+  }
+  OS << "\n\t.att_syntax\n" << (char)0;  // null terminate string.
+}
+
+static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                                MachineModuleInfo *MMI, int InlineAsmVariant,
+                                int AsmPrinterVariant, AsmPrinter *AP,
+                                unsigned LocCookie, raw_ostream &OS) {
+  int CurVariant = -1;            // The number of the {.|.|.} region we are in.
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  OS << '\t';
+
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+        OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
+
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':     // $$ -> $
+        if (CurVariant == -1 || CurVariant == AsmPrinterVariant)
+          OS << '$';
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      case '(':             // $( -> same as GCC's { character.
+        ++LastEmitted;      // Consume '(' character.
+        if (CurVariant != -1)
+          report_fatal_error("Nested variants found in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        CurVariant = 0;     // We're in the first variant now.
+        break;
+      case '|':
+        ++LastEmitted;  // consume '|' character.
+        if (CurVariant == -1)
+          OS << '|';       // this is gcc's behavior for | outside a variant
+        else
+          ++CurVariant;   // We're in the next variant.
+        break;
+      case ')':         // $) -> same as GCC's } char.
+        ++LastEmitted;  // consume ')' character.
+        if (CurVariant == -1)
+          OS << '}';     // this is gcc's behavior for } outside a variant
+        else
+          CurVariant = -1;
+        break;
+      }
+      if (Done) break;
+
+      bool HasCurlyBraces = false;
+      if (*LastEmitted == '{') {     // ${variable}
+        ++LastEmitted;               // Consume '{' character.
+        HasCurlyBraces = true;
+      }
+
+      // If we have ${:foo}, then this is not a real operand reference, it is a
+      // "magic" string reference, just like in .td files.  Arrange to call
+      // PrintSpecial.
+      if (HasCurlyBraces && *LastEmitted == ':') {
+        ++LastEmitted;
+        const char *StrStart = LastEmitted;
+        const char *StrEnd = strchr(StrStart, '}');
+        if (!StrEnd)
+          report_fatal_error("Unterminated ${:foo} operand in inline asm"
+                             " string: '" + Twine(AsmStr) + "'");
+
+        std::string Val(StrStart, StrEnd);
+        AP->PrintSpecial(MI, OS, Val.c_str());
+        LastEmitted = StrEnd+1;
+        break;
+      }
+
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
+
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
+
+      char Modifier[2] = { 0, 0 };
+
+      if (HasCurlyBraces) {
+        // If we have curly braces, check for a modifier character.  This
+        // supports syntax like ${0:u}, which correspond to "%u0" in GCC asm.
+        if (*LastEmitted == ':') {
+          ++LastEmitted;    // Consume ':' character.
+          if (*LastEmitted == 0)
+            report_fatal_error("Bad ${:} expression in inline asm string: '" +
+                               Twine(AsmStr) + "'");
+
+          Modifier[0] = *LastEmitted;
+          ++LastEmitted;    // Consume modifier character.
+        }
+
+        if (*LastEmitted != '}')
+          report_fatal_error("Bad ${} expression in inline asm string: '" +
+                             Twine(AsmStr) + "'");
+        ++LastEmitted;    // Consume '}' character.
+      }
+
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      if (CurVariant == -1 || CurVariant == AsmPrinterVariant) {
+        unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+
+        bool Error = false;
+
+        // Scan to find the machine operand number for the operand.
+        for (; Val; --Val) {
+          if (OpNo >= MI->getNumOperands()) break;
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+        }
+
+        // We may have a location metadata attached to the end of the
+        // instruction, and at no point should see metadata at any
+        // other point while processing. It's an error if so.
+        if (OpNo >= MI->getNumOperands() ||
+            MI->getOperand(OpNo).isMetadata()) {
+          Error = true;
+        } else {
+          unsigned OpFlags = MI->getOperand(OpNo).getImm();
+          ++OpNo;  // Skip over the ID number.
+
+          if (Modifier[0] == 'l') { // Labels are target independent.
+            // FIXME: What if the operand isn't an MBB, report error?
+            const MCSymbol *Sym = MI->getOperand(OpNo).getMBB()->getSymbol();
+            Sym->print(OS, AP->MAI);
+          } else {
+            if (InlineAsm::isMemKind(OpFlags)) {
+              Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                                Modifier[0] ? Modifier : nullptr,
+                                                OS);
+            } else {
+              Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                          Modifier[0] ? Modifier : nullptr, OS);
+            }
+          }
+        }
+        if (Error) {
+          std::string msg;
+          raw_string_ostream Msg(msg);
+          Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+          MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+        }
+      }
+      break;
+    }
+    }
+  }
+  OS << '\n' << (char)0;  // null terminate string.
+}
+
+/// EmitInlineAsm - This method formats and emits the specified machine
+/// instruction that is an inline asm.
+void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
+  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
+
+  // Count the number of register definitions to find the asm string.
+  unsigned NumDefs = 0;
+  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+       ++NumDefs)
+    assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    OutStreamer->emitRawComment(MAI->getInlineAsmStart());
+    OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
+    return;
+  }
+
+  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use emitRawComment.
+  OutStreamer->emitRawComment(MAI->getInlineAsmStart());
+
+  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
+  // it.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = nullptr;
+  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
+    if (MI->getOperand(i-1).isMetadata() &&
+        (LocMD = MI->getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  // Emit the inline asm to a temporary string so we can emit it through
+  // EmitInlineAsm.
+  SmallString<256> StringData;
+  raw_svector_ostream OS(StringData);
+
+  // The variant of the current asmprinter.
+  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  InlineAsm::AsmDialect InlineAsmVariant = MI->getInlineAsmDialect();
+  AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+  if (InlineAsmVariant == InlineAsm::AD_ATT)
+    EmitGCCInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AsmPrinterVariant,
+                        AP, LocCookie, OS);
+  else
+    EmitMSInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AP, LocCookie, OS);
+
+  // Reset SanitizeAddress based on the function's attribute.
+  MCTargetOptions MCOptions = TM.Options.MCOptions;
+  MCOptions.SanitizeAddress =
+      MF->getFunction()->hasFnAttribute(Attribute::SanitizeAddress);
+
+  EmitInlineAsm(OS.str(), getSubtargetInfo(), MCOptions, LocMD,
+                MI->getInlineAsmDialect());
+
+  // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use emitRawComment.
+  OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
+}
+
+
+/// PrintSpecial - Print information related to the specified machine instr
+/// that is independent of the operand, and may be independent of the instr
+/// itself.  This can be useful for portably encoding the comment character
+/// or other bits of target-specific knowledge into the asmstrings.  The
+/// syntax used is ${:comment}.  Targets can override this to add support
+/// for their own strange codes.
+void AsmPrinter::PrintSpecial(const MachineInstr *MI, raw_ostream &OS,
+                              const char *Code) const {
+  if (!strcmp(Code, "private")) {
+    const DataLayout &DL = MF->getDataLayout();
+    OS << DL.getPrivateGlobalPrefix();
+  } else if (!strcmp(Code, "comment")) {
+    OS << MAI->getCommentString();
+  } else if (!strcmp(Code, "uid")) {
+    // Comparing the address of MI isn't sufficient, because machineinstrs may
+    // be allocated to the same address across functions.
+
+    // If this is a new LastFn instruction, bump the counter.
+    if (LastMI != MI || LastFn != getFunctionNumber()) {
+      ++Counter;
+      LastMI = MI;
+      LastFn = getFunctionNumber();
+    }
+    OS << Counter;
+  } else {
+    std::string msg;
+    raw_string_ostream Msg(msg);
+    Msg << "Unknown special formatter '" << Code
+         << "' for machine instr: " << *MI;
+    report_fatal_error(Msg.str());
+  }
+}
+
+/// PrintAsmOperand - Print the specified operand of MI, an INLINEASM
+/// instruction, using the specified assembler variant.  Targets should
+/// override this to format as appropriate.
+bool AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                                 unsigned AsmVariant, const char *ExtraCode,
+                                 raw_ostream &O) {
+  // Does this asm operand have a single letter operand modifier?
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    switch (ExtraCode[0]) {
+    default:
+      return true;  // Unknown modifier.
+    case 'c': // Substitute immediate value without immediate syntax
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << MO.getImm();
+      return false;
+    case 'n':  // Negate the immediate constant.
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << -MO.getImm();
+      return false;
+    case 's':  // The GCC deprecated s modifier
+      if (MO.getType() != MachineOperand::MO_Immediate)
+        return true;
+      O << ((32 - MO.getImm()) & 31);
+      return false;
+    }
+  }
+  return true;
+}
+
+bool AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                                       unsigned AsmVariant,
+                                       const char *ExtraCode, raw_ostream &O) {
+  // Target doesn't support this yet!
+  return true;
+}
+
+void AsmPrinter::emitInlineAsmStart() const {}
+
+void AsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
+                                  const MCSubtargetInfo *EndInfo) const {}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
new file mode 100644
index 000000000000..aaf6180c9404
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -0,0 +1,110 @@
+//===-- llvm/CodeGen/ByteStreamer.h - ByteStreamer class --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a class that can take bytes that would normally be
+// streamed via the AsmPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_BYTESTREAMER_H
+
+#include "DIEHash.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/LEB128.h"
+#include <string>
+
+namespace llvm {
+class ByteStreamer {
+ protected:
+  ~ByteStreamer() = default;
+  ByteStreamer(const ByteStreamer&) = default;
+  ByteStreamer() = default;
+
+ public:
+  // For now we're just handling the calls we need for dwarf emission/hashing.
+  virtual void EmitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
+  virtual void EmitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
+  virtual void EmitULEB128(uint64_t DWord, const Twine &Comment = "") = 0;
+};
+
+class APByteStreamer final : public ByteStreamer {
+private:
+  AsmPrinter &AP;
+
+public:
+  APByteStreamer(AsmPrinter &Asm) : AP(Asm) {}
+  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.EmitInt8(Byte);
+  }
+  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.EmitSLEB128(DWord);
+  }
+  void EmitULEB128(uint64_t DWord, const Twine &Comment) override {
+    AP.OutStreamer->AddComment(Comment);
+    AP.EmitULEB128(DWord);
+  }
+};
+
+class HashingByteStreamer final : public ByteStreamer {
+ private:
+  DIEHash &Hash;
+ public:
+ HashingByteStreamer(DIEHash &H) : Hash(H) {}
+  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+    Hash.update(Byte);
+  }
+  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    Hash.addSLEB128(DWord);
+  }
+  void EmitULEB128(uint64_t DWord, const Twine &Comment) override {
+    Hash.addULEB128(DWord);
+  }
+};
+
+class BufferByteStreamer final : public ByteStreamer {
+private:
+  SmallVectorImpl<char> &Buffer;
+  SmallVectorImpl<std::string> &Comments;
+
+  /// \brief Only verbose textual output needs comments.  This will be set to
+  /// true for that case, and false otherwise.  If false, comments passed in to
+  /// the emit methods will be ignored.
+  bool GenerateComments;
+
+public:
+  BufferByteStreamer(SmallVectorImpl<char> &Buffer,
+                     SmallVectorImpl<std::string> &Comments,
+                     bool GenerateComments)
+  : Buffer(Buffer), Comments(Comments), GenerateComments(GenerateComments) {}
+  void EmitInt8(uint8_t Byte, const Twine &Comment) override {
+    Buffer.push_back(Byte);
+    if (GenerateComments)
+      Comments.push_back(Comment.str());
+  }
+  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+    raw_svector_ostream OSE(Buffer);
+    encodeSLEB128(DWord, OSE);
+    if (GenerateComments)
+      Comments.push_back(Comment.str());
+  }
+  void EmitULEB128(uint64_t DWord, const Twine &Comment) override {
+    raw_svector_ostream OSE(Buffer);
+    encodeULEB128(DWord, OSE);
+    if (GenerateComments)
+      Comments.push_back(Comment.str());
+  }
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
new file mode 100644
index 000000000000..b0ba57122206
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -0,0 +1,2076 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp --*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeViewDebug.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/DebugInfo/CodeView/FieldListRecordBuilder.h"
+#include "llvm/DebugInfo/CodeView/Line.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeDumper.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
+    : DebugHandlerBase(AP), OS(*Asm->OutStreamer), CurFn(nullptr) {
+  // If module doesn't have named metadata anchors or COFF debug section
+  // is not available, skip any debug info related stuff.
+  if (!MMI->getModule()->getNamedMetadata("llvm.dbg.cu") ||
+      !AP->getObjFileLowering().getCOFFDebugSymbolsSection()) {
+    Asm = nullptr;
+    return;
+  }
+
+  // Tell MMI that we have debug info.
+  MMI->setDebugInfoAvailability(true);
+}
+
+StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
+  std::string &Filepath = FileToFilepathMap[File];
+  if (!Filepath.empty())
+    return Filepath;
+
+  StringRef Dir = File->getDirectory(), Filename = File->getFilename();
+
+  // Clang emits directory and relative filename info into the IR, but CodeView
+  // operates on full paths.  We could change Clang to emit full paths too, but
+  // that would increase the IR size and probably not needed for other users.
+  // For now, just concatenate and canonicalize the path here.
+  if (Filename.find(':') == 1)
+    Filepath = Filename;
+  else
+    Filepath = (Dir + "\\" + Filename).str();
+
+  // Canonicalize the path.  We have to do it textually because we may no longer
+  // have access the file in the filesystem.
+  // First, replace all slashes with backslashes.
+  std::replace(Filepath.begin(), Filepath.end(), '/', '\\');
+
+  // Remove all "\.\" with "\".
+  size_t Cursor = 0;
+  while ((Cursor = Filepath.find("\\.\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 2);
+
+  // Replace all "\XXX\..\" with "\".  Don't try too hard though as the original
+  // path should be well-formatted, e.g. start with a drive letter, etc.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\..\\", Cursor)) != std::string::npos) {
+    // Something's wrong if the path starts with "\..\", abort.
+    if (Cursor == 0)
+      break;
+
+    size_t PrevSlash = Filepath.rfind('\\', Cursor - 1);
+    if (PrevSlash == std::string::npos)
+      // Something's wrong, abort.
+      break;
+
+    Filepath.erase(PrevSlash, Cursor + 3 - PrevSlash);
+    // The next ".." might be following the one we've just erased.
+    Cursor = PrevSlash;
+  }
+
+  // Remove all duplicate backslashes.
+  Cursor = 0;
+  while ((Cursor = Filepath.find("\\\\", Cursor)) != std::string::npos)
+    Filepath.erase(Cursor, 1);
+
+  return Filepath;
+}
+
+unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
+  unsigned NextId = FileIdMap.size() + 1;
+  auto Insertion = FileIdMap.insert(std::make_pair(F, NextId));
+  if (Insertion.second) {
+    // We have to compute the full filepath and emit a .cv_file directive.
+    StringRef FullPath = getFullFilepath(F);
+    NextId = OS.EmitCVFileDirective(NextId, FullPath);
+    assert(NextId == FileIdMap.size() && ".cv_file directive failed");
+  }
+  return Insertion.first->second;
+}
+
+CodeViewDebug::InlineSite &
+CodeViewDebug::getInlineSite(const DILocation *InlinedAt,
+                             const DISubprogram *Inlinee) {
+  auto SiteInsertion = CurFn->InlineSites.insert({InlinedAt, InlineSite()});
+  InlineSite *Site = &SiteInsertion.first->second;
+  if (SiteInsertion.second) {
+    Site->SiteFuncId = NextFuncId++;
+    Site->Inlinee = Inlinee;
+    InlinedSubprograms.insert(Inlinee);
+    getFuncIdForSubprogram(Inlinee);
+  }
+  return *Site;
+}
+
+static StringRef getPrettyScopeName(const DIScope *Scope) {
+  StringRef ScopeName = Scope->getName();
+  if (!ScopeName.empty())
+    return ScopeName;
+
+  switch (Scope->getTag()) {
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    return "<unnamed-tag>";
+  case dwarf::DW_TAG_namespace:
+    return "`anonymous namespace'";
+  }
+
+  return StringRef();
+}
+
+static const DISubprogram *getQualifiedNameComponents(
+    const DIScope *Scope, SmallVectorImpl<StringRef> &QualifiedNameComponents) {
+  const DISubprogram *ClosestSubprogram = nullptr;
+  while (Scope != nullptr) {
+    if (ClosestSubprogram == nullptr)
+      ClosestSubprogram = dyn_cast<DISubprogram>(Scope);
+    StringRef ScopeName = getPrettyScopeName(Scope);
+    if (!ScopeName.empty())
+      QualifiedNameComponents.push_back(ScopeName);
+    Scope = Scope->getScope().resolve();
+  }
+  return ClosestSubprogram;
+}
+
+static std::string getQualifiedName(ArrayRef<StringRef> QualifiedNameComponents,
+                                    StringRef TypeName) {
+  std::string FullyQualifiedName;
+  for (StringRef QualifiedNameComponent : reverse(QualifiedNameComponents)) {
+    FullyQualifiedName.append(QualifiedNameComponent);
+    FullyQualifiedName.append("::");
+  }
+  FullyQualifiedName.append(TypeName);
+  return FullyQualifiedName;
+}
+
+static std::string getFullyQualifiedName(const DIScope *Scope, StringRef Name) {
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  getQualifiedNameComponents(Scope, QualifiedNameComponents);
+  return getQualifiedName(QualifiedNameComponents, Name);
+}
+
+struct CodeViewDebug::TypeLoweringScope {
+  TypeLoweringScope(CodeViewDebug &CVD) : CVD(CVD) { ++CVD.TypeEmissionLevel; }
+  ~TypeLoweringScope() {
+    // Don't decrement TypeEmissionLevel until after emitting deferred types, so
+    // inner TypeLoweringScopes don't attempt to emit deferred types.
+    if (CVD.TypeEmissionLevel == 1)
+      CVD.emitDeferredCompleteTypes();
+    --CVD.TypeEmissionLevel;
+  }
+  CodeViewDebug &CVD;
+};
+
+static std::string getFullyQualifiedName(const DIScope *Ty) {
+  const DIScope *Scope = Ty->getScope().resolve();
+  return getFullyQualifiedName(Scope, getPrettyScopeName(Ty));
+}
+
+TypeIndex CodeViewDebug::getScopeIndex(const DIScope *Scope) {
+  // No scope means global scope and that uses the zero index.
+  if (!Scope || isa<DIFile>(Scope))
+    return TypeIndex();
+
+  assert(!isa<DIType>(Scope) && "shouldn't make a namespace scope for a type");
+
+  // Check if we've already translated this scope.
+  auto I = TypeIndices.find({Scope, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Build the fully qualified name of the scope.
+  std::string ScopeName = getFullyQualifiedName(Scope);
+  TypeIndex TI =
+      TypeTable.writeStringId(StringIdRecord(TypeIndex(), ScopeName));
+  return recordTypeIndexForDINode(Scope, TI);
+}
+
+TypeIndex CodeViewDebug::getFuncIdForSubprogram(const DISubprogram *SP) {
+  // It's possible to ask for the FuncId of a function which doesn't have a
+  // subprogram: inlining a function with debug info into a function with none.
+  if (!SP)
+    return TypeIndex::None();
+
+  // Check if we've already translated this subprogram.
+  auto I = TypeIndices.find({SP, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // The display name includes function template arguments. Drop them to match
+  // MSVC.
+  StringRef DisplayName = SP->getDisplayName().split('<').first;
+
+  const DIScope *Scope = SP->getScope().resolve();
+  TypeIndex TI;
+  if (const auto *Class = dyn_cast_or_null<DICompositeType>(Scope)) {
+    // If the scope is a DICompositeType, then this must be a method. Member
+    // function types take some special handling, and require access to the
+    // subprogram.
+    TypeIndex ClassType = getTypeIndex(Class);
+    MemberFuncIdRecord MFuncId(ClassType, getMemberFunctionType(SP, Class),
+                               DisplayName);
+    TI = TypeTable.writeMemberFuncId(MFuncId);
+  } else {
+    // Otherwise, this must be a free function.
+    TypeIndex ParentScope = getScopeIndex(Scope);
+    FuncIdRecord FuncId(ParentScope, getTypeIndex(SP->getType()), DisplayName);
+    TI = TypeTable.writeFuncId(FuncId);
+  }
+
+  return recordTypeIndexForDINode(SP, TI);
+}
+
+TypeIndex CodeViewDebug::getMemberFunctionType(const DISubprogram *SP,
+                                               const DICompositeType *Class) {
+  // Always use the method declaration as the key for the function type. The
+  // method declaration contains the this adjustment.
+  if (SP->getDeclaration())
+    SP = SP->getDeclaration();
+  assert(!SP->getDeclaration() && "should use declaration as key");
+
+  // Key the MemberFunctionRecord into the map as {SP, Class}. It won't collide
+  // with the MemberFuncIdRecord, which is keyed in as {SP, nullptr}.
+  auto I = TypeIndices.find({SP, Class});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Make sure complete type info for the class is emitted *after* the member
+  // function type, as the complete class type is likely to reference this
+  // member function type.
+  TypeLoweringScope S(*this);
+  TypeIndex TI =
+      lowerTypeMemberFunction(SP->getType(), Class, SP->getThisAdjustment());
+  return recordTypeIndexForDINode(SP, TI, Class);
+}
+
+TypeIndex CodeViewDebug::recordTypeIndexForDINode(const DINode *Node,
+                                                  TypeIndex TI,
+                                                  const DIType *ClassTy) {
+  auto InsertResult = TypeIndices.insert({{Node, ClassTy}, TI});
+  (void)InsertResult;
+  assert(InsertResult.second && "DINode was already assigned a type index");
+  return TI;
+}
+
+unsigned CodeViewDebug::getPointerSizeInBytes() {
+  return MMI->getModule()->getDataLayout().getPointerSizeInBits() / 8;
+}
+
+void CodeViewDebug::recordLocalVariable(LocalVariable &&Var,
+                                        const DILocation *InlinedAt) {
+  if (InlinedAt) {
+    // This variable was inlined. Associate it with the InlineSite.
+    const DISubprogram *Inlinee = Var.DIVar->getScope()->getSubprogram();
+    InlineSite &Site = getInlineSite(InlinedAt, Inlinee);
+    Site.InlinedLocals.emplace_back(Var);
+  } else {
+    // This variable goes in the main ProcSym.
+    CurFn->Locals.emplace_back(Var);
+  }
+}
+
+static void addLocIfNotPresent(SmallVectorImpl<const DILocation *> &Locs,
+                               const DILocation *Loc) {
+  auto B = Locs.begin(), E = Locs.end();
+  if (std::find(B, E, Loc) == E)
+    Locs.push_back(Loc);
+}
+
+void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
+                                        const MachineFunction *MF) {
+  // Skip this instruction if it has the same location as the previous one.
+  if (DL == CurFn->LastLoc)
+    return;
+
+  const DIScope *Scope = DL.get()->getScope();
+  if (!Scope)
+    return;
+
+  // Skip this line if it is longer than the maximum we can record.
+  LineInfo LI(DL.getLine(), DL.getLine(), /*IsStatement=*/true);
+  if (LI.getStartLine() != DL.getLine() || LI.isAlwaysStepInto() ||
+      LI.isNeverStepInto())
+    return;
+
+  ColumnInfo CI(DL.getCol(), /*EndColumn=*/0);
+  if (CI.getStartColumn() != DL.getCol())
+    return;
+
+  if (!CurFn->HaveLineInfo)
+    CurFn->HaveLineInfo = true;
+  unsigned FileId = 0;
+  if (CurFn->LastLoc.get() && CurFn->LastLoc->getFile() == DL->getFile())
+    FileId = CurFn->LastFileId;
+  else
+    FileId = CurFn->LastFileId = maybeRecordFile(DL->getFile());
+  CurFn->LastLoc = DL;
+
+  unsigned FuncId = CurFn->FuncId;
+  if (const DILocation *SiteLoc = DL->getInlinedAt()) {
+    const DILocation *Loc = DL.get();
+
+    // If this location was actually inlined from somewhere else, give it the ID
+    // of the inline call site.
+    FuncId =
+        getInlineSite(SiteLoc, Loc->getScope()->getSubprogram()).SiteFuncId;
+
+    // Ensure we have links in the tree of inline call sites.
+    bool FirstLoc = true;
+    while ((SiteLoc = Loc->getInlinedAt())) {
+      InlineSite &Site =
+          getInlineSite(SiteLoc, Loc->getScope()->getSubprogram());
+      if (!FirstLoc)
+        addLocIfNotPresent(Site.ChildSites, Loc);
+      FirstLoc = false;
+      Loc = SiteLoc;
+    }
+    addLocIfNotPresent(CurFn->ChildSites, Loc);
+  }
+
+  OS.EmitCVLocDirective(FuncId, FileId, DL.getLine(), DL.getCol(),
+                        /*PrologueEnd=*/false,
+                        /*IsStmt=*/false, DL->getFilename());
+}
+
+void CodeViewDebug::emitCodeViewMagicVersion() {
+  OS.EmitValueToAlignment(4);
+  OS.AddComment("Debug section magic");
+  OS.EmitIntValue(COFF::DEBUG_SECTION_MAGIC, 4);
+}
+
+void CodeViewDebug::endModule() {
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  assert(Asm != nullptr);
+
+  // The COFF .debug$S section consists of several subsections, each starting
+  // with a 4-byte control code (e.g. 0xF1, 0xF2, etc) and then a 4-byte length
+  // of the payload followed by the payload itself.  The subsections are 4-byte
+  // aligned.
+
+  // Use the generic .debug$S section, and make a subsection for all the inlined
+  // subprograms.
+  switchToDebugSectionForSymbol(nullptr);
+  emitInlineeLinesSubsection();
+
+  // Emit per-function debug information.
+  for (auto &P : FnDebugInfo)
+    if (!P.first->isDeclarationForLinker())
+      emitDebugInfoForFunction(P.first, P.second);
+
+  // Emit global variable debug information.
+  setCurrentSubprogram(nullptr);
+  emitDebugInfoForGlobals();
+
+  // Emit retained types.
+  emitDebugInfoForRetainedTypes();
+
+  // Switch back to the generic .debug$S section after potentially processing
+  // comdat symbol sections.
+  switchToDebugSectionForSymbol(nullptr);
+
+  // Emit UDT records for any types used by global variables.
+  if (!GlobalUDTs.empty()) {
+    MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+    emitDebugInfoForUDTs(GlobalUDTs);
+    endCVSubsection(SymbolsEnd);
+  }
+
+  // This subsection holds a file index to offset in string table table.
+  OS.AddComment("File index to string table offset subsection");
+  OS.EmitCVFileChecksumsDirective();
+
+  // This subsection holds the string table.
+  OS.AddComment("String table");
+  OS.EmitCVStringTableDirective();
+
+  // Emit type information last, so that any types we translate while emitting
+  // function info are included.
+  emitTypeInformation();
+
+  clear();
+}
+
+static void emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S) {
+  // Microsoft's linker seems to have trouble with symbol names longer than
+  // 0xffd8 bytes.
+  S = S.substr(0, 0xffd8);
+  SmallString<32> NullTerminatedString(S);
+  NullTerminatedString.push_back('\0');
+  OS.EmitBytes(NullTerminatedString);
+}
+
+void CodeViewDebug::emitTypeInformation() {
+  // Do nothing if we have no debug info or if no non-trivial types were emitted
+  // to TypeTable during codegen.
+  NamedMDNode *CU_Nodes = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  if (!CU_Nodes)
+    return;
+  if (TypeTable.empty())
+    return;
+
+  // Start the .debug$T section with 0x4.
+  OS.SwitchSection(Asm->getObjFileLowering().getCOFFDebugTypesSection());
+  emitCodeViewMagicVersion();
+
+  SmallString<8> CommentPrefix;
+  if (OS.isVerboseAsm()) {
+    CommentPrefix += '\t';
+    CommentPrefix += Asm->MAI->getCommentString();
+    CommentPrefix += ' ';
+  }
+
+  CVTypeDumper CVTD(nullptr, /*PrintRecordBytes=*/false);
+  TypeTable.ForEachRecord(
+      [&](TypeIndex Index, StringRef Record) {
+        if (OS.isVerboseAsm()) {
+          // Emit a block comment describing the type record for readability.
+          SmallString<512> CommentBlock;
+          raw_svector_ostream CommentOS(CommentBlock);
+          ScopedPrinter SP(CommentOS);
+          SP.setPrefix(CommentPrefix);
+          CVTD.setPrinter(&SP);
+          Error E = CVTD.dump({Record.bytes_begin(), Record.bytes_end()});
+          if (E) {
+            logAllUnhandledErrors(std::move(E), errs(), "error: ");
+            llvm_unreachable("produced malformed type record");
+          }
+          // emitRawComment will insert its own tab and comment string before
+          // the first line, so strip off our first one. It also prints its own
+          // newline.
+          OS.emitRawComment(
+              CommentOS.str().drop_front(CommentPrefix.size() - 1).rtrim());
+        } else {
+#ifndef NDEBUG
+          // Assert that the type data is valid even if we aren't dumping
+          // comments. The MSVC linker doesn't do much type record validation,
+          // so the first link of an invalid type record can succeed while
+          // subsequent links will fail with LNK1285.
+          ByteStream<> Stream({Record.bytes_begin(), Record.bytes_end()});
+          CVTypeArray Types;
+          StreamReader Reader(Stream);
+          Error E = Reader.readArray(Types, Reader.getLength());
+          if (!E) {
+            TypeVisitorCallbacks C;
+            E = CVTypeVisitor(C).visitTypeStream(Types);
+          }
+          if (E) {
+            logAllUnhandledErrors(std::move(E), errs(), "error: ");
+            llvm_unreachable("produced malformed type record");
+          }
+#endif
+        }
+        OS.EmitBinaryData(Record);
+      });
+}
+
+void CodeViewDebug::emitInlineeLinesSubsection() {
+  if (InlinedSubprograms.empty())
+    return;
+
+  OS.AddComment("Inlinee lines subsection");
+  MCSymbol *InlineEnd = beginCVSubsection(ModuleSubstreamKind::InlineeLines);
+
+  // We don't provide any extra file info.
+  // FIXME: Find out if debuggers use this info.
+  OS.AddComment("Inlinee lines signature");
+  OS.EmitIntValue(unsigned(InlineeLinesSignature::Normal), 4);
+
+  for (const DISubprogram *SP : InlinedSubprograms) {
+    assert(TypeIndices.count({SP, nullptr}));
+    TypeIndex InlineeIdx = TypeIndices[{SP, nullptr}];
+
+    OS.AddBlankLine();
+    unsigned FileId = maybeRecordFile(SP->getFile());
+    OS.AddComment("Inlined function " + SP->getDisplayName() + " starts at " +
+                  SP->getFilename() + Twine(':') + Twine(SP->getLine()));
+    OS.AddBlankLine();
+    // The filechecksum table uses 8 byte entries for now, and file ids start at
+    // 1.
+    unsigned FileOffset = (FileId - 1) * 8;
+    OS.AddComment("Type index of inlined function");
+    OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+    OS.AddComment("Offset into filechecksum table");
+    OS.EmitIntValue(FileOffset, 4);
+    OS.AddComment("Starting line number");
+    OS.EmitIntValue(SP->getLine(), 4);
+  }
+
+  endCVSubsection(InlineEnd);
+}
+
+void CodeViewDebug::collectInlineSiteChildren(
+    SmallVectorImpl<unsigned> &Children, const FunctionInfo &FI,
+    const InlineSite &Site) {
+  for (const DILocation *ChildSiteLoc : Site.ChildSites) {
+    auto I = FI.InlineSites.find(ChildSiteLoc);
+    const InlineSite &ChildSite = I->second;
+    Children.push_back(ChildSite.SiteFuncId);
+    collectInlineSiteChildren(Children, FI, ChildSite);
+  }
+}
+
+void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
+                                        const DILocation *InlinedAt,
+                                        const InlineSite &Site) {
+  MCSymbol *InlineBegin = MMI->getContext().createTempSymbol(),
+           *InlineEnd = MMI->getContext().createTempSymbol();
+
+  assert(TypeIndices.count({Site.Inlinee, nullptr}));
+  TypeIndex InlineeIdx = TypeIndices[{Site.Inlinee, nullptr}];
+
+  // SymbolRecord
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(InlineEnd, InlineBegin, 2);   // RecordLength
+  OS.EmitLabel(InlineBegin);
+  OS.AddComment("Record kind: S_INLINESITE");
+  OS.EmitIntValue(SymbolKind::S_INLINESITE, 2); // RecordKind
+
+  OS.AddComment("PtrParent");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("PtrEnd");
+  OS.EmitIntValue(0, 4);
+  OS.AddComment("Inlinee type index");
+  OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+
+  unsigned FileId = maybeRecordFile(Site.Inlinee->getFile());
+  unsigned StartLineNum = Site.Inlinee->getLine();
+  SmallVector<unsigned, 3> SecondaryFuncIds;
+  collectInlineSiteChildren(SecondaryFuncIds, FI, Site);
+
+  OS.EmitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
+                                    FI.Begin, FI.End, SecondaryFuncIds);
+
+  OS.EmitLabel(InlineEnd);
+
+  emitLocalVariableList(Site.InlinedLocals);
+
+  // Recurse on child inlined call sites before closing the scope.
+  for (const DILocation *ChildSite : Site.ChildSites) {
+    auto I = FI.InlineSites.find(ChildSite);
+    assert(I != FI.InlineSites.end() &&
+           "child site not in function inline site map");
+    emitInlinedCallSite(FI, ChildSite, I->second);
+  }
+
+  // Close the scope.
+  OS.AddComment("Record length");
+  OS.EmitIntValue(2, 2);                                  // RecordLength
+  OS.AddComment("Record kind: S_INLINESITE_END");
+  OS.EmitIntValue(SymbolKind::S_INLINESITE_END, 2); // RecordKind
+}
+
+void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
+  // If we have a symbol, it may be in a section that is COMDAT. If so, find the
+  // comdat key. A section may be comdat because of -ffunction-sections or
+  // because it is comdat in the IR.
+  MCSectionCOFF *GVSec =
+      GVSym ? dyn_cast<MCSectionCOFF>(&GVSym->getSection()) : nullptr;
+  const MCSymbol *KeySym = GVSec ? GVSec->getCOMDATSymbol() : nullptr;
+
+  MCSectionCOFF *DebugSec = cast<MCSectionCOFF>(
+      Asm->getObjFileLowering().getCOFFDebugSymbolsSection());
+  DebugSec = OS.getContext().getAssociativeCOFFSection(DebugSec, KeySym);
+
+  OS.SwitchSection(DebugSec);
+
+  // Emit the magic version number if this is the first time we've switched to
+  // this section.
+  if (ComdatDebugSections.insert(DebugSec).second)
+    emitCodeViewMagicVersion();
+}
+
+void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
+                                             FunctionInfo &FI) {
+  // For each function there is a separate subsection
+  // which holds the PC to file:line table.
+  const MCSymbol *Fn = Asm->getSymbol(GV);
+  assert(Fn);
+
+  // Switch to the to a comdat section, if appropriate.
+  switchToDebugSectionForSymbol(Fn);
+
+  std::string FuncName;
+  auto *SP = GV->getSubprogram();
+  setCurrentSubprogram(SP);
+
+  // If we have a display name, build the fully qualified name by walking the
+  // chain of scopes.
+  if (SP != nullptr && !SP->getDisplayName().empty())
+    FuncName =
+        getFullyQualifiedName(SP->getScope().resolve(), SP->getDisplayName());
+
+  // If our DISubprogram name is empty, use the mangled name.
+  if (FuncName.empty())
+    FuncName = GlobalValue::getRealLinkageName(GV->getName());
+
+  // Emit a symbol subsection, required by VS2012+ to find function boundaries.
+  OS.AddComment("Symbol subsection for " + Twine(FuncName));
+  MCSymbol *SymbolsEnd = beginCVSubsection(ModuleSubstreamKind::Symbols);
+  {
+    MCSymbol *ProcRecordBegin = MMI->getContext().createTempSymbol(),
+             *ProcRecordEnd = MMI->getContext().createTempSymbol();
+    OS.AddComment("Record length");
+    OS.emitAbsoluteSymbolDiff(ProcRecordEnd, ProcRecordBegin, 2);
+    OS.EmitLabel(ProcRecordBegin);
+
+  if (GV->hasLocalLinkage()) {
+    OS.AddComment("Record kind: S_LPROC32_ID");
+    OS.EmitIntValue(unsigned(SymbolKind::S_LPROC32_ID), 2);
+  } else {
+    OS.AddComment("Record kind: S_GPROC32_ID");
+    OS.EmitIntValue(unsigned(SymbolKind::S_GPROC32_ID), 2);
+  }
+
+    // These fields are filled in by tools like CVPACK which run after the fact.
+    OS.AddComment("PtrParent");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("PtrEnd");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("PtrNext");
+    OS.EmitIntValue(0, 4);
+    // This is the important bit that tells the debugger where the function
+    // code is located and what's its size:
+    OS.AddComment("Code size");
+    OS.emitAbsoluteSymbolDiff(FI.End, Fn, 4);
+    OS.AddComment("Offset after prologue");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("Offset before epilogue");
+    OS.EmitIntValue(0, 4);
+    OS.AddComment("Function type index");
+    OS.EmitIntValue(getFuncIdForSubprogram(GV->getSubprogram()).getIndex(), 4);
+    OS.AddComment("Function section relative address");
+    OS.EmitCOFFSecRel32(Fn);
+    OS.AddComment("Function section index");
+    OS.EmitCOFFSectionIndex(Fn);
+    OS.AddComment("Flags");
+    OS.EmitIntValue(0, 1);
+    // Emit the function display name as a null-terminated string.
+    OS.AddComment("Function name");
+    // Truncate the name so we won't overflow the record length field.
+    emitNullTerminatedSymbolName(OS, FuncName);
+    OS.EmitLabel(ProcRecordEnd);
+
+    emitLocalVariableList(FI.Locals);
+
+    // Emit inlined call site information. Only emit functions inlined directly
+    // into the parent function. We'll emit the other sites recursively as part
+    // of their parent inline site.
+    for (const DILocation *InlinedAt : FI.ChildSites) {
+      auto I = FI.InlineSites.find(InlinedAt);
+      assert(I != FI.InlineSites.end() &&
+             "child site not in function inline site map");
+      emitInlinedCallSite(FI, InlinedAt, I->second);
+    }
+
+    if (SP != nullptr)
+      emitDebugInfoForUDTs(LocalUDTs);
+
+    // We're done with this function.
+    OS.AddComment("Record length");
+    OS.EmitIntValue(0x0002, 2);
+    OS.AddComment("Record kind: S_PROC_ID_END");
+    OS.EmitIntValue(unsigned(SymbolKind::S_PROC_ID_END), 2);
+  }
+  endCVSubsection(SymbolsEnd);
+
+  // We have an assembler directive that takes care of the whole line table.
+  OS.EmitCVLinetableDirective(FI.FuncId, Fn, FI.End);
+}
+
+CodeViewDebug::LocalVarDefRange
+CodeViewDebug::createDefRangeMem(uint16_t CVRegister, int Offset) {
+  LocalVarDefRange DR;
+  DR.InMemory = -1;
+  DR.DataOffset = Offset;
+  assert(DR.DataOffset == Offset && "truncation");
+  DR.StructOffset = 0;
+  DR.CVRegister = CVRegister;
+  return DR;
+}
+
+CodeViewDebug::LocalVarDefRange
+CodeViewDebug::createDefRangeReg(uint16_t CVRegister) {
+  LocalVarDefRange DR;
+  DR.InMemory = 0;
+  DR.DataOffset = 0;
+  DR.StructOffset = 0;
+  DR.CVRegister = CVRegister;
+  return DR;
+}
+
+void CodeViewDebug::collectVariableInfoFromMMITable(
+    DenseSet<InlinedVariable> &Processed) {
+  const TargetSubtargetInfo &TSI = Asm->MF->getSubtarget();
+  const TargetFrameLowering *TFI = TSI.getFrameLowering();
+  const TargetRegisterInfo *TRI = TSI.getRegisterInfo();
+
+  for (const MachineModuleInfo::VariableDbgInfo &VI :
+       MMI->getVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) &&
+           "Expected inlined-at fields to agree");
+
+    Processed.insert(InlinedVariable(VI.Var, VI.Loc->getInlinedAt()));
+    LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
+
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    // Get the frame register used and the offset.
+    unsigned FrameReg = 0;
+    int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
+    uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
+
+    // Calculate the label ranges.
+    LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset);
+    for (const InsnRange &Range : Scope->getRanges()) {
+      const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
+      const MCSymbol *End = getLabelAfterInsn(Range.second);
+      End = End ? End : Asm->getFunctionEnd();
+      DefRange.Ranges.emplace_back(Begin, End);
+    }
+
+    LocalVariable Var;
+    Var.DIVar = VI.Var;
+    Var.DefRanges.emplace_back(std::move(DefRange));
+    recordLocalVariable(std::move(Var), VI.Loc->getInlinedAt());
+  }
+}
+
+void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
+  DenseSet<InlinedVariable> Processed;
+  // Grab the variable info that was squirreled away in the MMI side-table.
+  collectVariableInfoFromMMITable(Processed);
+
+  const TargetRegisterInfo *TRI = Asm->MF->getSubtarget().getRegisterInfo();
+
+  for (const auto &I : DbgValues) {
+    InlinedVariable IV = I.first;
+    if (Processed.count(IV))
+      continue;
+    const DILocalVariable *DIVar = IV.first;
+    const DILocation *InlinedAt = IV.second;
+
+    // Instruction ranges, specifying where IV is accessible.
+    const auto &Ranges = I.second;
+
+    LexicalScope *Scope = nullptr;
+    if (InlinedAt)
+      Scope = LScopes.findInlinedScope(DIVar->getScope(), InlinedAt);
+    else
+      Scope = LScopes.findLexicalScope(DIVar->getScope());
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    LocalVariable Var;
+    Var.DIVar = DIVar;
+
+    // Calculate the definition ranges.
+    for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+      const InsnRange &Range = *I;
+      const MachineInstr *DVInst = Range.first;
+      assert(DVInst->isDebugValue() && "Invalid History entry");
+      const DIExpression *DIExpr = DVInst->getDebugExpression();
+
+      // Bail if there is a complex DWARF expression for now.
+      if (DIExpr && DIExpr->getNumElements() > 0)
+        continue;
+
+      // Bail if operand 0 is not a valid register. This means the variable is a
+      // simple constant, or is described by a complex expression.
+      // FIXME: Find a way to represent constant variables, since they are
+      // relatively common.
+      unsigned Reg =
+          DVInst->getOperand(0).isReg() ? DVInst->getOperand(0).getReg() : 0;
+      if (Reg == 0)
+        continue;
+
+      // Handle the two cases we can handle: indirect in memory and in register.
+      bool IsIndirect = DVInst->getOperand(1).isImm();
+      unsigned CVReg = TRI->getCodeViewRegNum(DVInst->getOperand(0).getReg());
+      {
+        LocalVarDefRange DefRange;
+        if (IsIndirect) {
+          int64_t Offset = DVInst->getOperand(1).getImm();
+          DefRange = createDefRangeMem(CVReg, Offset);
+        } else {
+          DefRange = createDefRangeReg(CVReg);
+        }
+        if (Var.DefRanges.empty() ||
+            Var.DefRanges.back().isDifferentLocation(DefRange)) {
+          Var.DefRanges.emplace_back(std::move(DefRange));
+        }
+      }
+
+      // Compute the label range.
+      const MCSymbol *Begin = getLabelBeforeInsn(Range.first);
+      const MCSymbol *End = getLabelAfterInsn(Range.second);
+      if (!End) {
+        if (std::next(I) != E)
+          End = getLabelBeforeInsn(std::next(I)->first);
+        else
+          End = Asm->getFunctionEnd();
+      }
+
+      // If the last range end is our begin, just extend the last range.
+      // Otherwise make a new range.
+      SmallVectorImpl<std::pair<const MCSymbol *, const MCSymbol *>> &Ranges =
+          Var.DefRanges.back().Ranges;
+      if (!Ranges.empty() && Ranges.back().second == Begin)
+        Ranges.back().second = End;
+      else
+        Ranges.emplace_back(Begin, End);
+
+      // FIXME: Do more range combining.
+    }
+
+    recordLocalVariable(std::move(Var), InlinedAt);
+  }
+}
+
+void CodeViewDebug::beginFunction(const MachineFunction *MF) {
+  assert(!CurFn && "Can't process two functions at once!");
+
+  if (!Asm || !MMI->hasDebugInfo())
+    return;
+
+  DebugHandlerBase::beginFunction(MF);
+
+  const Function *GV = MF->getFunction();
+  assert(FnDebugInfo.count(GV) == false);
+  CurFn = &FnDebugInfo[GV];
+  CurFn->FuncId = NextFuncId++;
+  CurFn->Begin = Asm->getFunctionBegin();
+
+  // Find the end of the function prolog.  First known non-DBG_VALUE and
+  // non-frame setup location marks the beginning of the function body.
+  // FIXME: is there a simpler a way to do this? Can we just search
+  // for the first instruction of the function, not the last of the prolog?
+  DebugLoc PrologEndLoc;
+  bool EmptyPrologue = true;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          MI.getDebugLoc()) {
+        PrologEndLoc = MI.getDebugLoc();
+        break;
+      } else if (!MI.isDebugValue()) {
+        EmptyPrologue = false;
+      }
+    }
+  }
+
+  // Record beginning of function if we have a non-empty prologue.
+  if (PrologEndLoc && !EmptyPrologue) {
+    DebugLoc FnStartDL = PrologEndLoc.getFnDebugLoc();
+    maybeRecordLocation(FnStartDL, MF);
+  }
+}
+
+void CodeViewDebug::addToUDTs(const DIType *Ty, TypeIndex TI) {
+  // Don't record empty UDTs.
+  if (Ty->getName().empty())
+    return;
+
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  const DISubprogram *ClosestSubprogram = getQualifiedNameComponents(
+      Ty->getScope().resolve(), QualifiedNameComponents);
+
+  std::string FullyQualifiedName =
+      getQualifiedName(QualifiedNameComponents, getPrettyScopeName(Ty));
+
+  if (ClosestSubprogram == nullptr)
+    GlobalUDTs.emplace_back(std::move(FullyQualifiedName), TI);
+  else if (ClosestSubprogram == CurrentSubprogram)
+    LocalUDTs.emplace_back(std::move(FullyQualifiedName), TI);
+
+  // TODO: What if the ClosestSubprogram is neither null or the current
+  // subprogram?  Currently, the UDT just gets dropped on the floor.
+  //
+  // The current behavior is not desirable.  To get maximal fidelity, we would
+  // need to perform all type translation before beginning emission of .debug$S
+  // and then make LocalUDTs a member of FunctionInfo
+}
+
+TypeIndex CodeViewDebug::lowerType(const DIType *Ty, const DIType *ClassTy) {
+  // Generic dispatch for lowering an unknown type.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_array_type:
+    return lowerTypeArray(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_typedef:
+    return lowerTypeAlias(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_base_type:
+    return lowerTypeBasic(cast<DIBasicType>(Ty));
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_rvalue_reference_type:
+    return lowerTypePointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_ptr_to_member_type:
+    return lowerTypeMemberPointer(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_volatile_type:
+    return lowerTypeModifier(cast<DIDerivedType>(Ty));
+  case dwarf::DW_TAG_subroutine_type:
+    if (ClassTy) {
+      // The member function type of a member function pointer has no
+      // ThisAdjustment.
+      return lowerTypeMemberFunction(cast<DISubroutineType>(Ty), ClassTy,
+                                     /*ThisAdjustment=*/0);
+    }
+    return lowerTypeFunction(cast<DISubroutineType>(Ty));
+  case dwarf::DW_TAG_enumeration_type:
+    return lowerTypeEnum(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    return lowerTypeClass(cast<DICompositeType>(Ty));
+  case dwarf::DW_TAG_union_type:
+    return lowerTypeUnion(cast<DICompositeType>(Ty));
+  default:
+    // Use the null type index.
+    return TypeIndex();
+  }
+}
+
+TypeIndex CodeViewDebug::lowerTypeAlias(const DIDerivedType *Ty) {
+  DITypeRef UnderlyingTypeRef = Ty->getBaseType();
+  TypeIndex UnderlyingTypeIndex = getTypeIndex(UnderlyingTypeRef);
+  StringRef TypeName = Ty->getName();
+
+  addToUDTs(Ty, UnderlyingTypeIndex);
+
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::Int32Long) &&
+      TypeName == "HRESULT")
+    return TypeIndex(SimpleTypeKind::HResult);
+  if (UnderlyingTypeIndex == TypeIndex(SimpleTypeKind::UInt16Short) &&
+      TypeName == "wchar_t")
+    return TypeIndex(SimpleTypeKind::WideCharacter);
+
+  return UnderlyingTypeIndex;
+}
+
+TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
+  DITypeRef ElementTypeRef = Ty->getBaseType();
+  TypeIndex ElementTypeIndex = getTypeIndex(ElementTypeRef);
+  // IndexType is size_t, which depends on the bitness of the target.
+  TypeIndex IndexType = Asm->MAI->getPointerSize() == 8
+                            ? TypeIndex(SimpleTypeKind::UInt64Quad)
+                            : TypeIndex(SimpleTypeKind::UInt32Long);
+
+  uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8;
+
+  bool UndefinedSubrange = false;
+
+  // FIXME:
+  // There is a bug in the front-end where an array of a structure, which was
+  // declared as incomplete structure first, ends up not getting a size assigned
+  // to it. (PR28303)
+  // Example:
+  //   struct A(*p)[3];
+  //   struct A { int f; } a[3];
+  //
+  // This needs to be fixed in the front-end, but in the meantime we don't want
+  // to trigger an assertion because of this.
+  if (Ty->getSizeInBits() == 0) {
+    UndefinedSubrange = true;
+  }
+
+  // Add subranges to array type.
+  DINodeArray Elements = Ty->getElements();
+  for (int i = Elements.size() - 1; i >= 0; --i) {
+    const DINode *Element = Elements[i];
+    assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
+
+    const DISubrange *Subrange = cast<DISubrange>(Element);
+    assert(Subrange->getLowerBound() == 0 &&
+           "codeview doesn't support subranges with lower bounds");
+    int64_t Count = Subrange->getCount();
+
+    // Variable Length Array (VLA) has Count equal to '-1'.
+    // Replace with Count '1', assume it is the minimum VLA length.
+    // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU.
+    if (Count == -1) {
+      Count = 1;
+      UndefinedSubrange = true;
+    }
+
+    StringRef Name = (i == 0) ? Ty->getName() : "";
+    // Update the element size and element type index for subsequent subranges.
+    ElementSize *= Count;
+    ElementTypeIndex = TypeTable.writeArray(
+        ArrayRecord(ElementTypeIndex, IndexType, ElementSize, Name));
+  }
+
+  (void)UndefinedSubrange;
+  assert(UndefinedSubrange || ElementSize == (Ty->getSizeInBits() / 8));
+
+  return ElementTypeIndex;
+}
+
+TypeIndex CodeViewDebug::lowerTypeBasic(const DIBasicType *Ty) {
+  TypeIndex Index;
+  dwarf::TypeKind Kind;
+  uint32_t ByteSize;
+
+  Kind = static_cast<dwarf::TypeKind>(Ty->getEncoding());
+  ByteSize = Ty->getSizeInBits() / 8;
+
+  SimpleTypeKind STK = SimpleTypeKind::None;
+  switch (Kind) {
+  case dwarf::DW_ATE_address:
+    // FIXME: Translate
+    break;
+  case dwarf::DW_ATE_boolean:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::Boolean8;   break;
+    case 2:  STK = SimpleTypeKind::Boolean16;  break;
+    case 4:  STK = SimpleTypeKind::Boolean32;  break;
+    case 8:  STK = SimpleTypeKind::Boolean64;  break;
+    case 16: STK = SimpleTypeKind::Boolean128; break;
+    }
+    break;
+  case dwarf::DW_ATE_complex_float:
+    switch (ByteSize) {
+    case 2:  STK = SimpleTypeKind::Complex16;  break;
+    case 4:  STK = SimpleTypeKind::Complex32;  break;
+    case 8:  STK = SimpleTypeKind::Complex64;  break;
+    case 10: STK = SimpleTypeKind::Complex80;  break;
+    case 16: STK = SimpleTypeKind::Complex128; break;
+    }
+    break;
+  case dwarf::DW_ATE_float:
+    switch (ByteSize) {
+    case 2:  STK = SimpleTypeKind::Float16;  break;
+    case 4:  STK = SimpleTypeKind::Float32;  break;
+    case 6:  STK = SimpleTypeKind::Float48;  break;
+    case 8:  STK = SimpleTypeKind::Float64;  break;
+    case 10: STK = SimpleTypeKind::Float80;  break;
+    case 16: STK = SimpleTypeKind::Float128; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::SByte;      break;
+    case 2:  STK = SimpleTypeKind::Int16Short; break;
+    case 4:  STK = SimpleTypeKind::Int32;      break;
+    case 8:  STK = SimpleTypeKind::Int64Quad;  break;
+    case 16: STK = SimpleTypeKind::Int128Oct;  break;
+    }
+    break;
+  case dwarf::DW_ATE_unsigned:
+    switch (ByteSize) {
+    case 1:  STK = SimpleTypeKind::Byte;        break;
+    case 2:  STK = SimpleTypeKind::UInt16Short; break;
+    case 4:  STK = SimpleTypeKind::UInt32;      break;
+    case 8:  STK = SimpleTypeKind::UInt64Quad;  break;
+    case 16: STK = SimpleTypeKind::UInt128Oct;  break;
+    }
+    break;
+  case dwarf::DW_ATE_UTF:
+    switch (ByteSize) {
+    case 2: STK = SimpleTypeKind::Character16; break;
+    case 4: STK = SimpleTypeKind::Character32; break;
+    }
+    break;
+  case dwarf::DW_ATE_signed_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::SignedCharacter;
+    break;
+  case dwarf::DW_ATE_unsigned_char:
+    if (ByteSize == 1)
+      STK = SimpleTypeKind::UnsignedCharacter;
+    break;
+  default:
+    break;
+  }
+
+  // Apply some fixups based on the source-level type name.
+  if (STK == SimpleTypeKind::Int32 && Ty->getName() == "long int")
+    STK = SimpleTypeKind::Int32Long;
+  if (STK == SimpleTypeKind::UInt32 && Ty->getName() == "long unsigned int")
+    STK = SimpleTypeKind::UInt32Long;
+  if (STK == SimpleTypeKind::UInt16Short &&
+      (Ty->getName() == "wchar_t" || Ty->getName() == "__wchar_t"))
+    STK = SimpleTypeKind::WideCharacter;
+  if ((STK == SimpleTypeKind::SignedCharacter ||
+       STK == SimpleTypeKind::UnsignedCharacter) &&
+      Ty->getName() == "char")
+    STK = SimpleTypeKind::NarrowCharacter;
+
+  return TypeIndex(STK);
+}
+
+TypeIndex CodeViewDebug::lowerTypePointer(const DIDerivedType *Ty) {
+  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType());
+
+  // While processing the type being pointed to it is possible we already
+  // created this pointer type.  If so, we check here and return the existing
+  // pointer type.
+  auto I = TypeIndices.find({Ty, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  // Pointers to simple types can use SimpleTypeMode, rather than having a
+  // dedicated pointer type record.
+  if (PointeeTI.isSimple() &&
+      PointeeTI.getSimpleMode() == SimpleTypeMode::Direct &&
+      Ty->getTag() == dwarf::DW_TAG_pointer_type) {
+    SimpleTypeMode Mode = Ty->getSizeInBits() == 64
+                              ? SimpleTypeMode::NearPointer64
+                              : SimpleTypeMode::NearPointer32;
+    return TypeIndex(PointeeTI.getSimpleKind(), Mode);
+  }
+
+  PointerKind PK =
+      Ty->getSizeInBits() == 64 ? PointerKind::Near64 : PointerKind::Near32;
+  PointerMode PM = PointerMode::Pointer;
+  switch (Ty->getTag()) {
+  default: llvm_unreachable("not a pointer tag type");
+  case dwarf::DW_TAG_pointer_type:
+    PM = PointerMode::Pointer;
+    break;
+  case dwarf::DW_TAG_reference_type:
+    PM = PointerMode::LValueReference;
+    break;
+  case dwarf::DW_TAG_rvalue_reference_type:
+    PM = PointerMode::RValueReference;
+    break;
+  }
+  // FIXME: MSVC folds qualifiers into PointerOptions in the context of a method
+  // 'this' pointer, but not normal contexts. Figure out what we're supposed to
+  // do.
+  PointerOptions PO = PointerOptions::None;
+  PointerRecord PR(PointeeTI, PK, PM, PO, Ty->getSizeInBits() / 8);
+  return TypeTable.writePointer(PR);
+}
+
+static PointerToMemberRepresentation
+translatePtrToMemberRep(unsigned SizeInBytes, bool IsPMF, unsigned Flags) {
+  // SizeInBytes being zero generally implies that the member pointer type was
+  // incomplete, which can happen if it is part of a function prototype. In this
+  // case, use the unknown model instead of the general model.
+  if (IsPMF) {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralFunction;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceFunction;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceFunction;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceFunction;
+    }
+  } else {
+    switch (Flags & DINode::FlagPtrToMemberRep) {
+    case 0:
+      return SizeInBytes == 0 ? PointerToMemberRepresentation::Unknown
+                              : PointerToMemberRepresentation::GeneralData;
+    case DINode::FlagSingleInheritance:
+      return PointerToMemberRepresentation::SingleInheritanceData;
+    case DINode::FlagMultipleInheritance:
+      return PointerToMemberRepresentation::MultipleInheritanceData;
+    case DINode::FlagVirtualInheritance:
+      return PointerToMemberRepresentation::VirtualInheritanceData;
+    }
+  }
+  llvm_unreachable("invalid ptr to member representation");
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty) {
+  assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
+  TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
+  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType());
+  PointerKind PK = Asm->MAI->getPointerSize() == 8 ? PointerKind::Near64
+                                                   : PointerKind::Near32;
+  bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
+  PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
+                         : PointerMode::PointerToDataMember;
+  PointerOptions PO = PointerOptions::None; // FIXME
+  assert(Ty->getSizeInBits() / 8 <= 0xff && "pointer size too big");
+  uint8_t SizeInBytes = Ty->getSizeInBits() / 8;
+  MemberPointerInfo MPI(
+      ClassTI, translatePtrToMemberRep(SizeInBytes, IsPMF, Ty->getFlags()));
+  PointerRecord PR(PointeeTI, PK, PM, PO, SizeInBytes, MPI);
+  return TypeTable.writePointer(PR);
+}
+
+/// Given a DWARF calling convention, get the CodeView equivalent. If we don't
+/// have a translation, use the NearC convention.
+static CallingConvention dwarfCCToCodeView(unsigned DwarfCC) {
+  switch (DwarfCC) {
+  case dwarf::DW_CC_normal:             return CallingConvention::NearC;
+  case dwarf::DW_CC_BORLAND_msfastcall: return CallingConvention::NearFast;
+  case dwarf::DW_CC_BORLAND_thiscall:   return CallingConvention::ThisCall;
+  case dwarf::DW_CC_BORLAND_stdcall:    return CallingConvention::NearStdCall;
+  case dwarf::DW_CC_BORLAND_pascal:     return CallingConvention::NearPascal;
+  case dwarf::DW_CC_LLVM_vectorcall:    return CallingConvention::NearVector;
+  }
+  return CallingConvention::NearC;
+}
+
+TypeIndex CodeViewDebug::lowerTypeModifier(const DIDerivedType *Ty) {
+  ModifierOptions Mods = ModifierOptions::None;
+  bool IsModifier = true;
+  const DIType *BaseTy = Ty;
+  while (IsModifier && BaseTy) {
+    // FIXME: Need to add DWARF tag for __unaligned.
+    switch (BaseTy->getTag()) {
+    case dwarf::DW_TAG_const_type:
+      Mods |= ModifierOptions::Const;
+      break;
+    case dwarf::DW_TAG_volatile_type:
+      Mods |= ModifierOptions::Volatile;
+      break;
+    default:
+      IsModifier = false;
+      break;
+    }
+    if (IsModifier)
+      BaseTy = cast<DIDerivedType>(BaseTy)->getBaseType().resolve();
+  }
+  TypeIndex ModifiedTI = getTypeIndex(BaseTy);
+
+  // While processing the type being pointed to, it is possible we already
+  // created this modifier type.  If so, we check here and return the existing
+  // modifier type.
+  auto I = TypeIndices.find({Ty, nullptr});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  ModifierRecord MR(ModifiedTI, Mods);
+  return TypeTable.writeModifier(MR);
+}
+
+TypeIndex CodeViewDebug::lowerTypeFunction(const DISubroutineType *Ty) {
+  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
+  for (DITypeRef ArgTypeRef : Ty->getTypeArray())
+    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
+
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  ArrayRef<TypeIndex> ArgTypeIndices = None;
+  if (!ReturnAndArgTypeIndices.empty()) {
+    auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices);
+    ReturnTypeIndex = ReturnAndArgTypesRef.front();
+    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
+  }
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeArgList(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  ProcedureRecord Procedure(ReturnTypeIndex, CC, FunctionOptions::None,
+                            ArgTypeIndices.size(), ArgListIndex);
+  return TypeTable.writeProcedure(Procedure);
+}
+
+TypeIndex CodeViewDebug::lowerTypeMemberFunction(const DISubroutineType *Ty,
+                                                 const DIType *ClassTy,
+                                                 int ThisAdjustment) {
+  // Lower the containing class type.
+  TypeIndex ClassType = getTypeIndex(ClassTy);
+
+  SmallVector<TypeIndex, 8> ReturnAndArgTypeIndices;
+  for (DITypeRef ArgTypeRef : Ty->getTypeArray())
+    ReturnAndArgTypeIndices.push_back(getTypeIndex(ArgTypeRef));
+
+  TypeIndex ReturnTypeIndex = TypeIndex::Void();
+  ArrayRef<TypeIndex> ArgTypeIndices = None;
+  if (!ReturnAndArgTypeIndices.empty()) {
+    auto ReturnAndArgTypesRef = makeArrayRef(ReturnAndArgTypeIndices);
+    ReturnTypeIndex = ReturnAndArgTypesRef.front();
+    ArgTypeIndices = ReturnAndArgTypesRef.drop_front();
+  }
+  TypeIndex ThisTypeIndex = TypeIndex::Void();
+  if (!ArgTypeIndices.empty()) {
+    ThisTypeIndex = ArgTypeIndices.front();
+    ArgTypeIndices = ArgTypeIndices.drop_front();
+  }
+
+  ArgListRecord ArgListRec(TypeRecordKind::ArgList, ArgTypeIndices);
+  TypeIndex ArgListIndex = TypeTable.writeArgList(ArgListRec);
+
+  CallingConvention CC = dwarfCCToCodeView(Ty->getCC());
+
+  // TODO: Need to use the correct values for:
+  //       FunctionOptions
+  //       ThisPointerAdjustment.
+  TypeIndex TI = TypeTable.writeMemberFunction(MemberFunctionRecord(
+      ReturnTypeIndex, ClassType, ThisTypeIndex, CC, FunctionOptions::None,
+      ArgTypeIndices.size(), ArgListIndex, ThisAdjustment));
+
+  return TI;
+}
+
+static MemberAccess translateAccessFlags(unsigned RecordTag, unsigned Flags) {
+  switch (Flags & DINode::FlagAccessibility) {
+  case DINode::FlagPrivate:   return MemberAccess::Private;
+  case DINode::FlagPublic:    return MemberAccess::Public;
+  case DINode::FlagProtected: return MemberAccess::Protected;
+  case 0:
+    // If there was no explicit access control, provide the default for the tag.
+    return RecordTag == dwarf::DW_TAG_class_type ? MemberAccess::Private
+                                                 : MemberAccess::Public;
+  }
+  llvm_unreachable("access flags are exclusive");
+}
+
+static MethodOptions translateMethodOptionFlags(const DISubprogram *SP) {
+  if (SP->isArtificial())
+    return MethodOptions::CompilerGenerated;
+
+  // FIXME: Handle other MethodOptions.
+
+  return MethodOptions::None;
+}
+
+static MethodKind translateMethodKindFlags(const DISubprogram *SP,
+                                           bool Introduced) {
+  switch (SP->getVirtuality()) {
+  case dwarf::DW_VIRTUALITY_none:
+    break;
+  case dwarf::DW_VIRTUALITY_virtual:
+    return Introduced ? MethodKind::IntroducingVirtual : MethodKind::Virtual;
+  case dwarf::DW_VIRTUALITY_pure_virtual:
+    return Introduced ? MethodKind::PureIntroducingVirtual
+                      : MethodKind::PureVirtual;
+  default:
+    llvm_unreachable("unhandled virtuality case");
+  }
+
+  // FIXME: Get Clang to mark DISubprogram as static and do something with it.
+
+  return MethodKind::Vanilla;
+}
+
+static TypeRecordKind getRecordKind(const DICompositeType *Ty) {
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:     return TypeRecordKind::Class;
+  case dwarf::DW_TAG_structure_type: return TypeRecordKind::Struct;
+  }
+  llvm_unreachable("unexpected tag");
+}
+
+/// Return ClassOptions that should be present on both the forward declaration
+/// and the defintion of a tag type.
+static ClassOptions getCommonClassOptions(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::None;
+
+  // MSVC always sets this flag, even for local types. Clang doesn't always
+  // appear to give every type a linkage name, which may be problematic for us.
+  // FIXME: Investigate the consequences of not following them here.
+  if (!Ty->getIdentifier().empty())
+    CO |= ClassOptions::HasUniqueName;
+
+  // Put the Nested flag on a type if it appears immediately inside a tag type.
+  // Do not walk the scope chain. Do not attempt to compute ContainsNestedClass
+  // here. That flag is only set on definitions, and not forward declarations.
+  const DIScope *ImmediateScope = Ty->getScope().resolve();
+  if (ImmediateScope && isa<DICompositeType>(ImmediateScope))
+    CO |= ClassOptions::Nested;
+
+  // Put the Scoped flag on function-local types.
+  for (const DIScope *Scope = ImmediateScope; Scope != nullptr;
+       Scope = Scope->getScope().resolve()) {
+    if (isa<DISubprogram>(Scope)) {
+      CO |= ClassOptions::Scoped;
+      break;
+    }
+  }
+
+  return CO;
+}
+
+TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FTI;
+  unsigned EnumeratorCount = 0;
+
+  if (Ty->isForwardDecl()) {
+    CO |= ClassOptions::ForwardReference;
+  } else {
+    FieldListRecordBuilder Fields;
+    for (const DINode *Element : Ty->getElements()) {
+      // We assume that the frontend provides all members in source declaration
+      // order, which is what MSVC does.
+      if (auto *Enumerator = dyn_cast_or_null<DIEnumerator>(Element)) {
+        Fields.writeEnumerator(EnumeratorRecord(
+            MemberAccess::Public, APSInt::getUnsigned(Enumerator->getValue()),
+            Enumerator->getName()));
+        EnumeratorCount++;
+      }
+    }
+    FTI = TypeTable.writeFieldList(Fields);
+  }
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  return TypeTable.writeEnum(EnumRecord(EnumeratorCount, CO, FTI, FullName,
+                                        Ty->getIdentifier(),
+                                        getTypeIndex(Ty->getBaseType())));
+}
+
+//===----------------------------------------------------------------------===//
+// ClassInfo
+//===----------------------------------------------------------------------===//
+
+struct llvm::ClassInfo {
+  struct MemberInfo {
+    const DIDerivedType *MemberTypeNode;
+    uint64_t BaseOffset;
+  };
+  // [MemberInfo]
+  typedef std::vector<MemberInfo> MemberList;
+
+  typedef TinyPtrVector<const DISubprogram *> MethodsList;
+  // MethodName -> MethodsList
+  typedef MapVector<MDString *, MethodsList> MethodsMap;
+
+  /// Base classes.
+  std::vector<const DIDerivedType *> Inheritance;
+
+  /// Direct members.
+  MemberList Members;
+  // Direct overloaded methods gathered by name.
+  MethodsMap Methods;
+
+  std::vector<const DICompositeType *> NestedClasses;
+};
+
+void CodeViewDebug::clear() {
+  assert(CurFn == nullptr);
+  FileIdMap.clear();
+  FnDebugInfo.clear();
+  FileToFilepathMap.clear();
+  LocalUDTs.clear();
+  GlobalUDTs.clear();
+  TypeIndices.clear();
+  CompleteTypeIndices.clear();
+}
+
+void CodeViewDebug::collectMemberInfo(ClassInfo &Info,
+                                      const DIDerivedType *DDTy) {
+  if (!DDTy->getName().empty()) {
+    Info.Members.push_back({DDTy, 0});
+    return;
+  }
+  // An unnamed member must represent a nested struct or union. Add all the
+  // indirect fields to the current record.
+  assert((DDTy->getOffsetInBits() % 8) == 0 && "Unnamed bitfield member!");
+  uint64_t Offset = DDTy->getOffsetInBits();
+  const DIType *Ty = DDTy->getBaseType().resolve();
+  const DICompositeType *DCTy = cast<DICompositeType>(Ty);
+  ClassInfo NestedInfo = collectClassInfo(DCTy);
+  for (const ClassInfo::MemberInfo &IndirectField : NestedInfo.Members)
+    Info.Members.push_back(
+        {IndirectField.MemberTypeNode, IndirectField.BaseOffset + Offset});
+}
+
+ClassInfo CodeViewDebug::collectClassInfo(const DICompositeType *Ty) {
+  ClassInfo Info;
+  // Add elements to structure type.
+  DINodeArray Elements = Ty->getElements();
+  for (auto *Element : Elements) {
+    // We assume that the frontend provides all members in source declaration
+    // order, which is what MSVC does.
+    if (!Element)
+      continue;
+    if (auto *SP = dyn_cast<DISubprogram>(Element)) {
+      Info.Methods[SP->getRawName()].push_back(SP);
+    } else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
+      if (DDTy->getTag() == dwarf::DW_TAG_member) {
+        collectMemberInfo(Info, DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_inheritance) {
+        Info.Inheritance.push_back(DDTy);
+      } else if (DDTy->getTag() == dwarf::DW_TAG_friend) {
+        // Ignore friend members. It appears that MSVC emitted info about
+        // friends in the past, but modern versions do not.
+      }
+      // FIXME: Get Clang to emit function virtual table here and handle it.
+    } else if (auto *Composite = dyn_cast<DICompositeType>(Element)) {
+      Info.NestedClasses.push_back(Composite);
+    }
+    // Skip other unrecognized kinds of elements.
+  }
+  return Info;
+}
+
+TypeIndex CodeViewDebug::lowerTypeClass(const DICompositeType *Ty) {
+  // First, construct the forward decl.  Don't look into Ty to compute the
+  // forward decl options, since it might not be available in all TUs.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  TypeIndex FwdDeclTI = TypeTable.writeClass(ClassRecord(
+      Kind, 0, CO, HfaKind::None, WindowsRTClassKind::None, TypeIndex(),
+      TypeIndex(), TypeIndex(), 0, FullName, Ty->getIdentifier()));
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
+  // Construct the field list and complete type record.
+  TypeRecordKind Kind = getRecordKind(Ty);
+  ClassOptions CO = getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  TypeIndex VShapeTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, VShapeTI, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+
+  TypeIndex ClassTI = TypeTable.writeClass(ClassRecord(
+      Kind, FieldCount, CO, HfaKind::None, WindowsRTClassKind::None, FieldTI,
+      TypeIndex(), VShapeTI, SizeInBytes, FullName, Ty->getIdentifier()));
+
+  TypeTable.writeUdtSourceLine(UdtSourceLineRecord(
+      ClassTI, TypeTable.writeStringId(StringIdRecord(
+                   TypeIndex(0x0), getFullFilepath(Ty->getFile()))),
+      Ty->getLine()));
+
+  addToUDTs(Ty, ClassTI);
+
+  return ClassTI;
+}
+
+TypeIndex CodeViewDebug::lowerTypeUnion(const DICompositeType *Ty) {
+  ClassOptions CO =
+      ClassOptions::ForwardReference | getCommonClassOptions(Ty);
+  std::string FullName = getFullyQualifiedName(Ty);
+  TypeIndex FwdDeclTI =
+      TypeTable.writeUnion(UnionRecord(0, CO, HfaKind::None, TypeIndex(), 0,
+                                       FullName, Ty->getIdentifier()));
+  if (!Ty->isForwardDecl())
+    DeferredCompleteTypes.push_back(Ty);
+  return FwdDeclTI;
+}
+
+TypeIndex CodeViewDebug::lowerCompleteTypeUnion(const DICompositeType *Ty) {
+  ClassOptions CO = ClassOptions::Sealed | getCommonClassOptions(Ty);
+  TypeIndex FieldTI;
+  unsigned FieldCount;
+  bool ContainsNestedClass;
+  std::tie(FieldTI, std::ignore, FieldCount, ContainsNestedClass) =
+      lowerRecordFieldList(Ty);
+
+  if (ContainsNestedClass)
+    CO |= ClassOptions::ContainsNestedClass;
+
+  uint64_t SizeInBytes = Ty->getSizeInBits() / 8;
+  std::string FullName = getFullyQualifiedName(Ty);
+
+  TypeIndex UnionTI = TypeTable.writeUnion(
+      UnionRecord(FieldCount, CO, HfaKind::None, FieldTI, SizeInBytes, FullName,
+                  Ty->getIdentifier()));
+
+  TypeTable.writeUdtSourceLine(UdtSourceLineRecord(
+      UnionTI, TypeTable.writeStringId(StringIdRecord(
+                   TypeIndex(0x0), getFullFilepath(Ty->getFile()))),
+      Ty->getLine()));
+
+  addToUDTs(Ty, UnionTI);
+
+  return UnionTI;
+}
+
+std::tuple<TypeIndex, TypeIndex, unsigned, bool>
+CodeViewDebug::lowerRecordFieldList(const DICompositeType *Ty) {
+  // Manually count members. MSVC appears to count everything that generates a
+  // field list record. Each individual overload in a method overload group
+  // contributes to this count, even though the overload group is a single field
+  // list record.
+  unsigned MemberCount = 0;
+  ClassInfo Info = collectClassInfo(Ty);
+  FieldListRecordBuilder Fields;
+
+  // Create base classes.
+  for (const DIDerivedType *I : Info.Inheritance) {
+    if (I->getFlags() & DINode::FlagVirtual) {
+      // Virtual base.
+      // FIXME: Emit VBPtrOffset when the frontend provides it.
+      unsigned VBPtrOffset = 0;
+      // FIXME: Despite the accessor name, the offset is really in bytes.
+      unsigned VBTableIndex = I->getOffsetInBits() / 4;
+      Fields.writeVirtualBaseClass(VirtualBaseClassRecord(
+          translateAccessFlags(Ty->getTag(), I->getFlags()),
+          getTypeIndex(I->getBaseType()), getVBPTypeIndex(), VBPtrOffset,
+          VBTableIndex));
+    } else {
+      assert(I->getOffsetInBits() % 8 == 0 &&
+             "bases must be on byte boundaries");
+      Fields.writeBaseClass(BaseClassRecord(
+          translateAccessFlags(Ty->getTag(), I->getFlags()),
+          getTypeIndex(I->getBaseType()), I->getOffsetInBits() / 8));
+    }
+  }
+
+  // Create members.
+  for (ClassInfo::MemberInfo &MemberInfo : Info.Members) {
+    const DIDerivedType *Member = MemberInfo.MemberTypeNode;
+    TypeIndex MemberBaseType = getTypeIndex(Member->getBaseType());
+    StringRef MemberName = Member->getName();
+    MemberAccess Access =
+        translateAccessFlags(Ty->getTag(), Member->getFlags());
+
+    if (Member->isStaticMember()) {
+      Fields.writeStaticDataMember(
+          StaticDataMemberRecord(Access, MemberBaseType, MemberName));
+      MemberCount++;
+      continue;
+    }
+
+    // Data member.
+    uint64_t MemberOffsetInBits =
+        Member->getOffsetInBits() + MemberInfo.BaseOffset;
+    if (Member->isBitField()) {
+      uint64_t StartBitOffset = MemberOffsetInBits;
+      if (const auto *CI =
+              dyn_cast_or_null<ConstantInt>(Member->getStorageOffsetInBits())) {
+        MemberOffsetInBits = CI->getZExtValue() + MemberInfo.BaseOffset;
+      }
+      StartBitOffset -= MemberOffsetInBits;
+      MemberBaseType = TypeTable.writeBitField(BitFieldRecord(
+          MemberBaseType, Member->getSizeInBits(), StartBitOffset));
+    }
+    uint64_t MemberOffsetInBytes = MemberOffsetInBits / 8;
+    Fields.writeDataMember(DataMemberRecord(Access, MemberBaseType,
+                                            MemberOffsetInBytes, MemberName));
+    MemberCount++;
+  }
+
+  // Create methods
+  for (auto &MethodItr : Info.Methods) {
+    StringRef Name = MethodItr.first->getString();
+
+    std::vector<OneMethodRecord> Methods;
+    for (const DISubprogram *SP : MethodItr.second) {
+      TypeIndex MethodType = getMemberFunctionType(SP, Ty);
+      bool Introduced = SP->getFlags() & DINode::FlagIntroducedVirtual;
+
+      unsigned VFTableOffset = -1;
+      if (Introduced)
+        VFTableOffset = SP->getVirtualIndex() * getPointerSizeInBytes();
+
+      Methods.push_back(
+          OneMethodRecord(MethodType, translateMethodKindFlags(SP, Introduced),
+                          translateMethodOptionFlags(SP),
+                          translateAccessFlags(Ty->getTag(), SP->getFlags()),
+                          VFTableOffset, Name));
+      MemberCount++;
+    }
+    assert(Methods.size() > 0 && "Empty methods map entry");
+    if (Methods.size() == 1)
+      Fields.writeOneMethod(Methods[0]);
+    else {
+      TypeIndex MethodList =
+          TypeTable.writeMethodOverloadList(MethodOverloadListRecord(Methods));
+      Fields.writeOverloadedMethod(
+          OverloadedMethodRecord(Methods.size(), MethodList, Name));
+    }
+  }
+
+  // Create nested classes.
+  for (const DICompositeType *Nested : Info.NestedClasses) {
+    NestedTypeRecord R(getTypeIndex(DITypeRef(Nested)), Nested->getName());
+    Fields.writeNestedType(R);
+    MemberCount++;
+  }
+
+  TypeIndex FieldTI = TypeTable.writeFieldList(Fields);
+  return std::make_tuple(FieldTI, TypeIndex(), MemberCount,
+                         !Info.NestedClasses.empty());
+}
+
+TypeIndex CodeViewDebug::getVBPTypeIndex() {
+  if (!VBPType.getIndex()) {
+    // Make a 'const int *' type.
+    ModifierRecord MR(TypeIndex::Int32(), ModifierOptions::Const);
+    TypeIndex ModifiedTI = TypeTable.writeModifier(MR);
+
+    PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
+                                                  : PointerKind::Near32;
+    PointerMode PM = PointerMode::Pointer;
+    PointerOptions PO = PointerOptions::None;
+    PointerRecord PR(ModifiedTI, PK, PM, PO, getPointerSizeInBytes());
+
+    VBPType = TypeTable.writePointer(PR);
+  }
+
+  return VBPType;
+}
+
+TypeIndex CodeViewDebug::getTypeIndex(DITypeRef TypeRef, DITypeRef ClassTyRef) {
+  const DIType *Ty = TypeRef.resolve();
+  const DIType *ClassTy = ClassTyRef.resolve();
+
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // Check if we've already translated this type. Don't try to do a
+  // get-or-create style insertion that caches the hash lookup across the
+  // lowerType call. It will update the TypeIndices map.
+  auto I = TypeIndices.find({Ty, ClassTy});
+  if (I != TypeIndices.end())
+    return I->second;
+
+  TypeLoweringScope S(*this);
+  TypeIndex TI = lowerType(Ty, ClassTy);
+  return recordTypeIndexForDINode(Ty, TI, ClassTy);
+}
+
+TypeIndex CodeViewDebug::getCompleteTypeIndex(DITypeRef TypeRef) {
+  const DIType *Ty = TypeRef.resolve();
+
+  // The null DIType is the void type. Don't try to hash it.
+  if (!Ty)
+    return TypeIndex::Void();
+
+  // If this is a non-record type, the complete type index is the same as the
+  // normal type index. Just call getTypeIndex.
+  switch (Ty->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+    break;
+  default:
+    return getTypeIndex(Ty);
+  }
+
+  // Check if we've already translated the complete record type.  Lowering a
+  // complete type should never trigger lowering another complete type, so we
+  // can reuse the hash table lookup result.
+  const auto *CTy = cast<DICompositeType>(Ty);
+  auto InsertResult = CompleteTypeIndices.insert({CTy, TypeIndex()});
+  if (!InsertResult.second)
+    return InsertResult.first->second;
+
+  TypeLoweringScope S(*this);
+
+  // Make sure the forward declaration is emitted first. It's unclear if this
+  // is necessary, but MSVC does it, and we should follow suit until we can show
+  // otherwise.
+  TypeIndex FwdDeclTI = getTypeIndex(CTy);
+
+  // Just use the forward decl if we don't have complete type info. This might
+  // happen if the frontend is using modules and expects the complete definition
+  // to be emitted elsewhere.
+  if (CTy->isForwardDecl())
+    return FwdDeclTI;
+
+  TypeIndex TI;
+  switch (CTy->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+    TI = lowerCompleteTypeClass(CTy);
+    break;
+  case dwarf::DW_TAG_union_type:
+    TI = lowerCompleteTypeUnion(CTy);
+    break;
+  default:
+    llvm_unreachable("not a record");
+  }
+
+  InsertResult.first->second = TI;
+  return TI;
+}
+
+/// Emit all the deferred complete record types. Try to do this in FIFO order,
+/// and do this until fixpoint, as each complete record type typically
+/// references
+/// many other record types.
+void CodeViewDebug::emitDeferredCompleteTypes() {
+  SmallVector<const DICompositeType *, 4> TypesToEmit;
+  while (!DeferredCompleteTypes.empty()) {
+    std::swap(DeferredCompleteTypes, TypesToEmit);
+    for (const DICompositeType *RecordTy : TypesToEmit)
+      getCompleteTypeIndex(RecordTy);
+    TypesToEmit.clear();
+  }
+}
+
+void CodeViewDebug::emitLocalVariableList(ArrayRef<LocalVariable> Locals) {
+  // Get the sorted list of parameters and emit them first.
+  SmallVector<const LocalVariable *, 6> Params;
+  for (const LocalVariable &L : Locals)
+    if (L.DIVar->isParameter())
+      Params.push_back(&L);
+  std::sort(Params.begin(), Params.end(),
+            [](const LocalVariable *L, const LocalVariable *R) {
+              return L->DIVar->getArg() < R->DIVar->getArg();
+            });
+  for (const LocalVariable *L : Params)
+    emitLocalVariable(*L);
+
+  // Next emit all non-parameters in the order that we found them.
+  for (const LocalVariable &L : Locals)
+    if (!L.DIVar->isParameter())
+      emitLocalVariable(L);
+}
+
+void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
+  // LocalSym record, see SymbolRecord.h for more info.
+  MCSymbol *LocalBegin = MMI->getContext().createTempSymbol(),
+           *LocalEnd = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(LocalEnd, LocalBegin, 2);
+  OS.EmitLabel(LocalBegin);
+
+  OS.AddComment("Record kind: S_LOCAL");
+  OS.EmitIntValue(unsigned(SymbolKind::S_LOCAL), 2);
+
+  LocalSymFlags Flags = LocalSymFlags::None;
+  if (Var.DIVar->isParameter())
+    Flags |= LocalSymFlags::IsParameter;
+  if (Var.DefRanges.empty())
+    Flags |= LocalSymFlags::IsOptimizedOut;
+
+  OS.AddComment("TypeIndex");
+  TypeIndex TI = getCompleteTypeIndex(Var.DIVar->getType());
+  OS.EmitIntValue(TI.getIndex(), 4);
+  OS.AddComment("Flags");
+  OS.EmitIntValue(static_cast<uint16_t>(Flags), 2);
+  // Truncate the name so we won't overflow the record length field.
+  emitNullTerminatedSymbolName(OS, Var.DIVar->getName());
+  OS.EmitLabel(LocalEnd);
+
+  // Calculate the on disk prefix of the appropriate def range record. The
+  // records and on disk formats are described in SymbolRecords.h. BytePrefix
+  // should be big enough to hold all forms without memory allocation.
+  SmallString<20> BytePrefix;
+  for (const LocalVarDefRange &DefRange : Var.DefRanges) {
+    BytePrefix.clear();
+    // FIXME: Handle bitpieces.
+    if (DefRange.StructOffset != 0)
+      continue;
+
+    if (DefRange.InMemory) {
+      DefRangeRegisterRelSym Sym(DefRange.CVRegister, 0, DefRange.DataOffset, 0,
+                                 0, 0, ArrayRef<LocalVariableAddrGap>());
+      ulittle16_t SymKind = ulittle16_t(S_DEFRANGE_REGISTER_REL);
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&SymKind), sizeof(SymKind));
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&Sym.Header),
+                    sizeof(Sym.Header) - sizeof(LocalVariableAddrRange));
+    } else {
+      assert(DefRange.DataOffset == 0 && "unexpected offset into register");
+      // Unclear what matters here.
+      DefRangeRegisterSym Sym(DefRange.CVRegister, 0, 0, 0, 0,
+                              ArrayRef<LocalVariableAddrGap>());
+      ulittle16_t SymKind = ulittle16_t(S_DEFRANGE_REGISTER);
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&SymKind), sizeof(SymKind));
+      BytePrefix +=
+          StringRef(reinterpret_cast<const char *>(&Sym.Header),
+                    sizeof(Sym.Header) - sizeof(LocalVariableAddrRange));
+    }
+    OS.EmitCVDefRangeDirective(DefRange.Ranges, BytePrefix);
+  }
+}
+
+void CodeViewDebug::endFunction(const MachineFunction *MF) {
+  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
+    return;
+
+  const Function *GV = MF->getFunction();
+  assert(FnDebugInfo.count(GV));
+  assert(CurFn == &FnDebugInfo[GV]);
+
+  collectVariableInfo(GV->getSubprogram());
+
+  DebugHandlerBase::endFunction(MF);
+
+  // Don't emit anything if we don't have any line tables.
+  if (!CurFn->HaveLineInfo) {
+    FnDebugInfo.erase(GV);
+    CurFn = nullptr;
+    return;
+  }
+
+  CurFn->End = Asm->getFunctionEnd();
+
+  CurFn = nullptr;
+}
+
+void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
+  DebugHandlerBase::beginInstruction(MI);
+
+  // Ignore DBG_VALUE locations and function prologue.
+  if (!Asm || MI->isDebugValue() || MI->getFlag(MachineInstr::FrameSetup))
+    return;
+  DebugLoc DL = MI->getDebugLoc();
+  if (DL == PrevInstLoc || !DL)
+    return;
+  maybeRecordLocation(DL, Asm->MF);
+}
+
+MCSymbol *CodeViewDebug::beginCVSubsection(ModuleSubstreamKind Kind) {
+  MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
+           *EndLabel = MMI->getContext().createTempSymbol();
+  OS.EmitIntValue(unsigned(Kind), 4);
+  OS.AddComment("Subsection size");
+  OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
+  OS.EmitLabel(BeginLabel);
+  return EndLabel;
+}
+
+void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) {
+  OS.EmitLabel(EndLabel);
+  // Every subsection must be aligned to a 4-byte boundary.
+  OS.EmitValueToAlignment(4);
+}
+
+void CodeViewDebug::emitDebugInfoForUDTs(
+    ArrayRef<std::pair<std::string, TypeIndex>> UDTs) {
+  for (const std::pair<std::string, codeview::TypeIndex> &UDT : UDTs) {
+    MCSymbol *UDTRecordBegin = MMI->getContext().createTempSymbol(),
+             *UDTRecordEnd = MMI->getContext().createTempSymbol();
+    OS.AddComment("Record length");
+    OS.emitAbsoluteSymbolDiff(UDTRecordEnd, UDTRecordBegin, 2);
+    OS.EmitLabel(UDTRecordBegin);
+
+    OS.AddComment("Record kind: S_UDT");
+    OS.EmitIntValue(unsigned(SymbolKind::S_UDT), 2);
+
+    OS.AddComment("Type");
+    OS.EmitIntValue(UDT.second.getIndex(), 4);
+
+    emitNullTerminatedSymbolName(OS, UDT.first);
+    OS.EmitLabel(UDTRecordEnd);
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForGlobals() {
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    const auto *CU = cast<DICompileUnit>(Node);
+
+    // First, emit all globals that are not in a comdat in a single symbol
+    // substream. MSVC doesn't like it if the substream is empty, so only open
+    // it if we have at least one global to emit.
+    switchToDebugSectionForSymbol(nullptr);
+    MCSymbol *EndLabel = nullptr;
+    for (const DIGlobalVariable *G : CU->getGlobalVariables()) {
+      if (const auto *GV = dyn_cast_or_null<GlobalVariable>(G->getVariable())) {
+        if (!GV->hasComdat() && !GV->isDeclarationForLinker()) {
+          if (!EndLabel) {
+            OS.AddComment("Symbol subsection for globals");
+            EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+          }
+          emitDebugInfoForGlobal(G, Asm->getSymbol(GV));
+        }
+      }
+    }
+    if (EndLabel)
+      endCVSubsection(EndLabel);
+
+    // Second, emit each global that is in a comdat into its own .debug$S
+    // section along with its own symbol substream.
+    for (const DIGlobalVariable *G : CU->getGlobalVariables()) {
+      if (const auto *GV = dyn_cast_or_null<GlobalVariable>(G->getVariable())) {
+        if (GV->hasComdat()) {
+          MCSymbol *GVSym = Asm->getSymbol(GV);
+          OS.AddComment("Symbol subsection for " +
+                        Twine(GlobalValue::getRealLinkageName(GV->getName())));
+          switchToDebugSectionForSymbol(GVSym);
+          EndLabel = beginCVSubsection(ModuleSubstreamKind::Symbols);
+          emitDebugInfoForGlobal(G, GVSym);
+          endCVSubsection(EndLabel);
+        }
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForRetainedTypes() {
+  NamedMDNode *CUs = MMI->getModule()->getNamedMetadata("llvm.dbg.cu");
+  for (const MDNode *Node : CUs->operands()) {
+    for (auto *Ty : cast<DICompileUnit>(Node)->getRetainedTypes()) {
+      if (DIType *RT = dyn_cast<DIType>(Ty)) {
+        getTypeIndex(RT);
+        // FIXME: Add to global/local DTU list.
+      }
+    }
+  }
+}
+
+void CodeViewDebug::emitDebugInfoForGlobal(const DIGlobalVariable *DIGV,
+                                           MCSymbol *GVSym) {
+  // DataSym record, see SymbolRecord.h for more info.
+  // FIXME: Thread local data, etc
+  MCSymbol *DataBegin = MMI->getContext().createTempSymbol(),
+           *DataEnd = MMI->getContext().createTempSymbol();
+  OS.AddComment("Record length");
+  OS.emitAbsoluteSymbolDiff(DataEnd, DataBegin, 2);
+  OS.EmitLabel(DataBegin);
+  const auto *GV = cast<GlobalVariable>(DIGV->getVariable());
+  if (DIGV->isLocalToUnit()) {
+    if (GV->isThreadLocal()) {
+      OS.AddComment("Record kind: S_LTHREAD32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_LTHREAD32), 2);
+    } else {
+      OS.AddComment("Record kind: S_LDATA32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_LDATA32), 2);
+    }
+  } else {
+    if (GV->isThreadLocal()) {
+      OS.AddComment("Record kind: S_GTHREAD32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_GTHREAD32), 2);
+    } else {
+      OS.AddComment("Record kind: S_GDATA32");
+      OS.EmitIntValue(unsigned(SymbolKind::S_GDATA32), 2);
+    }
+  }
+  OS.AddComment("Type");
+  OS.EmitIntValue(getCompleteTypeIndex(DIGV->getType()).getIndex(), 4);
+  OS.AddComment("DataOffset");
+  OS.EmitCOFFSecRel32(GVSym);
+  OS.AddComment("Segment");
+  OS.EmitCOFFSectionIndex(GVSym);
+  OS.AddComment("Name");
+  emitNullTerminatedSymbolName(OS, DIGV->getName());
+  OS.EmitLabel(DataEnd);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
new file mode 100644
index 000000000000..e4bbd61d4ce0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -0,0 +1,310 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Microsoft CodeView debug info.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_CODEVIEWDEBUG_H
+
+#include "DebugHandlerBase.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/DebugInfo/CodeView/MemoryTypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeIndex.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+
+class StringRef;
+class LexicalScope;
+struct ClassInfo;
+
+/// \brief Collects and handles line tables information in a CodeView format.
+class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
+  MCStreamer &OS;
+  codeview::MemoryTypeTableBuilder TypeTable;
+
+  /// Represents the most general definition range.
+  struct LocalVarDefRange {
+    /// Indicates that variable data is stored in memory relative to the
+    /// specified register.
+    int InMemory : 1;
+
+    /// Offset of variable data in memory.
+    int DataOffset : 31;
+
+    /// Offset of the data into the user level struct. If zero, no splitting
+    /// occurred.
+    uint16_t StructOffset;
+
+    /// Register containing the data or the register base of the memory
+    /// location containing the data.
+    uint16_t CVRegister;
+
+    /// Compares all location fields. This includes all fields except the label
+    /// ranges.
+    bool isDifferentLocation(LocalVarDefRange &O) {
+      return InMemory != O.InMemory || DataOffset != O.DataOffset ||
+             StructOffset != O.StructOffset || CVRegister != O.CVRegister;
+    }
+
+    SmallVector<std::pair<const MCSymbol *, const MCSymbol *>, 1> Ranges;
+  };
+
+  static LocalVarDefRange createDefRangeMem(uint16_t CVRegister, int Offset);
+  static LocalVarDefRange createDefRangeReg(uint16_t CVRegister);
+
+  /// Similar to DbgVariable in DwarfDebug, but not dwarf-specific.
+  struct LocalVariable {
+    const DILocalVariable *DIVar = nullptr;
+    SmallVector<LocalVarDefRange, 1> DefRanges;
+  };
+
+  struct InlineSite {
+    SmallVector<LocalVariable, 1> InlinedLocals;
+    SmallVector<const DILocation *, 1> ChildSites;
+    const DISubprogram *Inlinee = nullptr;
+
+    /// The ID of the inline site or function used with .cv_loc. Not a type
+    /// index.
+    unsigned SiteFuncId = 0;
+  };
+
+  // For each function, store a vector of labels to its instructions, as well as
+  // to the end of the function.
+  struct FunctionInfo {
+    /// Map from inlined call site to inlined instructions and child inlined
+    /// call sites. Listed in program order.
+    std::unordered_map<const DILocation *, InlineSite> InlineSites;
+
+    /// Ordered list of top-level inlined call sites.
+    SmallVector<const DILocation *, 1> ChildSites;
+
+    SmallVector<LocalVariable, 1> Locals;
+
+    DebugLoc LastLoc;
+    const MCSymbol *Begin = nullptr;
+    const MCSymbol *End = nullptr;
+    unsigned FuncId = 0;
+    unsigned LastFileId = 0;
+    bool HaveLineInfo = false;
+  };
+  FunctionInfo *CurFn;
+
+  /// The set of comdat .debug$S sections that we've seen so far. Each section
+  /// must start with a magic version number that must only be emitted once.
+  /// This set tracks which sections we've already opened.
+  DenseSet<MCSectionCOFF *> ComdatDebugSections;
+
+  /// Switch to the appropriate .debug$S section for GVSym. If GVSym, the symbol
+  /// of an emitted global value, is in a comdat COFF section, this will switch
+  /// to a new .debug$S section in that comdat. This method ensures that the
+  /// section starts with the magic version number on first use. If GVSym is
+  /// null, uses the main .debug$S section.
+  void switchToDebugSectionForSymbol(const MCSymbol *GVSym);
+
+  /// The next available function index for use with our .cv_* directives. Not
+  /// to be confused with type indices for LF_FUNC_ID records.
+  unsigned NextFuncId = 0;
+
+  InlineSite &getInlineSite(const DILocation *InlinedAt,
+                            const DISubprogram *Inlinee);
+
+  codeview::TypeIndex getFuncIdForSubprogram(const DISubprogram *SP);
+
+  static void collectInlineSiteChildren(SmallVectorImpl<unsigned> &Children,
+                                        const FunctionInfo &FI,
+                                        const InlineSite &Site);
+
+  /// Remember some debug info about each function. Keep it in a stable order to
+  /// emit at the end of the TU.
+  MapVector<const Function *, FunctionInfo> FnDebugInfo;
+
+  /// Map from DIFile to .cv_file id.
+  DenseMap<const DIFile *, unsigned> FileIdMap;
+
+  /// All inlined subprograms in the order they should be emitted.
+  SmallSetVector<const DISubprogram *, 4> InlinedSubprograms;
+
+  /// Map from a pair of DI metadata nodes and its DI type (or scope) that can
+  /// be nullptr, to CodeView type indices. Primarily indexed by
+  /// {DIType*, DIType*} and {DISubprogram*, DIType*}.
+  ///
+  /// The second entry in the key is needed for methods as DISubroutineType
+  /// representing static method type are shared with non-method function type.
+  DenseMap<std::pair<const DINode *, const DIType *>, codeview::TypeIndex>
+      TypeIndices;
+
+  /// Map from DICompositeType* to complete type index. Non-record types are
+  /// always looked up in the normal TypeIndices map.
+  DenseMap<const DICompositeType *, codeview::TypeIndex> CompleteTypeIndices;
+
+  /// Complete record types to emit after all active type lowerings are
+  /// finished.
+  SmallVector<const DICompositeType *, 4> DeferredCompleteTypes;
+
+  /// Number of type lowering frames active on the stack.
+  unsigned TypeEmissionLevel = 0;
+
+  codeview::TypeIndex VBPType;
+
+  const DISubprogram *CurrentSubprogram = nullptr;
+
+  // The UDTs we have seen while processing types; each entry is a pair of type
+  // index and type name.
+  std::vector<std::pair<std::string, codeview::TypeIndex>> LocalUDTs,
+      GlobalUDTs;
+
+  typedef std::map<const DIFile *, std::string> FileToFilepathMapTy;
+  FileToFilepathMapTy FileToFilepathMap;
+  StringRef getFullFilepath(const DIFile *S);
+
+  unsigned maybeRecordFile(const DIFile *F);
+
+  void maybeRecordLocation(const DebugLoc &DL, const MachineFunction *MF);
+
+  void clear();
+
+  void setCurrentSubprogram(const DISubprogram *SP) {
+    CurrentSubprogram = SP;
+    LocalUDTs.clear();
+  }
+
+  /// Emit the magic version number at the start of a CodeView type or symbol
+  /// section. Appears at the front of every .debug$S or .debug$T section.
+  void emitCodeViewMagicVersion();
+
+  void emitTypeInformation();
+
+  void emitInlineeLinesSubsection();
+
+  void emitDebugInfoForFunction(const Function *GV, FunctionInfo &FI);
+
+  void emitDebugInfoForGlobals();
+
+  void emitDebugInfoForRetainedTypes();
+
+  void emitDebugInfoForUDTs(
+      ArrayRef<std::pair<std::string, codeview::TypeIndex>> UDTs);
+
+  void emitDebugInfoForGlobal(const DIGlobalVariable *DIGV, MCSymbol *GVSym);
+
+  /// Opens a subsection of the given kind in a .debug$S codeview section.
+  /// Returns an end label for use with endCVSubsection when the subsection is
+  /// finished.
+  MCSymbol *beginCVSubsection(codeview::ModuleSubstreamKind Kind);
+
+  void endCVSubsection(MCSymbol *EndLabel);
+
+  void emitInlinedCallSite(const FunctionInfo &FI, const DILocation *InlinedAt,
+                           const InlineSite &Site);
+
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+
+  void collectVariableInfo(const DISubprogram *SP);
+
+  void collectVariableInfoFromMMITable(DenseSet<InlinedVariable> &Processed);
+
+  /// Records information about a local variable in the appropriate scope. In
+  /// particular, locals from inlined code live inside the inlining site.
+  void recordLocalVariable(LocalVariable &&Var, const DILocation *Loc);
+
+  /// Emits local variables in the appropriate order.
+  void emitLocalVariableList(ArrayRef<LocalVariable> Locals);
+
+  /// Emits an S_LOCAL record and its associated defined ranges.
+  void emitLocalVariable(const LocalVariable &Var);
+
+  /// Translates the DIType to codeview if necessary and returns a type index
+  /// for it.
+  codeview::TypeIndex getTypeIndex(DITypeRef TypeRef,
+                                   DITypeRef ClassTyRef = DITypeRef());
+
+  codeview::TypeIndex getMemberFunctionType(const DISubprogram *SP,
+                                            const DICompositeType *Class);
+
+  codeview::TypeIndex getScopeIndex(const DIScope *Scope);
+
+  codeview::TypeIndex getVBPTypeIndex();
+
+  void addToUDTs(const DIType *Ty, codeview::TypeIndex TI);
+
+  codeview::TypeIndex lowerType(const DIType *Ty, const DIType *ClassTy);
+  codeview::TypeIndex lowerTypeAlias(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeArray(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeBasic(const DIBasicType *Ty);
+  codeview::TypeIndex lowerTypePointer(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeMemberPointer(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeModifier(const DIDerivedType *Ty);
+  codeview::TypeIndex lowerTypeFunction(const DISubroutineType *Ty);
+  codeview::TypeIndex lowerTypeMemberFunction(const DISubroutineType *Ty,
+                                              const DIType *ClassTy,
+                                              int ThisAdjustment);
+  codeview::TypeIndex lowerTypeEnum(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerTypeUnion(const DICompositeType *Ty);
+
+  /// Symbol records should point to complete types, but type records should
+  /// always point to incomplete types to avoid cycles in the type graph. Only
+  /// use this entry point when generating symbol records. The complete and
+  /// incomplete type indices only differ for record types. All other types use
+  /// the same index.
+  codeview::TypeIndex getCompleteTypeIndex(DITypeRef TypeRef);
+
+  codeview::TypeIndex lowerCompleteTypeClass(const DICompositeType *Ty);
+  codeview::TypeIndex lowerCompleteTypeUnion(const DICompositeType *Ty);
+
+  struct TypeLoweringScope;
+
+  void emitDeferredCompleteTypes();
+
+  void collectMemberInfo(ClassInfo &Info, const DIDerivedType *DDTy);
+  ClassInfo collectClassInfo(const DICompositeType *Ty);
+
+  /// Common record member lowering functionality for record types, which are
+  /// structs, classes, and unions. Returns the field list index and the member
+  /// count.
+  std::tuple<codeview::TypeIndex, codeview::TypeIndex, unsigned, bool>
+  lowerRecordFieldList(const DICompositeType *Ty);
+
+  /// Inserts {{Node, ClassTy}, TI} into TypeIndices and checks for duplicates.
+  codeview::TypeIndex recordTypeIndexForDINode(const DINode *Node,
+                                               codeview::TypeIndex TI,
+                                               const DIType *ClassTy = nullptr);
+
+  unsigned getPointerSizeInBytes();
+
+public:
+  CodeViewDebug(AsmPrinter *Asm);
+
+  void setSymbolSize(const llvm::MCSymbol *, uint64_t) override {}
+
+  /// \brief Emit the COFF section that holds the line table information.
+  void endModule() override;
+
+  /// \brief Gather pre-function debug information.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// \brief Gather post-function debug information.
+  void endFunction(const MachineFunction *) override;
+
+  /// \brief Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override;
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
new file mode 100644
index 000000000000..2aaa85a58094
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -0,0 +1,600 @@
+//===--- lib/CodeGen/DIE.cpp - DWARF Info Entries -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Data structures for DWARF info entries.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/DIE.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "DwarfUnit.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrevData Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
+  // Explicitly cast to an integer type for which FoldingSetNodeID has
+  // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
+  ID.AddInteger(unsigned(Attribute));
+  ID.AddInteger(unsigned(Form));
+}
+
+//===----------------------------------------------------------------------===//
+// DIEAbbrev Implementation
+//===----------------------------------------------------------------------===//
+
+/// Profile - Used to gather unique data for the abbreviation folding set.
+///
+void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(unsigned(Tag));
+  ID.AddInteger(unsigned(Children));
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i)
+    Data[i].Profile(ID);
+}
+
+/// Emit - Print the abbreviation using the specified asm printer.
+///
+void DIEAbbrev::Emit(const AsmPrinter *AP) const {
+  // Emit its Dwarf tag type.
+  AP->EmitULEB128(Tag, dwarf::TagString(Tag));
+
+  // Emit whether it has children DIEs.
+  AP->EmitULEB128((unsigned)Children, dwarf::ChildrenString(Children));
+
+  // For each attribute description.
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    const DIEAbbrevData &AttrData = Data[i];
+
+    // Emit attribute type.
+    AP->EmitULEB128(AttrData.getAttribute(),
+                    dwarf::AttributeString(AttrData.getAttribute()));
+
+    // Emit form type.
+    AP->EmitULEB128(AttrData.getForm(),
+                    dwarf::FormEncodingString(AttrData.getForm()));
+  }
+
+  // Mark end of abbreviation.
+  AP->EmitULEB128(0, "EOM(1)");
+  AP->EmitULEB128(0, "EOM(2)");
+}
+
+LLVM_DUMP_METHOD
+void DIEAbbrev::print(raw_ostream &O) {
+  O << "Abbreviation @"
+    << format("0x%lx", (long)(intptr_t)this)
+    << "  "
+    << dwarf::TagString(Tag)
+    << " "
+    << dwarf::ChildrenString(Children)
+    << '\n';
+
+  for (unsigned i = 0, N = Data.size(); i < N; ++i) {
+    O << "  "
+      << dwarf::AttributeString(Data[i].getAttribute())
+      << "  "
+      << dwarf::FormEncodingString(Data[i].getForm())
+      << '\n';
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEAbbrev::dump() { print(dbgs()); }
+
+DIEAbbrev DIE::generateAbbrev() const {
+  DIEAbbrev Abbrev(Tag, hasChildren());
+  for (const DIEValue &V : values())
+    Abbrev.AddAttribute(V.getAttribute(), V.getForm());
+  return Abbrev;
+}
+
+/// Climb up the parent chain to get the unit DIE to which this DIE
+/// belongs.
+const DIE *DIE::getUnit() const {
+  const DIE *Cu = getUnitOrNull();
+  assert(Cu && "We should not have orphaned DIEs.");
+  return Cu;
+}
+
+/// Climb up the parent chain to get the unit DIE this DIE belongs
+/// to. Return NULL if DIE is not added to an owner yet.
+const DIE *DIE::getUnitOrNull() const {
+  const DIE *p = this;
+  while (p) {
+    if (p->getTag() == dwarf::DW_TAG_compile_unit ||
+        p->getTag() == dwarf::DW_TAG_type_unit)
+      return p;
+    p = p->getParent();
+  }
+  return nullptr;
+}
+
+DIEValue DIE::findAttribute(dwarf::Attribute Attribute) const {
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return NULL.
+  for (const auto &V : values())
+    if (V.getAttribute() == Attribute)
+      return V;
+  return DIEValue();
+}
+
+LLVM_DUMP_METHOD
+static void printValues(raw_ostream &O, const DIEValueList &Values,
+                        StringRef Type, unsigned Size, unsigned IndentCount) {
+  O << Type << ": Size: " << Size << "\n";
+
+  unsigned I = 0;
+  const std::string Indent(IndentCount, ' ');
+  for (const auto &V : Values.values()) {
+    O << Indent;
+    O << "Blk[" << I++ << "]";
+    O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
+    V.print(O);
+    O << "\n";
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIE::print(raw_ostream &O, unsigned IndentCount) const {
+  const std::string Indent(IndentCount, ' ');
+  O << Indent << "Die: " << format("0x%lx", (long)(intptr_t) this)
+    << ", Offset: " << Offset << ", Size: " << Size << "\n";
+
+  O << Indent << dwarf::TagString(getTag()) << " "
+    << dwarf::ChildrenString(hasChildren()) << "\n";
+
+  IndentCount += 2;
+  for (const auto &V : values()) {
+    O << Indent;
+    O << dwarf::AttributeString(V.getAttribute());
+    O << "  " << dwarf::FormEncodingString(V.getForm()) << " ";
+    V.print(O);
+    O << "\n";
+  }
+  IndentCount -= 2;
+
+  for (const auto &Child : children())
+    Child.print(O, IndentCount + 4);
+
+  O << "\n";
+}
+
+LLVM_DUMP_METHOD
+void DIE::dump() {
+  print(dbgs());
+}
+
+void DIEValue::EmitValue(const AsmPrinter *AP) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    getDIE##T().EmitValue(AP, Form);                                           \
+    break;
+#include "llvm/CodeGen/DIEValue.def"
+  }
+}
+
+unsigned DIEValue::SizeOf(const AsmPrinter *AP) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    return getDIE##T().SizeOf(AP, Form);
+#include "llvm/CodeGen/DIEValue.def"
+  }
+  llvm_unreachable("Unknown DIE kind");
+}
+
+LLVM_DUMP_METHOD
+void DIEValue::print(raw_ostream &O) const {
+  switch (Ty) {
+  case isNone:
+    llvm_unreachable("Expected valid DIEValue");
+#define HANDLE_DIEVALUE(T)                                                     \
+  case is##T:                                                                  \
+    getDIE##T().print(O);                                                      \
+    break;
+#include "llvm/CodeGen/DIEValue.def"
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEValue::dump() const {
+  print(dbgs());
+}
+
+//===----------------------------------------------------------------------===//
+// DIEInteger Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit integer of appropriate size.
+///
+void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  unsigned Size = ~0U;
+  switch (Form) {
+  case dwarf::DW_FORM_flag_present:
+    // Emit something to keep the lines and comments in sync.
+    // FIXME: Is there a better way to do this?
+    Asm->OutStreamer->AddBlankLine();
+    return;
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: Size = 1; break;
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: Size = 2; break;
+  case dwarf::DW_FORM_sec_offset: // Fall thru
+  case dwarf::DW_FORM_strp: // Fall thru
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: Size = 4; break;
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_ref_sig8:  // Fall thru
+  case dwarf::DW_FORM_data8: Size = 8; break;
+  case dwarf::DW_FORM_GNU_str_index: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_GNU_addr_index: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_udata: Asm->EmitULEB128(Integer); return;
+  case dwarf::DW_FORM_sdata: Asm->EmitSLEB128(Integer); return;
+  case dwarf::DW_FORM_addr:
+    Size = Asm->getPointerSize();
+    break;
+  case dwarf::DW_FORM_ref_addr:
+    Size = SizeOf(Asm, dwarf::DW_FORM_ref_addr);
+    break;
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+  Asm->OutStreamer->EmitIntValue(Integer, Size);
+}
+
+/// SizeOf - Determine size of integer value in bytes.
+///
+unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_flag_present: return 0;
+  case dwarf::DW_FORM_flag:  // Fall thru
+  case dwarf::DW_FORM_ref1:  // Fall thru
+  case dwarf::DW_FORM_data1: return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:  // Fall thru
+  case dwarf::DW_FORM_data2: return sizeof(int16_t);
+  case dwarf::DW_FORM_sec_offset: // Fall thru
+  case dwarf::DW_FORM_strp: // Fall thru
+  case dwarf::DW_FORM_ref4:  // Fall thru
+  case dwarf::DW_FORM_data4: return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:  // Fall thru
+  case dwarf::DW_FORM_ref_sig8:  // Fall thru
+  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_GNU_str_index: return getULEB128Size(Integer);
+  case dwarf::DW_FORM_GNU_addr_index: return getULEB128Size(Integer);
+  case dwarf::DW_FORM_udata: return getULEB128Size(Integer);
+  case dwarf::DW_FORM_sdata: return getSLEB128Size(Integer);
+  case dwarf::DW_FORM_addr:
+    return AP->getPointerSize();
+  case dwarf::DW_FORM_ref_addr:
+    if (AP->OutStreamer->getContext().getDwarfVersion() == 2)
+      return AP->getPointerSize();
+    return sizeof(int32_t);
+  default: llvm_unreachable("DIE Value form not supported yet");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEInteger::print(raw_ostream &O) const {
+  O << "Int: " << (int64_t)Integer << "  0x";
+  O.write_hex(Integer);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEExpr Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit expression value.
+///
+void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->OutStreamer->EmitValue(Expr, SizeOf(AP, Form));
+}
+
+/// SizeOf - Determine size of expression value in bytes.
+///
+unsigned DIEExpr::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getPointerSize();
+}
+
+LLVM_DUMP_METHOD
+void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
+
+//===----------------------------------------------------------------------===//
+// DIELabel Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit label value.
+///
+void DIELabel::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->EmitLabelReference(Label, SizeOf(AP, Form),
+                         Form == dwarf::DW_FORM_strp ||
+                             Form == dwarf::DW_FORM_sec_offset ||
+                             Form == dwarf::DW_FORM_ref_addr);
+}
+
+/// SizeOf - Determine size of label value in bytes.
+///
+unsigned DIELabel::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getPointerSize();
+}
+
+LLVM_DUMP_METHOD
+void DIELabel::print(raw_ostream &O) const { O << "Lbl: " << Label->getName(); }
+
+//===----------------------------------------------------------------------===//
+// DIEDelta Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit delta value.
+///
+void DIEDelta::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEDelta::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_data4) return 4;
+  if (Form == dwarf::DW_FORM_sec_offset) return 4;
+  if (Form == dwarf::DW_FORM_strp) return 4;
+  return AP->getPointerSize();
+}
+
+LLVM_DUMP_METHOD
+void DIEDelta::print(raw_ostream &O) const {
+  O << "Del: " << LabelHi->getName() << "-" << LabelLo->getName();
+}
+
+//===----------------------------------------------------------------------===//
+// DIEString Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit string value.
+///
+void DIEString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  assert(
+      (Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_GNU_str_index) &&
+      "Expected valid string form");
+
+  // Index of string in symbol table.
+  if (Form == dwarf::DW_FORM_GNU_str_index) {
+    DIEInteger(S.getIndex()).EmitValue(AP, Form);
+    return;
+  }
+
+  // Relocatable symbol.
+  assert(Form == dwarf::DW_FORM_strp);
+  if (AP->MAI->doesDwarfUseRelocationsAcrossSections()) {
+    DIELabel(S.getSymbol()).EmitValue(AP, Form);
+    return;
+  }
+
+  // Offset into symbol table.
+  DIEInteger(S.getOffset()).EmitValue(AP, Form);
+}
+
+/// SizeOf - Determine size of delta value in bytes.
+///
+unsigned DIEString::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  assert(
+      (Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_GNU_str_index) &&
+      "Expected valid string form");
+
+  // Index of string in symbol table.
+  if (Form == dwarf::DW_FORM_GNU_str_index)
+    return DIEInteger(S.getIndex()).SizeOf(AP, Form);
+
+  // Relocatable symbol.
+  if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
+    return DIELabel(S.getSymbol()).SizeOf(AP, Form);
+
+  // Offset into symbol table.
+  return DIEInteger(S.getOffset()).SizeOf(AP, Form);
+}
+
+LLVM_DUMP_METHOD
+void DIEString::print(raw_ostream &O) const {
+  O << "String: " << S.getString();
+}
+
+//===----------------------------------------------------------------------===//
+// DIEEntry Implementation
+//===----------------------------------------------------------------------===//
+
+/// EmitValue - Emit debug information entry offset.
+///
+void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+
+  if (Form == dwarf::DW_FORM_ref_addr) {
+    const DwarfDebug *DD = AP->getDwarfDebug();
+    unsigned Addr = Entry->getOffset();
+    assert(!DD->useSplitDwarf() && "TODO: dwo files can't have relocations.");
+    // For DW_FORM_ref_addr, output the offset from beginning of debug info
+    // section. Entry->getOffset() returns the offset from start of the
+    // compile unit.
+    DwarfCompileUnit *CU = DD->lookupUnit(Entry->getUnit());
+    assert(CU && "CUDie should belong to a CU.");
+    Addr += CU->getDebugInfoOffset();
+    if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
+      AP->EmitLabelPlusOffset(CU->getSectionSym(), Addr,
+                              DIEEntry::getRefAddrSize(AP));
+    else
+      AP->OutStreamer->EmitIntValue(Addr, DIEEntry::getRefAddrSize(AP));
+  } else
+    AP->EmitInt32(Entry->getOffset());
+}
+
+unsigned DIEEntry::getRefAddrSize(const AsmPrinter *AP) {
+  // DWARF4: References that use the attribute form DW_FORM_ref_addr are
+  // specified to be four bytes in the DWARF 32-bit format and eight bytes
+  // in the DWARF 64-bit format, while DWARF Version 2 specifies that such
+  // references have the same size as an address on the target system.
+  const DwarfDebug *DD = AP->getDwarfDebug();
+  assert(DD && "Expected Dwarf Debug info to be available");
+  if (DD->getDwarfVersion() == 2)
+    return AP->getPointerSize();
+  return sizeof(int32_t);
+}
+
+LLVM_DUMP_METHOD
+void DIEEntry::print(raw_ostream &O) const {
+  O << format("Die: 0x%lx", (long)(intptr_t)&Entry);
+}
+
+//===----------------------------------------------------------------------===//
+// DIELoc Implementation
+//===----------------------------------------------------------------------===//
+
+/// ComputeSize - calculate the size of the location expression.
+///
+unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const {
+  if (!Size) {
+    for (const auto &V : values())
+      Size += V.SizeOf(AP);
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit location data.
+///
+void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  switch (Form) {
+  default: llvm_unreachable("Improper form for block");
+  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);   break;
+  case dwarf::DW_FORM_block:
+  case dwarf::DW_FORM_exprloc:
+    Asm->EmitULEB128(Size); break;
+  }
+
+  for (const auto &V : values())
+    V.EmitValue(Asm);
+}
+
+/// SizeOf - Determine size of location data in bytes.
+///
+unsigned DIELoc::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_block:
+  case dwarf::DW_FORM_exprloc:
+    return Size + getULEB128Size(Size);
+  default: llvm_unreachable("Improper form for block");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIELoc::print(raw_ostream &O) const {
+  printValues(O, *this, "ExprLoc", Size, 5);
+}
+
+//===----------------------------------------------------------------------===//
+// DIEBlock Implementation
+//===----------------------------------------------------------------------===//
+
+/// ComputeSize - calculate the size of the block.
+///
+unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const {
+  if (!Size) {
+    for (const auto &V : values())
+      Size += V.SizeOf(AP);
+  }
+
+  return Size;
+}
+
+/// EmitValue - Emit block data.
+///
+void DIEBlock::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+  switch (Form) {
+  default: llvm_unreachable("Improper form for block");
+  case dwarf::DW_FORM_block1: Asm->EmitInt8(Size);    break;
+  case dwarf::DW_FORM_block2: Asm->EmitInt16(Size);   break;
+  case dwarf::DW_FORM_block4: Asm->EmitInt32(Size);   break;
+  case dwarf::DW_FORM_block:  Asm->EmitULEB128(Size); break;
+  }
+
+  for (const auto &V : values())
+    V.EmitValue(Asm);
+}
+
+/// SizeOf - Determine size of block data in bytes.
+///
+unsigned DIEBlock::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  switch (Form) {
+  case dwarf::DW_FORM_block1: return Size + sizeof(int8_t);
+  case dwarf::DW_FORM_block2: return Size + sizeof(int16_t);
+  case dwarf::DW_FORM_block4: return Size + sizeof(int32_t);
+  case dwarf::DW_FORM_block:  return Size + getULEB128Size(Size);
+  default: llvm_unreachable("Improper form for block");
+  }
+}
+
+LLVM_DUMP_METHOD
+void DIEBlock::print(raw_ostream &O) const {
+  printValues(O, *this, "Blk", Size, 5);
+}
+
+//===----------------------------------------------------------------------===//
+// DIELocList Implementation
+//===----------------------------------------------------------------------===//
+
+unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
+  if (Form == dwarf::DW_FORM_data4)
+    return 4;
+  if (Form == dwarf::DW_FORM_sec_offset)
+    return 4;
+  return AP->getPointerSize();
+}
+
+/// EmitValue - Emit label value.
+///
+void DIELocList::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  DwarfDebug *DD = AP->getDwarfDebug();
+  MCSymbol *Label = DD->getDebugLocs().getList(Index).Label;
+  AP->emitDwarfSymbolReference(Label, /*ForceOffset*/ DD->useSplitDwarf());
+}
+
+LLVM_DUMP_METHOD
+void DIELocList::print(raw_ostream &O) const { O << "LocList: " << Index; }
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
new file mode 100644
index 000000000000..74c47d151c62
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -0,0 +1,514 @@
+//===-- llvm/CodeGen/DIEHash.cpp - Dwarf Hashing Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ByteStreamer.h"
+#include "DIEHash.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+/// \brief Grabs the string in whichever attribute is passed in and returns
+/// a reference to it.
+static StringRef getDIEStringAttr(const DIE &Die, uint16_t Attr) {
+  // Iterate through all the attributes until we find the one we're
+  // looking for, if we can't find it return an empty string.
+  for (const auto &V : Die.values())
+    if (V.getAttribute() == Attr)
+      return V.getDIEString().getString();
+
+  return StringRef("");
+}
+
+/// \brief Adds the string in \p Str to the hash. This also hashes
+/// a trailing NULL with the string.
+void DIEHash::addString(StringRef Str) {
+  DEBUG(dbgs() << "Adding string " << Str << " to hash.\n");
+  Hash.update(Str);
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+// FIXME: The LEB128 routines are copied and only slightly modified out of
+// LEB128.h.
+
+/// \brief Adds the unsigned in \p Value to the hash encoded as a ULEB128.
+void DIEHash::addULEB128(uint64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    if (Value != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (Value != 0);
+}
+
+void DIEHash::addSLEB128(int64_t Value) {
+  DEBUG(dbgs() << "Adding ULEB128 " << Value << " to hash.\n");
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    Value >>= 7;
+    More = !((((Value == 0) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    Hash.update(Byte);
+  } while (More);
+}
+
+/// \brief Including \p Parent adds the context of Parent to the hash..
+void DIEHash::addParentContext(const DIE &Parent) {
+
+  DEBUG(dbgs() << "Adding parent context to hash...\n");
+
+  // [7.27.2] For each surrounding type or namespace beginning with the
+  // outermost such construct...
+  SmallVector<const DIE *, 1> Parents;
+  const DIE *Cur = &Parent;
+  while (Cur->getParent()) {
+    Parents.push_back(Cur);
+    Cur = Cur->getParent();
+  }
+  assert(Cur->getTag() == dwarf::DW_TAG_compile_unit ||
+         Cur->getTag() == dwarf::DW_TAG_type_unit);
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (SmallVectorImpl<const DIE *>::reverse_iterator I = Parents.rbegin(),
+                                                      E = Parents.rend();
+       I != E; ++I) {
+    const DIE &Die = **I;
+
+    // ... Append the letter "C" to the sequence...
+    addULEB128('C');
+
+    // ... Followed by the DWARF tag of the construct...
+    addULEB128(Die.getTag());
+
+    // ... Then the name, taken from the DW_AT_name attribute.
+    StringRef Name = getDIEStringAttr(Die, dwarf::DW_AT_name);
+    DEBUG(dbgs() << "... adding context: " << Name << "\n");
+    if (!Name.empty())
+      addString(Name);
+  }
+}
+
+// Collect all of the attributes for a particular DIE in single structure.
+void DIEHash::collectAttributes(const DIE &Die, DIEAttrs &Attrs) {
+#define COLLECT_ATTR(NAME)                                                     \
+  case dwarf::NAME:                                                            \
+    Attrs.NAME = V;                                                            \
+    break
+
+  for (const auto &V : Die.values()) {
+    DEBUG(dbgs() << "Attribute: "
+                 << dwarf::AttributeString(V.getAttribute())
+                 << " added.\n");
+    switch (V.getAttribute()) {
+      COLLECT_ATTR(DW_AT_name);
+      COLLECT_ATTR(DW_AT_accessibility);
+      COLLECT_ATTR(DW_AT_address_class);
+      COLLECT_ATTR(DW_AT_allocated);
+      COLLECT_ATTR(DW_AT_artificial);
+      COLLECT_ATTR(DW_AT_associated);
+      COLLECT_ATTR(DW_AT_binary_scale);
+      COLLECT_ATTR(DW_AT_bit_offset);
+      COLLECT_ATTR(DW_AT_bit_size);
+      COLLECT_ATTR(DW_AT_bit_stride);
+      COLLECT_ATTR(DW_AT_byte_size);
+      COLLECT_ATTR(DW_AT_byte_stride);
+      COLLECT_ATTR(DW_AT_const_expr);
+      COLLECT_ATTR(DW_AT_const_value);
+      COLLECT_ATTR(DW_AT_containing_type);
+      COLLECT_ATTR(DW_AT_count);
+      COLLECT_ATTR(DW_AT_data_bit_offset);
+      COLLECT_ATTR(DW_AT_data_location);
+      COLLECT_ATTR(DW_AT_data_member_location);
+      COLLECT_ATTR(DW_AT_decimal_scale);
+      COLLECT_ATTR(DW_AT_decimal_sign);
+      COLLECT_ATTR(DW_AT_default_value);
+      COLLECT_ATTR(DW_AT_digit_count);
+      COLLECT_ATTR(DW_AT_discr);
+      COLLECT_ATTR(DW_AT_discr_list);
+      COLLECT_ATTR(DW_AT_discr_value);
+      COLLECT_ATTR(DW_AT_encoding);
+      COLLECT_ATTR(DW_AT_enum_class);
+      COLLECT_ATTR(DW_AT_endianity);
+      COLLECT_ATTR(DW_AT_explicit);
+      COLLECT_ATTR(DW_AT_is_optional);
+      COLLECT_ATTR(DW_AT_location);
+      COLLECT_ATTR(DW_AT_lower_bound);
+      COLLECT_ATTR(DW_AT_mutable);
+      COLLECT_ATTR(DW_AT_ordering);
+      COLLECT_ATTR(DW_AT_picture_string);
+      COLLECT_ATTR(DW_AT_prototyped);
+      COLLECT_ATTR(DW_AT_small);
+      COLLECT_ATTR(DW_AT_segment);
+      COLLECT_ATTR(DW_AT_string_length);
+      COLLECT_ATTR(DW_AT_threads_scaled);
+      COLLECT_ATTR(DW_AT_upper_bound);
+      COLLECT_ATTR(DW_AT_use_location);
+      COLLECT_ATTR(DW_AT_use_UTF8);
+      COLLECT_ATTR(DW_AT_variable_parameter);
+      COLLECT_ATTR(DW_AT_virtuality);
+      COLLECT_ATTR(DW_AT_visibility);
+      COLLECT_ATTR(DW_AT_vtable_elem_location);
+      COLLECT_ATTR(DW_AT_type);
+    default:
+      break;
+    }
+  }
+}
+
+void DIEHash::hashShallowTypeReference(dwarf::Attribute Attribute,
+                                       const DIE &Entry, StringRef Name) {
+  // append the letter 'N'
+  addULEB128('N');
+
+  // the DWARF attribute code (DW_AT_type or DW_AT_friend),
+  addULEB128(Attribute);
+
+  // the context of the tag,
+  if (const DIE *Parent = Entry.getParent())
+    addParentContext(*Parent);
+
+  // the letter 'E',
+  addULEB128('E');
+
+  // and the name of the type.
+  addString(Name);
+
+  // Currently DW_TAG_friends are not used by Clang, but if they do become so,
+  // here's the relevant spec text to implement:
+  //
+  // For DW_TAG_friend, if the referenced entry is the DW_TAG_subprogram,
+  // the context is omitted and the name to be used is the ABI-specific name
+  // of the subprogram (e.g., the mangled linker name).
+}
+
+void DIEHash::hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                        unsigned DieNumber) {
+  // a) If T is in the list of [previously hashed types], use the letter
+  // 'R' as the marker
+  addULEB128('R');
+
+  addULEB128(Attribute);
+
+  // and use the unsigned LEB128 encoding of [the index of T in the
+  // list] as the attribute value;
+  addULEB128(DieNumber);
+}
+
+void DIEHash::hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                           const DIE &Entry) {
+  assert(Tag != dwarf::DW_TAG_friend && "No current LLVM clients emit friend "
+                                        "tags. Add support here when there's "
+                                        "a use case");
+  // Step 5
+  // If the tag in Step 3 is one of [the below tags]
+  if ((Tag == dwarf::DW_TAG_pointer_type ||
+       Tag == dwarf::DW_TAG_reference_type ||
+       Tag == dwarf::DW_TAG_rvalue_reference_type ||
+       Tag == dwarf::DW_TAG_ptr_to_member_type) &&
+      // and the referenced type (via the [below attributes])
+      // FIXME: This seems overly restrictive, and causes hash mismatches
+      // there's a decl/def difference in the containing type of a
+      // ptr_to_member_type, but it's what DWARF says, for some reason.
+      Attribute == dwarf::DW_AT_type) {
+    // ... has a DW_AT_name attribute,
+    StringRef Name = getDIEStringAttr(Entry, dwarf::DW_AT_name);
+    if (!Name.empty()) {
+      hashShallowTypeReference(Attribute, Entry, Name);
+      return;
+    }
+  }
+
+  unsigned &DieNumber = Numbering[&Entry];
+  if (DieNumber) {
+    hashRepeatedTypeReference(Attribute, DieNumber);
+    return;
+  }
+
+  // otherwise, b) use the letter 'T' as the marker, ...
+  addULEB128('T');
+
+  addULEB128(Attribute);
+
+  // ... process the type T recursively by performing Steps 2 through 7, and
+  // use the result as the attribute value.
+  DieNumber = Numbering.size();
+  computeHash(Entry);
+}
+
+// Hash all of the values in a block like set of values. This assumes that
+// all of the data is going to be added as integers.
+void DIEHash::hashBlockData(const DIE::const_value_range &Values) {
+  for (const auto &V : Values)
+    Hash.update((uint64_t)V.getDIEInteger().getValue());
+}
+
+// Hash the contents of a loclistptr class.
+void DIEHash::hashLocList(const DIELocList &LocList) {
+  HashingByteStreamer Streamer(*this);
+  DwarfDebug &DD = *AP->getDwarfDebug();
+  const DebugLocStream &Locs = DD.getDebugLocs();
+  for (const auto &Entry : Locs.getEntries(Locs.getList(LocList.getValue())))
+    DD.emitDebugLocEntry(Streamer, Entry);
+}
+
+// Hash an individual attribute \param Attr based on the type of attribute and
+// the form.
+void DIEHash::hashAttribute(const DIEValue &Value, dwarf::Tag Tag) {
+  dwarf::Attribute Attribute = Value.getAttribute();
+
+  // Other attribute values use the letter 'A' as the marker, and the value
+  // consists of the form code (encoded as an unsigned LEB128 value) followed by
+  // the encoding of the value according to the form code. To ensure
+  // reproducibility of the signature, the set of forms used in the signature
+  // computation is limited to the following: DW_FORM_sdata, DW_FORM_flag,
+  // DW_FORM_string, and DW_FORM_block.
+
+  switch (Value.getType()) {
+  case DIEValue::isNone:
+    llvm_unreachable("Expected valid DIEValue");
+
+    // 7.27 Step 3
+    // ... An attribute that refers to another type entry T is processed as
+    // follows:
+  case DIEValue::isEntry:
+    hashDIEEntry(Attribute, Tag, Value.getDIEEntry().getEntry());
+    break;
+  case DIEValue::isInteger: {
+    addULEB128('A');
+    addULEB128(Attribute);
+    switch (Value.getForm()) {
+    case dwarf::DW_FORM_data1:
+    case dwarf::DW_FORM_data2:
+    case dwarf::DW_FORM_data4:
+    case dwarf::DW_FORM_data8:
+    case dwarf::DW_FORM_udata:
+    case dwarf::DW_FORM_sdata:
+      addULEB128(dwarf::DW_FORM_sdata);
+      addSLEB128((int64_t)Value.getDIEInteger().getValue());
+      break;
+    // DW_FORM_flag_present is just flag with a value of one. We still give it a
+    // value so just use the value.
+    case dwarf::DW_FORM_flag_present:
+    case dwarf::DW_FORM_flag:
+      addULEB128(dwarf::DW_FORM_flag);
+      addULEB128((int64_t)Value.getDIEInteger().getValue());
+      break;
+    default:
+      llvm_unreachable("Unknown integer form!");
+    }
+    break;
+  }
+  case DIEValue::isString:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_string);
+    addString(Value.getDIEString().getString());
+    break;
+  case DIEValue::isBlock:
+  case DIEValue::isLoc:
+  case DIEValue::isLocList:
+    addULEB128('A');
+    addULEB128(Attribute);
+    addULEB128(dwarf::DW_FORM_block);
+    if (Value.getType() == DIEValue::isBlock) {
+      addULEB128(Value.getDIEBlock().ComputeSize(AP));
+      hashBlockData(Value.getDIEBlock().values());
+    } else if (Value.getType() == DIEValue::isLoc) {
+      addULEB128(Value.getDIELoc().ComputeSize(AP));
+      hashBlockData(Value.getDIELoc().values());
+    } else {
+      // We could add the block length, but that would take
+      // a bit of work and not add a lot of uniqueness
+      // to the hash in some way we could test.
+      hashLocList(Value.getDIELocList());
+    }
+    break;
+    // FIXME: It's uncertain whether or not we should handle this at the moment.
+  case DIEValue::isExpr:
+  case DIEValue::isLabel:
+  case DIEValue::isDelta:
+    llvm_unreachable("Add support for additional value types.");
+  }
+}
+
+// Go through the attributes from \param Attrs in the order specified in 7.27.4
+// and hash them.
+void DIEHash::hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag) {
+#define ADD_ATTR(ATTR)                                                         \
+  {                                                                            \
+    if (ATTR)                                                                  \
+      hashAttribute(ATTR, Tag);                                                \
+  }
+
+  ADD_ATTR(Attrs.DW_AT_name);
+  ADD_ATTR(Attrs.DW_AT_accessibility);
+  ADD_ATTR(Attrs.DW_AT_address_class);
+  ADD_ATTR(Attrs.DW_AT_allocated);
+  ADD_ATTR(Attrs.DW_AT_artificial);
+  ADD_ATTR(Attrs.DW_AT_associated);
+  ADD_ATTR(Attrs.DW_AT_binary_scale);
+  ADD_ATTR(Attrs.DW_AT_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_bit_size);
+  ADD_ATTR(Attrs.DW_AT_bit_stride);
+  ADD_ATTR(Attrs.DW_AT_byte_size);
+  ADD_ATTR(Attrs.DW_AT_byte_stride);
+  ADD_ATTR(Attrs.DW_AT_const_expr);
+  ADD_ATTR(Attrs.DW_AT_const_value);
+  ADD_ATTR(Attrs.DW_AT_containing_type);
+  ADD_ATTR(Attrs.DW_AT_count);
+  ADD_ATTR(Attrs.DW_AT_data_bit_offset);
+  ADD_ATTR(Attrs.DW_AT_data_location);
+  ADD_ATTR(Attrs.DW_AT_data_member_location);
+  ADD_ATTR(Attrs.DW_AT_decimal_scale);
+  ADD_ATTR(Attrs.DW_AT_decimal_sign);
+  ADD_ATTR(Attrs.DW_AT_default_value);
+  ADD_ATTR(Attrs.DW_AT_digit_count);
+  ADD_ATTR(Attrs.DW_AT_discr);
+  ADD_ATTR(Attrs.DW_AT_discr_list);
+  ADD_ATTR(Attrs.DW_AT_discr_value);
+  ADD_ATTR(Attrs.DW_AT_encoding);
+  ADD_ATTR(Attrs.DW_AT_enum_class);
+  ADD_ATTR(Attrs.DW_AT_endianity);
+  ADD_ATTR(Attrs.DW_AT_explicit);
+  ADD_ATTR(Attrs.DW_AT_is_optional);
+  ADD_ATTR(Attrs.DW_AT_location);
+  ADD_ATTR(Attrs.DW_AT_lower_bound);
+  ADD_ATTR(Attrs.DW_AT_mutable);
+  ADD_ATTR(Attrs.DW_AT_ordering);
+  ADD_ATTR(Attrs.DW_AT_picture_string);
+  ADD_ATTR(Attrs.DW_AT_prototyped);
+  ADD_ATTR(Attrs.DW_AT_small);
+  ADD_ATTR(Attrs.DW_AT_segment);
+  ADD_ATTR(Attrs.DW_AT_string_length);
+  ADD_ATTR(Attrs.DW_AT_threads_scaled);
+  ADD_ATTR(Attrs.DW_AT_upper_bound);
+  ADD_ATTR(Attrs.DW_AT_use_location);
+  ADD_ATTR(Attrs.DW_AT_use_UTF8);
+  ADD_ATTR(Attrs.DW_AT_variable_parameter);
+  ADD_ATTR(Attrs.DW_AT_virtuality);
+  ADD_ATTR(Attrs.DW_AT_visibility);
+  ADD_ATTR(Attrs.DW_AT_vtable_elem_location);
+  ADD_ATTR(Attrs.DW_AT_type);
+
+  // FIXME: Add the extended attributes.
+}
+
+// Add all of the attributes for \param Die to the hash.
+void DIEHash::addAttributes(const DIE &Die) {
+  DIEAttrs Attrs = {};
+  collectAttributes(Die, Attrs);
+  hashAttributes(Attrs, Die.getTag());
+}
+
+void DIEHash::hashNestedType(const DIE &Die, StringRef Name) {
+  // 7.27 Step 7
+  // ... append the letter 'S',
+  addULEB128('S');
+
+  // the tag of C,
+  addULEB128(Die.getTag());
+
+  // and the name.
+  addString(Name);
+}
+
+// Compute the hash of a DIE. This is based on the type signature computation
+// given in section 7.27 of the DWARF4 standard. It is the md5 hash of a
+// flattened description of the DIE.
+void DIEHash::computeHash(const DIE &Die) {
+  // Append the letter 'D', followed by the DWARF tag of the DIE.
+  addULEB128('D');
+  addULEB128(Die.getTag());
+
+  // Add each of the attributes of the DIE.
+  addAttributes(Die);
+
+  // Then hash each of the children of the DIE.
+  for (auto &C : Die.children()) {
+    // 7.27 Step 7
+    // If C is a nested type entry or a member function entry, ...
+    if (isType(C.getTag()) || C.getTag() == dwarf::DW_TAG_subprogram) {
+      StringRef Name = getDIEStringAttr(C, dwarf::DW_AT_name);
+      // ... and has a DW_AT_name attribute
+      if (!Name.empty()) {
+        hashNestedType(C, Name);
+        continue;
+      }
+    }
+    computeHash(C);
+  }
+
+  // Following the last (or if there are no children), append a zero byte.
+  Hash.update(makeArrayRef((uint8_t)'\0'));
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of the full CU and all top level CU entities.
+// TODO: Initialize the type chain at 0 instead of 1 for CU signatures.
+uint64_t DIEHash::computeCUSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return support::endian::read64le(Result + 8);
+}
+
+/// This is based on the type signature computation given in section 7.27 of the
+/// DWARF4 standard. It is an md5 hash of the flattened description of the DIE
+/// with the inclusion of additional forms not specifically called out in the
+/// standard.
+uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
+  Numbering.clear();
+  Numbering[&Die] = 1;
+
+  if (const DIE *Parent = Die.getParent())
+    addParentContext(*Parent);
+
+  // Hash the DIE.
+  computeHash(Die);
+
+  // Now return the result.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  return support::endian::read64le(Result + 8);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
new file mode 100644
index 000000000000..996cd7ef3d2e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -0,0 +1,159 @@
+//===-- llvm/CodeGen/DIEHash.h - Dwarf Hashing Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for DWARF4 hashing of DIEs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DIEHASH_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/Support/MD5.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class CompileUnit;
+
+/// \brief An object containing the capability of hashing and adding hash
+/// attributes onto a DIE.
+class DIEHash {
+  // Collection of all attributes used in hashing a particular DIE.
+  struct DIEAttrs {
+    DIEValue DW_AT_name;
+    DIEValue DW_AT_accessibility;
+    DIEValue DW_AT_address_class;
+    DIEValue DW_AT_allocated;
+    DIEValue DW_AT_artificial;
+    DIEValue DW_AT_associated;
+    DIEValue DW_AT_binary_scale;
+    DIEValue DW_AT_bit_offset;
+    DIEValue DW_AT_bit_size;
+    DIEValue DW_AT_bit_stride;
+    DIEValue DW_AT_byte_size;
+    DIEValue DW_AT_byte_stride;
+    DIEValue DW_AT_const_expr;
+    DIEValue DW_AT_const_value;
+    DIEValue DW_AT_containing_type;
+    DIEValue DW_AT_count;
+    DIEValue DW_AT_data_bit_offset;
+    DIEValue DW_AT_data_location;
+    DIEValue DW_AT_data_member_location;
+    DIEValue DW_AT_decimal_scale;
+    DIEValue DW_AT_decimal_sign;
+    DIEValue DW_AT_default_value;
+    DIEValue DW_AT_digit_count;
+    DIEValue DW_AT_discr;
+    DIEValue DW_AT_discr_list;
+    DIEValue DW_AT_discr_value;
+    DIEValue DW_AT_encoding;
+    DIEValue DW_AT_enum_class;
+    DIEValue DW_AT_endianity;
+    DIEValue DW_AT_explicit;
+    DIEValue DW_AT_is_optional;
+    DIEValue DW_AT_location;
+    DIEValue DW_AT_lower_bound;
+    DIEValue DW_AT_mutable;
+    DIEValue DW_AT_ordering;
+    DIEValue DW_AT_picture_string;
+    DIEValue DW_AT_prototyped;
+    DIEValue DW_AT_small;
+    DIEValue DW_AT_segment;
+    DIEValue DW_AT_string_length;
+    DIEValue DW_AT_threads_scaled;
+    DIEValue DW_AT_upper_bound;
+    DIEValue DW_AT_use_location;
+    DIEValue DW_AT_use_UTF8;
+    DIEValue DW_AT_variable_parameter;
+    DIEValue DW_AT_virtuality;
+    DIEValue DW_AT_visibility;
+    DIEValue DW_AT_vtable_elem_location;
+    DIEValue DW_AT_type;
+
+    // Insert any additional ones here...
+  };
+
+public:
+  DIEHash(AsmPrinter *A = nullptr) : AP(A) {}
+
+  /// \brief Computes the CU signature.
+  uint64_t computeCUSignature(const DIE &Die);
+
+  /// \brief Computes the type signature.
+  uint64_t computeTypeSignature(const DIE &Die);
+
+  // Helper routines to process parts of a DIE.
+private:
+  /// \brief Adds the parent context of \param Die to the hash.
+  void addParentContext(const DIE &Die);
+
+  /// \brief Adds the attributes of \param Die to the hash.
+  void addAttributes(const DIE &Die);
+
+  /// \brief Computes the full DWARF4 7.27 hash of the DIE.
+  void computeHash(const DIE &Die);
+
+  // Routines that add DIEValues to the hash.
+public:
+  /// \brief Adds \param Value to the hash.
+  void update(uint8_t Value) { Hash.update(Value); }
+
+  /// \brief Encodes and adds \param Value to the hash as a ULEB128.
+  void addULEB128(uint64_t Value);
+
+  /// \brief Encodes and adds \param Value to the hash as a SLEB128.
+  void addSLEB128(int64_t Value);
+
+private:
+  /// \brief Adds \param Str to the hash and includes a NULL byte.
+  void addString(StringRef Str);
+
+  /// \brief Collects the attributes of DIE \param Die into the \param Attrs
+  /// structure.
+  void collectAttributes(const DIE &Die, DIEAttrs &Attrs);
+
+  /// \brief Hashes the attributes in \param Attrs in order.
+  void hashAttributes(const DIEAttrs &Attrs, dwarf::Tag Tag);
+
+  /// \brief Hashes the data in a block like DIEValue, e.g. DW_FORM_block or
+  /// DW_FORM_exprloc.
+  void hashBlockData(const DIE::const_value_range &Values);
+
+  /// \brief Hashes the contents pointed to in the .debug_loc section.
+  void hashLocList(const DIELocList &LocList);
+
+  /// \brief Hashes an individual attribute.
+  void hashAttribute(const DIEValue &Value, dwarf::Tag Tag);
+
+  /// \brief Hashes an attribute that refers to another DIE.
+  void hashDIEEntry(dwarf::Attribute Attribute, dwarf::Tag Tag,
+                    const DIE &Entry);
+
+  /// \brief Hashes a reference to a named type in such a way that is
+  /// independent of whether that type is described by a declaration or a
+  /// definition.
+  void hashShallowTypeReference(dwarf::Attribute Attribute, const DIE &Entry,
+                                StringRef Name);
+
+  /// \brief Hashes a reference to a previously referenced type DIE.
+  void hashRepeatedTypeReference(dwarf::Attribute Attribute,
+                                 unsigned DieNumber);
+
+  void hashNestedType(const DIE &Die, StringRef Name);
+
+private:
+  MD5 Hash;
+  AsmPrinter *AP;
+  DenseMap<const DIE *, unsigned> Numbering;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
new file mode 100644
index 000000000000..adc536f1add8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp
@@ -0,0 +1,246 @@
+//===-- llvm/CodeGen/AsmPrinter/DbgValueHistoryCalculator.cpp -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DbgValueHistoryCalculator.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+// \brief If @MI is a DBG_VALUE with debug value described by a
+// defined register, returns the number of this register.
+// In the other case, returns 0.
+static unsigned isDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue());
+  assert(MI.getNumOperands() == 4);
+  // If location of variable is described using a register (directly or
+  // indirecltly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+void DbgValueHistoryMap::startInstrRange(InlinedVariable Var,
+                                         const MachineInstr &MI) {
+  // Instruction range should start with a DBG_VALUE instruction for the
+  // variable.
+  assert(MI.isDebugValue() && "not a DBG_VALUE");
+  auto &Ranges = VarInstrRanges[Var];
+  if (!Ranges.empty() && Ranges.back().second == nullptr &&
+      Ranges.back().first->isIdenticalTo(MI)) {
+    DEBUG(dbgs() << "Coalescing identical DBG_VALUE entries:\n"
+                 << "\t" << Ranges.back().first << "\t" << MI << "\n");
+    return;
+  }
+  Ranges.push_back(std::make_pair(&MI, nullptr));
+}
+
+void DbgValueHistoryMap::endInstrRange(InlinedVariable Var,
+                                       const MachineInstr &MI) {
+  auto &Ranges = VarInstrRanges[Var];
+  // Verify that the current instruction range is not yet closed.
+  assert(!Ranges.empty() && Ranges.back().second == nullptr);
+  // For now, instruction ranges are not allowed to cross basic block
+  // boundaries.
+  assert(Ranges.back().first->getParent() == MI.getParent());
+  Ranges.back().second = &MI;
+}
+
+unsigned DbgValueHistoryMap::getRegisterForVar(InlinedVariable Var) const {
+  const auto &I = VarInstrRanges.find(Var);
+  if (I == VarInstrRanges.end())
+    return 0;
+  const auto &Ranges = I->second;
+  if (Ranges.empty() || Ranges.back().second != nullptr)
+    return 0;
+  return isDescribedByReg(*Ranges.back().first);
+}
+
+namespace {
+// Maps physreg numbers to the variables they describe.
+typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+typedef std::map<unsigned, SmallVector<InlinedVariable, 1>> RegDescribedVarsMap;
+}
+
+// \brief Claim that @Var is not described by @RegNo anymore.
+static void dropRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                InlinedVariable Var) {
+  const auto &I = RegVars.find(RegNo);
+  assert(RegNo != 0U && I != RegVars.end());
+  auto &VarSet = I->second;
+  const auto &VarPos = std::find(VarSet.begin(), VarSet.end(), Var);
+  assert(VarPos != VarSet.end());
+  VarSet.erase(VarPos);
+  // Don't keep empty sets in a map to keep it as small as possible.
+  if (VarSet.empty())
+    RegVars.erase(I);
+}
+
+// \brief Claim that @Var is now described by @RegNo.
+static void addRegDescribedVar(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                               InlinedVariable Var) {
+  assert(RegNo != 0U);
+  auto &VarSet = RegVars[RegNo];
+  assert(std::find(VarSet.begin(), VarSet.end(), Var) == VarSet.end());
+  VarSet.push_back(Var);
+}
+
+// \brief Terminate the location range for variables described by register at
+// @I by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars,
+                                RegDescribedVarsMap::iterator I,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  // Iterate over all variables described by this register and add this
+  // instruction to their history, clobbering it.
+  for (const auto &Var : I->second)
+    HistMap.endInstrRange(Var, ClobberingInstr);
+  RegVars.erase(I);
+}
+
+// \brief Terminate the location range for variables described by register
+// @RegNo by inserting @ClobberingInstr to their history.
+static void clobberRegisterUses(RegDescribedVarsMap &RegVars, unsigned RegNo,
+                                DbgValueHistoryMap &HistMap,
+                                const MachineInstr &ClobberingInstr) {
+  const auto &I = RegVars.find(RegNo);
+  if (I == RegVars.end())
+    return;
+  clobberRegisterUses(RegVars, I, HistMap, ClobberingInstr);
+}
+
+// \brief Returns the first instruction in @MBB which corresponds to
+// the function epilogue, or nullptr if @MBB doesn't contain an epilogue.
+static const MachineInstr *getFirstEpilogueInst(const MachineBasicBlock &MBB) {
+  auto LastMI = MBB.getLastNonDebugInstr();
+  if (LastMI == MBB.end() || !LastMI->isReturn())
+    return nullptr;
+  // Assume that epilogue starts with instruction having the same debug location
+  // as the return instruction.
+  DebugLoc LastLoc = LastMI->getDebugLoc();
+  auto Res = LastMI;
+  for (MachineBasicBlock::const_reverse_iterator I(std::next(LastMI)),
+       E = MBB.rend();
+       I != E; ++I) {
+    if (I->getDebugLoc() != LastLoc)
+      return &*Res;
+    Res = &*I;
+  }
+  // If all instructions have the same debug location, assume whole MBB is
+  // an epilogue.
+  return &*MBB.begin();
+}
+
+// \brief Collect registers that are modified in the function body (their
+// contents is changed outside of the prologue and epilogue).
+static void collectChangingRegs(const MachineFunction *MF,
+                                const TargetRegisterInfo *TRI,
+                                BitVector &Regs) {
+  for (const auto &MBB : *MF) {
+    auto FirstEpilogueInst = getFirstEpilogueInst(MBB);
+
+    for (const auto &MI : MBB) {
+      // Avoid looking at prologue or epilogue instructions.
+      if (&MI == FirstEpilogueInst)
+        break;
+      if (MI.getFlag(MachineInstr::FrameSetup))
+        continue;
+
+      // Look for register defs and register masks. Register masks are
+      // typically on calls and they clobber everything not in the mask.
+      for (const MachineOperand &MO : MI.operands()) {
+        if (MO.isReg() && MO.isDef() && MO.getReg()) {
+          for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+               ++AI)
+            Regs.set(*AI);
+        } else if (MO.isRegMask()) {
+          Regs.setBitsNotInMask(MO.getRegMask());
+        }
+      }
+    }
+  }
+}
+
+void llvm::calculateDbgValueHistory(const MachineFunction *MF,
+                                    const TargetRegisterInfo *TRI,
+                                    DbgValueHistoryMap &Result) {
+  BitVector ChangingRegs(TRI->getNumRegs());
+  collectChangingRegs(MF, TRI, ChangingRegs);
+
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+  RegDescribedVarsMap RegVars;
+  for (const auto &MBB : *MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isDebugValue()) {
+        // Not a DBG_VALUE instruction. It may clobber registers which describe
+        // some variables.
+        for (const MachineOperand &MO : MI.operands()) {
+          if (MO.isReg() && MO.isDef() && MO.getReg()) {
+            // If this is a register def operand, it may end a debug value
+            // range.
+            for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid();
+                 ++AI)
+              if (ChangingRegs.test(*AI))
+                clobberRegisterUses(RegVars, *AI, Result, MI);
+          } else if (MO.isRegMask()) {
+            // If this is a register mask operand, clobber all debug values in
+            // non-CSRs.
+            for (int I = ChangingRegs.find_first(); I != -1;
+                 I = ChangingRegs.find_next(I)) {
+              // Don't consider SP to be clobbered by register masks.
+              if (unsigned(I) != SP && TRI->isPhysicalRegister(I) &&
+                  MO.clobbersPhysReg(I)) {
+                clobberRegisterUses(RegVars, I, Result, MI);
+              }
+            }
+          }
+        }
+        continue;
+      }
+
+      assert(MI.getNumOperands() > 1 && "Invalid DBG_VALUE instruction!");
+      // Use the base variable (without any DW_OP_piece expressions)
+      // as index into History. The full variables including the
+      // piece expressions are attached to the MI.
+      const DILocalVariable *RawVar = MI.getDebugVariable();
+      assert(RawVar->isValidLocationForIntrinsic(MI.getDebugLoc()) &&
+             "Expected inlined-at fields to agree");
+      InlinedVariable Var(RawVar, MI.getDebugLoc()->getInlinedAt());
+
+      if (unsigned PrevReg = Result.getRegisterForVar(Var))
+        dropRegDescribedVar(RegVars, PrevReg, Var);
+
+      Result.startInstrRange(Var, MI);
+
+      if (unsigned NewReg = isDescribedByReg(MI))
+        addRegDescribedVar(RegVars, NewReg, Var);
+    }
+
+    // Make sure locations for register-described variables are valid only
+    // until the end of the basic block (unless it's the last basic block, in
+    // which case let their liveness run off to the end of the function).
+    if (!MBB.empty() && &MBB != &MF->back()) {
+      for (auto I = RegVars.begin(), E = RegVars.end(); I != E;) {
+        auto CurElem = I++; // CurElem can be erased below.
+        if (ChangingRegs.test(CurElem->first))
+          clobberRegisterUses(RegVars, CurElem, Result, MBB.back());
+      }
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
new file mode 100644
index 000000000000..16d2d7fd7e99
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h
@@ -0,0 +1,59 @@
+//===-- llvm/CodeGen/AsmPrinter/DbgValueHistoryCalculator.h ----*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DBGVALUEHISTORYCALCULATOR_H
+
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+
+namespace llvm {
+
+class MachineFunction;
+class MachineInstr;
+class TargetRegisterInfo;
+
+// For each user variable, keep a list of instruction ranges where this variable
+// is accessible. The variables are listed in order of appearance.
+class DbgValueHistoryMap {
+  // Each instruction range starts with a DBG_VALUE instruction, specifying the
+  // location of a variable, which is assumed to be valid until the end of the
+  // range. If end is not specified, location is valid until the start
+  // instruction of the next instruction range, or until the end of the
+  // function.
+public:
+  typedef std::pair<const MachineInstr *, const MachineInstr *> InstrRange;
+  typedef SmallVector<InstrRange, 4> InstrRanges;
+  typedef std::pair<const DILocalVariable *, const DILocation *>
+      InlinedVariable;
+  typedef MapVector<InlinedVariable, InstrRanges> InstrRangesMap;
+
+private:
+  InstrRangesMap VarInstrRanges;
+
+public:
+  void startInstrRange(InlinedVariable Var, const MachineInstr &MI);
+  void endInstrRange(InlinedVariable Var, const MachineInstr &MI);
+  // Returns register currently describing @Var. If @Var is currently
+  // unaccessible or is not described by a register, returns 0.
+  unsigned getRegisterForVar(InlinedVariable Var) const;
+
+  bool empty() const { return VarInstrRanges.empty(); }
+  void clear() { VarInstrRanges.clear(); }
+  InstrRangesMap::const_iterator begin() const { return VarInstrRanges.begin(); }
+  InstrRangesMap::const_iterator end() const { return VarInstrRanges.end(); }
+};
+
+void calculateDbgValueHistory(const MachineFunction *MF,
+                              const TargetRegisterInfo *TRI,
+                              DbgValueHistoryMap &Result);
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
new file mode 100644
index 000000000000..16ffe2e15acd
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -0,0 +1,230 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for different debug information format backends.
+// LLVM currently supports DWARF and CodeView.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugHandlerBase.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+DebugHandlerBase::DebugHandlerBase(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
+
+// Each LexicalScope has first instruction and last instruction to mark
+// beginning and end of a scope respectively. Create an inverse map that list
+// scopes starts (and ends) with an instruction. One instruction may start (or
+// end) multiple scopes. Ignore scopes that are not reachable.
+void DebugHandlerBase::identifyScopeMarkers() {
+  SmallVector<LexicalScope *, 4> WorkList;
+  WorkList.push_back(LScopes.getCurrentFunctionScope());
+  while (!WorkList.empty()) {
+    LexicalScope *S = WorkList.pop_back_val();
+
+    const SmallVectorImpl<LexicalScope *> &Children = S->getChildren();
+    if (!Children.empty())
+      WorkList.append(Children.begin(), Children.end());
+
+    if (S->isAbstractScope())
+      continue;
+
+    for (const InsnRange &R : S->getRanges()) {
+      assert(R.first && "InsnRange does not have first instruction!");
+      assert(R.second && "InsnRange does not have second instruction!");
+      requestLabelBeforeInsn(R.first);
+      requestLabelAfterInsn(R.second);
+    }
+  }
+}
+
+// Return Label preceding the instruction.
+MCSymbol *DebugHandlerBase::getLabelBeforeInsn(const MachineInstr *MI) {
+  MCSymbol *Label = LabelsBeforeInsn.lookup(MI);
+  assert(Label && "Didn't insert label before instruction");
+  return Label;
+}
+
+// Return Label immediately following the instruction.
+MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
+  return LabelsAfterInsn.lookup(MI);
+}
+
+// Determine the relative position of the pieces described by P1 and P2.
+// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+// 1 if P1 is entirely after P2.
+int DebugHandlerBase::pieceCmp(const DIExpression *P1, const DIExpression *P2) {
+  unsigned l1 = P1->getBitPieceOffset();
+  unsigned l2 = P2->getBitPieceOffset();
+  unsigned r1 = l1 + P1->getBitPieceSize();
+  unsigned r2 = l2 + P2->getBitPieceSize();
+  if (r1 <= l2)
+    return -1;
+  else if (r2 <= l1)
+    return 1;
+  else
+    return 0;
+}
+
+/// Determine whether two variable pieces overlap.
+bool DebugHandlerBase::piecesOverlap(const DIExpression *P1, const DIExpression *P2) {
+  if (!P1->isBitPiece() || !P2->isBitPiece())
+    return true;
+  return pieceCmp(P1, P2) == 0;
+}
+
+/// If this type is derived from a base type then return base type size.
+uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
+  DIType *Ty = TyRef.resolve();
+  assert(Ty);
+  DIDerivedType *DDTy = dyn_cast<DIDerivedType>(Ty);
+  if (!DDTy)
+    return Ty->getSizeInBits();
+
+  unsigned Tag = DDTy->getTag();
+
+  if (Tag != dwarf::DW_TAG_member && Tag != dwarf::DW_TAG_typedef &&
+      Tag != dwarf::DW_TAG_const_type && Tag != dwarf::DW_TAG_volatile_type &&
+      Tag != dwarf::DW_TAG_restrict_type)
+    return DDTy->getSizeInBits();
+
+  DIType *BaseType = DDTy->getBaseType().resolve();
+
+  assert(BaseType && "Unexpected invalid base type");
+
+  // If this is a derived type, go ahead and get the base type, unless it's a
+  // reference then it's just the size of the field. Pointer types have no need
+  // of this since they're a different type of qualification on the type.
+  if (BaseType->getTag() == dwarf::DW_TAG_reference_type ||
+      BaseType->getTag() == dwarf::DW_TAG_rvalue_reference_type)
+    return Ty->getSizeInBits();
+
+  return getBaseTypeSize(BaseType);
+}
+
+void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
+  LScopes.initialize(*MF);
+  if (LScopes.empty())
+    return;
+
+  // Make sure that each lexical scope will have a begin/end label.
+  identifyScopeMarkers();
+
+  // Calculate history for local variables.
+  assert(DbgValues.empty() && "DbgValues map wasn't cleaned!");
+  calculateDbgValueHistory(MF, Asm->MF->getSubtarget().getRegisterInfo(),
+                           DbgValues);
+
+  // Request labels for the full history.
+  for (const auto &I : DbgValues) {
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
+      continue;
+
+    // The first mention of a function argument gets the CurrentFnBegin
+    // label, so arguments are visible when breaking at function entry.
+    const DILocalVariable *DIVar = Ranges.front().first->getDebugVariable();
+    if (DIVar->isParameter() &&
+        getDISubprogram(DIVar->getScope())->describes(MF->getFunction())) {
+      LabelsBeforeInsn[Ranges.front().first] = Asm->getFunctionBegin();
+      if (Ranges.front().first->getDebugExpression()->isBitPiece()) {
+        // Mark all non-overlapping initial pieces.
+        for (auto I = Ranges.begin(); I != Ranges.end(); ++I) {
+          const DIExpression *Piece = I->first->getDebugExpression();
+          if (std::all_of(Ranges.begin(), I,
+                          [&](DbgValueHistoryMap::InstrRange Pred) {
+                return !piecesOverlap(Piece, Pred.first->getDebugExpression());
+              }))
+            LabelsBeforeInsn[I->first] = Asm->getFunctionBegin();
+          else
+            break;
+        }
+      }
+    }
+
+    for (const auto &Range : Ranges) {
+      requestLabelBeforeInsn(Range.first);
+      if (Range.second)
+        requestLabelAfterInsn(Range.second);
+    }
+  }
+
+  PrevInstLoc = DebugLoc();
+  PrevLabel = Asm->getFunctionBegin();
+}
+
+void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
+  if (!MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI == nullptr);
+  CurMI = MI;
+
+  // Insert labels where requested.
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsBeforeInsn.find(MI);
+
+  // No label needed.
+  if (I == LabelsBeforeInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+void DebugHandlerBase::endInstruction() {
+  if (!MMI->hasDebugInfo())
+    return;
+
+  assert(CurMI != nullptr);
+  // Don't create a new label after DBG_VALUE instructions.
+  // They don't generate code.
+  if (!CurMI->isDebugValue())
+    PrevLabel = nullptr;
+
+  DenseMap<const MachineInstr *, MCSymbol *>::iterator I =
+      LabelsAfterInsn.find(CurMI);
+  CurMI = nullptr;
+
+  // No label needed.
+  if (I == LabelsAfterInsn.end())
+    return;
+
+  // Label already assigned.
+  if (I->second)
+    return;
+
+  // We need a label after this instruction.
+  if (!PrevLabel) {
+    PrevLabel = MMI->getContext().createTempSymbol();
+    Asm->OutStreamer->EmitLabel(PrevLabel);
+  }
+  I->second = PrevLabel;
+}
+
+void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  DbgValues.clear();
+  LabelsBeforeInsn.clear();
+  LabelsAfterInsn.clear();
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
new file mode 100644
index 000000000000..b8bbcec133fd
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -0,0 +1,109 @@
+//===-- llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.h --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Common functionality for different debug information format backends.
+// LLVM currently supports DWARF and CodeView.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGHANDLERBASE_H
+
+#include "AsmPrinterHandler.h"
+#include "DbgValueHistoryCalculator.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineInstr.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class MachineModuleInfo;
+
+/// Base class for debug information backends. Common functionality related to
+/// tracking which variables and scopes are alive at a given PC live here.
+class DebugHandlerBase : public AsmPrinterHandler {
+protected:
+  DebugHandlerBase(AsmPrinter *A);
+
+  /// Target of debug info emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// Previous instruction's location information. This is used to
+  /// determine label location to indicate scope boundries in dwarf
+  /// debug info.
+  DebugLoc PrevInstLoc;
+  MCSymbol *PrevLabel = nullptr;
+
+  /// This location indicates end of function prologue and beginning of
+  /// function body.
+  DebugLoc PrologEndLoc;
+
+  /// If nonnull, stores the current machine instruction we're processing.
+  const MachineInstr *CurMI = nullptr;
+
+  LexicalScopes LScopes;
+
+  /// History of DBG_VALUE and clobber instructions for each user
+  /// variable.  Variables are listed in order of appearance.
+  DbgValueHistoryMap DbgValues;
+
+  /// Maps instruction with label emitted before instruction.
+  /// FIXME: Make this private from DwarfDebug, we have the necessary accessors
+  /// for it.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsBeforeInsn;
+
+  /// Maps instruction with label emitted after instruction.
+  DenseMap<const MachineInstr *, MCSymbol *> LabelsAfterInsn;
+
+  /// Indentify instructions that are marking the beginning of or
+  /// ending of a scope.
+  void identifyScopeMarkers();
+
+  /// Ensure that a label will be emitted before MI.
+  void requestLabelBeforeInsn(const MachineInstr *MI) {
+    LabelsBeforeInsn.insert(std::make_pair(MI, nullptr));
+  }
+
+  /// Ensure that a label will be emitted after MI.
+  void requestLabelAfterInsn(const MachineInstr *MI) {
+    LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
+  }
+
+  // AsmPrinterHandler overrides.
+public:
+  void beginInstruction(const MachineInstr *MI) override;
+  void endInstruction() override;
+
+  void beginFunction(const MachineFunction *MF) override;
+  void endFunction(const MachineFunction *MF) override;
+
+  /// Return Label preceding the instruction.
+  MCSymbol *getLabelBeforeInsn(const MachineInstr *MI);
+
+  /// Return Label immediately following the instruction.
+  MCSymbol *getLabelAfterInsn(const MachineInstr *MI);
+
+  /// Determine the relative position of the pieces described by P1 and P2.
+  /// Returns  -1 if P1 is entirely before P2, 0 if P1 and P2 overlap,
+  /// 1 if P1 is entirely after P2.
+  static int pieceCmp(const DIExpression *P1, const DIExpression *P2);
+
+  /// Determine whether two variable pieces overlap.
+  static bool piecesOverlap(const DIExpression *P1, const DIExpression *P2);
+
+  /// If this type is derived from a base type then return base type size.
+  static uint64_t getBaseTypeSize(const DITypeRef TyRef);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
new file mode 100644
index 000000000000..20acd45e5720
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocEntry.h
@@ -0,0 +1,184 @@
+//===-- llvm/CodeGen/DebugLocEntry.h - Entry in debug_loc list -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCENTRY_H
+
+#include "DebugLocStream.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Debug.h"
+
+namespace llvm {
+class AsmPrinter;
+
+/// \brief This struct describes location entries emitted in the .debug_loc
+/// section.
+class DebugLocEntry {
+  /// Begin and end symbols for the address range that this location is valid.
+  const MCSymbol *Begin;
+  const MCSymbol *End;
+
+public:
+  /// \brief A single location or constant.
+  struct Value {
+    Value(const DIExpression *Expr, int64_t i)
+        : Expression(Expr), EntryKind(E_Integer) {
+      Constant.Int = i;
+    }
+    Value(const DIExpression *Expr, const ConstantFP *CFP)
+        : Expression(Expr), EntryKind(E_ConstantFP) {
+      Constant.CFP = CFP;
+    }
+    Value(const DIExpression *Expr, const ConstantInt *CIP)
+        : Expression(Expr), EntryKind(E_ConstantInt) {
+      Constant.CIP = CIP;
+    }
+    Value(const DIExpression *Expr, MachineLocation Loc)
+        : Expression(Expr), EntryKind(E_Location), Loc(Loc) {
+      assert(cast<DIExpression>(Expr)->isValid());
+    }
+
+    /// Any complex address location expression for this Value.
+    const DIExpression *Expression;
+
+    /// Type of entry that this represents.
+    enum EntryType { E_Location, E_Integer, E_ConstantFP, E_ConstantInt };
+    enum EntryType EntryKind;
+
+    /// Either a constant,
+    union {
+      int64_t Int;
+      const ConstantFP *CFP;
+      const ConstantInt *CIP;
+    } Constant;
+
+    // Or a location in the machine frame.
+    MachineLocation Loc;
+
+    bool isLocation() const { return EntryKind == E_Location; }
+    bool isInt() const { return EntryKind == E_Integer; }
+    bool isConstantFP() const { return EntryKind == E_ConstantFP; }
+    bool isConstantInt() const { return EntryKind == E_ConstantInt; }
+    int64_t getInt() const { return Constant.Int; }
+    const ConstantFP *getConstantFP() const { return Constant.CFP; }
+    const ConstantInt *getConstantInt() const { return Constant.CIP; }
+    MachineLocation getLoc() const { return Loc; }
+    bool isBitPiece() const { return getExpression()->isBitPiece(); }
+    const DIExpression *getExpression() const { return Expression; }
+    friend bool operator==(const Value &, const Value &);
+    friend bool operator<(const Value &, const Value &);
+    void dump() const {
+      if (isLocation()) {
+        llvm::dbgs() << "Loc = { reg=" << Loc.getReg() << " ";
+        if (Loc.isIndirect())
+          llvm::dbgs() << '+' << Loc.getOffset();
+        llvm::dbgs() << "} ";
+      }
+      else if (isConstantInt())
+        Constant.CIP->dump();
+      else if (isConstantFP())
+        Constant.CFP->dump();
+      if (Expression)
+        Expression->dump();
+    }
+  };
+
+private:
+  /// A nonempty list of locations/constants belonging to this entry,
+  /// sorted by offset.
+  SmallVector<Value, 1> Values;
+
+public:
+  DebugLocEntry(const MCSymbol *B, const MCSymbol *E, Value Val)
+      : Begin(B), End(E) {
+    Values.push_back(std::move(Val));
+  }
+
+  /// \brief If this and Next are describing different pieces of the same
+  /// variable, merge them by appending Next's values to the current
+  /// list of values.
+  /// Return true if the merge was successful.
+  bool MergeValues(const DebugLocEntry &Next);
+
+  /// \brief Attempt to merge this DebugLocEntry with Next and return
+  /// true if the merge was successful. Entries can be merged if they
+  /// share the same Loc/Constant and if Next immediately follows this
+  /// Entry.
+  bool MergeRanges(const DebugLocEntry &Next) {
+    // If this and Next are describing the same variable, merge them.
+    if ((End == Next.Begin && Values == Next.Values)) {
+      End = Next.End;
+      return true;
+    }
+    return false;
+  }
+
+  const MCSymbol *getBeginSym() const { return Begin; }
+  const MCSymbol *getEndSym() const { return End; }
+  ArrayRef<Value> getValues() const { return Values; }
+  void addValues(ArrayRef<DebugLocEntry::Value> Vals) {
+    Values.append(Vals.begin(), Vals.end());
+    sortUniqueValues();
+    assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value V){
+          return V.isBitPiece();
+        }) && "value must be a piece");
+  }
+
+  // \brief Sort the pieces by offset.
+  // Remove any duplicate entries by dropping all but the first.
+  void sortUniqueValues() {
+    std::sort(Values.begin(), Values.end());
+    Values.erase(
+        std::unique(
+            Values.begin(), Values.end(), [](const Value &A, const Value &B) {
+              return A.getExpression() == B.getExpression();
+            }),
+        Values.end());
+  }
+
+  /// \brief Lower this entry into a DWARF expression.
+  void finalize(const AsmPrinter &AP, DebugLocStream::ListBuilder &List,
+                const DIBasicType *BT);
+};
+
+/// \brief Compare two Values for equality.
+inline bool operator==(const DebugLocEntry::Value &A,
+                       const DebugLocEntry::Value &B) {
+  if (A.EntryKind != B.EntryKind)
+    return false;
+
+  if (A.Expression != B.Expression)
+    return false;
+
+  switch (A.EntryKind) {
+  case DebugLocEntry::Value::E_Location:
+    return A.Loc == B.Loc;
+  case DebugLocEntry::Value::E_Integer:
+    return A.Constant.Int == B.Constant.Int;
+  case DebugLocEntry::Value::E_ConstantFP:
+    return A.Constant.CFP == B.Constant.CFP;
+  case DebugLocEntry::Value::E_ConstantInt:
+    return A.Constant.CIP == B.Constant.CIP;
+  }
+  llvm_unreachable("unhandled EntryKind");
+}
+
+/// \brief Compare two pieces based on their offset.
+inline bool operator<(const DebugLocEntry::Value &A,
+                      const DebugLocEntry::Value &B) {
+  return A.getExpression()->getBitPieceOffset() <
+         B.getExpression()->getBitPieceOffset();
+}
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
new file mode 100644
index 000000000000..7e8ed7104af3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.cpp
@@ -0,0 +1,46 @@
+//===- DebugLocStream.cpp - DWARF debug_loc stream --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DebugLocStream.h"
+#include "DwarfDebug.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+
+using namespace llvm;
+
+bool DebugLocStream::finalizeList(AsmPrinter &Asm) {
+  if (Lists.back().EntryOffset == Entries.size()) {
+    // Empty list.  Delete it.
+    Lists.pop_back();
+    return false;
+  }
+
+  // Real list.  Generate a label for it.
+  Lists.back().Label = Asm.createTempSymbol("debug_loc");
+  return true;
+}
+
+void DebugLocStream::finalizeEntry() {
+  if (Entries.back().ByteOffset != DWARFBytes.size())
+    return;
+
+  // The last entry was empty.  Delete it.
+  Comments.erase(Comments.begin() + Entries.back().CommentOffset,
+                 Comments.end());
+  Entries.pop_back();
+
+  assert(Lists.back().EntryOffset <= Entries.size() &&
+         "Popped off more entries than are in the list");
+}
+
+DebugLocStream::ListBuilder::~ListBuilder() {
+  if (!Locs.finalizeList(Asm))
+    return;
+  V.initializeDbgValue(&MI);
+  V.setDebugLocListIndex(ListIndex);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
new file mode 100644
index 000000000000..3656e9d95099
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DebugLocStream.h
@@ -0,0 +1,193 @@
+//===--- lib/CodeGen/DebugLocStream.h - DWARF debug_loc stream --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DEBUGLOCSTREAM_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "ByteStreamer.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class DbgVariable;
+class DwarfCompileUnit;
+class MachineInstr;
+class MCSymbol;
+
+/// \brief Byte stream of .debug_loc entries.
+///
+/// Stores a unified stream of .debug_loc entries.  There's \a List for each
+/// variable/inlined-at pair, and an \a Entry for each \a DebugLocEntry.
+///
+/// FIXME: Do we need all these temp symbols?
+/// FIXME: Why not output directly to the output stream?
+class DebugLocStream {
+public:
+  struct List {
+    DwarfCompileUnit *CU;
+    MCSymbol *Label = nullptr;
+    size_t EntryOffset;
+    List(DwarfCompileUnit *CU, size_t EntryOffset)
+        : CU(CU), EntryOffset(EntryOffset) {}
+  };
+  struct Entry {
+    const MCSymbol *BeginSym;
+    const MCSymbol *EndSym;
+    size_t ByteOffset;
+    size_t CommentOffset;
+    Entry(const MCSymbol *BeginSym, const MCSymbol *EndSym, size_t ByteOffset,
+          size_t CommentOffset)
+        : BeginSym(BeginSym), EndSym(EndSym), ByteOffset(ByteOffset),
+          CommentOffset(CommentOffset) {}
+  };
+
+private:
+  SmallVector<List, 4> Lists;
+  SmallVector<Entry, 32> Entries;
+  SmallString<256> DWARFBytes;
+  SmallVector<std::string, 32> Comments;
+
+  /// \brief Only verbose textual output needs comments.  This will be set to
+  /// true for that case, and false otherwise.
+  bool GenerateComments;
+
+public:
+  DebugLocStream(bool GenerateComments) : GenerateComments(GenerateComments) { }
+  size_t getNumLists() const { return Lists.size(); }
+  const List &getList(size_t LI) const { return Lists[LI]; }
+  ArrayRef<List> getLists() const { return Lists; }
+
+  class ListBuilder;
+  class EntryBuilder;
+
+private:
+  /// \brief Start a new .debug_loc entry list.
+  ///
+  /// Start a new .debug_loc entry list.  Return the new list's index so it can
+  /// be retrieved later via \a getList().
+  ///
+  /// Until the next call, \a startEntry() will add entries to this list.
+  size_t startList(DwarfCompileUnit *CU) {
+    size_t LI = Lists.size();
+    Lists.emplace_back(CU, Entries.size());
+    return LI;
+  }
+
+  /// Finalize a .debug_loc entry list.
+  ///
+  /// If there are no entries in this list, delete it outright.  Otherwise,
+  /// create a label with \a Asm.
+  ///
+  /// \return false iff the list is deleted.
+  bool finalizeList(AsmPrinter &Asm);
+
+  /// \brief Start a new .debug_loc entry.
+  ///
+  /// Until the next call, bytes added to the stream will be added to this
+  /// entry.
+  void startEntry(const MCSymbol *BeginSym, const MCSymbol *EndSym) {
+    Entries.emplace_back(BeginSym, EndSym, DWARFBytes.size(), Comments.size());
+  }
+
+  /// Finalize a .debug_loc entry, deleting if it's empty.
+  void finalizeEntry();
+
+public:
+  BufferByteStreamer getStreamer() {
+    return BufferByteStreamer(DWARFBytes, Comments, GenerateComments);
+  }
+
+  ArrayRef<Entry> getEntries(const List &L) const {
+    size_t LI = getIndex(L);
+    return makeArrayRef(Entries)
+        .slice(Lists[LI].EntryOffset, getNumEntries(LI));
+  }
+
+  ArrayRef<char> getBytes(const Entry &E) const {
+    size_t EI = getIndex(E);
+    return makeArrayRef(DWARFBytes.begin(), DWARFBytes.end())
+        .slice(Entries[EI].ByteOffset, getNumBytes(EI));
+  }
+  ArrayRef<std::string> getComments(const Entry &E) const {
+    size_t EI = getIndex(E);
+    return makeArrayRef(Comments)
+        .slice(Entries[EI].CommentOffset, getNumComments(EI));
+  }
+
+private:
+  size_t getIndex(const List &L) const {
+    assert(&Lists.front() <= &L && &L <= &Lists.back() &&
+           "Expected valid list");
+    return &L - &Lists.front();
+  }
+  size_t getIndex(const Entry &E) const {
+    assert(&Entries.front() <= &E && &E <= &Entries.back() &&
+           "Expected valid entry");
+    return &E - &Entries.front();
+  }
+  size_t getNumEntries(size_t LI) const {
+    if (LI + 1 == Lists.size())
+      return Entries.size() - Lists[LI].EntryOffset;
+    return Lists[LI + 1].EntryOffset - Lists[LI].EntryOffset;
+  }
+  size_t getNumBytes(size_t EI) const {
+    if (EI + 1 == Entries.size())
+      return DWARFBytes.size() - Entries[EI].ByteOffset;
+    return Entries[EI + 1].ByteOffset - Entries[EI].ByteOffset;
+  }
+  size_t getNumComments(size_t EI) const {
+    if (EI + 1 == Entries.size())
+      return Comments.size() - Entries[EI].CommentOffset;
+    return Entries[EI + 1].CommentOffset - Entries[EI].CommentOffset;
+  }
+};
+
+/// Builder for DebugLocStream lists.
+class DebugLocStream::ListBuilder {
+  DebugLocStream &Locs;
+  AsmPrinter &Asm;
+  DbgVariable &V;
+  const MachineInstr &MI;
+  size_t ListIndex;
+
+public:
+  ListBuilder(DebugLocStream &Locs, DwarfCompileUnit &CU, AsmPrinter &Asm,
+              DbgVariable &V, const MachineInstr &MI)
+      : Locs(Locs), Asm(Asm), V(V), MI(MI), ListIndex(Locs.startList(&CU)) {}
+
+  /// Finalize the list.
+  ///
+  /// If the list is empty, delete it.  Otherwise, finalize it by creating a
+  /// temp symbol in \a Asm and setting up the \a DbgVariable.
+  ~ListBuilder();
+
+  DebugLocStream &getLocs() { return Locs; }
+};
+
+/// Builder for DebugLocStream entries.
+class DebugLocStream::EntryBuilder {
+  DebugLocStream &Locs;
+
+public:
+  EntryBuilder(ListBuilder &List, const MCSymbol *Begin, const MCSymbol *End)
+      : Locs(List.getLocs()) {
+    Locs.startEntry(Begin, End);
+  }
+
+  /// Finalize the entry, deleting it if it's empty.
+  ~EntryBuilder() { Locs.finalizeEntry(); }
+
+  BufferByteStreamer getStreamer() { return Locs.getStreamer(); }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
new file mode 100644
index 000000000000..4ad3e1867328
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -0,0 +1,289 @@
+//=-- llvm/CodeGen/DwarfAccelTable.cpp - Dwarf Accelerator Tables -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfAccelTable.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// The length of the header data is always going to be 4 + 4 + 4*NumAtoms.
+DwarfAccelTable::DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom> atomList)
+    : Header(8 + (atomList.size() * 4)), HeaderData(atomList),
+      Entries(Allocator) {}
+
+void DwarfAccelTable::AddName(DwarfStringPoolEntryRef Name, const DIE *die,
+                              char Flags) {
+  assert(Data.empty() && "Already finalized!");
+  // If the string is in the list already then add this die to the list
+  // otherwise add a new one.
+  DataArray &DIEs = Entries[Name.getString()];
+  assert(!DIEs.Name || DIEs.Name == Name);
+  DIEs.Name = Name;
+  DIEs.Values.push_back(new (Allocator) HashDataContents(die, Flags));
+}
+
+void DwarfAccelTable::ComputeBucketCount() {
+  // First get the number of unique hashes.
+  std::vector<uint32_t> uniques(Data.size());
+  for (size_t i = 0, e = Data.size(); i < e; ++i)
+    uniques[i] = Data[i]->HashValue;
+  array_pod_sort(uniques.begin(), uniques.end());
+  std::vector<uint32_t>::iterator p =
+      std::unique(uniques.begin(), uniques.end());
+  uint32_t num = std::distance(uniques.begin(), p);
+
+  // Then compute the bucket size, minimum of 1 bucket.
+  if (num > 1024)
+    Header.bucket_count = num / 4;
+  else if (num > 16)
+    Header.bucket_count = num / 2;
+  else
+    Header.bucket_count = num > 0 ? num : 1;
+
+  Header.hashes_count = num;
+}
+
+// compareDIEs - comparison predicate that sorts DIEs by their offset.
+static bool compareDIEs(const DwarfAccelTable::HashDataContents *A,
+                        const DwarfAccelTable::HashDataContents *B) {
+  return A->Die->getOffset() < B->Die->getOffset();
+}
+
+void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, StringRef Prefix) {
+  // Create the individual hash data outputs.
+  Data.reserve(Entries.size());
+  for (StringMap<DataArray>::iterator EI = Entries.begin(), EE = Entries.end();
+       EI != EE; ++EI) {
+
+    // Unique the entries.
+    std::stable_sort(EI->second.Values.begin(), EI->second.Values.end(), compareDIEs);
+    EI->second.Values.erase(
+        std::unique(EI->second.Values.begin(), EI->second.Values.end()),
+        EI->second.Values.end());
+
+    HashData *Entry = new (Allocator) HashData(EI->getKey(), EI->second);
+    Data.push_back(Entry);
+  }
+
+  // Figure out how many buckets we need, then compute the bucket
+  // contents and the final ordering. We'll emit the hashes and offsets
+  // by doing a walk during the emission phase. We add temporary
+  // symbols to the data so that we can reference them during the offset
+  // later, we'll emit them when we emit the data.
+  ComputeBucketCount();
+
+  // Compute bucket contents and final ordering.
+  Buckets.resize(Header.bucket_count);
+  for (size_t i = 0, e = Data.size(); i < e; ++i) {
+    uint32_t bucket = Data[i]->HashValue % Header.bucket_count;
+    Buckets[bucket].push_back(Data[i]);
+    Data[i]->Sym = Asm->createTempSymbol(Prefix);
+  }
+
+  // Sort the contents of the buckets by hash value so that hash
+  // collisions end up together. Stable sort makes testing easier and
+  // doesn't cost much more.
+  for (size_t i = 0; i < Buckets.size(); ++i)
+    std::stable_sort(Buckets[i].begin(), Buckets[i].end(),
+                     [] (HashData *LHS, HashData *RHS) {
+                       return LHS->HashValue < RHS->HashValue;
+                     });
+}
+
+// Emits the header for the table via the AsmPrinter.
+void DwarfAccelTable::EmitHeader(AsmPrinter *Asm) {
+  Asm->OutStreamer->AddComment("Header Magic");
+  Asm->EmitInt32(Header.magic);
+  Asm->OutStreamer->AddComment("Header Version");
+  Asm->EmitInt16(Header.version);
+  Asm->OutStreamer->AddComment("Header Hash Function");
+  Asm->EmitInt16(Header.hash_function);
+  Asm->OutStreamer->AddComment("Header Bucket Count");
+  Asm->EmitInt32(Header.bucket_count);
+  Asm->OutStreamer->AddComment("Header Hash Count");
+  Asm->EmitInt32(Header.hashes_count);
+  Asm->OutStreamer->AddComment("Header Data Length");
+  Asm->EmitInt32(Header.header_data_len);
+  Asm->OutStreamer->AddComment("HeaderData Die Offset Base");
+  Asm->EmitInt32(HeaderData.die_offset_base);
+  Asm->OutStreamer->AddComment("HeaderData Atom Count");
+  Asm->EmitInt32(HeaderData.Atoms.size());
+  for (size_t i = 0; i < HeaderData.Atoms.size(); i++) {
+    Atom A = HeaderData.Atoms[i];
+    Asm->OutStreamer->AddComment(dwarf::AtomTypeString(A.type));
+    Asm->EmitInt16(A.type);
+    Asm->OutStreamer->AddComment(dwarf::FormEncodingString(A.form));
+    Asm->EmitInt16(A.form);
+  }
+}
+
+// Walk through and emit the buckets for the table. Each index is
+// an offset into the list of hashes.
+void DwarfAccelTable::EmitBuckets(AsmPrinter *Asm) {
+  unsigned index = 0;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    Asm->OutStreamer->AddComment("Bucket " + Twine(i));
+    if (Buckets[i].size() != 0)
+      Asm->EmitInt32(index);
+    else
+      Asm->EmitInt32(UINT32_MAX);
+    // Buckets point in the list of hashes, not to the data. Do not
+    // increment the index multiple times in case of hash collisions.
+    uint64_t PrevHash = UINT64_MAX;
+    for (auto *HD : Buckets[i]) {
+      uint32_t HashValue = HD->HashValue;
+      if (PrevHash != HashValue)
+        ++index;
+      PrevHash = HashValue;
+    }
+  }
+}
+
+// Walk through the buckets and emit the individual hashes for each
+// bucket.
+void DwarfAccelTable::EmitHashes(AsmPrinter *Asm) {
+  uint64_t PrevHash = UINT64_MAX;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
+      uint32_t HashValue = (*HI)->HashValue;
+      if (PrevHash == HashValue)
+        continue;
+      Asm->OutStreamer->AddComment("Hash in Bucket " + Twine(i));
+      Asm->EmitInt32(HashValue);
+      PrevHash = HashValue;
+    }
+  }
+}
+
+// Walk through the buckets and emit the individual offsets for each
+// element in each bucket. This is done via a symbol subtraction from the
+// beginning of the section. The non-section symbol will be output later
+// when we emit the actual data.
+void DwarfAccelTable::emitOffsets(AsmPrinter *Asm, const MCSymbol *SecBegin) {
+  uint64_t PrevHash = UINT64_MAX;
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
+      uint32_t HashValue = (*HI)->HashValue;
+      if (PrevHash == HashValue)
+        continue;
+      PrevHash = HashValue;
+      Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
+      MCContext &Context = Asm->OutStreamer->getContext();
+      const MCExpr *Sub = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create((*HI)->Sym, Context),
+          MCSymbolRefExpr::create(SecBegin, Context), Context);
+      Asm->OutStreamer->EmitValue(Sub, sizeof(uint32_t));
+    }
+  }
+}
+
+// Walk through the buckets and emit the full data for each element in
+// the bucket. For the string case emit the dies and the various offsets.
+// Terminate each HashData bucket with 0.
+void DwarfAccelTable::EmitData(AsmPrinter *Asm, DwarfDebug *D) {
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i) {
+    uint64_t PrevHash = UINT64_MAX;
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI) {
+      // Terminate the previous entry if there is no hash collision
+      // with the current one.
+      if (PrevHash != UINT64_MAX && PrevHash != (*HI)->HashValue)
+        Asm->EmitInt32(0);
+      // Remember to emit the label for our offset.
+      Asm->OutStreamer->EmitLabel((*HI)->Sym);
+      Asm->OutStreamer->AddComment((*HI)->Str);
+      Asm->emitDwarfStringOffset((*HI)->Data.Name);
+      Asm->OutStreamer->AddComment("Num DIEs");
+      Asm->EmitInt32((*HI)->Data.Values.size());
+      for (HashDataContents *HD : (*HI)->Data.Values) {
+        // Emit the DIE offset
+        DwarfCompileUnit *CU = D->lookupUnit(HD->Die->getUnit());
+        assert(CU && "Accelerated DIE should belong to a CU.");
+        Asm->EmitInt32(HD->Die->getOffset() + CU->getDebugInfoOffset());
+        // If we have multiple Atoms emit that info too.
+        // FIXME: A bit of a hack, we either emit only one atom or all info.
+        if (HeaderData.Atoms.size() > 1) {
+          Asm->EmitInt16(HD->Die->getTag());
+          Asm->EmitInt8(HD->Flags);
+        }
+      }
+      PrevHash = (*HI)->HashValue;
+    }
+    // Emit the final end marker for the bucket.
+    if (!Buckets[i].empty())
+      Asm->EmitInt32(0);
+  }
+}
+
+// Emit the entire data structure to the output file.
+void DwarfAccelTable::emit(AsmPrinter *Asm, const MCSymbol *SecBegin,
+                           DwarfDebug *D) {
+  // Emit the header.
+  EmitHeader(Asm);
+
+  // Emit the buckets.
+  EmitBuckets(Asm);
+
+  // Emit the hashes.
+  EmitHashes(Asm);
+
+  // Emit the offsets.
+  emitOffsets(Asm, SecBegin);
+
+  // Emit the hash data.
+  EmitData(Asm, D);
+}
+
+#ifndef NDEBUG
+void DwarfAccelTable::print(raw_ostream &O) {
+
+  Header.print(O);
+  HeaderData.print(O);
+
+  O << "Entries: \n";
+  for (StringMap<DataArray>::const_iterator EI = Entries.begin(),
+                                            EE = Entries.end();
+       EI != EE; ++EI) {
+    O << "Name: " << EI->getKeyData() << "\n";
+    for (HashDataContents *HD : EI->second.Values)
+      HD->print(O);
+  }
+
+  O << "Buckets and Hashes: \n";
+  for (size_t i = 0, e = Buckets.size(); i < e; ++i)
+    for (HashList::const_iterator HI = Buckets[i].begin(),
+                                  HE = Buckets[i].end();
+         HI != HE; ++HI)
+      (*HI)->print(O);
+
+  O << "Data: \n";
+  for (std::vector<HashData *>::const_iterator DI = Data.begin(),
+                                               DE = Data.end();
+       DI != DE; ++DI)
+    (*DI)->print(O);
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
new file mode 100644
index 000000000000..4d81441f6a72
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -0,0 +1,256 @@
+//==-- llvm/CodeGen/DwarfAccelTable.h - Dwarf Accelerator Tables -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf accelerator tables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFACCELTABLE_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormattedStream.h"
+#include <vector>
+
+// The dwarf accelerator tables are an indirect hash table optimized
+// for null lookup rather than access to known data. They are output into
+// an on-disk format that looks like this:
+//
+// .-------------.
+// |  HEADER     |
+// |-------------|
+// |  BUCKETS    |
+// |-------------|
+// |  HASHES     |
+// |-------------|
+// |  OFFSETS    |
+// |-------------|
+// |  DATA       |
+// `-------------'
+//
+// where the header contains a magic number, version, type of hash function,
+// the number of buckets, total number of hashes, and room for a special
+// struct of data and the length of that struct.
+//
+// The buckets contain an index (e.g. 6) into the hashes array. The hashes
+// section contains all of the 32-bit hash values in contiguous memory, and
+// the offsets contain the offset into the data area for the particular
+// hash.
+//
+// For a lookup example, we could hash a function name and take it modulo the
+// number of buckets giving us our bucket. From there we take the bucket value
+// as an index into the hashes table and look at each successive hash as long
+// as the hash value is still the same modulo result (bucket value) as earlier.
+// If we have a match we look at that same entry in the offsets table and
+// grab the offset in the data for our final match.
+
+namespace llvm {
+
+class AsmPrinter;
+class DwarfDebug;
+
+class DwarfAccelTable {
+
+  static uint32_t HashDJB(StringRef Str) {
+    uint32_t h = 5381;
+    for (unsigned i = 0, e = Str.size(); i != e; ++i)
+      h = ((h << 5) + h) + Str[i];
+    return h;
+  }
+
+  // Helper function to compute the number of buckets needed based on
+  // the number of unique hashes.
+  void ComputeBucketCount(void);
+
+  struct TableHeader {
+    uint32_t magic;           // 'HASH' magic value to allow endian detection
+    uint16_t version;         // Version number.
+    uint16_t hash_function;   // The hash function enumeration that was used.
+    uint32_t bucket_count;    // The number of buckets in this hash table.
+    uint32_t hashes_count;    // The total number of unique hash values
+                              // and hash data offsets in this table.
+    uint32_t header_data_len; // The bytes to skip to get to the hash
+                              // indexes (buckets) for correct alignment.
+    // Also written to disk is the implementation specific header data.
+
+    static const uint32_t MagicHash = 0x48415348;
+
+    TableHeader(uint32_t data_len)
+        : magic(MagicHash), version(1),
+          hash_function(dwarf::DW_hash_function_djb), bucket_count(0),
+          hashes_count(0), header_data_len(data_len) {}
+
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Magic: " << format("0x%x", magic) << "\n"
+        << "Version: " << version << "\n"
+        << "Hash Function: " << hash_function << "\n"
+        << "Bucket Count: " << bucket_count << "\n"
+        << "Header Data Length: " << header_data_len << "\n";
+    }
+    void dump() { print(dbgs()); }
+#endif
+  };
+
+public:
+  // The HeaderData describes the form of each set of data. In general this
+  // is as a list of atoms (atom_count) where each atom contains a type
+  // (AtomType type) of data, and an encoding form (form). In the case of
+  // data that is referenced via DW_FORM_ref_* the die_offset_base is
+  // used to describe the offset for all forms in the list of atoms.
+  // This also serves as a public interface of sorts.
+  // When written to disk this will have the form:
+  //
+  // uint32_t die_offset_base
+  // uint32_t atom_count
+  // atom_count Atoms
+
+  // Make these public so that they can be used as a general interface to
+  // the class.
+  struct Atom {
+    uint16_t type; // enum AtomType
+    uint16_t form; // DWARF DW_FORM_ defines
+
+    LLVM_CONSTEXPR Atom(uint16_t type, uint16_t form)
+        : type(type), form(form) {}
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Type: " << dwarf::AtomTypeString(type) << "\n"
+        << "Form: " << dwarf::FormEncodingString(form) << "\n";
+    }
+    void dump() { print(dbgs()); }
+#endif
+  };
+
+private:
+  struct TableHeaderData {
+    uint32_t die_offset_base;
+    SmallVector<Atom, 3> Atoms;
+
+    TableHeaderData(ArrayRef<Atom> AtomList, uint32_t offset = 0)
+        : die_offset_base(offset), Atoms(AtomList.begin(), AtomList.end()) {}
+
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "die_offset_base: " << die_offset_base << "\n";
+      for (size_t i = 0; i < Atoms.size(); i++)
+        Atoms[i].print(O);
+    }
+    void dump() { print(dbgs()); }
+#endif
+  };
+
+  // The data itself consists of a str_offset, a count of the DIEs in the
+  // hash and the offsets to the DIEs themselves.
+  // On disk each data section is ended with a 0 KeyType as the end of the
+  // hash chain.
+  // On output this looks like:
+  // uint32_t str_offset
+  // uint32_t hash_data_count
+  // HashData[hash_data_count]
+public:
+  struct HashDataContents {
+    const DIE *Die;   // Offsets
+    char Flags; // Specific flags to output
+
+    HashDataContents(const DIE *D, char Flags) : Die(D), Flags(Flags) {}
+#ifndef NDEBUG
+    void print(raw_ostream &O) const {
+      O << "  Offset: " << Die->getOffset() << "\n";
+      O << "  Tag: " << dwarf::TagString(Die->getTag()) << "\n";
+      O << "  Flags: " << Flags << "\n";
+    }
+#endif
+  };
+
+private:
+  // String Data
+  struct DataArray {
+    DwarfStringPoolEntryRef Name;
+    std::vector<HashDataContents *> Values;
+  };
+  friend struct HashData;
+  struct HashData {
+    StringRef Str;
+    uint32_t HashValue;
+    MCSymbol *Sym;
+    DwarfAccelTable::DataArray &Data; // offsets
+    HashData(StringRef S, DwarfAccelTable::DataArray &Data)
+        : Str(S), Data(Data) {
+      HashValue = DwarfAccelTable::HashDJB(S);
+    }
+#ifndef NDEBUG
+    void print(raw_ostream &O) {
+      O << "Name: " << Str << "\n";
+      O << "  Hash Value: " << format("0x%x", HashValue) << "\n";
+      O << "  Symbol: ";
+      if (Sym)
+        O << *Sym;
+      else
+        O << "<none>";
+      O << "\n";
+      for (HashDataContents *C : Data.Values) {
+        O << "  Offset: " << C->Die->getOffset() << "\n";
+        O << "  Tag: " << dwarf::TagString(C->Die->getTag()) << "\n";
+        O << "  Flags: " << C->Flags << "\n";
+      }
+    }
+    void dump() { print(dbgs()); }
+#endif
+  };
+
+  DwarfAccelTable(const DwarfAccelTable &) = delete;
+  void operator=(const DwarfAccelTable &) = delete;
+
+  // Internal Functions
+  void EmitHeader(AsmPrinter *);
+  void EmitBuckets(AsmPrinter *);
+  void EmitHashes(AsmPrinter *);
+  void emitOffsets(AsmPrinter *, const MCSymbol *);
+  void EmitData(AsmPrinter *, DwarfDebug *D);
+
+  // Allocator for HashData and HashDataContents.
+  BumpPtrAllocator Allocator;
+
+  // Output Variables
+  TableHeader Header;
+  TableHeaderData HeaderData;
+  std::vector<HashData *> Data;
+
+  typedef StringMap<DataArray, BumpPtrAllocator &> StringEntries;
+  StringEntries Entries;
+
+  // Buckets/Hashes/Offsets
+  typedef std::vector<HashData *> HashList;
+  typedef std::vector<HashList> BucketList;
+  BucketList Buckets;
+  HashList Hashes;
+
+  // Public Implementation
+public:
+  DwarfAccelTable(ArrayRef<DwarfAccelTable::Atom>);
+  void AddName(DwarfStringPoolEntryRef Name, const DIE *Die, char Flags = 0);
+  void FinalizeTable(AsmPrinter *, StringRef);
+  void emit(AsmPrinter *, const MCSymbol *, DwarfDebug *);
+#ifndef NDEBUG
+  void print(raw_ostream &O);
+  void dump() { print(dbgs()); }
+#endif
+};
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
new file mode 100644
index 000000000000..2eae1b234473
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -0,0 +1,179 @@
+//===-- CodeGen/AsmPrinter/DwarfException.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing DWARF exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfException.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+DwarfCFIExceptionBase::DwarfCFIExceptionBase(AsmPrinter *A)
+    : EHStreamer(A), shouldEmitCFI(false) {}
+
+void DwarfCFIExceptionBase::markFunctionEnd() {
+  endFragment();
+
+  if (MMI->getLandingPads().empty())
+    return;
+
+  // Map all labels and get rid of any dead landing pads.
+  MMI->TidyLandingPads();
+}
+
+void DwarfCFIExceptionBase::endFragment() {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->EmitCFIEndProc();
+}
+
+DwarfCFIException::DwarfCFIException(AsmPrinter *A)
+    : DwarfCFIExceptionBase(A), shouldEmitPersonality(false),
+      forceEmitPersonality(false), shouldEmitLSDA(false),
+      shouldEmitMoves(false), moveTypeModule(AsmPrinter::CFI_M_None) {}
+
+DwarfCFIException::~DwarfCFIException() {}
+
+/// endModule - Emit all exception information that should come after the
+/// content.
+void DwarfCFIException::endModule() {
+  // SjLj uses this pass and it doesn't need this info.
+  if (!Asm->MAI->usesCFIForEH())
+    return;
+
+  if (moveTypeModule == AsmPrinter::CFI_M_Debug)
+    Asm->OutStreamer->EmitCFISections(false, true);
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+
+  if ((PerEncoding & 0x80) != dwarf::DW_EH_PE_indirect)
+    return;
+
+  // Emit references to all used personality functions
+  for (const Function *Personality : MMI->getPersonalities()) {
+    if (!Personality)
+      continue;
+    MCSymbol *Sym = Asm->getSymbol(Personality);
+    TLOF.emitPersonalityValue(*Asm->OutStreamer, Asm->getDataLayout(), Sym);
+  }
+}
+
+static MCSymbol *getExceptionSym(AsmPrinter *Asm) {
+  return Asm->getCurExceptionSym();
+}
+
+void DwarfCFIException::beginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+  const Function *F = MF->getFunction();
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+
+  // See if we need frame move info.
+  AsmPrinter::CFIMoveType MoveType = Asm->needsCFIMoves();
+  if (MoveType == AsmPrinter::CFI_M_EH ||
+      (MoveType == AsmPrinter::CFI_M_Debug &&
+       moveTypeModule == AsmPrinter::CFI_M_None))
+    moveTypeModule = MoveType;
+
+  shouldEmitMoves = MoveType != AsmPrinter::CFI_M_None;
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+
+  // Emit a personality function even when there are no landing pads
+  forceEmitPersonality =
+      // ...if a personality function is explicitly specified
+      F->hasPersonalityFn() &&
+      // ... and it's not known to be a noop in the absence of invokes
+      !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+      // ... and we're not explicitly asked not to emit it
+      F->needsUnwindTableEntry();
+
+  shouldEmitPersonality =
+      (forceEmitPersonality ||
+       (hasLandingPads && PerEncoding != dwarf::DW_EH_PE_omit)) &&
+      Per;
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  shouldEmitCFI = MF->getMMI().getContext().getAsmInfo()->usesCFIForEH() &&
+                  (shouldEmitPersonality || shouldEmitMoves);
+  beginFragment(&*MF->begin(), getExceptionSym);
+}
+
+void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
+                                      ExceptionSymbolProvider ESP) {
+  if (!shouldEmitCFI)
+    return;
+
+  Asm->OutStreamer->EmitCFIStartProc(/*IsSimple=*/false);
+
+  // Indicate personality routine, if any.
+  if (!shouldEmitPersonality)
+    return;
+
+  auto *F = MBB->getParent()->getFunction();
+  auto *P = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  assert(P && "Expected personality function");
+
+  // If we are forced to emit this personality, make sure to record
+  // it because it might not appear in any landingpad
+  if (forceEmitPersonality)
+    MMI->addPersonality(P);
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const MCSymbol *Sym =
+      TLOF.getCFIPersonalitySymbol(P, *Asm->Mang, Asm->TM, MMI);
+  Asm->OutStreamer->EmitCFIPersonality(Sym, PerEncoding);
+
+  // Provide LSDA information.
+  if (shouldEmitLSDA)
+    Asm->OutStreamer->EmitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void DwarfCFIException::endFunction(const MachineFunction *) {
+  if (!shouldEmitPersonality)
+    return;
+
+  emitExceptionTable();
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
new file mode 100644
index 000000000000..7822814c7a0f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -0,0 +1,833 @@
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+namespace llvm {
+
+DwarfCompileUnit::DwarfCompileUnit(unsigned UID, const DICompileUnit *Node,
+                                   AsmPrinter *A, DwarfDebug *DW,
+                                   DwarfFile *DWU)
+    : DwarfUnit(dwarf::DW_TAG_compile_unit, Node, A, DW, DWU), UniqueID(UID),
+      Skeleton(nullptr), BaseAddress(nullptr) {
+  insertDIE(Node, &getUnitDie());
+  MacroLabelBegin = Asm->createTempSymbol("cu_macro_begin");
+}
+
+/// addLabelAddress - Add a dwarf label attribute data and value using
+/// DW_FORM_addr or DW_FORM_GNU_addr_index.
+///
+void DwarfCompileUnit::addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                                       const MCSymbol *Label) {
+
+  // Don't use the address pool in non-fission or in the skeleton unit itself.
+  // FIXME: Once GDB supports this, it's probably worthwhile using the address
+  // pool from the skeleton - maybe even in non-fission (possibly fewer
+  // relocations by sharing them in the pool, but we have other ideas about how
+  // to reduce the number of relocations as well/instead).
+  if (!DD->useSplitDwarf() || !Skeleton)
+    return addLocalLabelAddress(Die, Attribute, Label);
+
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  unsigned idx = DD->getAddressPool().getIndex(Label);
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_GNU_addr_index,
+               DIEInteger(idx));
+}
+
+void DwarfCompileUnit::addLocalLabelAddress(DIE &Die,
+                                            dwarf::Attribute Attribute,
+                                            const MCSymbol *Label) {
+  if (Label)
+    DD->addArangeLabel(SymbolCU(this, Label));
+
+  if (Label)
+    Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_addr,
+                 DIELabel(Label));
+  else
+    Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_addr,
+                 DIEInteger(0));
+}
+
+unsigned DwarfCompileUnit::getOrCreateSourceID(StringRef FileName,
+                                               StringRef DirName) {
+  // If we print assembly, we can't separate .file entries according to
+  // compile units. Thus all files will belong to the default compile unit.
+
+  // FIXME: add a better feature test than hasRawTextSupport. Even better,
+  // extend .file to support this.
+  return Asm->OutStreamer->EmitDwarfFileDirective(
+      0, DirName, FileName,
+      Asm->OutStreamer->hasRawTextSupport() ? 0 : getUniqueID());
+}
+
+// Return const expression if value is a GEP to access merged global
+// constant. e.g.
+// i8* getelementptr ({ i8, i8, i8, i8 }* @_MergedGlobals, i32 0, i32 0)
+static const ConstantExpr *getMergedGlobalExpr(const Value *V) {
+  const ConstantExpr *CE = dyn_cast_or_null<ConstantExpr>(V);
+  if (!CE || CE->getNumOperands() != 3 ||
+      CE->getOpcode() != Instruction::GetElementPtr)
+    return nullptr;
+
+  // First operand points to a global struct.
+  Value *Ptr = CE->getOperand(0);
+  GlobalValue *GV = dyn_cast<GlobalValue>(Ptr);
+  if (!GV || !isa<StructType>(GV->getValueType()))
+    return nullptr;
+
+  // Second operand is zero.
+  const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CE->getOperand(1));
+  if (!CI || !CI->isZero())
+    return nullptr;
+
+  // Third operand is offset.
+  if (!isa<ConstantInt>(CE->getOperand(2)))
+    return nullptr;
+
+  return CE;
+}
+
+/// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
+    const DIGlobalVariable *GV) {
+  // Check for pre-existence.
+  if (DIE *Die = getDIE(GV))
+    return Die;
+
+  assert(GV);
+
+  auto *GVContext = GV->getScope();
+  auto *GTy = DD->resolve(GV->getType());
+
+  // Construct the context before querying for the existence of the DIE in
+  // case such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(GVContext);
+
+  // Add to map.
+  DIE *VariableDIE = &createAndAddDIE(GV->getTag(), *ContextDIE, GV);
+  DIScope *DeclContext;
+  if (auto *SDMDecl = GV->getStaticDataMemberDeclaration()) {
+    DeclContext = resolve(SDMDecl->getScope());
+    assert(SDMDecl->isStaticMember() && "Expected static member decl");
+    assert(GV->isDefinition());
+    // We need the declaration DIE that is in the static member's class.
+    DIE *VariableSpecDIE = getOrCreateStaticMemberDIE(SDMDecl);
+    addDIEEntry(*VariableDIE, dwarf::DW_AT_specification, *VariableSpecDIE);
+  } else {
+    DeclContext = GV->getScope();
+    // Add name and type.
+    addString(*VariableDIE, dwarf::DW_AT_name, GV->getDisplayName());
+    addType(*VariableDIE, GTy);
+
+    // Add scoping info.
+    if (!GV->isLocalToUnit())
+      addFlag(*VariableDIE, dwarf::DW_AT_external);
+
+    // Add line number info.
+    addSourceLine(*VariableDIE, GV);
+  }
+
+  if (!GV->isDefinition())
+    addFlag(*VariableDIE, dwarf::DW_AT_declaration);
+  else
+    addGlobalName(GV->getName(), *VariableDIE, DeclContext);
+
+  // Add location.
+  bool addToAccelTable = false;
+  if (auto *Global = dyn_cast_or_null<GlobalVariable>(GV->getVariable())) {
+    // We cannot describe the location of dllimport'd variables: the computation
+    // of their address requires loads from the IAT.
+    if (!Global->hasDLLImportStorageClass()) {
+      addToAccelTable = true;
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.Options.EmulatedTLS) {
+          // TODO: add debug info for emulated thread local mode.
+        } else {
+          // FIXME: Make this work with -gsplit-dwarf.
+          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+          assert((PointerSize == 4 || PointerSize == 8) &&
+                 "Add support for other sizes if necessary");
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            // 1) Start with a constNu of the appropriate pointer size
+            addUInt(*Loc, dwarf::DW_FORM_data1, PointerSize == 4
+                                                    ? dwarf::DW_OP_const4u
+                                                    : dwarf::DW_OP_const8u);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, dwarf::DW_FORM_udata,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
+          }
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
+        }
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
+      }
+
+      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+      if (DD->useAllLinkageNames())
+        addLinkageName(*VariableDIE, GV->getLinkageName());
+    }
+  } else if (const ConstantInt *CI =
+                 dyn_cast_or_null<ConstantInt>(GV->getVariable())) {
+    addConstantValue(*VariableDIE, CI, GTy);
+  } else if (const ConstantExpr *CE = getMergedGlobalExpr(GV->getVariable())) {
+    auto *Ptr = cast<GlobalValue>(CE->getOperand(0));
+    if (!Ptr->hasDLLImportStorageClass()) {
+      addToAccelTable = true;
+      // GV is a merged global.
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      MCSymbol *Sym = Asm->getSymbol(Ptr);
+      DD->addArangeLabel(SymbolCU(this, Sym));
+      addOpAddress(*Loc, Sym);
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      SmallVector<Value *, 3> Idx(CE->op_begin() + 1, CE->op_end());
+      addUInt(*Loc, dwarf::DW_FORM_udata,
+              Asm->getDataLayout().getIndexedOffsetInType(Ptr->getValueType(),
+                                                          Idx));
+      addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+      addBlock(*VariableDIE, dwarf::DW_AT_location, Loc);
+    }
+  }
+
+  if (addToAccelTable) {
+    DD->addAccelName(GV->getName(), *VariableDIE);
+
+    // If the linkage name is different than the name, go ahead and output
+    // that as well into the name table.
+    if (GV->getLinkageName() != "" && GV->getName() != GV->getLinkageName())
+      DD->addAccelName(GV->getLinkageName(), *VariableDIE);
+  }
+
+  return VariableDIE;
+}
+
+void DwarfCompileUnit::addRange(RangeSpan Range) {
+  bool SameAsPrevCU = this == DD->getPrevCU();
+  DD->setPrevCU(this);
+  // If we have no current ranges just add the range and return, otherwise,
+  // check the current section and CU against the previous section and CU we
+  // emitted into and the subprogram was contained within. If these are the
+  // same then extend our current range, otherwise add this as a new range.
+  if (CURanges.empty() || !SameAsPrevCU ||
+      (&CURanges.back().getEnd()->getSection() !=
+       &Range.getEnd()->getSection())) {
+    CURanges.push_back(Range);
+    return;
+  }
+
+  CURanges.back().setEnd(Range.getEnd());
+}
+
+DIE::value_iterator
+DwarfCompileUnit::addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                  const MCSymbol *Label, const MCSymbol *Sec) {
+  if (Asm->MAI->doesDwarfUseRelocationsAcrossSections())
+    return addLabel(Die, Attribute,
+                    DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                               : dwarf::DW_FORM_data4,
+                    Label);
+  return addSectionDelta(Die, Attribute, Label, Sec);
+}
+
+void DwarfCompileUnit::initStmtList() {
+  // Define start line table label for each Compile Unit.
+  MCSymbol *LineTableStartSym =
+      Asm->OutStreamer->getDwarfLineTableSymbol(getUniqueID());
+
+  // DW_AT_stmt_list is a offset of line number information for this
+  // compile unit in debug_line section. For split dwarf this is
+  // left in the skeleton CU and so not included.
+  // The line table entries are not always emitted in assembly, so it
+  // is not okay to use line_table_start here.
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  StmtListValue =
+      addSectionLabel(UnitDie, dwarf::DW_AT_stmt_list, LineTableStartSym,
+                      TLOF.getDwarfLineSection()->getBeginSymbol());
+}
+
+void DwarfCompileUnit::applyStmtList(DIE &D) {
+  D.addValue(DIEValueAllocator, *StmtListValue);
+}
+
+void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
+                                       const MCSymbol *End) {
+  assert(Begin && "Begin label should not be null!");
+  assert(End && "End label should not be null!");
+  assert(Begin->isDefined() && "Invalid starting label");
+  assert(End->isDefined() && "Invalid end label");
+
+  addLabelAddress(D, dwarf::DW_AT_low_pc, Begin);
+  if (DD->getDwarfVersion() < 4)
+    addLabelAddress(D, dwarf::DW_AT_high_pc, End);
+  else
+    addLabelDelta(D, dwarf::DW_AT_high_pc, End, Begin);
+}
+
+// Find DIE for the given subprogram and attach appropriate DW_AT_low_pc
+// and DW_AT_high_pc attributes. If there are global variables in this
+// scope then create and insert DIEs for these variables.
+DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
+  DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
+
+  attachLowHighPC(*SPDie, Asm->getFunctionBegin(), Asm->getFunctionEnd());
+  if (DD->useAppleExtensionAttributes() &&
+      !DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
+          *DD->getCurrentFunction()))
+    addFlag(*SPDie, dwarf::DW_AT_APPLE_omit_frame_ptr);
+
+  // Only include DW_AT_frame_base in full debug info
+  if (!includeMinimalInlineScopes()) {
+    const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
+    MachineLocation Location(RI->getFrameRegister(*Asm->MF));
+    if (RI->isPhysicalRegister(Location.getReg()))
+      addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+  }
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_subprogram nodes.
+  DD->addSubprogramNames(SP, *SPDie);
+
+  return *SPDie;
+}
+
+// Construct a DIE for this scope.
+void DwarfCompileUnit::constructScopeDIE(
+    LexicalScope *Scope, SmallVectorImpl<DIE *> &FinalChildren) {
+  if (!Scope || !Scope->getScopeNode())
+    return;
+
+  auto *DS = Scope->getScopeNode();
+
+  assert((Scope->getInlinedAt() || !isa<DISubprogram>(DS)) &&
+         "Only handle inlined subprograms here, use "
+         "constructSubprogramScopeDIE for non-inlined "
+         "subprograms");
+
+  SmallVector<DIE *, 8> Children;
+
+  // We try to create the scope DIE first, then the children DIEs. This will
+  // avoid creating un-used children then removing them later when we find out
+  // the scope DIE is null.
+  DIE *ScopeDIE;
+  if (Scope->getParent() && isa<DISubprogram>(DS)) {
+    ScopeDIE = constructInlinedScopeDIE(Scope);
+    if (!ScopeDIE)
+      return;
+    // We create children when the scope DIE is not null.
+    createScopeChildrenDIE(Scope, Children);
+  } else {
+    // Early exit when we know the scope DIE is going to be null.
+    if (DD->isLexicalScopeDIENull(Scope))
+      return;
+
+    unsigned ChildScopeCount;
+
+    // We create children here when we know the scope DIE is not going to be
+    // null and the children will be added to the scope DIE.
+    createScopeChildrenDIE(Scope, Children, &ChildScopeCount);
+
+    // Skip imported directives in gmlt-like data.
+    if (!includeMinimalInlineScopes()) {
+      // There is no need to emit empty lexical block DIE.
+      for (const auto *IE : ImportedEntities[DS])
+        Children.push_back(
+            constructImportedEntityDIE(cast<DIImportedEntity>(IE)));
+    }
+
+    // If there are only other scopes as children, put them directly in the
+    // parent instead, as this scope would serve no purpose.
+    if (Children.size() == ChildScopeCount) {
+      FinalChildren.insert(FinalChildren.end(),
+                           std::make_move_iterator(Children.begin()),
+                           std::make_move_iterator(Children.end()));
+      return;
+    }
+    ScopeDIE = constructLexicalScopeDIE(Scope);
+    assert(ScopeDIE && "Scope DIE should not be null.");
+  }
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE->addChild(std::move(I));
+
+  FinalChildren.push_back(std::move(ScopeDIE));
+}
+
+DIE::value_iterator
+DwarfCompileUnit::addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                  const MCSymbol *Hi, const MCSymbol *Lo) {
+  return Die.addValue(DIEValueAllocator, Attribute,
+                      DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                 : dwarf::DW_FORM_data4,
+                      new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+void DwarfCompileUnit::addScopeRangeList(DIE &ScopeDIE,
+                                         SmallVector<RangeSpan, 2> Range) {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  // Emit offset in .debug_range as a relocatable label. emitDIE will handle
+  // emitting it appropriately.
+  const MCSymbol *RangeSectionSym =
+      TLOF.getDwarfRangesSection()->getBeginSymbol();
+
+  RangeSpanList List(Asm->createTempSymbol("debug_ranges"), std::move(Range));
+
+  // Under fission, ranges are specified by constant offsets relative to the
+  // CU's DW_AT_GNU_ranges_base.
+  if (isDwoUnit())
+    addSectionDelta(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+  else
+    addSectionLabel(ScopeDIE, dwarf::DW_AT_ranges, List.getSym(),
+                    RangeSectionSym);
+
+  // Add the range list to the set of ranges to be emitted.
+  (Skeleton ? Skeleton : this)->CURangeLists.push_back(std::move(List));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, SmallVector<RangeSpan, 2> Ranges) {
+  if (Ranges.size() == 1) {
+    const auto &single = Ranges.front();
+    attachLowHighPC(Die, single.getStart(), single.getEnd());
+  } else
+    addScopeRangeList(Die, std::move(Ranges));
+}
+
+void DwarfCompileUnit::attachRangesOrLowHighPC(
+    DIE &Die, const SmallVectorImpl<InsnRange> &Ranges) {
+  SmallVector<RangeSpan, 2> List;
+  List.reserve(Ranges.size());
+  for (const InsnRange &R : Ranges)
+    List.push_back(RangeSpan(DD->getLabelBeforeInsn(R.first),
+                             DD->getLabelAfterInsn(R.second)));
+  attachRangesOrLowHighPC(Die, std::move(List));
+}
+
+// This scope represents inlined body of a function. Construct DIE to
+// represent this concrete inlined copy of the function.
+DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) {
+  assert(Scope->getScopeNode());
+  auto *DS = Scope->getScopeNode();
+  auto *InlinedSP = getDISubprogram(DS);
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP];
+  assert(OriginDIE && "Unable to find original DIE for an inlined subprogram.");
+
+  auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine);
+  addDIEEntry(*ScopeDIE, dwarf::DW_AT_abstract_origin, *OriginDIE);
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  // Add the call site information to the DIE.
+  const DILocation *IA = Scope->getInlinedAt();
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_file, None,
+          getOrCreateSourceID(IA->getFilename(), IA->getDirectory()));
+  addUInt(*ScopeDIE, dwarf::DW_AT_call_line, None, IA->getLine());
+  if (IA->getDiscriminator())
+    addUInt(*ScopeDIE, dwarf::DW_AT_GNU_discriminator, None,
+            IA->getDiscriminator());
+
+  // Add name to the name table, we do this here because we're guaranteed
+  // to have concrete versions of our DW_TAG_inlined_subprogram nodes.
+  DD->addSubprogramNames(InlinedSP, *ScopeDIE);
+
+  return ScopeDIE;
+}
+
+// Construct new DW_TAG_lexical_block for this scope and attach
+// DW_AT_low_pc/DW_AT_high_pc labels.
+DIE *DwarfCompileUnit::constructLexicalScopeDIE(LexicalScope *Scope) {
+  if (DD->isLexicalScopeDIENull(Scope))
+    return nullptr;
+
+  auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_lexical_block);
+  if (Scope->isAbstractScope())
+    return ScopeDIE;
+
+  attachRangesOrLowHighPC(*ScopeDIE, Scope->getRanges());
+
+  return ScopeDIE;
+}
+
+/// constructVariableDIE - Construct a DIE for the given DbgVariable.
+DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV, bool Abstract) {
+  auto D = constructVariableDIEImpl(DV, Abstract);
+  DV.setDIE(*D);
+  return D;
+}
+
+DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
+                                                bool Abstract) {
+  // Define variable debug information entry.
+  auto VariableDie = DIE::get(DIEValueAllocator, DV.getTag());
+
+  if (Abstract) {
+    applyVariableAttributes(DV, *VariableDie);
+    return VariableDie;
+  }
+
+  // Add variable address.
+
+  unsigned Offset = DV.getDebugLocListIndex();
+  if (Offset != ~0U) {
+    addLocationList(*VariableDie, dwarf::DW_AT_location, Offset);
+    return VariableDie;
+  }
+
+  // Check if variable is described by a DBG_VALUE instruction.
+  if (const MachineInstr *DVInsn = DV.getMInsn()) {
+    assert(DVInsn->getNumOperands() == 4);
+    if (DVInsn->getOperand(0).isReg()) {
+      const MachineOperand RegOp = DVInsn->getOperand(0);
+      // If the second operand is an immediate, this is an indirect value.
+      if (DVInsn->getOperand(1).isImm()) {
+        MachineLocation Location(RegOp.getReg(),
+                                 DVInsn->getOperand(1).getImm());
+        addVariableAddress(DV, *VariableDie, Location);
+      } else if (RegOp.getReg())
+        addVariableAddress(DV, *VariableDie, MachineLocation(RegOp.getReg()));
+    } else if (DVInsn->getOperand(0).isImm()) {
+      // This variable is described by a single constant.
+      // Check whether it has a DIExpression.
+      auto *Expr = DV.getSingleExpression();
+      if (Expr && Expr->getNumElements()) {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+        // If there is an expression, emit raw unsigned bytes.
+        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm());
+        DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end());
+        addBlock(*VariableDie, dwarf::DW_AT_location, Loc);
+      } else
+        addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
+    } else if (DVInsn->getOperand(0).isFPImm())
+      addConstantFPValue(*VariableDie, DVInsn->getOperand(0));
+    else if (DVInsn->getOperand(0).isCImm())
+      addConstantValue(*VariableDie, DVInsn->getOperand(0).getCImm(),
+                       DV.getType());
+
+    return VariableDie;
+  }
+
+  // .. else use frame index.
+  if (DV.getFrameIndex().empty())
+    return VariableDie;
+
+  auto Expr = DV.getExpression().begin();
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  for (auto FI : DV.getFrameIndex()) {
+    unsigned FrameReg = 0;
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
+    assert(Expr != DV.getExpression().end() && "Wrong number of expressions");
+    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
+                                    FrameReg, Offset);
+    DwarfExpr.AddExpression((*Expr)->expr_op_begin(), (*Expr)->expr_op_end());
+    ++Expr;
+  }
+  addBlock(*VariableDie, dwarf::DW_AT_location, Loc);
+
+  return VariableDie;
+}
+
+DIE *DwarfCompileUnit::constructVariableDIE(DbgVariable &DV,
+                                            const LexicalScope &Scope,
+                                            DIE *&ObjectPointer) {
+  auto Var = constructVariableDIE(DV, Scope.isAbstractScope());
+  if (DV.isObjectPointer())
+    ObjectPointer = Var;
+  return Var;
+}
+
+DIE *DwarfCompileUnit::createScopeChildrenDIE(LexicalScope *Scope,
+                                              SmallVectorImpl<DIE *> &Children,
+                                              unsigned *ChildScopeCount) {
+  DIE *ObjectPointer = nullptr;
+
+  for (DbgVariable *DV : DU->getScopeVariables().lookup(Scope))
+    Children.push_back(constructVariableDIE(*DV, *Scope, ObjectPointer));
+
+  unsigned ChildCountWithoutScopes = Children.size();
+
+  for (LexicalScope *LS : Scope->getChildren())
+    constructScopeDIE(LS, Children);
+
+  if (ChildScopeCount)
+    *ChildScopeCount = Children.size() - ChildCountWithoutScopes;
+
+  return ObjectPointer;
+}
+
+void DwarfCompileUnit::constructSubprogramScopeDIE(LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(!Scope->getInlinedAt());
+  assert(!Scope->isAbstractScope());
+  auto *Sub = cast<DISubprogram>(Scope->getScopeNode());
+
+  DD->getProcessedSPNodes().insert(Sub);
+
+  DIE &ScopeDIE = updateSubprogramScopeDIE(Sub);
+
+  // If this is a variadic function, add an unspecified parameter.
+  DITypeRefArray FnArgs = Sub->getType()->getTypeArray();
+
+  // Collect lexical scope children first.
+  // ObjectPointer might be a local (non-argument) local variable if it's a
+  // block's synthetic this pointer.
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, ScopeDIE))
+    addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer, *ObjectPointer);
+
+  // If we have a single element of null, it is a function that returns void.
+  // If we have more than one elements and the last one is null, it is a
+  // variadic function.
+  if (FnArgs.size() > 1 && !FnArgs[FnArgs.size() - 1] &&
+      !includeMinimalInlineScopes())
+    ScopeDIE.addChild(
+        DIE::get(DIEValueAllocator, dwarf::DW_TAG_unspecified_parameters));
+}
+
+DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope,
+                                                 DIE &ScopeDIE) {
+  // We create children when the scope DIE is not null.
+  SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = createScopeChildrenDIE(Scope, Children);
+
+  // Add children
+  for (auto &I : Children)
+    ScopeDIE.addChild(std::move(I));
+
+  return ObjectPointer;
+}
+
+void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
+    LexicalScope *Scope) {
+  DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()];
+  if (AbsDef)
+    return;
+
+  auto *SP = cast<DISubprogram>(Scope->getScopeNode());
+
+  DIE *ContextDIE;
+
+  if (includeMinimalInlineScopes())
+    ContextDIE = &getUnitDie();
+  // Some of this is duplicated from DwarfUnit::getOrCreateSubprogramDIE, with
+  // the important distinction that the debug node is not associated with the
+  // DIE (since the debug node will be associated with the concrete DIE, if
+  // any). It could be refactored to some common utility function.
+  else if (auto *SPDecl = SP->getDeclaration()) {
+    ContextDIE = &getUnitDie();
+    getOrCreateSubprogramDIE(SPDecl);
+  } else
+    ContextDIE = getOrCreateContextDIE(resolve(SP->getScope()));
+
+  // Passing null as the associated node because the abstract definition
+  // shouldn't be found by lookup.
+  AbsDef = &createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, nullptr);
+  applySubprogramAttributesToDefinition(SP, *AbsDef);
+
+  if (!includeMinimalInlineScopes())
+    addUInt(*AbsDef, dwarf::DW_AT_inline, None, dwarf::DW_INL_inlined);
+  if (DIE *ObjectPointer = createAndAddScopeChildren(Scope, *AbsDef))
+    addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
+}
+
+DIE *DwarfCompileUnit::constructImportedEntityDIE(
+    const DIImportedEntity *Module) {
+  DIE *IMDie = DIE::get(DIEValueAllocator, (dwarf::Tag)Module->getTag());
+  insertDIE(Module, IMDie);
+  DIE *EntityDie;
+  auto *Entity = resolve(Module->getEntity());
+  if (auto *NS = dyn_cast<DINamespace>(Entity))
+    EntityDie = getOrCreateNameSpace(NS);
+  else if (auto *M = dyn_cast<DIModule>(Entity))
+    EntityDie = getOrCreateModule(M);
+  else if (auto *SP = dyn_cast<DISubprogram>(Entity))
+    EntityDie = getOrCreateSubprogramDIE(SP);
+  else if (auto *T = dyn_cast<DIType>(Entity))
+    EntityDie = getOrCreateTypeDIE(T);
+  else if (auto *GV = dyn_cast<DIGlobalVariable>(Entity))
+    EntityDie = getOrCreateGlobalVariableDIE(GV);
+  else
+    EntityDie = getDIE(Entity);
+  assert(EntityDie);
+  addSourceLine(*IMDie, Module->getLine(), Module->getScope()->getFilename(),
+                Module->getScope()->getDirectory());
+  addDIEEntry(*IMDie, dwarf::DW_AT_import, *EntityDie);
+  StringRef Name = Module->getName();
+  if (!Name.empty())
+    addString(*IMDie, dwarf::DW_AT_name, Name);
+
+  return IMDie;
+}
+
+void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) {
+  DIE *D = getDIE(SP);
+  if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) {
+    if (D)
+      // If this subprogram has an abstract definition, reference that
+      addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE);
+  } else {
+    if (!D && !includeMinimalInlineScopes())
+      // Lazily construct the subprogram if we didn't see either concrete or
+      // inlined versions during codegen. (except in -gmlt ^ where we want
+      // to omit these entirely)
+      D = getOrCreateSubprogramDIE(SP);
+    if (D)
+      // And attach the attributes
+      applySubprogramAttributesToDefinition(SP, *D);
+  }
+}
+
+void DwarfCompileUnit::emitHeader(bool UseOffsets) {
+  // Don't bother labeling the .dwo unit, as its offset isn't used.
+  if (!Skeleton) {
+    LabelBegin = Asm->createTempSymbol("cu_begin");
+    Asm->OutStreamer->EmitLabel(LabelBegin);
+  }
+
+  DwarfUnit::emitHeader(UseOffsets);
+}
+
+/// addGlobalName - Add a new global name to the compile unit.
+void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+                                     const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  GlobalNames[FullName] = &Die;
+}
+
+/// Add a new global type to the unit.
+void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                     const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  GlobalTypes[FullName] = &Die;
+}
+
+/// addVariableAddress - Add DW_AT_location attribute for a
+/// DbgVariable based on provided MachineLocation.
+void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
+                                          MachineLocation Location) {
+  if (DV.hasComplexAddress())
+    addComplexAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else if (DV.isBlockByrefVariable())
+    addBlockByrefAddress(DV, Die, dwarf::DW_AT_location, Location);
+  else
+    addAddress(Die, dwarf::DW_AT_location, Location);
+}
+
+/// Add an address attribute to a die based on the location provided.
+void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
+                                  const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+
+  bool validReg;
+  if (Location.isReg())
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+  else
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Start with the address based on the location provided, and generate the
+/// DWARF information necessary to find the actual variable given the extra
+/// address information encoded in the DbgVariable, starting from the starting
+/// location.  Add the DWARF information to the die.
+void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
+                                         dwarf::Attribute Attribute,
+                                         const MachineLocation &Location) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+  const DIExpression *Expr = DV.getSingleExpression();
+  bool ValidReg;
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (Location.getOffset()) {
+    ValidReg = DwarfExpr.AddMachineRegIndirect(TRI, Location.getReg(),
+                                               Location.getOffset());
+    if (ValidReg)
+      DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end());
+  } else
+    ValidReg = DwarfExpr.AddMachineRegExpression(TRI, Expr, Location.getReg());
+
+  // Now attach the location information to the DIE.
+  if (ValidReg)
+    addBlock(Die, Attribute, Loc);
+}
+
+/// Add a Dwarf loclistptr attribute data and value.
+void DwarfCompileUnit::addLocationList(DIE &Die, dwarf::Attribute Attribute,
+                                       unsigned Index) {
+  dwarf::Form Form = DD->getDwarfVersion() >= 4 ? dwarf::DW_FORM_sec_offset
+                                                : dwarf::DW_FORM_data4;
+  Die.addValue(DIEValueAllocator, Attribute, Form, DIELocList(Index));
+}
+
+void DwarfCompileUnit::applyVariableAttributes(const DbgVariable &Var,
+                                               DIE &VariableDie) {
+  StringRef Name = Var.getName();
+  if (!Name.empty())
+    addString(VariableDie, dwarf::DW_AT_name, Name);
+  addSourceLine(VariableDie, Var.getVariable());
+  addType(VariableDie, Var.getType());
+  if (Var.isArtificial())
+    addFlag(VariableDie, dwarf::DW_AT_artificial);
+}
+
+/// Add a Dwarf expression attribute data and value.
+void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
+                               const MCExpr *Expr) {
+  Die.addValue(DIEValueAllocator, (dwarf::Attribute)0, Form, DIEExpr(Expr));
+}
+
+void DwarfCompileUnit::applySubprogramAttributesToDefinition(
+    const DISubprogram *SP, DIE &SPDie) {
+  auto *SPDecl = SP->getDeclaration();
+  auto *Context = resolve(SPDecl ? SPDecl->getScope() : SP->getScope());
+  applySubprogramAttributes(SP, SPDie, includeMinimalInlineScopes());
+  addGlobalName(SP->getName(), SPDie, Context);
+}
+
+bool DwarfCompileUnit::isDwoUnit() const {
+  return DD->useSplitDwarf() && Skeleton;
+}
+
+bool DwarfCompileUnit::includeMinimalInlineScopes() const {
+  return getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly ||
+         (DD->useSplitDwarf() && !Skeleton);
+}
+} // end llvm namespace
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
new file mode 100644
index 000000000000..90f74a3686ea
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -0,0 +1,265 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFCOMPILEUNIT_H
+
+#include "DwarfUnit.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Dwarf.h"
+
+namespace llvm {
+
+class StringRef;
+class AsmPrinter;
+class DIE;
+class DwarfDebug;
+class DwarfFile;
+class MCSymbol;
+class LexicalScope;
+
+class DwarfCompileUnit : public DwarfUnit {
+  /// A numeric ID unique among all CUs in the module
+  unsigned UniqueID;
+
+  /// Offset of the UnitDie from beginning of debug info section.
+  unsigned DebugInfoOffset = 0;
+
+  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
+  /// the need to search for it in applyStmtList.
+  DIE::value_iterator StmtListValue;
+
+  /// Skeleton unit associated with this unit.
+  DwarfCompileUnit *Skeleton;
+
+  /// The start of the unit within its section.
+  MCSymbol *LabelBegin;
+
+  /// The start of the unit macro info within macro section.
+  MCSymbol *MacroLabelBegin;
+
+  typedef llvm::SmallVector<const MDNode *, 8> ImportedEntityList;
+  typedef llvm::DenseMap<const MDNode *, ImportedEntityList>
+  ImportedEntityMap;
+
+  ImportedEntityMap ImportedEntities;
+
+  /// GlobalNames - A map of globally visible named entities for this unit.
+  StringMap<const DIE *> GlobalNames;
+
+  /// GlobalTypes - A map of globally visible types for this unit.
+  StringMap<const DIE *> GlobalTypes;
+
+  // List of range lists for a given compile unit, separate from the ranges for
+  // the CU itself.
+  SmallVector<RangeSpanList, 1> CURangeLists;
+
+  // List of ranges for a given compile unit.
+  SmallVector<RangeSpan, 2> CURanges;
+
+  // The base address of this unit, if any. Used for relative references in
+  // ranges/locs.
+  const MCSymbol *BaseAddress;
+
+  /// \brief Construct a DIE for the given DbgVariable without initializing the
+  /// DbgVariable's DIE reference.
+  DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract);
+
+  bool isDwoUnit() const override;
+
+  bool includeMinimalInlineScopes() const;
+
+public:
+  DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A,
+                   DwarfDebug *DW, DwarfFile *DWU);
+
+  unsigned getUniqueID() const { return UniqueID; }
+  unsigned getDebugInfoOffset() const { return DebugInfoOffset; }
+  void setDebugInfoOffset(unsigned DbgInfoOff) { DebugInfoOffset = DbgInfoOff; }
+
+  DwarfCompileUnit *getSkeleton() const {
+    return Skeleton;
+  }
+
+  void initStmtList();
+
+  /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
+  void applyStmtList(DIE &D);
+
+  /// getOrCreateGlobalVariableDIE - get or create global variable DIE.
+  DIE *getOrCreateGlobalVariableDIE(const DIGlobalVariable *GV);
+
+  /// addLabelAddress - Add a dwarf label attribute data and value using
+  /// either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                       const MCSymbol *Label);
+
+  /// addLocalLabelAddress - Add a dwarf label attribute data and value using
+  /// DW_FORM_addr only.
+  void addLocalLabelAddress(DIE &Die, dwarf::Attribute Attribute,
+                            const MCSymbol *Label);
+
+  /// addSectionDelta - Add a label delta attribute data and value.
+  DIE::value_iterator addSectionDelta(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Hi, const MCSymbol *Lo);
+
+  DwarfCompileUnit &getCU() override { return *this; }
+
+  unsigned getOrCreateSourceID(StringRef FileName, StringRef DirName) override;
+
+  void addImportedEntity(const DIImportedEntity* IE) {
+    DIScope *Scope = IE->getScope();
+    assert(Scope && "Invalid Scope encoding!");
+    if (!isa<DILocalScope>(Scope))
+      // No need to add imported enities that are not local declaration.
+      return;
+
+    auto *LocalScope = cast<DILocalScope>(Scope)->getNonLexicalBlockFileScope();
+    ImportedEntities[LocalScope].push_back(IE);
+  }
+
+  /// addRange - Add an address range to the list of ranges for this unit.
+  void addRange(RangeSpan Range);
+
+  void attachLowHighPC(DIE &D, const MCSymbol *Begin, const MCSymbol *End);
+
+  /// addSectionLabel - Add a Dwarf section label attribute data and value.
+  ///
+  DIE::value_iterator addSectionLabel(DIE &Die, dwarf::Attribute Attribute,
+                                      const MCSymbol *Label,
+                                      const MCSymbol *Sec);
+
+  /// \brief Find DIE for the given subprogram and attach appropriate
+  /// DW_AT_low_pc and DW_AT_high_pc attributes. If there are global
+  /// variables in this scope then create and insert DIEs for these
+  /// variables.
+  DIE &updateSubprogramScopeDIE(const DISubprogram *SP);
+
+  void constructScopeDIE(LexicalScope *Scope,
+                         SmallVectorImpl<DIE *> &FinalChildren);
+
+  /// \brief A helper function to construct a RangeSpanList for a given
+  /// lexical scope.
+  void addScopeRangeList(DIE &ScopeDIE, SmallVector<RangeSpan, 2> Range);
+
+  void attachRangesOrLowHighPC(DIE &D, SmallVector<RangeSpan, 2> Ranges);
+
+  void attachRangesOrLowHighPC(DIE &D,
+                               const SmallVectorImpl<InsnRange> &Ranges);
+  /// \brief This scope represents inlined body of a function. Construct
+  /// DIE to represent this concrete inlined copy of the function.
+  DIE *constructInlinedScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct new DW_TAG_lexical_block for this scope and
+  /// attach DW_AT_low_pc/DW_AT_high_pc labels.
+  DIE *constructLexicalScopeDIE(LexicalScope *Scope);
+
+  /// constructVariableDIE - Construct a DIE for the given DbgVariable.
+  DIE *constructVariableDIE(DbgVariable &DV, bool Abstract = false);
+
+  DIE *constructVariableDIE(DbgVariable &DV, const LexicalScope &Scope,
+                            DIE *&ObjectPointer);
+
+  /// A helper function to create children of a Scope DIE.
+  DIE *createScopeChildrenDIE(LexicalScope *Scope,
+                              SmallVectorImpl<DIE *> &Children,
+                              unsigned *ChildScopeCount = nullptr);
+
+  /// \brief Construct a DIE for this subprogram scope.
+  void constructSubprogramScopeDIE(LexicalScope *Scope);
+
+  DIE *createAndAddScopeChildren(LexicalScope *Scope, DIE &ScopeDIE);
+
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+
+  /// \brief Construct import_module DIE.
+  DIE *constructImportedEntityDIE(const DIImportedEntity *Module);
+
+  void finishSubprogramDefinition(const DISubprogram *SP);
+
+  /// Set the skeleton unit associated with this unit.
+  void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; }
+
+  const MCSymbol *getSectionSym() const {
+    assert(Section);
+    return Section->getBeginSymbol();
+  }
+
+  unsigned getLength() {
+    return sizeof(uint32_t) + // Length field
+        getHeaderSize() + UnitDie.getSize();
+  }
+
+  void emitHeader(bool UseOffsets) override;
+
+  MCSymbol *getLabelBegin() const {
+    assert(Section);
+    return LabelBegin;
+  }
+
+  MCSymbol *getMacroLabelBegin() const {
+    return MacroLabelBegin;
+  }
+
+  /// Add a new global name to the compile unit.
+  void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override;
+
+  /// Add a new global type to the compile unit.
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
+
+  const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
+  const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
+
+  /// Add DW_AT_location attribute for a DbgVariable based on provided
+  /// MachineLocation.
+  void addVariableAddress(const DbgVariable &DV, DIE &Die,
+                          MachineLocation Location);
+  /// Add an address attribute to a die based on the location provided.
+  void addAddress(DIE &Die, dwarf::Attribute Attribute,
+                  const MachineLocation &Location);
+
+  /// Start with the address based on the location provided, and generate the
+  /// DWARF information necessary to find the actual variable (navigating the
+  /// extra location information encoded in the type) based on the starting
+  /// location.  Add the DWARF information to the die.
+  void addComplexAddress(const DbgVariable &DV, DIE &Die,
+                         dwarf::Attribute Attribute,
+                         const MachineLocation &Location);
+
+  /// Add a Dwarf loclistptr attribute data and value.
+  void addLocationList(DIE &Die, dwarf::Attribute Attribute, unsigned Index);
+  void applyVariableAttributes(const DbgVariable &Var, DIE &VariableDie);
+
+  /// Add a Dwarf expression attribute data and value.
+  void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
+
+  void applySubprogramAttributesToDefinition(const DISubprogram *SP,
+                                             DIE &SPDie);
+
+  /// getRangeLists - Get the vector of range lists.
+  const SmallVectorImpl<RangeSpanList> &getRangeLists() const {
+    return (Skeleton ? Skeleton : this)->CURangeLists;
+  }
+
+  /// getRanges - Get the list of ranges for this unit.
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return CURanges; }
+  SmallVector<RangeSpan, 2> takeRanges() { return std::move(CURanges); }
+
+  void setBaseAddress(const MCSymbol *Base) { BaseAddress = Base; }
+  const MCSymbol *getBaseAddress() const { return BaseAddress; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
new file mode 100644
index 000000000000..7fba7688f7fb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -0,0 +1,1970 @@
+//===-- llvm/CodeGen/DwarfDebug.cpp - Dwarf Debug Framework ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfDebug.h"
+#include "ByteStreamer.h"
+#include "DIEHash.h"
+#include "DebugLocEntry.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfExpression.h"
+#include "DwarfUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+static cl::opt<bool>
+DisableDebugInfoPrinting("disable-debug-info-print", cl::Hidden,
+                         cl::desc("Disable debug info printing"));
+
+static cl::opt<bool> UnknownLocations(
+    "use-unknown-locations", cl::Hidden,
+    cl::desc("Make an absence of debug location information explicit."),
+    cl::init(false));
+
+static cl::opt<bool>
+GenerateGnuPubSections("generate-gnu-dwarf-pub-sections", cl::Hidden,
+                       cl::desc("Generate GNU-style pubnames and pubtypes"),
+                       cl::init(false));
+
+static cl::opt<bool> GenerateARangeSection("generate-arange-section",
+                                           cl::Hidden,
+                                           cl::desc("Generate dwarf aranges"),
+                                           cl::init(false));
+
+namespace {
+enum DefaultOnOff { Default, Enable, Disable };
+}
+
+static cl::opt<DefaultOnOff>
+DwarfAccelTables("dwarf-accel-tables", cl::Hidden,
+                 cl::desc("Output prototype dwarf accelerator tables."),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
+
+static cl::opt<DefaultOnOff>
+SplitDwarf("split-dwarf", cl::Hidden,
+           cl::desc("Output DWARF5 split debug info."),
+           cl::values(clEnumVal(Default, "Default for platform"),
+                      clEnumVal(Enable, "Enabled"),
+                      clEnumVal(Disable, "Disabled"), clEnumValEnd),
+           cl::init(Default));
+
+static cl::opt<DefaultOnOff>
+DwarfPubSections("generate-dwarf-pub-sections", cl::Hidden,
+                 cl::desc("Generate DWARF pubnames and pubtypes sections"),
+                 cl::values(clEnumVal(Default, "Default for platform"),
+                            clEnumVal(Enable, "Enabled"),
+                            clEnumVal(Disable, "Disabled"), clEnumValEnd),
+                 cl::init(Default));
+
+enum LinkageNameOption {
+  DefaultLinkageNames,
+  AllLinkageNames,
+  AbstractLinkageNames
+};
+static cl::opt<LinkageNameOption>
+    DwarfLinkageNames("dwarf-linkage-names", cl::Hidden,
+                      cl::desc("Which DWARF linkage-name attributes to emit."),
+                      cl::values(clEnumValN(DefaultLinkageNames, "Default",
+                                            "Default for platform"),
+                                 clEnumValN(AllLinkageNames, "All", "All"),
+                                 clEnumValN(AbstractLinkageNames, "Abstract",
+                                            "Abstract subprograms"),
+                                 clEnumValEnd),
+                      cl::init(DefaultLinkageNames));
+
+static const char *const DWARFGroupName = "DWARF Emission";
+static const char *const DbgTimerName = "DWARF Debug Writer";
+
+void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+  BS.EmitInt8(
+      Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
+                  : dwarf::OperationEncodingString(Op));
+}
+
+void DebugLocDwarfExpression::EmitSigned(int64_t Value) {
+  BS.EmitSLEB128(Value, Twine(Value));
+}
+
+void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) {
+  BS.EmitULEB128(Value, Twine(Value));
+}
+
+bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                              unsigned MachineReg) {
+  // This information is not available while emitting .debug_loc entries.
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+
+bool DbgVariable::isBlockByrefVariable() const {
+  assert(Var && "Invalid complex DbgVariable!");
+  return Var->getType().resolve()->isBlockByrefStruct();
+}
+
+const DIType *DbgVariable::getType() const {
+  DIType *Ty = Var->getType().resolve();
+  // FIXME: isBlockByrefVariable should be reformulated in terms of complex
+  // addresses instead.
+  if (Ty->isBlockByrefStruct()) {
+    /* Byref variables, in Blocks, are declared by the programmer as
+       "SomeType VarName;", but the compiler creates a
+       __Block_byref_x_VarName struct, and gives the variable VarName
+       either the struct, or a pointer to the struct, as its type.  This
+       is necessary for various behind-the-scenes things the compiler
+       needs to do with by-reference variables in blocks.
+
+       However, as far as the original *programmer* is concerned, the
+       variable should still have type 'SomeType', as originally declared.
+
+       The following function dives into the __Block_byref_x_VarName
+       struct to find the original type of the variable.  This will be
+       passed back to the code generating the type for the Debug
+       Information Entry for the variable 'VarName'.  'VarName' will then
+       have the original type 'SomeType' in its debug information.
+
+       The original type 'SomeType' will be the type of the field named
+       'VarName' inside the __Block_byref_x_VarName struct.
+
+       NOTE: In order for this to not completely fail on the debugger
+       side, the Debug Information Entry for the variable VarName needs to
+       have a DW_AT_location that tells the debugger how to unwind through
+       the pointers and __Block_byref_x_VarName struct to find the actual
+       value of the variable.  The function addBlockByrefType does this.  */
+    DIType *subType = Ty;
+    uint16_t tag = Ty->getTag();
+
+    if (tag == dwarf::DW_TAG_pointer_type)
+      subType = resolve(cast<DIDerivedType>(Ty)->getBaseType());
+
+    auto Elements = cast<DICompositeType>(subType)->getElements();
+    for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
+      auto *DT = cast<DIDerivedType>(Elements[i]);
+      if (getName() == DT->getName())
+        return resolve(DT->getBaseType());
+    }
+  }
+  return Ty;
+}
+
+static LLVM_CONSTEXPR DwarfAccelTable::Atom TypeAtoms[] = {
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
+    DwarfAccelTable::Atom(dwarf::DW_ATOM_type_flags, dwarf::DW_FORM_data1)};
+
+DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
+    : DebugHandlerBase(A), DebugLocs(A->OutStreamer->isVerboseAsm()),
+      InfoHolder(A, "info_string", DIEValueAllocator),
+      SkeletonHolder(A, "skel_string", DIEValueAllocator),
+      IsDarwin(Triple(A->getTargetTriple()).isOSDarwin()),
+      AccelNames(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                       dwarf::DW_FORM_data4)),
+      AccelObjC(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                      dwarf::DW_FORM_data4)),
+      AccelNamespace(DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset,
+                                           dwarf::DW_FORM_data4)),
+      AccelTypes(TypeAtoms), DebuggerTuning(DebuggerKind::Default) {
+
+  CurFn = nullptr;
+  Triple TT(Asm->getTargetTriple());
+
+  // Make sure we know our "debugger tuning."  The target option takes
+  // precedence; fall back to triple-based defaults.
+  if (Asm->TM.Options.DebuggerTuning != DebuggerKind::Default)
+    DebuggerTuning = Asm->TM.Options.DebuggerTuning;
+  else if (IsDarwin)
+    DebuggerTuning = DebuggerKind::LLDB;
+  else if (TT.isPS4CPU())
+    DebuggerTuning = DebuggerKind::SCE;
+  else
+    DebuggerTuning = DebuggerKind::GDB;
+
+  // Turn on accelerator tables for LLDB by default.
+  if (DwarfAccelTables == Default)
+    HasDwarfAccelTables = tuneForLLDB();
+  else
+    HasDwarfAccelTables = DwarfAccelTables == Enable;
+
+  HasAppleExtensionAttributes = tuneForLLDB();
+
+  // Handle split DWARF. Off by default for now.
+  if (SplitDwarf == Default)
+    HasSplitDwarf = false;
+  else
+    HasSplitDwarf = SplitDwarf == Enable;
+
+  // Pubnames/pubtypes on by default for GDB.
+  if (DwarfPubSections == Default)
+    HasDwarfPubSections = tuneForGDB();
+  else
+    HasDwarfPubSections = DwarfPubSections == Enable;
+
+  // SCE defaults to linkage names only for abstract subprograms.
+  if (DwarfLinkageNames == DefaultLinkageNames)
+    UseAllLinkageNames = !tuneForSCE();
+  else
+    UseAllLinkageNames = DwarfLinkageNames == AllLinkageNames;
+
+  unsigned DwarfVersionNumber = Asm->TM.Options.MCOptions.DwarfVersion;
+  DwarfVersion = DwarfVersionNumber ? DwarfVersionNumber
+                                    : MMI->getModule()->getDwarfVersion();
+  // Use dwarf 4 by default if nothing is requested.
+  DwarfVersion = DwarfVersion ? DwarfVersion : dwarf::DWARF_VERSION;
+
+  // Work around a GDB bug. GDB doesn't support the standard opcode;
+  // SCE doesn't support GNU's; LLDB prefers the standard opcode, which
+  // is defined as of DWARF 3.
+  // See GDB bug 11616 - DW_OP_form_tls_address is unimplemented
+  // https://sourceware.org/bugzilla/show_bug.cgi?id=11616
+  UseGNUTLSOpcode = tuneForGDB() || DwarfVersion < 3;
+
+  // GDB does not fully support the DWARF 4 representation for bitfields.
+  UseDWARF2Bitfields = (DwarfVersion < 4) || tuneForGDB();
+
+  Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
+}
+
+// Define out of line so we don't have to include DwarfUnit.h in DwarfDebug.h.
+DwarfDebug::~DwarfDebug() { }
+
+static bool isObjCClass(StringRef Name) {
+  return Name.startswith("+") || Name.startswith("-");
+}
+
+static bool hasObjCCategory(StringRef Name) {
+  if (!isObjCClass(Name))
+    return false;
+
+  return Name.find(") ") != StringRef::npos;
+}
+
+static void getObjCClassCategory(StringRef In, StringRef &Class,
+                                 StringRef &Category) {
+  if (!hasObjCCategory(In)) {
+    Class = In.slice(In.find('[') + 1, In.find(' '));
+    Category = "";
+    return;
+  }
+
+  Class = In.slice(In.find('[') + 1, In.find('('));
+  Category = In.slice(In.find('[') + 1, In.find(' '));
+}
+
+static StringRef getObjCMethodName(StringRef In) {
+  return In.slice(In.find(' ') + 1, In.find(']'));
+}
+
+// Add the various names to the Dwarf accelerator table names.
+// TODO: Determine whether or not we should add names for programs
+// that do not have a DW_AT_name or DW_AT_linkage_name field - this
+// is only slightly different than the lookup of non-standard ObjC names.
+void DwarfDebug::addSubprogramNames(const DISubprogram *SP, DIE &Die) {
+  if (!SP->isDefinition())
+    return;
+  addAccelName(SP->getName(), Die);
+
+  // If the linkage name is different than the name, go ahead and output
+  // that as well into the name table.
+  if (SP->getLinkageName() != "" && SP->getName() != SP->getLinkageName())
+    addAccelName(SP->getLinkageName(), Die);
+
+  // If this is an Objective-C selector name add it to the ObjC accelerator
+  // too.
+  if (isObjCClass(SP->getName())) {
+    StringRef Class, Category;
+    getObjCClassCategory(SP->getName(), Class, Category);
+    addAccelObjC(Class, Die);
+    if (Category != "")
+      addAccelObjC(Category, Die);
+    // Also add the base method name to the name table.
+    addAccelName(getObjCMethodName(SP->getName()), Die);
+  }
+}
+
+/// Check whether we should create a DIE for the given Scope, return true
+/// if we don't create a DIE (the corresponding DIE is null).
+bool DwarfDebug::isLexicalScopeDIENull(LexicalScope *Scope) {
+  if (Scope->isAbstractScope())
+    return false;
+
+  // We don't create a DIE if there is no Range.
+  const SmallVectorImpl<InsnRange> &Ranges = Scope->getRanges();
+  if (Ranges.empty())
+    return true;
+
+  if (Ranges.size() > 1)
+    return false;
+
+  // We don't create a DIE if we have a single Range and the end label
+  // is null.
+  return !getLabelAfterInsn(Ranges.front().second);
+}
+
+template <typename Func> void forBothCUs(DwarfCompileUnit &CU, Func F) {
+  F(CU);
+  if (auto *SkelCU = CU.getSkeleton())
+    F(*SkelCU);
+}
+
+void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) {
+  assert(Scope && Scope->getScopeNode());
+  assert(Scope->isAbstractScope());
+  assert(!Scope->getInlinedAt());
+
+  const MDNode *SP = Scope->getScopeNode();
+
+  ProcessedSPNodes.insert(SP);
+
+  // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram
+  // was inlined from another compile unit.
+  auto &CU = *CUMap.lookup(cast<DISubprogram>(SP)->getUnit());
+  forBothCUs(CU, [&](DwarfCompileUnit &CU) {
+    CU.constructAbstractSubprogramScopeDIE(Scope);
+  });
+}
+
+void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const {
+  if (!GenerateGnuPubSections)
+    return;
+
+  U.addFlag(D, dwarf::DW_AT_GNU_pubnames);
+}
+
+// Create new DwarfCompileUnit for the given metadata node with tag
+// DW_TAG_compile_unit.
+DwarfCompileUnit &
+DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
+  StringRef FN = DIUnit->getFilename();
+  CompilationDir = DIUnit->getDirectory();
+
+  auto OwnedUnit = make_unique<DwarfCompileUnit>(
+      InfoHolder.getUnits().size(), DIUnit, Asm, this, &InfoHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  DIE &Die = NewCU.getUnitDie();
+  InfoHolder.addUnit(std::move(OwnedUnit));
+  if (useSplitDwarf()) {
+    NewCU.setSkeleton(constructSkeletonCU(NewCU));
+    NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                    DIUnit->getSplitDebugFilename());
+  }
+
+  // LTO with assembly output shares a single line table amongst multiple CUs.
+  // To avoid the compilation directory being ambiguous, let the line table
+  // explicitly describe the directory of all files, never relying on the
+  // compilation directory.
+  if (!Asm->OutStreamer->hasRawTextSupport() || SingleCU)
+    Asm->OutStreamer->getContext().setMCLineTableCompilationDir(
+        NewCU.getUniqueID(), CompilationDir);
+
+  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer());
+  NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                DIUnit->getSourceLanguage());
+  NewCU.addString(Die, dwarf::DW_AT_name, FN);
+
+  if (!useSplitDwarf()) {
+    NewCU.initStmtList();
+
+    // If we're using split dwarf the compilation dir is going to be in the
+    // skeleton CU and so we don't need to duplicate it here.
+    if (!CompilationDir.empty())
+      NewCU.addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+    addGnuPubAttributes(NewCU, Die);
+  }
+
+  if (useAppleExtensionAttributes()) {
+    if (DIUnit->isOptimized())
+      NewCU.addFlag(Die, dwarf::DW_AT_APPLE_optimized);
+
+    StringRef Flags = DIUnit->getFlags();
+    if (!Flags.empty())
+      NewCU.addString(Die, dwarf::DW_AT_APPLE_flags, Flags);
+
+    if (unsigned RVer = DIUnit->getRuntimeVersion())
+      NewCU.addUInt(Die, dwarf::DW_AT_APPLE_major_runtime_vers,
+                    dwarf::DW_FORM_data1, RVer);
+  }
+
+  if (useSplitDwarf())
+    NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoDWOSection());
+  else
+    NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection());
+
+  if (DIUnit->getDWOId()) {
+    // This CU is either a clang module DWO or a skeleton CU.
+    NewCU.addUInt(Die, dwarf::DW_AT_GNU_dwo_id, dwarf::DW_FORM_data8,
+                  DIUnit->getDWOId());
+    if (!DIUnit->getSplitDebugFilename().empty())
+      // This is a prefabricated skeleton CU.
+      NewCU.addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                      DIUnit->getSplitDebugFilename());
+  }
+
+  CUMap.insert(std::make_pair(DIUnit, &NewCU));
+  CUDieMap.insert(std::make_pair(&Die, &NewCU));
+  return NewCU;
+}
+
+void DwarfDebug::constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                                  const DIImportedEntity *N) {
+  if (DIE *D = TheCU.getOrCreateContextDIE(N->getScope()))
+    D->addChild(TheCU.constructImportedEntityDIE(N));
+}
+
+// Emit all Dwarf sections that should come prior to the content. Create
+// global DIEs and emit initial debug info sections. This is invoked by
+// the target AsmPrinter.
+void DwarfDebug::beginModule() {
+  NamedRegionTimer T(DbgTimerName, DWARFGroupName, TimePassesIsEnabled);
+  if (DisableDebugInfoPrinting)
+    return;
+
+  const Module *M = MMI->getModule();
+
+  unsigned NumDebugCUs = std::distance(M->debug_compile_units_begin(),
+                                       M->debug_compile_units_end());
+  // Tell MMI whether we have debug info.
+  MMI->setDebugInfoAvailability(NumDebugCUs > 0);
+  SingleCU = NumDebugCUs == 1;
+
+  for (DICompileUnit *CUNode : M->debug_compile_units()) {
+    DwarfCompileUnit &CU = constructDwarfCompileUnit(CUNode);
+    for (auto *IE : CUNode->getImportedEntities())
+      CU.addImportedEntity(IE);
+    for (auto *GV : CUNode->getGlobalVariables())
+      CU.getOrCreateGlobalVariableDIE(GV);
+    for (auto *Ty : CUNode->getEnumTypes()) {
+      // The enum types array by design contains pointers to
+      // MDNodes rather than DIRefs. Unique them here.
+      CU.getOrCreateTypeDIE(cast<DIType>(Ty));
+    }
+    for (auto *Ty : CUNode->getRetainedTypes()) {
+      // The retained types array by design contains pointers to
+      // MDNodes rather than DIRefs. Unique them here.
+      if (DIType *RT = dyn_cast<DIType>(Ty))
+        if (!RT->isExternalTypeRef())
+          // There is no point in force-emitting a forward declaration.
+          CU.getOrCreateTypeDIE(RT);
+    }
+    // Emit imported_modules last so that the relevant context is already
+    // available.
+    for (auto *IE : CUNode->getImportedEntities())
+      constructAndAddImportedEntityDIE(CU, IE);
+  }
+}
+
+void DwarfDebug::finishVariableDefinitions() {
+  for (const auto &Var : ConcreteVariables) {
+    DIE *VariableDie = Var->getDIE();
+    assert(VariableDie);
+    // FIXME: Consider the time-space tradeoff of just storing the unit pointer
+    // in the ConcreteVariables list, rather than looking it up again here.
+    // DIE::getUnit isn't simple - it walks parent pointers, etc.
+    DwarfCompileUnit *Unit = lookupUnit(VariableDie->getUnit());
+    assert(Unit);
+    DbgVariable *AbsVar = getExistingAbstractVariable(
+        InlinedVariable(Var->getVariable(), Var->getInlinedAt()));
+    if (AbsVar && AbsVar->getDIE()) {
+      Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin,
+                        *AbsVar->getDIE());
+    } else
+      Unit->applyVariableAttributes(*Var, *VariableDie);
+  }
+}
+
+void DwarfDebug::finishSubprogramDefinitions() {
+  for (auto &F : MMI->getModule()->functions())
+    if (auto *SP = F.getSubprogram())
+      if (ProcessedSPNodes.count(SP) &&
+          SP->getUnit()->getEmissionKind() != DICompileUnit::NoDebug)
+        forBothCUs(*CUMap.lookup(SP->getUnit()), [&](DwarfCompileUnit &CU) {
+          CU.finishSubprogramDefinition(SP);
+        });
+}
+
+void DwarfDebug::finalizeModuleInfo() {
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+
+  finishSubprogramDefinitions();
+
+  finishVariableDefinitions();
+
+  // Handle anything that needs to be done on a per-unit basis after
+  // all other generation.
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    // Emit DW_AT_containing_type attribute to connect types with their
+    // vtable holding type.
+    TheCU.constructContainingTypeDIEs();
+
+    // Add CU specific attributes if we need to add any.
+    // If we're splitting the dwarf out now that we've got the entire
+    // CU then add the dwo id to it.
+    auto *SkCU = TheCU.getSkeleton();
+    if (useSplitDwarf()) {
+      // Emit a unique identifier for this CU.
+      uint64_t ID = DIEHash(Asm).computeCUSignature(TheCU.getUnitDie());
+      TheCU.addUInt(TheCU.getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+      SkCU->addUInt(SkCU->getUnitDie(), dwarf::DW_AT_GNU_dwo_id,
+                    dwarf::DW_FORM_data8, ID);
+
+      // We don't keep track of which addresses are used in which CU so this
+      // is a bit pessimistic under LTO.
+      if (!AddrPool.isEmpty()) {
+        const MCSymbol *Sym = TLOF.getDwarfAddrSection()->getBeginSymbol();
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_addr_base,
+                              Sym, Sym);
+      }
+      if (!SkCU->getRangeLists().empty()) {
+        const MCSymbol *Sym = TLOF.getDwarfRangesSection()->getBeginSymbol();
+        SkCU->addSectionLabel(SkCU->getUnitDie(), dwarf::DW_AT_GNU_ranges_base,
+                              Sym, Sym);
+      }
+    }
+
+    // If we have code split among multiple sections or non-contiguous
+    // ranges of code then emit a DW_AT_ranges attribute on the unit that will
+    // remain in the .o file, otherwise add a DW_AT_low_pc.
+    // FIXME: We should use ranges allow reordering of code ala
+    // .subsections_via_symbols in mach-o. This would mean turning on
+    // ranges for all subprogram DIEs for mach-o.
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    if (unsigned NumRanges = TheCU.getRanges().size()) {
+      if (NumRanges > 1)
+        // A DW_AT_low_pc attribute may also be specified in combination with
+        // DW_AT_ranges to specify the default base address for use in
+        // location lists (see Section 2.6.2) and range lists (see Section
+        // 2.17.3).
+        U.addUInt(U.getUnitDie(), dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr, 0);
+      else
+        U.setBaseAddress(TheCU.getRanges().front().getStart());
+      U.attachRangesOrLowHighPC(U.getUnitDie(), TheCU.takeRanges());
+    }
+
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
+    if (CUNode->getMacros())
+      U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
+                        U.getMacroLabelBegin(),
+                        TLOF.getDwarfMacinfoSection()->getBeginSymbol());
+  }
+
+  // Compute DIE offsets and sizes.
+  InfoHolder.computeSizeAndOffsets();
+  if (useSplitDwarf())
+    SkeletonHolder.computeSizeAndOffsets();
+}
+
+// Emit all Dwarf sections that should come after the content.
+void DwarfDebug::endModule() {
+  assert(CurFn == nullptr);
+  assert(CurMI == nullptr);
+
+  // If we aren't actually generating debug info (check beginModule -
+  // conditionalized on !DisableDebugInfoPrinting and the presence of the
+  // llvm.dbg.cu metadata node)
+  if (!MMI->hasDebugInfo())
+    return;
+
+  // Finalize the debug info for the module.
+  finalizeModuleInfo();
+
+  emitDebugStr();
+
+  if (useSplitDwarf())
+    emitDebugLocDWO();
+  else
+    // Emit info into a debug loc section.
+    emitDebugLoc();
+
+  // Corresponding abbreviations into a abbrev section.
+  emitAbbreviations();
+
+  // Emit all the DIEs into a debug info section.
+  emitDebugInfo();
+
+  // Emit info into a debug aranges section.
+  if (GenerateARangeSection)
+    emitDebugARanges();
+
+  // Emit info into a debug ranges section.
+  emitDebugRanges();
+
+  // Emit info into a debug macinfo section.
+  emitDebugMacinfo();
+
+  if (useSplitDwarf()) {
+    emitDebugStrDWO();
+    emitDebugInfoDWO();
+    emitDebugAbbrevDWO();
+    emitDebugLineDWO();
+    // Emit DWO addresses.
+    AddrPool.emit(*Asm, Asm->getObjFileLowering().getDwarfAddrSection());
+  }
+
+  // Emit info into the dwarf accelerator table sections.
+  if (useDwarfAccelTables()) {
+    emitAccelNames();
+    emitAccelObjC();
+    emitAccelNamespaces();
+    emitAccelTypes();
+  }
+
+  // Emit the pubnames and pubtypes sections if requested.
+  if (HasDwarfPubSections) {
+    emitDebugPubNames(GenerateGnuPubSections);
+    emitDebugPubTypes(GenerateGnuPubSections);
+  }
+
+  // clean up.
+  AbstractVariables.clear();
+}
+
+// Find abstract variable, if any, associated with Var.
+DbgVariable *
+DwarfDebug::getExistingAbstractVariable(InlinedVariable IV,
+                                        const DILocalVariable *&Cleansed) {
+  // More then one inlined variable corresponds to one abstract variable.
+  Cleansed = IV.first;
+  auto I = AbstractVariables.find(Cleansed);
+  if (I != AbstractVariables.end())
+    return I->second.get();
+  return nullptr;
+}
+
+DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
+  const DILocalVariable *Cleansed;
+  return getExistingAbstractVariable(IV, Cleansed);
+}
+
+void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
+                                        LexicalScope *Scope) {
+  auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
+  InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
+  AbstractVariables[Var] = std::move(AbsDbgVariable);
+}
+
+void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV,
+                                                 const MDNode *ScopeNode) {
+  const DILocalVariable *Cleansed = nullptr;
+  if (getExistingAbstractVariable(IV, Cleansed))
+    return;
+
+  createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope(
+                                       cast<DILocalScope>(ScopeNode)));
+}
+
+void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(
+    InlinedVariable IV, const MDNode *ScopeNode) {
+  const DILocalVariable *Cleansed = nullptr;
+  if (getExistingAbstractVariable(IV, Cleansed))
+    return;
+
+  if (LexicalScope *Scope =
+          LScopes.findAbstractScope(cast_or_null<DILocalScope>(ScopeNode)))
+    createAbstractVariable(Cleansed, Scope);
+}
+
+// Collect variable information from side table maintained by MMI.
+void DwarfDebug::collectVariableInfoFromMMITable(
+    DenseSet<InlinedVariable> &Processed) {
+  for (const auto &VI : MMI->getVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    assert(VI.Var->isValidLocationForIntrinsic(VI.Loc) &&
+           "Expected inlined-at fields to agree");
+
+    InlinedVariable Var(VI.Var, VI.Loc->getInlinedAt());
+    Processed.insert(Var);
+    LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
+
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode());
+    auto RegVar = make_unique<DbgVariable>(Var.first, Var.second);
+    RegVar->initializeMMI(VI.Expr, VI.Slot);
+    if (InfoHolder.addScopeVariable(Scope, RegVar.get()))
+      ConcreteVariables.push_back(std::move(RegVar));
+  }
+}
+
+// Get .debug_loc entry for the instruction range starting at MI.
+static DebugLocEntry::Value getDebugLocValue(const MachineInstr *MI) {
+  const DIExpression *Expr = MI->getDebugExpression();
+
+  assert(MI->getNumOperands() == 4);
+  if (MI->getOperand(0).isReg()) {
+    MachineLocation MLoc;
+    // If the second operand is an immediate, this is a
+    // register-indirect address.
+    if (!MI->getOperand(1).isImm())
+      MLoc.set(MI->getOperand(0).getReg());
+    else
+      MLoc.set(MI->getOperand(0).getReg(), MI->getOperand(1).getImm());
+    return DebugLocEntry::Value(Expr, MLoc);
+  }
+  if (MI->getOperand(0).isImm())
+    return DebugLocEntry::Value(Expr, MI->getOperand(0).getImm());
+  if (MI->getOperand(0).isFPImm())
+    return DebugLocEntry::Value(Expr, MI->getOperand(0).getFPImm());
+  if (MI->getOperand(0).isCImm())
+    return DebugLocEntry::Value(Expr, MI->getOperand(0).getCImm());
+
+  llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
+}
+
+/// \brief If this and Next are describing different pieces of the same
+/// variable, merge them by appending Next's values to the current
+/// list of values.
+/// Return true if the merge was successful.
+bool DebugLocEntry::MergeValues(const DebugLocEntry &Next) {
+  if (Begin == Next.Begin) {
+    auto *FirstExpr = cast<DIExpression>(Values[0].Expression);
+    auto *FirstNextExpr = cast<DIExpression>(Next.Values[0].Expression);
+    if (!FirstExpr->isBitPiece() || !FirstNextExpr->isBitPiece())
+      return false;
+
+    // We can only merge entries if none of the pieces overlap any others.
+    // In doing so, we can take advantage of the fact that both lists are
+    // sorted.
+    for (unsigned i = 0, j = 0; i < Values.size(); ++i) {
+      for (; j < Next.Values.size(); ++j) {
+        int res = DebugHandlerBase::pieceCmp(
+            cast<DIExpression>(Values[i].Expression),
+            cast<DIExpression>(Next.Values[j].Expression));
+        if (res == 0) // The two expressions overlap, we can't merge.
+          return false;
+        // Values[i] is entirely before Next.Values[j],
+        // so go back to the next entry of Values.
+        else if (res == -1)
+          break;
+        // Next.Values[j] is entirely before Values[i], so go on to the
+        // next entry of Next.Values.
+      }
+    }
+
+    addValues(Next.Values);
+    End = Next.End;
+    return true;
+  }
+  return false;
+}
+
+/// Build the location list for all DBG_VALUEs in the function that
+/// describe the same variable.  If the ranges of several independent
+/// pieces of the same variable overlap partially, split them up and
+/// combine the ranges. The resulting DebugLocEntries are will have
+/// strict monotonically increasing begin addresses and will never
+/// overlap.
+//
+// Input:
+//
+//   Ranges History [var, loc, piece ofs size]
+// 0 |      [x, (reg0, piece 0, 32)]
+// 1 | |    [x, (reg1, piece 32, 32)] <- IsPieceOfPrevEntry
+// 2 | |    ...
+// 3   |    [clobber reg0]
+// 4        [x, (mem, piece 0, 64)] <- overlapping with both previous pieces of
+//                                     x.
+//
+// Output:
+//
+// [0-1]    [x, (reg0, piece  0, 32)]
+// [1-3]    [x, (reg0, piece  0, 32), (reg1, piece 32, 32)]
+// [3-4]    [x, (reg1, piece 32, 32)]
+// [4- ]    [x, (mem,  piece  0, 64)]
+void
+DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                              const DbgValueHistoryMap::InstrRanges &Ranges) {
+  SmallVector<DebugLocEntry::Value, 4> OpenRanges;
+
+  for (auto I = Ranges.begin(), E = Ranges.end(); I != E; ++I) {
+    const MachineInstr *Begin = I->first;
+    const MachineInstr *End = I->second;
+    assert(Begin->isDebugValue() && "Invalid History entry");
+
+    // Check if a variable is inaccessible in this range.
+    if (Begin->getNumOperands() > 1 &&
+        Begin->getOperand(0).isReg() && !Begin->getOperand(0).getReg()) {
+      OpenRanges.clear();
+      continue;
+    }
+
+    // If this piece overlaps with any open ranges, truncate them.
+    const DIExpression *DIExpr = Begin->getDebugExpression();
+    auto Last = std::remove_if(OpenRanges.begin(), OpenRanges.end(),
+                               [&](DebugLocEntry::Value R) {
+      return piecesOverlap(DIExpr, R.getExpression());
+    });
+    OpenRanges.erase(Last, OpenRanges.end());
+
+    const MCSymbol *StartLabel = getLabelBeforeInsn(Begin);
+    assert(StartLabel && "Forgot label before DBG_VALUE starting a range!");
+
+    const MCSymbol *EndLabel;
+    if (End != nullptr)
+      EndLabel = getLabelAfterInsn(End);
+    else if (std::next(I) == Ranges.end())
+      EndLabel = Asm->getFunctionEnd();
+    else
+      EndLabel = getLabelBeforeInsn(std::next(I)->first);
+    assert(EndLabel && "Forgot label after instruction ending a range!");
+
+    DEBUG(dbgs() << "DotDebugLoc: " << *Begin << "\n");
+
+    auto Value = getDebugLocValue(Begin);
+    DebugLocEntry Loc(StartLabel, EndLabel, Value);
+    bool couldMerge = false;
+
+    // If this is a piece, it may belong to the current DebugLocEntry.
+    if (DIExpr->isBitPiece()) {
+      // Add this value to the list of open ranges.
+      OpenRanges.push_back(Value);
+
+      // Attempt to add the piece to the last entry.
+      if (!DebugLoc.empty())
+        if (DebugLoc.back().MergeValues(Loc))
+          couldMerge = true;
+    }
+
+    if (!couldMerge) {
+      // Need to add a new DebugLocEntry. Add all values from still
+      // valid non-overlapping pieces.
+      if (OpenRanges.size())
+        Loc.addValues(OpenRanges);
+
+      DebugLoc.push_back(std::move(Loc));
+    }
+
+    // Attempt to coalesce the ranges of two otherwise identical
+    // DebugLocEntries.
+    auto CurEntry = DebugLoc.rbegin();
+    DEBUG({
+      dbgs() << CurEntry->getValues().size() << " Values:\n";
+      for (auto &Value : CurEntry->getValues())
+        Value.dump();
+      dbgs() << "-----\n";
+    });
+
+    auto PrevEntry = std::next(CurEntry);
+    if (PrevEntry != DebugLoc.rend() && PrevEntry->MergeRanges(*CurEntry))
+      DebugLoc.pop_back();
+  }
+}
+
+DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope,
+                                                InlinedVariable IV) {
+  ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode());
+  ConcreteVariables.push_back(make_unique<DbgVariable>(IV.first, IV.second));
+  InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get());
+  return ConcreteVariables.back().get();
+}
+
+// Determine whether this DBG_VALUE is valid at the beginning of the function.
+static bool validAtEntry(const MachineInstr *MInsn) {
+  auto MBB = MInsn->getParent();
+  // Is it in the entry basic block?
+  if (!MBB->pred_empty())
+    return false;
+  for (MachineBasicBlock::const_reverse_iterator I(MInsn); I != MBB->rend(); ++I)
+    if (!(I->isDebugValue() || I->getFlag(MachineInstr::FrameSetup)))
+      return false;
+  return true;
+}
+
+// Find variables for each lexical scope.
+void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU,
+                                     const DISubprogram *SP,
+                                     DenseSet<InlinedVariable> &Processed) {
+  // Grab the variable info that was squirreled away in the MMI side-table.
+  collectVariableInfoFromMMITable(Processed);
+
+  for (const auto &I : DbgValues) {
+    InlinedVariable IV = I.first;
+    if (Processed.count(IV))
+      continue;
+
+    // Instruction ranges, specifying where IV is accessible.
+    const auto &Ranges = I.second;
+    if (Ranges.empty())
+      continue;
+
+    LexicalScope *Scope = nullptr;
+    if (const DILocation *IA = IV.second)
+      Scope = LScopes.findInlinedScope(IV.first->getScope(), IA);
+    else
+      Scope = LScopes.findLexicalScope(IV.first->getScope());
+    // If variable scope is not found then skip this variable.
+    if (!Scope)
+      continue;
+
+    Processed.insert(IV);
+    DbgVariable *RegVar = createConcreteVariable(*Scope, IV);
+
+    const MachineInstr *MInsn = Ranges.front().first;
+    assert(MInsn->isDebugValue() && "History must begin with debug value");
+
+    // Check if there is a single DBG_VALUE, valid throughout the function.
+    // A single constant is also considered valid for the entire function.
+    if (Ranges.size() == 1 &&
+        (MInsn->getOperand(0).isImm() ||
+         (validAtEntry(MInsn) && Ranges.front().second == nullptr))) {
+      RegVar->initializeDbgValue(MInsn);
+      continue;
+    }
+
+    // Handle multiple DBG_VALUE instructions describing one variable.
+    DebugLocStream::ListBuilder List(DebugLocs, TheCU, *Asm, *RegVar, *MInsn);
+
+    // Build the location list for this variable.
+    SmallVector<DebugLocEntry, 8> Entries;
+    buildLocationList(Entries, Ranges);
+
+    // If the variable has a DIBasicType, extract it.  Basic types cannot have
+    // unique identifiers, so don't bother resolving the type with the
+    // identifier map.
+    const DIBasicType *BT = dyn_cast<DIBasicType>(
+        static_cast<const Metadata *>(IV.first->getType()));
+
+    // Finalize the entry by lowering it into a DWARF bytestream.
+    for (auto &Entry : Entries)
+      Entry.finalize(*Asm, List, BT);
+  }
+
+  // Collect info for variables that were optimized out.
+  for (const DILocalVariable *DV : SP->getVariables()) {
+    if (Processed.insert(InlinedVariable(DV, nullptr)).second)
+      if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope()))
+        createConcreteVariable(*Scope, InlinedVariable(DV, nullptr));
+  }
+}
+
+// Process beginning of an instruction.
+void DwarfDebug::beginInstruction(const MachineInstr *MI) {
+  DebugHandlerBase::beginInstruction(MI);
+  assert(CurMI);
+
+  // Check if source location changes, but ignore DBG_VALUE locations.
+  if (!MI->isDebugValue()) {
+    const DebugLoc &DL = MI->getDebugLoc();
+    if (DL != PrevInstLoc) {
+      if (DL) {
+        unsigned Flags = 0;
+        PrevInstLoc = DL;
+        if (DL == PrologEndLoc) {
+          Flags |= DWARF2_FLAG_PROLOGUE_END;
+          PrologEndLoc = DebugLoc();
+          Flags |= DWARF2_FLAG_IS_STMT;
+        }
+        if (DL.getLine() !=
+            Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine())
+          Flags |= DWARF2_FLAG_IS_STMT;
+
+        const MDNode *Scope = DL.getScope();
+        recordSourceLine(DL.getLine(), DL.getCol(), Scope, Flags);
+      } else if (UnknownLocations) {
+        PrevInstLoc = DL;
+        recordSourceLine(0, 0, nullptr, 0);
+      }
+    }
+  }
+}
+
+static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
+  // First known non-DBG_VALUE and non-frame setup location marks
+  // the beginning of the function body.
+  for (const auto &MBB : *MF)
+    for (const auto &MI : MBB)
+      if (!MI.isDebugValue() && !MI.getFlag(MachineInstr::FrameSetup) &&
+          MI.getDebugLoc())
+        return MI.getDebugLoc();
+  return DebugLoc();
+}
+
+// Gather pre-function debug information.  Assumes being called immediately
+// after the function entry point has been emitted.
+void DwarfDebug::beginFunction(const MachineFunction *MF) {
+  CurFn = MF;
+
+  // If there's no debug info for the function we're not going to do anything.
+  if (!MMI->hasDebugInfo())
+    return;
+
+  auto DI = MF->getFunction()->getSubprogram();
+  if (!DI)
+    return;
+
+  // Grab the lexical scopes for the function, if we don't have any of those
+  // then we're not going to be able to do anything.
+  DebugHandlerBase::beginFunction(MF);
+  if (LScopes.empty())
+    return;
+
+  // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
+  // belongs to so that we add to the correct per-cu line table in the
+  // non-asm case.
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  // FnScope->getScopeNode() and DI->second should represent the same function,
+  // though they may not be the same MDNode due to inline functions merged in
+  // LTO where the debug info metadata still differs (either due to distinct
+  // written differences - two versions of a linkonce_odr function
+  // written/copied into two separate files, or some sub-optimal metadata that
+  // isn't structurally identical (see: file path/name info from clang, which
+  // includes the directory of the cpp file being built, even when the file name
+  // is absolute (such as an <> lookup header)))
+  auto *SP = cast<DISubprogram>(FnScope->getScopeNode());
+  DwarfCompileUnit *TheCU = CUMap.lookup(SP->getUnit());
+  if (!TheCU) {
+    assert(SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug &&
+           "DICompileUnit missing from llvm.dbg.cu?");
+    return;
+  }
+  if (Asm->OutStreamer->hasRawTextSupport())
+    // Use a single line table if we are generating assembly.
+    Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
+  else
+    Asm->OutStreamer->getContext().setDwarfCompileUnitID(TheCU->getUniqueID());
+
+  // Record beginning of function.
+  PrologEndLoc = findPrologueEndLoc(MF);
+  if (DILocation *L = PrologEndLoc) {
+    // We'd like to list the prologue as "not statements" but GDB behaves
+    // poorly if we do that. Revisit this with caution/GDB (7.5+) testing.
+    auto *SP = L->getInlinedAtScope()->getSubprogram();
+    recordSourceLine(SP->getScopeLine(), 0, SP, DWARF2_FLAG_IS_STMT);
+  }
+}
+
+// Gather and emit post-function debug information.
+void DwarfDebug::endFunction(const MachineFunction *MF) {
+  assert(CurFn == MF &&
+      "endFunction should be called with the same function as beginFunction");
+
+  const DISubprogram *SP = MF->getFunction()->getSubprogram();
+  if (!MMI->hasDebugInfo() || LScopes.empty() || !SP ||
+      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) {
+    // If we don't have a lexical scope for this function then there will
+    // be a hole in the range information. Keep note of this by setting the
+    // previously used section to nullptr.
+    PrevCU = nullptr;
+    CurFn = nullptr;
+    DebugHandlerBase::endFunction(MF);
+    // Mark functions with no debug info on any instructions, but a
+    // valid DISubprogram as processed.
+    if (SP)
+      ProcessedSPNodes.insert(SP);
+    return;
+  }
+
+  // Set DwarfDwarfCompileUnitID in MCContext to default value.
+  Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
+
+  LexicalScope *FnScope = LScopes.getCurrentFunctionScope();
+  SP = cast<DISubprogram>(FnScope->getScopeNode());
+  DwarfCompileUnit &TheCU = *CUMap.lookup(SP->getUnit());
+
+  DenseSet<InlinedVariable> ProcessedVars;
+  collectVariableInfo(TheCU, SP, ProcessedVars);
+
+  // Add the range of this function to the list of ranges for the CU.
+  TheCU.addRange(RangeSpan(Asm->getFunctionBegin(), Asm->getFunctionEnd()));
+
+  // Under -gmlt, skip building the subprogram if there are no inlined
+  // subroutines inside it.
+  if (TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
+      LScopes.getAbstractScopesList().empty() && !IsDarwin) {
+    assert(InfoHolder.getScopeVariables().empty());
+    assert(DbgValues.empty());
+    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
+    // by a -gmlt CU. Add a test and remove this assertion.
+    assert(AbstractVariables.empty());
+    PrevLabel = nullptr;
+    CurFn = nullptr;
+    DebugHandlerBase::endFunction(MF);
+    return;
+  }
+
+#ifndef NDEBUG
+  size_t NumAbstractScopes = LScopes.getAbstractScopesList().size();
+#endif
+  // Construct abstract scopes.
+  for (LexicalScope *AScope : LScopes.getAbstractScopesList()) {
+    auto *SP = cast<DISubprogram>(AScope->getScopeNode());
+    // Collect info for variables that were optimized out.
+    for (const DILocalVariable *DV : SP->getVariables()) {
+      if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second)
+        continue;
+      ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr),
+                                      DV->getScope());
+      assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes
+             && "ensureAbstractVariableIsCreated inserted abstract scopes");
+    }
+    constructAbstractSubprogramScopeDIE(AScope);
+  }
+
+  TheCU.constructSubprogramScopeDIE(FnScope);
+  if (auto *SkelCU = TheCU.getSkeleton())
+    if (!LScopes.getAbstractScopesList().empty())
+      SkelCU->constructSubprogramScopeDIE(FnScope);
+
+  // Clear debug info
+  // Ownership of DbgVariables is a bit subtle - ScopeVariables owns all the
+  // DbgVariables except those that are also in AbstractVariables (since they
+  // can be used cross-function)
+  InfoHolder.getScopeVariables().clear();
+  PrevLabel = nullptr;
+  CurFn = nullptr;
+  DebugHandlerBase::endFunction(MF);
+}
+
+// Register a source line with debug info. Returns the  unique label that was
+// emitted and which provides correspondence to the source line list.
+void DwarfDebug::recordSourceLine(unsigned Line, unsigned Col, const MDNode *S,
+                                  unsigned Flags) {
+  StringRef Fn;
+  StringRef Dir;
+  unsigned Src = 1;
+  unsigned Discriminator = 0;
+  if (auto *Scope = cast_or_null<DIScope>(S)) {
+    Fn = Scope->getFilename();
+    Dir = Scope->getDirectory();
+    if (auto *LBF = dyn_cast<DILexicalBlockFile>(Scope))
+      Discriminator = LBF->getDiscriminator();
+
+    unsigned CUID = Asm->OutStreamer->getContext().getDwarfCompileUnitID();
+    Src = static_cast<DwarfCompileUnit &>(*InfoHolder.getUnits()[CUID])
+              .getOrCreateSourceID(Fn, Dir);
+  }
+  Asm->OutStreamer->EmitDwarfLocDirective(Src, Line, Col, Flags, 0,
+                                          Discriminator, Fn);
+}
+
+//===----------------------------------------------------------------------===//
+// Emit Methods
+//===----------------------------------------------------------------------===//
+
+// Emit the debug info section.
+void DwarfDebug::emitDebugInfo() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.emitUnits(/* UseOffsets */ false);
+}
+
+// Emit the abbreviation section.
+void DwarfDebug::emitAbbreviations() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  Holder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevSection());
+}
+
+void DwarfDebug::emitAccel(DwarfAccelTable &Accel, MCSection *Section,
+                           StringRef TableName) {
+  Accel.FinalizeTable(Asm, TableName);
+  Asm->OutStreamer->SwitchSection(Section);
+
+  // Emit the full data.
+  Accel.emit(Asm, Section->getBeginSymbol(), this);
+}
+
+// Emit visible names into a hashed accelerator table section.
+void DwarfDebug::emitAccelNames() {
+  emitAccel(AccelNames, Asm->getObjFileLowering().getDwarfAccelNamesSection(),
+            "Names");
+}
+
+// Emit objective C classes and categories into a hashed accelerator table
+// section.
+void DwarfDebug::emitAccelObjC() {
+  emitAccel(AccelObjC, Asm->getObjFileLowering().getDwarfAccelObjCSection(),
+            "ObjC");
+}
+
+// Emit namespace dies into a hashed accelerator table.
+void DwarfDebug::emitAccelNamespaces() {
+  emitAccel(AccelNamespace,
+            Asm->getObjFileLowering().getDwarfAccelNamespaceSection(),
+            "namespac");
+}
+
+// Emit type dies into a hashed accelerator table.
+void DwarfDebug::emitAccelTypes() {
+  emitAccel(AccelTypes, Asm->getObjFileLowering().getDwarfAccelTypesSection(),
+            "types");
+}
+
+// Public name handling.
+// The format for the various pubnames:
+//
+// dwarf pubnames - offset/name pairs where the offset is the offset into the CU
+// for the DIE that is named.
+//
+// gnu pubnames - offset/index value/name tuples where the offset is the offset
+// into the CU and the index value is computed according to the type of value
+// for the DIE that is named.
+//
+// For type units the offset is the offset of the skeleton DIE. For split dwarf
+// it's the offset within the debug_info/debug_types dwo section, however, the
+// reference in the pubname header doesn't change.
+
+/// computeIndexValue - Compute the gdb index value for the DIE and CU.
+static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
+                                                        const DIE *Die) {
+  dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
+
+  // We could have a specification DIE that has our most of our knowledge,
+  // look for that now.
+  if (DIEValue SpecVal = Die->findAttribute(dwarf::DW_AT_specification)) {
+    DIE &SpecDIE = SpecVal.getDIEEntry().getEntry();
+    if (SpecDIE.findAttribute(dwarf::DW_AT_external))
+      Linkage = dwarf::GIEL_EXTERNAL;
+  } else if (Die->findAttribute(dwarf::DW_AT_external))
+    Linkage = dwarf::GIEL_EXTERNAL;
+
+  switch (Die->getTag()) {
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_enumeration_type:
+    return dwarf::PubIndexEntryDescriptor(
+        dwarf::GIEK_TYPE, CU->getLanguage() != dwarf::DW_LANG_C_plus_plus
+                              ? dwarf::GIEL_STATIC
+                              : dwarf::GIEL_EXTERNAL);
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_subrange_type:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE, dwarf::GIEL_STATIC);
+  case dwarf::DW_TAG_namespace:
+    return dwarf::GIEK_TYPE;
+  case dwarf::DW_TAG_subprogram:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_FUNCTION, Linkage);
+  case dwarf::DW_TAG_variable:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE, Linkage);
+  case dwarf::DW_TAG_enumerator:
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_VARIABLE,
+                                          dwarf::GIEL_STATIC);
+  default:
+    return dwarf::GIEK_NONE;
+  }
+}
+
+/// emitDebugPubNames - Emit visible names into a debug pubnames section.
+///
+void DwarfDebug::emitDebugPubNames(bool GnuStyle) {
+  MCSection *PSec = GnuStyle
+                        ? Asm->getObjFileLowering().getDwarfGnuPubNamesSection()
+                        : Asm->getObjFileLowering().getDwarfPubNamesSection();
+
+  emitDebugPubSection(GnuStyle, PSec, "Names",
+                      &DwarfCompileUnit::getGlobalNames);
+}
+
+void DwarfDebug::emitDebugPubSection(
+    bool GnuStyle, MCSection *PSec, StringRef Name,
+    const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const) {
+  for (const auto &NU : CUMap) {
+    DwarfCompileUnit *TheU = NU.second;
+
+    const auto &Globals = (TheU->*Accessor)();
+
+    if (Globals.empty())
+      continue;
+
+    if (auto *Skeleton = TheU->getSkeleton())
+      TheU = Skeleton;
+
+    // Start the dwarf pubnames section.
+    Asm->OutStreamer->SwitchSection(PSec);
+
+    // Emit the header.
+    Asm->OutStreamer->AddComment("Length of Public " + Name + " Info");
+    MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + Name + "_begin");
+    MCSymbol *EndLabel = Asm->createTempSymbol("pub" + Name + "_end");
+    Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+
+    Asm->OutStreamer->EmitLabel(BeginLabel);
+
+    Asm->OutStreamer->AddComment("DWARF Version");
+    Asm->EmitInt16(dwarf::DW_PUBNAMES_VERSION);
+
+    Asm->OutStreamer->AddComment("Offset of Compilation Unit Info");
+    Asm->emitDwarfSymbolReference(TheU->getLabelBegin());
+
+    Asm->OutStreamer->AddComment("Compilation Unit Length");
+    Asm->EmitInt32(TheU->getLength());
+
+    // Emit the pubnames for this compilation unit.
+    for (const auto &GI : Globals) {
+      const char *Name = GI.getKeyData();
+      const DIE *Entity = GI.second;
+
+      Asm->OutStreamer->AddComment("DIE offset");
+      Asm->EmitInt32(Entity->getOffset());
+
+      if (GnuStyle) {
+        dwarf::PubIndexEntryDescriptor Desc = computeIndexValue(TheU, Entity);
+        Asm->OutStreamer->AddComment(
+            Twine("Kind: ") + dwarf::GDBIndexEntryKindString(Desc.Kind) + ", " +
+            dwarf::GDBIndexEntryLinkageString(Desc.Linkage));
+        Asm->EmitInt8(Desc.toBits());
+      }
+
+      Asm->OutStreamer->AddComment("External Name");
+      Asm->OutStreamer->EmitBytes(StringRef(Name, GI.getKeyLength() + 1));
+    }
+
+    Asm->OutStreamer->AddComment("End Mark");
+    Asm->EmitInt32(0);
+    Asm->OutStreamer->EmitLabel(EndLabel);
+  }
+}
+
+void DwarfDebug::emitDebugPubTypes(bool GnuStyle) {
+  MCSection *PSec = GnuStyle
+                        ? Asm->getObjFileLowering().getDwarfGnuPubTypesSection()
+                        : Asm->getObjFileLowering().getDwarfPubTypesSection();
+
+  emitDebugPubSection(GnuStyle, PSec, "Types",
+                      &DwarfCompileUnit::getGlobalTypes);
+}
+
+/// Emit null-terminated strings into a debug str section.
+void DwarfDebug::emitDebugStr() {
+  DwarfFile &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+  Holder.emitStrings(Asm->getObjFileLowering().getDwarfStrSection());
+}
+
+void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
+                                   const DebugLocStream::Entry &Entry) {
+  auto &&Comments = DebugLocs.getComments(Entry);
+  auto Comment = Comments.begin();
+  auto End = Comments.end();
+  for (uint8_t Byte : DebugLocs.getBytes(Entry))
+    Streamer.EmitInt8(Byte, Comment != End ? *(Comment++) : "");
+}
+
+static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
+                              ByteStreamer &Streamer,
+                              const DebugLocEntry::Value &Value,
+                              unsigned PieceOffsetInBits) {
+  DebugLocDwarfExpression DwarfExpr(AP.getDwarfDebug()->getDwarfVersion(),
+                                    Streamer);
+  // Regular entry.
+  if (Value.isInt()) {
+    if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
+               BT->getEncoding() == dwarf::DW_ATE_signed_char))
+      DwarfExpr.AddSignedConstant(Value.getInt());
+    else
+      DwarfExpr.AddUnsignedConstant(Value.getInt());
+  } else if (Value.isLocation()) {
+    MachineLocation Loc = Value.getLoc();
+    const DIExpression *Expr = Value.getExpression();
+    if (!Expr || !Expr->getNumElements())
+      // Regular entry.
+      AP.EmitDwarfRegOp(Streamer, Loc);
+    else {
+      // Complex address entry.
+      const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
+      if (Loc.getOffset()) {
+        DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
+        DwarfExpr.AddExpression(Expr->expr_op_begin(), Expr->expr_op_end(),
+                                PieceOffsetInBits);
+      } else
+        DwarfExpr.AddMachineRegExpression(TRI, Expr, Loc.getReg(),
+                                          PieceOffsetInBits);
+    }
+  } else if (Value.isConstantFP()) {
+    APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
+    DwarfExpr.AddUnsignedConstant(RawBytes);
+  }
+}
+
+void DebugLocEntry::finalize(const AsmPrinter &AP,
+                             DebugLocStream::ListBuilder &List,
+                             const DIBasicType *BT) {
+  DebugLocStream::EntryBuilder Entry(List, Begin, End);
+  BufferByteStreamer Streamer = Entry.getStreamer();
+  const DebugLocEntry::Value &Value = Values[0];
+  if (Value.isBitPiece()) {
+    // Emit all pieces that belong to the same variable and range.
+    assert(std::all_of(Values.begin(), Values.end(), [](DebugLocEntry::Value P) {
+          return P.isBitPiece();
+        }) && "all values are expected to be pieces");
+    assert(std::is_sorted(Values.begin(), Values.end()) &&
+           "pieces are expected to be sorted");
+   
+    unsigned Offset = 0;
+    for (auto Piece : Values) {
+      const DIExpression *Expr = Piece.getExpression();
+      unsigned PieceOffset = Expr->getBitPieceOffset();
+      unsigned PieceSize = Expr->getBitPieceSize();
+      assert(Offset <= PieceOffset && "overlapping or duplicate pieces");
+      if (Offset < PieceOffset) {
+        // The DWARF spec seriously mandates pieces with no locations for gaps.
+        DebugLocDwarfExpression Expr(AP.getDwarfDebug()->getDwarfVersion(),
+                                     Streamer);
+        Expr.AddOpPiece(PieceOffset-Offset, 0);
+        Offset += PieceOffset-Offset;
+      }
+      Offset += PieceSize;
+
+      emitDebugLocValue(AP, BT, Streamer, Piece, PieceOffset);
+    }
+  } else {
+    assert(Values.size() == 1 && "only pieces may have >1 value");
+    emitDebugLocValue(AP, BT, Streamer, Value, 0);
+  }
+}
+
+void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry) {
+  // Emit the size.
+  Asm->OutStreamer->AddComment("Loc expr size");
+  Asm->EmitInt16(DebugLocs.getBytes(Entry).size());
+
+  // Emit the entry.
+  APByteStreamer Streamer(*Asm);
+  emitDebugLocEntry(Streamer, Entry);
+}
+
+// Emit locations into the debug loc section.
+void DwarfDebug::emitDebugLoc() {
+  // Start the dwarf loc section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfLocSection());
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
+  for (const auto &List : DebugLocs.getLists()) {
+    Asm->OutStreamer->EmitLabel(List.Label);
+    const DwarfCompileUnit *CU = List.CU;
+    for (const auto &Entry : DebugLocs.getEntries(List)) {
+      // Set up the range. This range is relative to the entry point of the
+      // compile unit. This is a hard coded 0 for low_pc when we're emitting
+      // ranges, or the DW_AT_low_pc on the compile unit otherwise.
+      if (auto *Base = CU->getBaseAddress()) {
+        Asm->EmitLabelDifference(Entry.BeginSym, Base, Size);
+        Asm->EmitLabelDifference(Entry.EndSym, Base, Size);
+      } else {
+        Asm->OutStreamer->EmitSymbolValue(Entry.BeginSym, Size);
+        Asm->OutStreamer->EmitSymbolValue(Entry.EndSym, Size);
+      }
+
+      emitDebugLocEntryLocation(Entry);
+    }
+    Asm->OutStreamer->EmitIntValue(0, Size);
+    Asm->OutStreamer->EmitIntValue(0, Size);
+  }
+}
+
+void DwarfDebug::emitDebugLocDWO() {
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfLocDWOSection());
+  for (const auto &List : DebugLocs.getLists()) {
+    Asm->OutStreamer->EmitLabel(List.Label);
+    for (const auto &Entry : DebugLocs.getEntries(List)) {
+      // Just always use start_length for now - at least that's one address
+      // rather than two. We could get fancier and try to, say, reuse an
+      // address we know we've emitted elsewhere (the start of the function?
+      // The start of the CU or CU subrange that encloses this range?)
+      Asm->EmitInt8(dwarf::DW_LLE_start_length_entry);
+      unsigned idx = AddrPool.getIndex(Entry.BeginSym);
+      Asm->EmitULEB128(idx);
+      Asm->EmitLabelDifference(Entry.EndSym, Entry.BeginSym, 4);
+
+      emitDebugLocEntryLocation(Entry);
+    }
+    Asm->EmitInt8(dwarf::DW_LLE_end_of_list_entry);
+  }
+}
+
+struct ArangeSpan {
+  const MCSymbol *Start, *End;
+};
+
+// Emit a debug aranges section, containing a CU lookup for any
+// address we can tie back to a CU.
+void DwarfDebug::emitDebugARanges() {
+  // Provides a unique id per text section.
+  MapVector<MCSection *, SmallVector<SymbolCU, 8>> SectionMap;
+
+  // Filter labels by section.
+  for (const SymbolCU &SCU : ArangeLabels) {
+    if (SCU.Sym->isInSection()) {
+      // Make a note of this symbol and it's section.
+      MCSection *Section = &SCU.Sym->getSection();
+      if (!Section->getKind().isMetadata())
+        SectionMap[Section].push_back(SCU);
+    } else {
+      // Some symbols (e.g. common/bss on mach-o) can have no section but still
+      // appear in the output. This sucks as we rely on sections to build
+      // arange spans. We can do it without, but it's icky.
+      SectionMap[nullptr].push_back(SCU);
+    }
+  }
+
+  DenseMap<DwarfCompileUnit *, std::vector<ArangeSpan>> Spans;
+
+  for (auto &I : SectionMap) {
+    MCSection *Section = I.first;
+    SmallVector<SymbolCU, 8> &List = I.second;
+    if (List.size() < 1)
+      continue;
+
+    // If we have no section (e.g. common), just write out
+    // individual spans for each symbol.
+    if (!Section) {
+      for (const SymbolCU &Cur : List) {
+        ArangeSpan Span;
+        Span.Start = Cur.Sym;
+        Span.End = nullptr;
+        assert(Cur.CU);
+        Spans[Cur.CU].push_back(Span);
+      }
+      continue;
+    }
+
+    // Sort the symbols by offset within the section.
+    std::sort(
+        List.begin(), List.end(), [&](const SymbolCU &A, const SymbolCU &B) {
+          unsigned IA = A.Sym ? Asm->OutStreamer->GetSymbolOrder(A.Sym) : 0;
+          unsigned IB = B.Sym ? Asm->OutStreamer->GetSymbolOrder(B.Sym) : 0;
+
+          // Symbols with no order assigned should be placed at the end.
+          // (e.g. section end labels)
+          if (IA == 0)
+            return false;
+          if (IB == 0)
+            return true;
+          return IA < IB;
+        });
+
+    // Insert a final terminator.
+    List.push_back(SymbolCU(nullptr, Asm->OutStreamer->endSection(Section)));
+
+    // Build spans between each label.
+    const MCSymbol *StartSym = List[0].Sym;
+    for (size_t n = 1, e = List.size(); n < e; n++) {
+      const SymbolCU &Prev = List[n - 1];
+      const SymbolCU &Cur = List[n];
+
+      // Try and build the longest span we can within the same CU.
+      if (Cur.CU != Prev.CU) {
+        ArangeSpan Span;
+        Span.Start = StartSym;
+        Span.End = Cur.Sym;
+        assert(Prev.CU);
+        Spans[Prev.CU].push_back(Span);
+        StartSym = Cur.Sym;
+      }
+    }
+  }
+
+  // Start the dwarf aranges section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfARangesSection());
+
+  unsigned PtrSize = Asm->getDataLayout().getPointerSize();
+
+  // Build a list of CUs used.
+  std::vector<DwarfCompileUnit *> CUs;
+  for (const auto &it : Spans) {
+    DwarfCompileUnit *CU = it.first;
+    CUs.push_back(CU);
+  }
+
+  // Sort the CU list (again, to ensure consistent output order).
+  std::sort(CUs.begin(), CUs.end(),
+            [](const DwarfCompileUnit *A, const DwarfCompileUnit *B) {
+              return A->getUniqueID() < B->getUniqueID();
+            });
+
+  // Emit an arange table for each CU we used.
+  for (DwarfCompileUnit *CU : CUs) {
+    std::vector<ArangeSpan> &List = Spans[CU];
+
+    // Describe the skeleton CU's offset and length, not the dwo file's.
+    if (auto *Skel = CU->getSkeleton())
+      CU = Skel;
+
+    // Emit size of content not including length itself.
+    unsigned ContentSize =
+        sizeof(int16_t) + // DWARF ARange version number
+        sizeof(int32_t) + // Offset of CU in the .debug_info section
+        sizeof(int8_t) +  // Pointer Size (in bytes)
+        sizeof(int8_t);   // Segment Size (in bytes)
+
+    unsigned TupleSize = PtrSize * 2;
+
+    // 7.20 in the Dwarf specs requires the table to be aligned to a tuple.
+    unsigned Padding =
+        OffsetToAlignment(sizeof(int32_t) + ContentSize, TupleSize);
+
+    ContentSize += Padding;
+    ContentSize += (List.size() + 1) * TupleSize;
+
+    // For each compile unit, write the list of spans it covers.
+    Asm->OutStreamer->AddComment("Length of ARange Set");
+    Asm->EmitInt32(ContentSize);
+    Asm->OutStreamer->AddComment("DWARF Arange version number");
+    Asm->EmitInt16(dwarf::DW_ARANGES_VERSION);
+    Asm->OutStreamer->AddComment("Offset Into Debug Info Section");
+    Asm->emitDwarfSymbolReference(CU->getLabelBegin());
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(PtrSize);
+    Asm->OutStreamer->AddComment("Segment Size (in bytes)");
+    Asm->EmitInt8(0);
+
+    Asm->OutStreamer->emitFill(Padding, 0xff);
+
+    for (const ArangeSpan &Span : List) {
+      Asm->EmitLabelReference(Span.Start, PtrSize);
+
+      // Calculate the size as being from the span start to it's end.
+      if (Span.End) {
+        Asm->EmitLabelDifference(Span.End, Span.Start, PtrSize);
+      } else {
+        // For symbols without an end marker (e.g. common), we
+        // write a single arange entry containing just that one symbol.
+        uint64_t Size = SymSize[Span.Start];
+        if (Size == 0)
+          Size = 1;
+
+        Asm->OutStreamer->EmitIntValue(Size, PtrSize);
+      }
+    }
+
+    Asm->OutStreamer->AddComment("ARange terminator");
+    Asm->OutStreamer->EmitIntValue(0, PtrSize);
+    Asm->OutStreamer->EmitIntValue(0, PtrSize);
+  }
+}
+
+/// Emit address ranges into a debug ranges section.
+void DwarfDebug::emitDebugRanges() {
+  // Start the dwarf ranges section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfRangesSection());
+
+  // Size for our labels.
+  unsigned char Size = Asm->getDataLayout().getPointerSize();
+
+  // Grab the specific ranges for the compile units in the module.
+  for (const auto &I : CUMap) {
+    DwarfCompileUnit *TheCU = I.second;
+
+    if (auto *Skel = TheCU->getSkeleton())
+      TheCU = Skel;
+
+    // Iterate over the misc ranges for the compile units in the module.
+    for (const RangeSpanList &List : TheCU->getRangeLists()) {
+      // Emit our symbol so we can find the beginning of the range.
+      Asm->OutStreamer->EmitLabel(List.getSym());
+
+      for (const RangeSpan &Range : List.getRanges()) {
+        const MCSymbol *Begin = Range.getStart();
+        const MCSymbol *End = Range.getEnd();
+        assert(Begin && "Range without a begin symbol?");
+        assert(End && "Range without an end symbol?");
+        if (auto *Base = TheCU->getBaseAddress()) {
+          Asm->EmitLabelDifference(Begin, Base, Size);
+          Asm->EmitLabelDifference(End, Base, Size);
+        } else {
+          Asm->OutStreamer->EmitSymbolValue(Begin, Size);
+          Asm->OutStreamer->EmitSymbolValue(End, Size);
+        }
+      }
+
+      // And terminate the list with two 0 values.
+      Asm->OutStreamer->EmitIntValue(0, Size);
+      Asm->OutStreamer->EmitIntValue(0, Size);
+    }
+  }
+}
+
+void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
+  for (auto *MN : Nodes) {
+    if (auto *M = dyn_cast<DIMacro>(MN))
+      emitMacro(*M);
+    else if (auto *F = dyn_cast<DIMacroFile>(MN))
+      emitMacroFile(*F, U);
+    else
+      llvm_unreachable("Unexpected DI type!");
+  }
+}
+
+void DwarfDebug::emitMacro(DIMacro &M) {
+  Asm->EmitULEB128(M.getMacinfoType());
+  Asm->EmitULEB128(M.getLine());
+  StringRef Name = M.getName();
+  StringRef Value = M.getValue();
+  Asm->OutStreamer->EmitBytes(Name);
+  if (!Value.empty()) {
+    // There should be one space between macro name and macro value.
+    Asm->EmitInt8(' ');
+    Asm->OutStreamer->EmitBytes(Value);
+  }
+  Asm->EmitInt8('\0');
+}
+
+void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
+  assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
+  Asm->EmitULEB128(dwarf::DW_MACINFO_start_file);
+  Asm->EmitULEB128(F.getLine());
+  DIFile *File = F.getFile();
+  unsigned FID =
+      U.getOrCreateSourceID(File->getFilename(), File->getDirectory());
+  Asm->EmitULEB128(FID);
+  handleMacroNodes(F.getElements(), U);
+  Asm->EmitULEB128(dwarf::DW_MACINFO_end_file);
+}
+
+/// Emit macros into a debug macinfo section.
+void DwarfDebug::emitDebugMacinfo() {
+  // Start the dwarf macinfo section.
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfMacinfoSection());
+
+  for (const auto &P : CUMap) {
+    auto &TheCU = *P.second;
+    auto *SkCU = TheCU.getSkeleton();
+    DwarfCompileUnit &U = SkCU ? *SkCU : TheCU;
+    auto *CUNode = cast<DICompileUnit>(P.first);
+    Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
+    handleMacroNodes(CUNode->getMacros(), U);
+  }
+  Asm->OutStreamer->AddComment("End Of Macro List Mark");
+  Asm->EmitInt8(0);
+}
+
+// DWARF5 Experimental Separate Dwarf emitters.
+
+void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                                  std::unique_ptr<DwarfCompileUnit> NewU) {
+  NewU->addString(Die, dwarf::DW_AT_GNU_dwo_name,
+                  U.getCUNode()->getSplitDebugFilename());
+
+  if (!CompilationDir.empty())
+    NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
+
+  addGnuPubAttributes(*NewU, Die);
+
+  SkeletonHolder.addUnit(std::move(NewU));
+}
+
+// This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
+// DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
+// DW_AT_addr_base, DW_AT_ranges_base.
+DwarfCompileUnit &DwarfDebug::constructSkeletonCU(const DwarfCompileUnit &CU) {
+
+  auto OwnedUnit = make_unique<DwarfCompileUnit>(
+      CU.getUniqueID(), CU.getCUNode(), Asm, this, &SkeletonHolder);
+  DwarfCompileUnit &NewCU = *OwnedUnit;
+  NewCU.initSection(Asm->getObjFileLowering().getDwarfInfoSection());
+
+  NewCU.initStmtList();
+
+  initSkeletonUnit(CU, NewCU.getUnitDie(), std::move(OwnedUnit));
+
+  return NewCU;
+}
+
+// Emit the .debug_info.dwo section for separated dwarf. This contains the
+// compile units that would normally be in debug_info.
+void DwarfDebug::emitDebugInfoDWO() {
+  assert(useSplitDwarf() && "No split dwarf debug info?");
+  // Don't emit relocations into the dwo file.
+  InfoHolder.emitUnits(/* UseOffsets */ true);
+}
+
+// Emit the .debug_abbrev.dwo section for separated dwarf. This contains the
+// abbreviations for the .debug_info.dwo section.
+void DwarfDebug::emitDebugAbbrevDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  InfoHolder.emitAbbrevs(Asm->getObjFileLowering().getDwarfAbbrevDWOSection());
+}
+
+void DwarfDebug::emitDebugLineDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  Asm->OutStreamer->SwitchSection(
+      Asm->getObjFileLowering().getDwarfLineDWOSection());
+  SplitTypeUnitFileTable.Emit(*Asm->OutStreamer, MCDwarfLineTableParams());
+}
+
+// Emit the .debug_str.dwo section for separated dwarf. This contains the
+// string section and is identical in format to traditional .debug_str
+// sections.
+void DwarfDebug::emitDebugStrDWO() {
+  assert(useSplitDwarf() && "No split dwarf?");
+  MCSection *OffSec = Asm->getObjFileLowering().getDwarfStrOffDWOSection();
+  InfoHolder.emitStrings(Asm->getObjFileLowering().getDwarfStrDWOSection(),
+                         OffSec);
+}
+
+MCDwarfDwoLineTable *DwarfDebug::getDwoLineTable(const DwarfCompileUnit &CU) {
+  if (!useSplitDwarf())
+    return nullptr;
+  if (SingleCU)
+    SplitTypeUnitFileTable.setCompilationDir(CU.getCUNode()->getDirectory());
+  return &SplitTypeUnitFileTable;
+}
+
+uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
+  MD5 Hash;
+  Hash.update(Identifier);
+  // ... take the least significant 8 bytes and return those. Our MD5
+  // implementation always returns its results in little endian, swap bytes
+  // appropriately.
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return support::endian::read64le(Result + 8);
+}
+
+void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
+                                      StringRef Identifier, DIE &RefDie,
+                                      const DICompositeType *CTy) {
+  // Fast path if we're building some type units and one has already used the
+  // address pool we know we're going to throw away all this work anyway, so
+  // don't bother building dependent types.
+  if (!TypeUnitsUnderConstruction.empty() && AddrPool.hasBeenUsed())
+    return;
+
+  auto Ins = TypeSignatures.insert(std::make_pair(CTy, 0));
+  if (!Ins.second) {
+    CU.addDIETypeSignature(RefDie, Ins.first->second);
+    return;
+  }
+
+  bool TopLevelType = TypeUnitsUnderConstruction.empty();
+  AddrPool.resetUsedFlag();
+
+  auto OwnedUnit = make_unique<DwarfTypeUnit>(CU, Asm, this, &InfoHolder,
+                                              getDwoLineTable(CU));
+  DwarfTypeUnit &NewTU = *OwnedUnit;
+  DIE &UnitDie = NewTU.getUnitDie();
+  TypeUnitsUnderConstruction.push_back(
+      std::make_pair(std::move(OwnedUnit), CTy));
+
+  NewTU.addUInt(UnitDie, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
+                CU.getLanguage());
+
+  uint64_t Signature = makeTypeSignature(Identifier);
+  NewTU.setTypeSignature(Signature);
+  Ins.first->second = Signature;
+
+  if (useSplitDwarf())
+    NewTU.initSection(Asm->getObjFileLowering().getDwarfTypesDWOSection());
+  else {
+    CU.applyStmtList(UnitDie);
+    NewTU.initSection(
+        Asm->getObjFileLowering().getDwarfTypesSection(Signature));
+  }
+
+  NewTU.setType(NewTU.createTypeDIE(CTy));
+
+  if (TopLevelType) {
+    auto TypeUnitsToAdd = std::move(TypeUnitsUnderConstruction);
+    TypeUnitsUnderConstruction.clear();
+
+    // Types referencing entries in the address table cannot be placed in type
+    // units.
+    if (AddrPool.hasBeenUsed()) {
+
+      // Remove all the types built while building this type.
+      // This is pessimistic as some of these types might not be dependent on
+      // the type that used an address.
+      for (const auto &TU : TypeUnitsToAdd)
+        TypeSignatures.erase(TU.second);
+
+      // Construct this type in the CU directly.
+      // This is inefficient because all the dependent types will be rebuilt
+      // from scratch, including building them in type units, discovering that
+      // they depend on addresses, throwing them out and rebuilding them.
+      CU.constructTypeDIE(RefDie, cast<DICompositeType>(CTy));
+      return;
+    }
+
+    // If the type wasn't dependent on fission addresses, finish adding the type
+    // and all its dependent types.
+    for (auto &TU : TypeUnitsToAdd) {
+      InfoHolder.computeSizeAndOffsetsForUnit(TU.first.get());
+      InfoHolder.emitUnit(TU.first.get(), useSplitDwarf());
+    }
+  }
+  CU.addDIETypeSignature(RefDie, Signature);
+}
+
+// Accelerator table mutators - add each name along with its companion
+// DIE to the proper table while ensuring that the name that we're going
+// to reference is in the string table. We do this since the names we
+// add may not only be identical to the names in the DIE.
+void DwarfDebug::addAccelName(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelNames.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+}
+
+void DwarfDebug::addAccelObjC(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelObjC.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+}
+
+void DwarfDebug::addAccelNamespace(StringRef Name, const DIE &Die) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelNamespace.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+}
+
+void DwarfDebug::addAccelType(StringRef Name, const DIE &Die, char Flags) {
+  if (!useDwarfAccelTables())
+    return;
+  AccelTypes.AddName(InfoHolder.getStringPool().getEntry(*Asm, Name), &Die);
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
new file mode 100644
index 000000000000..6b06757628b6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -0,0 +1,571 @@
+//===-- llvm/CodeGen/DwarfDebug.h - Dwarf Debug Framework ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFDEBUG_H
+
+#include "DbgValueHistoryCalculator.h"
+#include "DebugHandlerBase.h"
+#include "DebugLocStream.h"
+#include "DwarfAccelTable.h"
+#include "DwarfFile.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Target/TargetOptions.h"
+#include <memory>
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class ConstantInt;
+class ConstantFP;
+class DebugLocEntry;
+class DwarfCompileUnit;
+class DwarfDebug;
+class DwarfTypeUnit;
+class DwarfUnit;
+class MachineModuleInfo;
+
+//===----------------------------------------------------------------------===//
+/// This class is used to track local variable information.
+///
+/// Variables can be created from allocas, in which case they're generated from
+/// the MMI table.  Such variables can have multiple expressions and frame
+/// indices.  The \a Expr and \a FrameIndices array must match.
+///
+/// Variables can be created from \c DBG_VALUE instructions.  Those whose
+/// location changes over time use \a DebugLocListIndex, while those with a
+/// single instruction use \a MInsn and (optionally) a single entry of \a Expr.
+///
+/// Variables that have been optimized out use none of these fields.
+class DbgVariable {
+  const DILocalVariable *Var;                /// Variable Descriptor.
+  const DILocation *IA;                      /// Inlined at location.
+  SmallVector<const DIExpression *, 1> Expr; /// Complex address.
+  DIE *TheDIE = nullptr;                     /// Variable DIE.
+  unsigned DebugLocListIndex = ~0u;          /// Offset in DebugLocs.
+  const MachineInstr *MInsn = nullptr;       /// DBG_VALUE instruction.
+  SmallVector<int, 1> FrameIndex;            /// Frame index.
+
+public:
+  /// Construct a DbgVariable.
+  ///
+  /// Creates a variable without any DW_AT_location.  Call \a initializeMMI()
+  /// for MMI entries, or \a initializeDbgValue() for DBG_VALUE instructions.
+  DbgVariable(const DILocalVariable *V, const DILocation *IA)
+      : Var(V), IA(IA) {}
+
+  /// Initialize from the MMI table.
+  void initializeMMI(const DIExpression *E, int FI) {
+    assert(Expr.empty() && "Already initialized?");
+    assert(FrameIndex.empty() && "Already initialized?");
+    assert(!MInsn && "Already initialized?");
+
+    assert((!E || E->isValid()) && "Expected valid expression");
+    assert(~FI && "Expected valid index");
+
+    Expr.push_back(E);
+    FrameIndex.push_back(FI);
+  }
+
+  /// Initialize from a DBG_VALUE instruction.
+  void initializeDbgValue(const MachineInstr *DbgValue) {
+    assert(Expr.empty() && "Already initialized?");
+    assert(FrameIndex.empty() && "Already initialized?");
+    assert(!MInsn && "Already initialized?");
+
+    assert(Var == DbgValue->getDebugVariable() && "Wrong variable");
+    assert(IA == DbgValue->getDebugLoc()->getInlinedAt() && "Wrong inlined-at");
+
+    MInsn = DbgValue;
+    if (auto *E = DbgValue->getDebugExpression())
+      if (E->getNumElements())
+        Expr.push_back(E);
+  }
+
+  // Accessors.
+  const DILocalVariable *getVariable() const { return Var; }
+  const DILocation *getInlinedAt() const { return IA; }
+  ArrayRef<const DIExpression *> getExpression() const { return Expr; }
+  const DIExpression *getSingleExpression() const {
+    assert(MInsn && Expr.size() <= 1);
+    return Expr.size() ? Expr[0] : nullptr;
+  }
+  void setDIE(DIE &D) { TheDIE = &D; }
+  DIE *getDIE() const { return TheDIE; }
+  void setDebugLocListIndex(unsigned O) { DebugLocListIndex = O; }
+  unsigned getDebugLocListIndex() const { return DebugLocListIndex; }
+  StringRef getName() const { return Var->getName(); }
+  const MachineInstr *getMInsn() const { return MInsn; }
+  ArrayRef<int> getFrameIndex() const { return FrameIndex; }
+
+  void addMMIEntry(const DbgVariable &V) {
+    assert(DebugLocListIndex == ~0U && !MInsn && "not an MMI entry");
+    assert(V.DebugLocListIndex == ~0U && !V.MInsn && "not an MMI entry");
+    assert(V.Var == Var && "conflicting variable");
+    assert(V.IA == IA && "conflicting inlined-at location");
+
+    assert(!FrameIndex.empty() && "Expected an MMI entry");
+    assert(!V.FrameIndex.empty() && "Expected an MMI entry");
+    assert(Expr.size() == FrameIndex.size() && "Mismatched expressions");
+    assert(V.Expr.size() == V.FrameIndex.size() && "Mismatched expressions");
+
+    Expr.append(V.Expr.begin(), V.Expr.end());
+    FrameIndex.append(V.FrameIndex.begin(), V.FrameIndex.end());
+    assert(std::all_of(Expr.begin(), Expr.end(), [](const DIExpression *E) {
+             return E && E->isBitPiece();
+           }) && "conflicting locations for variable");
+  }
+
+  // Translate tag to proper Dwarf tag.
+  dwarf::Tag getTag() const {
+    // FIXME: Why don't we just infer this tag and store it all along?
+    if (Var->isParameter())
+      return dwarf::DW_TAG_formal_parameter;
+
+    return dwarf::DW_TAG_variable;
+  }
+  /// Return true if DbgVariable is artificial.
+  bool isArtificial() const {
+    if (Var->isArtificial())
+      return true;
+    if (getType()->isArtificial())
+      return true;
+    return false;
+  }
+
+  bool isObjectPointer() const {
+    if (Var->isObjectPointer())
+      return true;
+    if (getType()->isObjectPointer())
+      return true;
+    return false;
+  }
+
+  bool hasComplexAddress() const {
+    assert(MInsn && "Expected DBG_VALUE, not MMI variable");
+    assert(FrameIndex.empty() && "Expected DBG_VALUE, not MMI variable");
+    assert(
+        (Expr.empty() || (Expr.size() == 1 && Expr.back()->getNumElements())) &&
+        "Invalid Expr for DBG_VALUE");
+    return !Expr.empty();
+  }
+  bool isBlockByrefVariable() const;
+  const DIType *getType() const;
+
+private:
+  template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
+    return Ref.resolve();
+  }
+};
+
+
+/// Helper used to pair up a symbol and its DWARF compile unit.
+struct SymbolCU {
+  SymbolCU(DwarfCompileUnit *CU, const MCSymbol *Sym) : Sym(Sym), CU(CU) {}
+  const MCSymbol *Sym;
+  DwarfCompileUnit *CU;
+};
+
+/// Collects and handles dwarf debug information.
+class DwarfDebug : public DebugHandlerBase {
+  /// All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  /// Maps MDNode with its corresponding DwarfCompileUnit.
+  MapVector<const MDNode *, DwarfCompileUnit *> CUMap;
+
+  /// Maps a CU DIE with its corresponding DwarfCompileUnit.
+  DenseMap<const DIE *, DwarfCompileUnit *> CUDieMap;
+
+  /// List of all labels used in aranges generation.
+  std::vector<SymbolCU> ArangeLabels;
+
+  /// Size of each symbol emitted (for those symbols that have a specific size).
+  DenseMap<const MCSymbol *, uint64_t> SymSize;
+
+  /// Collection of abstract variables.
+  DenseMap<const MDNode *, std::unique_ptr<DbgVariable>> AbstractVariables;
+  SmallVector<std::unique_ptr<DbgVariable>, 64> ConcreteVariables;
+
+  /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists
+  /// can refer to them in spite of insertions into this list.
+  DebugLocStream DebugLocs;
+
+  /// This is a collection of subprogram MDNodes that are processed to
+  /// create DIEs.
+  SmallPtrSet<const MDNode *, 16> ProcessedSPNodes;
+
+  /// If nonnull, stores the current machine function we're processing.
+  const MachineFunction *CurFn;
+
+  /// If nonnull, stores the CU in which the previous subprogram was contained.
+  const DwarfCompileUnit *PrevCU;
+
+  /// As an optimization, there is no need to emit an entry in the directory
+  /// table for the same directory as DW_AT_comp_dir.
+  StringRef CompilationDir;
+
+  /// Holder for the file specific debug information.
+  DwarfFile InfoHolder;
+
+  /// Holders for the various debug information flags that we might need to
+  /// have exposed. See accessor functions below for description.
+
+  /// Map from MDNodes for user-defined types to their type signatures. Also
+  /// used to keep track of which types we have emitted type units for.
+  DenseMap<const MDNode *, uint64_t> TypeSignatures;
+
+  SmallVector<
+      std::pair<std::unique_ptr<DwarfTypeUnit>, const DICompositeType *>, 1>
+      TypeUnitsUnderConstruction;
+
+  /// Whether to emit the pubnames/pubtypes sections.
+  bool HasDwarfPubSections;
+
+  /// Whether to use the GNU TLS opcode (instead of the standard opcode).
+  bool UseGNUTLSOpcode;
+
+  /// Whether to use DWARF 2 bitfields (instead of the DWARF 4 format).
+  bool UseDWARF2Bitfields;
+
+  /// Whether to emit all linkage names, or just abstract subprograms.
+  bool UseAllLinkageNames;
+
+  /// Version of dwarf we're emitting.
+  unsigned DwarfVersion;
+
+  /// DWARF5 Experimental Options
+  /// @{
+  bool HasDwarfAccelTables;
+  bool HasAppleExtensionAttributes;
+  bool HasSplitDwarf;
+
+  /// Separated Dwarf Variables
+  /// In general these will all be for bits that are left in the
+  /// original object file, rather than things that are meant
+  /// to be in the .dwo sections.
+
+  /// Holder for the skeleton information.
+  DwarfFile SkeletonHolder;
+
+  /// Store file names for type units under fission in a line table
+  /// header that will be emitted into debug_line.dwo.
+  // FIXME: replace this with a map from comp_dir to table so that we
+  // can emit multiple tables during LTO each of which uses directory
+  // 0, referencing the comp_dir of all the type units that use it.
+  MCDwarfDwoLineTable SplitTypeUnitFileTable;
+  /// @}
+  
+  /// True iff there are multiple CUs in this module.
+  bool SingleCU;
+  bool IsDarwin;
+
+  AddressPool AddrPool;
+
+  DwarfAccelTable AccelNames;
+  DwarfAccelTable AccelObjC;
+  DwarfAccelTable AccelNamespace;
+  DwarfAccelTable AccelTypes;
+
+  // Identify a debugger for "tuning" the debug info.
+  DebuggerKind DebuggerTuning;
+
+  /// \defgroup DebuggerTuning Predicates to tune DWARF for a given debugger.
+  ///
+  /// Returns whether we are "tuning" for a given debugger.
+  /// Should be used only within the constructor, to set feature flags.
+  /// @{
+  bool tuneForGDB() const { return DebuggerTuning == DebuggerKind::GDB; }
+  bool tuneForLLDB() const { return DebuggerTuning == DebuggerKind::LLDB; }
+  bool tuneForSCE() const { return DebuggerTuning == DebuggerKind::SCE; }
+  /// @}
+
+  MCDwarfDwoLineTable *getDwoLineTable(const DwarfCompileUnit &);
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return InfoHolder.getUnits();
+  }
+
+  typedef DbgValueHistoryMap::InlinedVariable InlinedVariable;
+
+  /// Find abstract variable associated with Var.
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV,
+                                           const DILocalVariable *&Cleansed);
+  DbgVariable *getExistingAbstractVariable(InlinedVariable IV);
+  void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope);
+  void ensureAbstractVariableIsCreated(InlinedVariable Var,
+                                       const MDNode *Scope);
+  void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var,
+                                               const MDNode *Scope);
+
+  DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV);
+
+  /// Construct a DIE for this abstract scope.
+  void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
+
+  void finishVariableDefinitions();
+
+  void finishSubprogramDefinitions();
+
+  /// Finish off debug information after all functions have been
+  /// processed.
+  void finalizeModuleInfo();
+
+  /// Emit the debug info section.
+  void emitDebugInfo();
+
+  /// Emit the abbreviation section.
+  void emitAbbreviations();
+
+  /// Emit a specified accelerator table.
+  void emitAccel(DwarfAccelTable &Accel, MCSection *Section,
+                 StringRef TableName);
+
+  /// Emit visible names into a hashed accelerator table section.
+  void emitAccelNames();
+
+  /// Emit objective C classes and categories into a hashed
+  /// accelerator table section.
+  void emitAccelObjC();
+
+  /// Emit namespace dies into a hashed accelerator table.
+  void emitAccelNamespaces();
+
+  /// Emit type dies into a hashed accelerator table.
+  void emitAccelTypes();
+
+  /// Emit visible names into a debug pubnames section.
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubNames(bool GnuStyle = false);
+
+  /// Emit visible types into a debug pubtypes section.
+  /// \param GnuStyle determines whether or not we want to emit
+  /// additional information into the table ala newer gcc for gdb
+  /// index.
+  void emitDebugPubTypes(bool GnuStyle = false);
+
+  void emitDebugPubSection(
+      bool GnuStyle, MCSection *PSec, StringRef Name,
+      const StringMap<const DIE *> &(DwarfCompileUnit::*Accessor)() const);
+
+  /// Emit null-terminated strings into a debug str section.
+  void emitDebugStr();
+
+  /// Emit variable locations into a debug loc section.
+  void emitDebugLoc();
+
+  /// Emit variable locations into a debug loc dwo section.
+  void emitDebugLocDWO();
+
+  /// Emit address ranges into a debug aranges section.
+  void emitDebugARanges();
+
+  /// Emit address ranges into a debug ranges section.
+  void emitDebugRanges();
+
+  /// Emit macros into a debug macinfo section.
+  void emitDebugMacinfo();
+  void emitMacro(DIMacro &M);
+  void emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U);
+  void handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U);
+
+  /// DWARF 5 Experimental Split Dwarf Emitters
+
+  /// Initialize common features of skeleton units.
+  void initSkeletonUnit(const DwarfUnit &U, DIE &Die,
+                        std::unique_ptr<DwarfCompileUnit> NewU);
+
+  /// Construct the split debug info compile unit for the debug info
+  /// section.
+  DwarfCompileUnit &constructSkeletonCU(const DwarfCompileUnit &CU);
+
+  /// Emit the debug info dwo section.
+  void emitDebugInfoDWO();
+
+  /// Emit the debug abbrev dwo section.
+  void emitDebugAbbrevDWO();
+
+  /// Emit the debug line dwo section.
+  void emitDebugLineDWO();
+
+  /// Emit the debug str dwo section.
+  void emitDebugStrDWO();
+
+  /// Flags to let the linker know we have emitted new style pubnames. Only
+  /// emit it here if we don't have a skeleton CU for split dwarf.
+  void addGnuPubAttributes(DwarfUnit &U, DIE &D) const;
+
+  /// Create new DwarfCompileUnit for the given metadata node with tag
+  /// DW_TAG_compile_unit.
+  DwarfCompileUnit &constructDwarfCompileUnit(const DICompileUnit *DIUnit);
+
+  /// Construct imported_module or imported_declaration DIE.
+  void constructAndAddImportedEntityDIE(DwarfCompileUnit &TheCU,
+                                        const DIImportedEntity *N);
+
+  /// Register a source line with debug info. Returns the unique
+  /// label that was emitted and which provides correspondence to the
+  /// source line list.
+  void recordSourceLine(unsigned Line, unsigned Col, const MDNode *Scope,
+                        unsigned Flags);
+
+  /// Populate LexicalScope entries with variables' info.
+  void collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP,
+                           DenseSet<InlinedVariable> &ProcessedVars);
+
+  /// Build the location list for all DBG_VALUEs in the
+  /// function that describe the same variable.
+  void buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
+                         const DbgValueHistoryMap::InstrRanges &Ranges);
+
+  /// Collect variable information from the side table maintained
+  /// by MMI.
+  void collectVariableInfoFromMMITable(DenseSet<InlinedVariable> &P);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfDebug(AsmPrinter *A, Module *M);
+
+  ~DwarfDebug() override;
+
+  /// Emit all Dwarf sections that should come prior to the
+  /// content.
+  void beginModule();
+
+  /// Emit all Dwarf sections that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function debug information.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function debug information.
+  void endFunction(const MachineFunction *MF) override;
+
+  /// Process beginning of an instruction.
+  void beginInstruction(const MachineInstr *MI) override;
+
+  /// Perform an MD5 checksum of \p Identifier and return the lower 64 bits.
+  static uint64_t makeTypeSignature(StringRef Identifier);
+
+  /// Add a DIE to the set of types that we're going to pull into
+  /// type units.
+  void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
+                            DIE &Die, const DICompositeType *CTy);
+
+  /// Add a label so that arange data can be generated for it.
+  void addArangeLabel(SymbolCU SCU) { ArangeLabels.push_back(SCU); }
+
+  /// For symbols that have a size designated (e.g. common symbols),
+  /// this tracks that size.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {
+    SymSize[Sym] = Size;
+  }
+
+  /// Returns whether we should emit all DW_AT_[MIPS_]linkage_name.
+  /// If not, we still might emit certain cases.
+  bool useAllLinkageNames() const { return UseAllLinkageNames; }
+
+  /// Returns whether to use DW_OP_GNU_push_tls_address, instead of the
+  /// standard DW_OP_form_tls_address opcode
+  bool useGNUTLSOpcode() const { return UseGNUTLSOpcode; }
+
+  /// Returns whether to use the DWARF2 format for bitfields instyead of the
+  /// DWARF4 format.
+  bool useDWARF2Bitfields() const { return UseDWARF2Bitfields; }
+
+  // Experimental DWARF5 features.
+
+  /// Returns whether or not to emit tables that dwarf consumers can
+  /// use to accelerate lookup.
+  bool useDwarfAccelTables() const { return HasDwarfAccelTables; }
+
+  bool useAppleExtensionAttributes() const {
+    return HasAppleExtensionAttributes;
+  }
+
+  /// Returns whether or not to change the current debug info for the
+  /// split dwarf proposal support.
+  bool useSplitDwarf() const { return HasSplitDwarf; }
+
+  /// Returns the Dwarf Version.
+  unsigned getDwarfVersion() const { return DwarfVersion; }
+
+  /// Returns the previous CU that was being updated
+  const DwarfCompileUnit *getPrevCU() const { return PrevCU; }
+  void setPrevCU(const DwarfCompileUnit *PrevCU) { this->PrevCU = PrevCU; }
+
+  /// Returns the entries for the .debug_loc section.
+  const DebugLocStream &getDebugLocs() const { return DebugLocs; }
+
+  /// Emit an entry for the debug loc section. This can be used to
+  /// handle an entry that's going to be emitted into the debug loc section.
+  void emitDebugLocEntry(ByteStreamer &Streamer,
+                         const DebugLocStream::Entry &Entry);
+
+  /// Emit the location for a debug loc entry, including the size header.
+  void emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry);
+
+  /// Find the MDNode for the given reference.
+  template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
+    return Ref.resolve();
+  }
+
+  /// Find the DwarfCompileUnit for the given CU Die.
+  DwarfCompileUnit *lookupUnit(const DIE *CU) const {
+    return CUDieMap.lookup(CU);
+  }
+
+  void addSubprogramNames(const DISubprogram *SP, DIE &Die);
+
+  AddressPool &getAddressPool() { return AddrPool; }
+
+  void addAccelName(StringRef Name, const DIE &Die);
+
+  void addAccelObjC(StringRef Name, const DIE &Die);
+
+  void addAccelNamespace(StringRef Name, const DIE &Die);
+
+  void addAccelType(StringRef Name, const DIE &Die, char Flags);
+
+  const MachineFunction *getCurrentFunction() const { return CurFn; }
+
+  /// A helper function to check whether the DIE for a given Scope is
+  /// going to be null.
+  bool isLexicalScopeDIENull(LexicalScope *Scope);
+
+  // FIXME: Sink these functions down into DwarfFile/Dwarf*Unit.
+
+  SmallPtrSet<const MDNode *, 16> &getProcessedSPNodes() {
+    return ProcessedSPNodes;
+  }
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
new file mode 100644
index 000000000000..8287f289f22b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -0,0 +1,95 @@
+//===-- DwarfException.h - Dwarf Exception Framework -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXCEPTION_H
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCDwarf.h"
+
+namespace llvm {
+class MachineFunction;
+class ARMTargetStreamer;
+
+class LLVM_LIBRARY_VISIBILITY DwarfCFIExceptionBase : public EHStreamer {
+protected:
+  DwarfCFIExceptionBase(AsmPrinter *A);
+
+  /// Per-function flag to indicate if frame CFI info should be emitted.
+  bool shouldEmitCFI;
+
+  void markFunctionEnd() override;
+  void endFragment() override;
+};
+
+class LLVM_LIBRARY_VISIBILITY DwarfCFIException : public DwarfCFIExceptionBase {
+  /// Per-function flag to indicate if .cfi_personality should be emitted.
+  bool shouldEmitPersonality;
+
+  /// Per-function flag to indicate if .cfi_personality must be emitted.
+  bool forceEmitPersonality;
+
+  /// Per-function flag to indicate if .cfi_lsda should be emitted.
+  bool shouldEmitLSDA;
+
+  /// Per-function flag to indicate if frame moves info should be emitted.
+  bool shouldEmitMoves;
+
+  AsmPrinter::CFIMoveType moveTypeModule;
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  DwarfCFIException(AsmPrinter *A);
+  ~DwarfCFIException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  void beginFragment(const MachineBasicBlock *MBB,
+                     ExceptionSymbolProvider ESP) override;
+};
+
+class LLVM_LIBRARY_VISIBILITY ARMException : public DwarfCFIExceptionBase {
+  void emitTypeInfos(unsigned TTypeEncoding) override;
+  ARMTargetStreamer &getTargetStreamer();
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  ARMException(AsmPrinter *A);
+  ~ARMException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+};
+} // End of namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
new file mode 100644
index 000000000000..7dbc6cb39951
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -0,0 +1,285 @@
+//===-- llvm/CodeGen/DwarfExpression.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf debug info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfExpression.h"
+#include "DwarfDebug.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  } else {
+    EmitOp(dwarf::DW_OP_regx, Comment);
+    EmitUnsigned(DwarfReg);
+  }
+}
+
+void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset, bool Deref) {
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  if (DwarfReg < 32) {
+    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+  } else {
+    EmitOp(dwarf::DW_OP_bregx);
+    EmitUnsigned(DwarfReg);
+  }
+  EmitSigned(Offset);
+  if (Deref)
+    EmitOp(dwarf::DW_OP_deref);
+}
+
+void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+  assert(SizeInBits > 0 && "piece has size zero");
+  const unsigned SizeOfByte = 8;
+  if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
+    EmitOp(dwarf::DW_OP_bit_piece);
+    EmitUnsigned(SizeInBits);
+    EmitUnsigned(OffsetInBits);
+  } else {
+    EmitOp(dwarf::DW_OP_piece);
+    unsigned ByteSize = SizeInBits / SizeOfByte;
+    EmitUnsigned(ByteSize);
+  }
+}
+
+void DwarfExpression::AddShr(unsigned ShiftBy) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(ShiftBy);
+  EmitOp(dwarf::DW_OP_shr);
+}
+
+bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI,
+                                            unsigned MachineReg, int Offset) {
+  if (isFrameRegister(TRI, MachineReg)) {
+    // If variable offset is based in frame register then use fbreg.
+    EmitOp(dwarf::DW_OP_fbreg);
+    EmitSigned(Offset);
+    return true;
+  }
+
+  int DwarfReg = TRI.getDwarfRegNum(MachineReg, false);
+  if (DwarfReg < 0)
+    return false;
+
+  AddRegIndirect(DwarfReg, Offset);
+  return true;
+}
+
+bool DwarfExpression::AddMachineRegPiece(const TargetRegisterInfo &TRI,
+                                         unsigned MachineReg,
+                                         unsigned PieceSizeInBits,
+                                         unsigned PieceOffsetInBits) {
+  if (!TRI.isPhysicalRegister(MachineReg))
+    return false;
+
+  int Reg = TRI.getDwarfRegNum(MachineReg, false);
+
+  // If this is a valid register number, emit it.
+  if (Reg >= 0) {
+    AddReg(Reg);
+    if (PieceSizeInBits)
+      AddOpPiece(PieceSizeInBits, PieceOffsetInBits);
+    return true;
+  }
+
+  // Walk up the super-register chain until we find a valid number.
+  // For example, EAX on x86_64 is a 32-bit piece of RAX with offset 0.
+  for (MCSuperRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) {
+    Reg = TRI.getDwarfRegNum(*SR, false);
+    if (Reg >= 0) {
+      unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);
+      unsigned Size = TRI.getSubRegIdxSize(Idx);
+      unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
+      AddReg(Reg, "super-register");
+      if (PieceOffsetInBits == RegOffset) {
+        AddOpPiece(Size, RegOffset);
+      } else {
+        // If this is part of a variable in a sub-register at a
+        // non-zero offset, we need to manually shift the value into
+        // place, since the DW_OP_piece describes the part of the
+        // variable, not the position of the subregister.
+        if (RegOffset)
+          AddShr(RegOffset);
+        AddOpPiece(Size, PieceOffsetInBits);
+      }
+      return true;
+    }
+  }
+
+  // Otherwise, attempt to find a covering set of sub-register numbers.
+  // For example, Q0 on ARM is a composition of D0+D1.
+  //
+  // Keep track of the current position so we can emit the more
+  // efficient DW_OP_piece.
+  unsigned CurPos = PieceOffsetInBits;
+  // The size of the register in bits, assuming 8 bits per byte.
+  unsigned RegSize = TRI.getMinimalPhysRegClass(MachineReg)->getSize() * 8;
+  // Keep track of the bits in the register we already emitted, so we
+  // can avoid emitting redundant aliasing subregs.
+  SmallBitVector Coverage(RegSize, false);
+  for (MCSubRegIterator SR(MachineReg, &TRI); SR.isValid(); ++SR) {
+    unsigned Idx = TRI.getSubRegIndex(MachineReg, *SR);
+    unsigned Size = TRI.getSubRegIdxSize(Idx);
+    unsigned Offset = TRI.getSubRegIdxOffset(Idx);
+    Reg = TRI.getDwarfRegNum(*SR, false);
+
+    // Intersection between the bits we already emitted and the bits
+    // covered by this subregister.
+    SmallBitVector Intersection(RegSize, false);
+    Intersection.set(Offset, Offset + Size);
+    Intersection ^= Coverage;
+
+    // If this sub-register has a DWARF number and we haven't covered
+    // its range, emit a DWARF piece for it.
+    if (Reg >= 0 && Intersection.any()) {
+      AddReg(Reg, "sub-register");
+      AddOpPiece(Size, Offset == CurPos ? 0 : Offset);
+      CurPos = Offset + Size;
+
+      // Mark it as emitted.
+      Coverage.set(Offset, Offset + Size);
+    }
+  }
+
+  return CurPos > PieceOffsetInBits;
+}
+
+void DwarfExpression::AddStackValue() {
+  if (DwarfVersion >= 4)
+    EmitOp(dwarf::DW_OP_stack_value);
+}
+
+void DwarfExpression::AddSignedConstant(int64_t Value) {
+  EmitOp(dwarf::DW_OP_consts);
+  EmitSigned(Value);
+  AddStackValue();
+}
+
+void DwarfExpression::AddUnsignedConstant(uint64_t Value) {
+  EmitOp(dwarf::DW_OP_constu);
+  EmitUnsigned(Value);
+  AddStackValue();
+}
+
+void DwarfExpression::AddUnsignedConstant(const APInt &Value) {
+  unsigned Size = Value.getBitWidth();
+  const uint64_t *Data = Value.getRawData();
+
+  // Chop it up into 64-bit pieces, because that's the maximum that
+  // AddUnsignedConstant takes.
+  unsigned Offset = 0;
+  while (Offset < Size) {
+    AddUnsignedConstant(*Data++);
+    if (Offset == 0 && Size <= 64)
+      break;
+    AddOpPiece(std::min(Size-Offset, 64u), Offset);
+    Offset += 64;
+  }
+}
+
+static unsigned getOffsetOrZero(unsigned OffsetInBits,
+                                unsigned PieceOffsetInBits) {
+  if (OffsetInBits == PieceOffsetInBits)
+    return 0;
+  assert(OffsetInBits >= PieceOffsetInBits && "overlapping pieces");
+  return OffsetInBits;
+}
+
+bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
+                                              const DIExpression *Expr,
+                                              unsigned MachineReg,
+                                              unsigned PieceOffsetInBits) {
+  auto I = Expr->expr_op_begin();
+  auto E = Expr->expr_op_end();
+  if (I == E)
+    return AddMachineRegPiece(TRI, MachineReg);
+
+  // Pattern-match combinations for which more efficient representations exist
+  // first.
+  bool ValidReg = false;
+  switch (I->getOp()) {
+  case dwarf::DW_OP_bit_piece: {
+    unsigned OffsetInBits = I->getArg(0);
+    unsigned SizeInBits   = I->getArg(1);
+    // Piece always comes at the end of the expression.
+    return AddMachineRegPiece(TRI, MachineReg, SizeInBits,
+               getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+  }
+  case dwarf::DW_OP_plus:
+  case dwarf::DW_OP_minus: {
+    // [DW_OP_reg,Offset,DW_OP_plus, DW_OP_deref] --> [DW_OP_breg, Offset].
+    // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].
+    auto N = I.getNext();
+    if (N != E && N->getOp() == dwarf::DW_OP_deref) {
+      unsigned Offset = I->getArg(0);
+      ValidReg = AddMachineRegIndirect(
+          TRI, MachineReg, I->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
+      std::advance(I, 2);
+      break;
+    } else
+      ValidReg = AddMachineRegPiece(TRI, MachineReg);
+  }
+  case dwarf::DW_OP_deref: {
+      // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
+      ValidReg = AddMachineRegIndirect(TRI, MachineReg);
+      ++I;
+      break;
+  }
+  default:
+    llvm_unreachable("unsupported operand");
+  }
+
+  if (!ValidReg)
+    return false;
+
+  // Emit remaining elements of the expression.
+  AddExpression(I, E, PieceOffsetInBits);
+  return true;
+}
+
+void DwarfExpression::AddExpression(DIExpression::expr_op_iterator I,
+                                    DIExpression::expr_op_iterator E,
+                                    unsigned PieceOffsetInBits) {
+  for (; I != E; ++I) {
+    switch (I->getOp()) {
+    case dwarf::DW_OP_bit_piece: {
+      unsigned OffsetInBits = I->getArg(0);
+      unsigned SizeInBits   = I->getArg(1);
+      AddOpPiece(SizeInBits, getOffsetOrZero(OffsetInBits, PieceOffsetInBits));
+      break;
+    }
+    case dwarf::DW_OP_plus:
+      EmitOp(dwarf::DW_OP_plus_uconst);
+      EmitUnsigned(I->getArg(0));
+      break;
+    case dwarf::DW_OP_minus:
+      // There is no OP_minus_uconst.
+      EmitOp(dwarf::DW_OP_constu);
+      EmitUnsigned(I->getArg(0));
+      EmitOp(dwarf::DW_OP_minus);
+      break;
+    case dwarf::DW_OP_deref:
+      EmitOp(dwarf::DW_OP_deref);
+      break;
+    default:
+      llvm_unreachable("unhandled opcode found in expression");
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
new file mode 100644
index 000000000000..5fff28d8a13c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -0,0 +1,152 @@
+//===-- llvm/CodeGen/DwarfExpression.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFEXPRESSION_H
+
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+
+class AsmPrinter;
+class ByteStreamer;
+class TargetRegisterInfo;
+class DwarfUnit;
+class DIELoc;
+
+/// Base class containing the logic for constructing DWARF expressions
+/// independently of whether they are emitted into a DIE or into a .debug_loc
+/// entry.
+class DwarfExpression {
+protected:
+  // Various convenience accessors that extract things out of AsmPrinter.
+  unsigned DwarfVersion;
+
+public:
+  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
+  virtual ~DwarfExpression() {}
+
+  /// Output a dwarf operand and an optional assembler comment.
+  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  /// Emit a raw signed value.
+  virtual void EmitSigned(int64_t Value) = 0;
+  /// Emit a raw unsigned value.
+  virtual void EmitUnsigned(uint64_t Value) = 0;
+  /// Return whether the given machine register is the frame register in the
+  /// current function.
+  virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
+
+  /// Emit a dwarf register operation.
+  void AddReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit an (double-)indirect dwarf register operation.
+  void AddRegIndirect(int DwarfReg, int Offset, bool Deref = false);
+
+  /// Emit a dwarf register operation for describing
+  /// - a small value occupying only part of a register or
+  /// - a register representing only part of a value.
+  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  /// Emit a shift-right dwarf expression.
+  void AddShr(unsigned ShiftBy);
+  /// Emit a DW_OP_stack_value, if supported.
+  ///
+  /// The proper way to describe a constant value is
+  /// DW_OP_constu <const>, DW_OP_stack_value.
+  /// Unfortunately, DW_OP_stack_value was not available until DWARF-4,
+  /// so we will continue to generate DW_OP_constu <const> for DWARF-2
+  /// and DWARF-3. Technically, this is incorrect since DW_OP_const <const>
+  /// actually describes a value at a constant addess, not a constant value.
+  /// However, in the past there was no better way  to describe a constant
+  /// value, so the producers and consumers started to rely on heuristics
+  /// to disambiguate the value vs. location status of the expression.
+  /// See PR21176 for more details.
+  void AddStackValue();
+
+  /// Emit an indirect dwarf register operation for the given machine register.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                             int Offset = 0);
+
+  /// \brief Emit a partial DWARF register operation.
+  /// \param MachineReg        the register
+  /// \param PieceSizeInBits   size and
+  /// \param PieceOffsetInBits offset of the piece in bits, if this is one
+  ///                          piece of an aggregate value.
+  ///
+  /// If size and offset is zero an operation for the entire
+  /// register is emitted: Some targets do not provide a DWARF
+  /// register number for every register.  If this is the case, this
+  /// function will attempt to emit a DWARF register by emitting a
+  /// piece of a super-register or by piecing together multiple
+  /// subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegPiece(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                          unsigned PieceSizeInBits = 0,
+                          unsigned PieceOffsetInBits = 0);
+
+  /// Emit a signed constant.
+  void AddSignedConstant(int64_t Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(uint64_t Value);
+  /// Emit an unsigned constant.
+  void AddUnsignedConstant(const APInt &Value);
+
+  /// \brief Emit an entire expression on top of a machine register location.
+  ///
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  /// \return false if no DWARF register exists for MachineReg.
+  bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
+                               const DIExpression *Expr, unsigned MachineReg,
+                               unsigned PieceOffsetInBits = 0);
+  /// Emit a the operations remaining the DIExpressionIterator I.
+  /// \param PieceOffsetInBits If this is one piece out of a fragmented
+  /// location, this is the offset of the piece inside the entire variable.
+  void AddExpression(DIExpression::expr_op_iterator I,
+                     DIExpression::expr_op_iterator E,
+                     unsigned PieceOffsetInBits = 0);
+};
+
+/// DwarfExpression implementation for .debug_loc entries.
+class DebugLocDwarfExpression : public DwarfExpression {
+  ByteStreamer &BS;
+
+public:
+  DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)
+      : DwarfExpression(DwarfVersion), BS(BS) {}
+
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int64_t Value) override;
+  void EmitUnsigned(uint64_t Value) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
+};
+
+/// DwarfExpression implementation for singular DW_AT_location.
+class DIEDwarfExpression : public DwarfExpression {
+const AsmPrinter &AP;
+  DwarfUnit &DU;
+  DIELoc &DIE;
+
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
+  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void EmitSigned(int64_t Value) override;
+  void EmitUnsigned(uint64_t Value) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
+};
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
new file mode 100644
index 000000000000..e9fe98ab3cf9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -0,0 +1,180 @@
+//===-- llvm/CodeGen/DwarfFile.cpp - Dwarf Debug Framework ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfFile.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "DwarfUnit.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+namespace llvm {
+DwarfFile::DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA)
+    : Asm(AP), StrPool(DA, *Asm, Pref) {}
+
+DwarfFile::~DwarfFile() {
+  for (DIEAbbrev *Abbrev : Abbreviations)
+    Abbrev->~DIEAbbrev();
+}
+
+// Define a unique number for the abbreviation.
+//
+DIEAbbrev &DwarfFile::assignAbbrevNumber(DIE &Die) {
+  FoldingSetNodeID ID;
+  DIEAbbrev Abbrev = Die.generateAbbrev();
+  Abbrev.Profile(ID);
+
+  void *InsertPos;
+  if (DIEAbbrev *Existing =
+          AbbreviationsSet.FindNodeOrInsertPos(ID, InsertPos)) {
+    Die.setAbbrevNumber(Existing->getNumber());
+    return *Existing;
+  }
+
+  // Move the abbreviation to the heap and assign a number.
+  DIEAbbrev *New = new (AbbrevAllocator) DIEAbbrev(std::move(Abbrev));
+  Abbreviations.push_back(New);
+  New->setNumber(Abbreviations.size());
+  Die.setAbbrevNumber(Abbreviations.size());
+
+  // Store it for lookup.
+  AbbreviationsSet.InsertNode(New, InsertPos);
+  return *New;
+}
+
+void DwarfFile::addUnit(std::unique_ptr<DwarfCompileUnit> U) {
+  CUs.push_back(std::move(U));
+}
+
+// Emit the various dwarf units to the unit section USection with
+// the abbreviations going into ASection.
+void DwarfFile::emitUnits(bool UseOffsets) {
+  for (const auto &TheU : CUs)
+    emitUnit(TheU.get(), UseOffsets);
+}
+
+void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
+  DIE &Die = TheU->getUnitDie();
+  MCSection *USection = TheU->getSection();
+  Asm->OutStreamer->SwitchSection(USection);
+
+  TheU->emitHeader(UseOffsets);
+
+  Asm->emitDwarfDIE(Die);
+}
+
+// Compute the size and offset for each DIE.
+void DwarfFile::computeSizeAndOffsets() {
+  // Offset from the first CU in the debug info section is 0 initially.
+  unsigned SecOffset = 0;
+
+  // Iterate over each compile unit and set the size and offsets for each
+  // DIE within each compile unit. All offsets are CU relative.
+  for (const auto &TheU : CUs) {
+    TheU->setDebugInfoOffset(SecOffset);
+    SecOffset += computeSizeAndOffsetsForUnit(TheU.get());
+  }
+}
+
+unsigned DwarfFile::computeSizeAndOffsetsForUnit(DwarfUnit *TheU) {
+  // CU-relative offset is reset to 0 here.
+  unsigned Offset = sizeof(int32_t) +      // Length of Unit Info
+                    TheU->getHeaderSize(); // Unit-specific headers
+
+  // The return value here is CU-relative, after laying out
+  // all of the CU DIE.
+  return computeSizeAndOffset(TheU->getUnitDie(), Offset);
+}
+
+// Compute the size and offset of a DIE. The offset is relative to start of the
+// CU. It returns the offset after laying out the DIE.
+unsigned DwarfFile::computeSizeAndOffset(DIE &Die, unsigned Offset) {
+  // Record the abbreviation.
+  const DIEAbbrev &Abbrev = assignAbbrevNumber(Die);
+
+  // Set DIE offset
+  Die.setOffset(Offset);
+
+  // Start the size with the size of abbreviation code.
+  Offset += getULEB128Size(Die.getAbbrevNumber());
+
+  // Size the DIE attribute values.
+  for (const auto &V : Die.values())
+    // Size attribute value.
+    Offset += V.SizeOf(Asm);
+
+  // Size the DIE children if any.
+  if (Die.hasChildren()) {
+    (void)Abbrev;
+    assert(Abbrev.hasChildren() && "Children flag not set");
+
+    for (auto &Child : Die.children())
+      Offset = computeSizeAndOffset(Child, Offset);
+
+    // End of children marker.
+    Offset += sizeof(int8_t);
+  }
+
+  Die.setSize(Offset - Die.getOffset());
+  return Offset;
+}
+
+void DwarfFile::emitAbbrevs(MCSection *Section) {
+  // Check to see if it is worth the effort.
+  if (!Abbreviations.empty()) {
+    // Start the debug abbrev section.
+    Asm->OutStreamer->SwitchSection(Section);
+    Asm->emitDwarfAbbrevs(Abbreviations);
+  }
+}
+
+// Emit strings into a string section.
+void DwarfFile::emitStrings(MCSection *StrSection, MCSection *OffsetSection) {
+  StrPool.emit(*Asm, StrSection, OffsetSection);
+}
+
+bool DwarfFile::addScopeVariable(LexicalScope *LS, DbgVariable *Var) {
+  SmallVectorImpl<DbgVariable *> &Vars = ScopeVariables[LS];
+  const DILocalVariable *DV = Var->getVariable();
+  // Variables with positive arg numbers are parameters.
+  if (unsigned ArgNum = DV->getArg()) {
+    // Keep all parameters in order at the start of the variable list to ensure
+    // function types are correct (no out-of-order parameters)
+    //
+    // This could be improved by only doing it for optimized builds (unoptimized
+    // builds have the right order to begin with), searching from the back (this
+    // would catch the unoptimized case quickly), or doing a binary search
+    // rather than linear search.
+    auto I = Vars.begin();
+    while (I != Vars.end()) {
+      unsigned CurNum = (*I)->getVariable()->getArg();
+      // A local (non-parameter) variable has been found, insert immediately
+      // before it.
+      if (CurNum == 0)
+        break;
+      // A later indexed parameter has been found, insert immediately before it.
+      if (CurNum > ArgNum)
+        break;
+      if (CurNum == ArgNum) {
+        (*I)->addMMIEntry(*Var);
+        return false;
+      }
+      ++I;
+    }
+    Vars.insert(I, Var);
+    return true;
+  }
+
+  Vars.push_back(Var);
+  return true;
+}
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
new file mode 100644
index 000000000000..b73d89b0e499
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfFile.h
@@ -0,0 +1,128 @@
+//===-- llvm/CodeGen/DwarfFile.h - Dwarf Debug Framework -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFFILE_H
+
+#include "AddressPool.h"
+#include "DwarfStringPool.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Allocator.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class AsmPrinter;
+class DbgVariable;
+class DwarfCompileUnit;
+class DwarfUnit;
+class DIEAbbrev;
+class MCSymbol;
+class DIE;
+class LexicalScope;
+class StringRef;
+class DwarfDebug;
+class MCSection;
+class MDNode;
+class DwarfFile {
+  // Target of Dwarf emission, used for sizing of abbreviations.
+  AsmPrinter *Asm;
+
+  BumpPtrAllocator AbbrevAllocator;
+
+  // Used to uniquely define abbreviations.
+  FoldingSet<DIEAbbrev> AbbreviationsSet;
+
+  // A list of all the unique abbreviations in use.
+  std::vector<DIEAbbrev *> Abbreviations;
+
+  // A pointer to all units in the section.
+  SmallVector<std::unique_ptr<DwarfCompileUnit>, 1> CUs;
+
+  DwarfStringPool StrPool;
+
+  // Collection of dbg variables of a scope.
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> ScopeVariables;
+
+  // Collection of abstract subprogram DIEs.
+  DenseMap<const MDNode *, DIE *> AbstractSPDies;
+
+  /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can
+  /// be shared across CUs, that is why we keep the map here instead
+  /// of in DwarfCompileUnit.
+  DenseMap<const MDNode *, DIE *> DITypeNodeToDieMap;
+
+public:
+  DwarfFile(AsmPrinter *AP, StringRef Pref, BumpPtrAllocator &DA);
+
+  ~DwarfFile();
+
+  const SmallVectorImpl<std::unique_ptr<DwarfCompileUnit>> &getUnits() {
+    return CUs;
+  }
+
+  /// \brief Compute the size and offset of a DIE given an incoming Offset.
+  unsigned computeSizeAndOffset(DIE &Die, unsigned Offset);
+
+  /// \brief Compute the size and offset of all the DIEs.
+  void computeSizeAndOffsets();
+
+  /// \brief Compute the size and offset of all the DIEs in the given unit.
+  /// \returns The size of the root DIE.
+  unsigned computeSizeAndOffsetsForUnit(DwarfUnit *TheU);
+
+  /// Define a unique number for the abbreviation.
+  ///
+  /// Compute the abbreviation for \c Die, look up its unique number, and
+  /// return a reference to it in the uniquing table.
+  DIEAbbrev &assignAbbrevNumber(DIE &Die);
+
+  /// \brief Add a unit to the list of CUs.
+  void addUnit(std::unique_ptr<DwarfCompileUnit> U);
+
+  /// \brief Emit all of the units to the section listed with the given
+  /// abbreviation section.
+  void emitUnits(bool UseOffsets);
+
+  /// \brief Emit the given unit to its section.
+  void emitUnit(DwarfUnit *U, bool UseOffsets);
+
+  /// \brief Emit a set of abbreviations to the specific section.
+  void emitAbbrevs(MCSection *);
+
+  /// \brief Emit all of the strings to the section given.
+  void emitStrings(MCSection *StrSection, MCSection *OffsetSection = nullptr);
+
+  /// \brief Returns the string pool.
+  DwarfStringPool &getStringPool() { return StrPool; }
+
+  /// \returns false if the variable was merged with a previous one.
+  bool addScopeVariable(LexicalScope *LS, DbgVariable *Var);
+
+  DenseMap<LexicalScope *, SmallVector<DbgVariable *, 8>> &getScopeVariables() {
+    return ScopeVariables;
+  }
+
+  DenseMap<const MDNode *, DIE *> &getAbstractSPDies() {
+    return AbstractSPDies;
+  }
+
+  void insertDIE(const MDNode *TypeMD, DIE *Die) {
+    DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die));
+  }
+  DIE *getDIE(const MDNode *TypeMD) {
+    return DITypeNodeToDieMap.lookup(TypeMD);
+  }
+};
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
new file mode 100644
index 000000000000..2066f745e318
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -0,0 +1,74 @@
+//===-- llvm/CodeGen/DwarfStringPool.cpp - Dwarf Debug Framework ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfStringPool.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+DwarfStringPool::DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm,
+                                 StringRef Prefix)
+    : Pool(A), Prefix(Prefix),
+      ShouldCreateSymbols(Asm.MAI->doesDwarfUseRelocationsAcrossSections()) {}
+
+DwarfStringPool::EntryRef DwarfStringPool::getEntry(AsmPrinter &Asm,
+                                                    StringRef Str) {
+  auto I = Pool.insert(std::make_pair(Str, EntryTy()));
+  if (I.second) {
+    auto &Entry = I.first->second;
+    Entry.Index = Pool.size() - 1;
+    Entry.Offset = NumBytes;
+    Entry.Symbol = ShouldCreateSymbols ? Asm.createTempSymbol(Prefix) : nullptr;
+
+    NumBytes += Str.size() + 1;
+    assert(NumBytes > Entry.Offset && "Unexpected overflow");
+  }
+  return EntryRef(*I.first);
+}
+
+void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
+                           MCSection *OffsetSection) {
+  if (Pool.empty())
+    return;
+
+  // Start the dwarf str section.
+  Asm.OutStreamer->SwitchSection(StrSection);
+
+  // Get all of the string pool entries and put them in an array by their ID so
+  // we can sort them.
+  SmallVector<const StringMapEntry<EntryTy> *, 64> Entries(Pool.size());
+
+  for (const auto &E : Pool)
+    Entries[E.getValue().Index] = &E;
+
+  for (const auto &Entry : Entries) {
+    assert(ShouldCreateSymbols == static_cast<bool>(Entry->getValue().Symbol) &&
+           "Mismatch between setting and entry");
+
+    // Emit a label for reference from debug information entries.
+    if (ShouldCreateSymbols)
+      Asm.OutStreamer->EmitLabel(Entry->getValue().Symbol);
+
+    // Emit the string itself with a terminating null byte.
+    Asm.OutStreamer->AddComment("string offset=" +
+                                Twine(Entry->getValue().Offset));
+    Asm.OutStreamer->EmitBytes(
+        StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
+  }
+
+  // If we've got an offset section go ahead and emit that now as well.
+  if (OffsetSection) {
+    Asm.OutStreamer->SwitchSection(OffsetSection);
+    unsigned size = 4; // FIXME: DWARF64 is 8.
+    for (const auto &Entry : Entries)
+      Asm.OutStreamer->EmitIntValue(Entry->getValue().Offset, size);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
new file mode 100644
index 000000000000..93a168485a54
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.h
@@ -0,0 +1,49 @@
+//===-- llvm/CodeGen/DwarfStringPool.h - Dwarf Debug Framework -*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFSTRINGPOOL_H
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/DwarfStringPoolEntry.h"
+#include "llvm/Support/Allocator.h"
+#include <utility>
+
+namespace llvm {
+
+class AsmPrinter;
+class MCSymbol;
+class MCSection;
+class StringRef;
+
+// Collection of strings for this unit and assorted symbols.
+// A String->Symbol mapping of strings used by indirect
+// references.
+class DwarfStringPool {
+  typedef DwarfStringPoolEntry EntryTy;
+  StringMap<EntryTy, BumpPtrAllocator &> Pool;
+  StringRef Prefix;
+  unsigned NumBytes = 0;
+  bool ShouldCreateSymbols;
+
+public:
+  typedef DwarfStringPoolEntryRef EntryRef;
+
+  DwarfStringPool(BumpPtrAllocator &A, AsmPrinter &Asm, StringRef Prefix);
+
+  void emit(AsmPrinter &Asm, MCSection *StrSection,
+            MCSection *OffsetSection = nullptr);
+
+  bool empty() const { return Pool.empty(); }
+
+  /// Get a reference to an entry in the string pool.
+  EntryRef getEntry(AsmPrinter &Asm, StringRef Str);
+};
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
new file mode 100644
index 000000000000..4100d728a53b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -0,0 +1,1541 @@
+//===-- llvm/CodeGen/DwarfUnit.cpp - Dwarf Type and Compile Units ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#include "DwarfUnit.h"
+#include "DwarfAccelTable.h"
+#include "DwarfCompileUnit.h"
+#include "DwarfDebug.h"
+#include "DwarfExpression.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfdebug"
+
+static cl::opt<bool>
+GenerateDwarfTypeUnits("generate-type-units", cl::Hidden,
+                       cl::desc("Generate DWARF4 type units."),
+                       cl::init(false));
+
+DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
+                                       DIELoc &DIE)
+    : DwarfExpression(AP.getDwarfDebug()->getDwarfVersion()), AP(AP), DU(DU),
+      DIE(DIE) {}
+
+void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+  DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
+}
+void DIEDwarfExpression::EmitSigned(int64_t Value) {
+  DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
+}
+void DIEDwarfExpression::EmitUnsigned(uint64_t Value) {
+  DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
+}
+bool DIEDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
+                                         unsigned MachineReg) {
+  return MachineReg == TRI.getFrameRegister(*AP.MF);
+}
+
+DwarfUnit::DwarfUnit(dwarf::Tag UnitTag, const DICompileUnit *Node,
+                     AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU)
+    : CUNode(Node), UnitDie(*DIE::get(DIEValueAllocator, UnitTag)), Asm(A),
+      DD(DW), DU(DWU), IndexTyDie(nullptr), Section(nullptr) {
+  assert(UnitTag == dwarf::DW_TAG_compile_unit ||
+         UnitTag == dwarf::DW_TAG_type_unit);
+}
+
+DwarfTypeUnit::DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A,
+                             DwarfDebug *DW, DwarfFile *DWU,
+                             MCDwarfDwoLineTable *SplitLineTable)
+    : DwarfUnit(dwarf::DW_TAG_type_unit, CU.getCUNode(), A, DW, DWU), CU(CU),
+      SplitLineTable(SplitLineTable) {
+  if (SplitLineTable)
+    addSectionOffset(UnitDie, dwarf::DW_AT_stmt_list, 0);
+}
+
+DwarfUnit::~DwarfUnit() {
+  for (unsigned j = 0, M = DIEBlocks.size(); j < M; ++j)
+    DIEBlocks[j]->~DIEBlock();
+  for (unsigned j = 0, M = DIELocs.size(); j < M; ++j)
+    DIELocs[j]->~DIELoc();
+}
+
+int64_t DwarfUnit::getDefaultLowerBound() const {
+  switch (getLanguage()) {
+  default:
+    break;
+
+  case dwarf::DW_LANG_C89:
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C_plus_plus:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    return 0;
+
+  case dwarf::DW_LANG_Fortran77:
+  case dwarf::DW_LANG_Fortran90:
+  case dwarf::DW_LANG_Fortran95:
+    return 1;
+
+  // The languages below have valid values only if the DWARF version >= 4.
+  case dwarf::DW_LANG_Java:
+  case dwarf::DW_LANG_Python:
+  case dwarf::DW_LANG_UPC:
+  case dwarf::DW_LANG_D:
+    if (dwarf::DWARF_VERSION >= 4)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Ada83:
+  case dwarf::DW_LANG_Ada95:
+  case dwarf::DW_LANG_Cobol74:
+  case dwarf::DW_LANG_Cobol85:
+  case dwarf::DW_LANG_Modula2:
+  case dwarf::DW_LANG_Pascal83:
+  case dwarf::DW_LANG_PLI:
+    if (dwarf::DWARF_VERSION >= 4)
+      return 1;
+    break;
+
+  // The languages below have valid values only if the DWARF version >= 5.
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
+  case dwarf::DW_LANG_C_plus_plus_03:
+  case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_Rust:
+  case dwarf::DW_LANG_C11:
+  case dwarf::DW_LANG_Swift:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_C_plus_plus_14:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Modula3:
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Fortran03:
+  case dwarf::DW_LANG_Fortran08:
+    if (dwarf::DWARF_VERSION >= 5)
+      return 1;
+    break;
+  }
+
+  return -1;
+}
+
+/// Check whether the DIE for this MDNode can be shared across CUs.
+static bool isShareableAcrossCUs(const DINode *D) {
+  // When the MDNode can be part of the type system, the DIE can be shared
+  // across CUs.
+  // Combining type units and cross-CU DIE sharing is lower value (since
+  // cross-CU DIE sharing is used in LTO and removes type redundancy at that
+  // level already) but may be implementable for some value in projects
+  // building multiple independent libraries with LTO and then linking those
+  // together.
+  return (isa<DIType>(D) ||
+          (isa<DISubprogram>(D) && !cast<DISubprogram>(D)->isDefinition())) &&
+         !GenerateDwarfTypeUnits;
+}
+
+DIE *DwarfUnit::getDIE(const DINode *D) const {
+  if (isShareableAcrossCUs(D))
+    return DU->getDIE(D);
+  return MDNodeToDieMap.lookup(D);
+}
+
+void DwarfUnit::insertDIE(const DINode *Desc, DIE *D) {
+  if (isShareableAcrossCUs(Desc)) {
+    DU->insertDIE(Desc, D);
+    return;
+  }
+  MDNodeToDieMap.insert(std::make_pair(Desc, D));
+}
+
+void DwarfUnit::addFlag(DIE &Die, dwarf::Attribute Attribute) {
+  if (DD->getDwarfVersion() >= 4)
+    Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag_present,
+                 DIEInteger(1));
+  else
+    Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_flag,
+                 DIEInteger(1));
+}
+
+void DwarfUnit::addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
+                        Optional<dwarf::Form> Form, uint64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(false, Integer);
+  Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
+}
+
+void DwarfUnit::addUInt(DIEValueList &Block, dwarf::Form Form,
+                        uint64_t Integer) {
+  addUInt(Block, (dwarf::Attribute)0, Form, Integer);
+}
+
+void DwarfUnit::addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
+                        Optional<dwarf::Form> Form, int64_t Integer) {
+  if (!Form)
+    Form = DIEInteger::BestForm(true, Integer);
+  Die.addValue(DIEValueAllocator, Attribute, *Form, DIEInteger(Integer));
+}
+
+void DwarfUnit::addSInt(DIELoc &Die, Optional<dwarf::Form> Form,
+                        int64_t Integer) {
+  addSInt(Die, (dwarf::Attribute)0, Form, Integer);
+}
+
+void DwarfUnit::addString(DIE &Die, dwarf::Attribute Attribute,
+                          StringRef String) {
+  Die.addValue(DIEValueAllocator, Attribute,
+               isDwoUnit() ? dwarf::DW_FORM_GNU_str_index : dwarf::DW_FORM_strp,
+               DIEString(DU->getStringPool().getEntry(*Asm, String)));
+}
+
+DIEValueList::value_iterator DwarfUnit::addLabel(DIEValueList &Die,
+                                                 dwarf::Attribute Attribute,
+                                                 dwarf::Form Form,
+                                                 const MCSymbol *Label) {
+  return Die.addValue(DIEValueAllocator, Attribute, Form, DIELabel(Label));
+}
+
+void DwarfUnit::addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label) {
+  addLabel(Die, (dwarf::Attribute)0, Form, Label);
+}
+
+void DwarfUnit::addSectionOffset(DIE &Die, dwarf::Attribute Attribute,
+                                 uint64_t Integer) {
+  if (DD->getDwarfVersion() >= 4)
+    addUInt(Die, Attribute, dwarf::DW_FORM_sec_offset, Integer);
+  else
+    addUInt(Die, Attribute, dwarf::DW_FORM_data4, Integer);
+}
+
+unsigned DwarfTypeUnit::getOrCreateSourceID(StringRef FileName, StringRef DirName) {
+  return SplitLineTable ? SplitLineTable->getFile(DirName, FileName)
+                        : getCU().getOrCreateSourceID(FileName, DirName);
+}
+
+void DwarfUnit::addOpAddress(DIELoc &Die, const MCSymbol *Sym) {
+  if (!DD->useSplitDwarf()) {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_addr);
+    addLabel(Die, dwarf::DW_FORM_udata, Sym);
+  } else {
+    addUInt(Die, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_addr_index);
+    addUInt(Die, dwarf::DW_FORM_GNU_addr_index,
+            DD->getAddressPool().getIndex(Sym));
+  }
+}
+
+void DwarfUnit::addLabelDelta(DIE &Die, dwarf::Attribute Attribute,
+                              const MCSymbol *Hi, const MCSymbol *Lo) {
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_data4,
+               new (DIEValueAllocator) DIEDelta(Hi, Lo));
+}
+
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry) {
+  addDIEEntry(Die, Attribute, DIEEntry(Entry));
+}
+
+void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
+  // Flag the type unit reference as a declaration so that if it contains
+  // members (implicit special members, static data member definitions, member
+  // declarations for definitions in this CU, etc) consumers don't get confused
+  // and think this is a full definition.
+  addFlag(Die, dwarf::DW_AT_declaration);
+
+  Die.addValue(DIEValueAllocator, dwarf::DW_AT_signature,
+               dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
+}
+
+void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
+                                    StringRef Identifier) {
+  uint64_t Signature = DD->makeTypeSignature(Identifier);
+  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8,
+               DIEInteger(Signature));
+}
+
+void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
+                            DIEEntry Entry) {
+  const DIE *DieCU = Die.getUnitOrNull();
+  const DIE *EntryCU = Entry.getEntry().getUnitOrNull();
+  if (!DieCU)
+    // We assume that Die belongs to this CU, if it is not linked to any CU yet.
+    DieCU = &getUnitDie();
+  if (!EntryCU)
+    EntryCU = &getUnitDie();
+  Die.addValue(DIEValueAllocator, Attribute,
+               EntryCU == DieCU ? dwarf::DW_FORM_ref4 : dwarf::DW_FORM_ref_addr,
+               Entry);
+}
+
+DIE &DwarfUnit::createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N) {
+  DIE &Die = Parent.addChild(DIE::get(DIEValueAllocator, (dwarf::Tag)Tag));
+  if (N)
+    insertDIE(N, &Die);
+  return Die;
+}
+
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Loc) {
+  Loc->ComputeSize(Asm);
+  DIELocs.push_back(Loc); // Memoize so we can call the destructor later on.
+  Die.addValue(DIEValueAllocator, Attribute,
+               Loc->BestForm(DD->getDwarfVersion()), Loc);
+}
+
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
+                         DIEBlock *Block) {
+  Block->ComputeSize(Asm);
+  DIEBlocks.push_back(Block); // Memoize so we can call the destructor later on.
+  Die.addValue(DIEValueAllocator, Attribute, Block->BestForm(), Block);
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, StringRef File,
+                              StringRef Directory) {
+  if (Line == 0)
+    return;
+
+  unsigned FileID = getOrCreateSourceID(File, Directory);
+  assert(FileID && "Invalid file id");
+  addUInt(Die, dwarf::DW_AT_decl_file, None, FileID);
+  addUInt(Die, dwarf::DW_AT_decl_line, None, Line);
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DILocalVariable *V) {
+  assert(V);
+
+  addSourceLine(Die, V->getLine(), V->getScope()->getFilename(),
+                V->getScope()->getDirectory());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIGlobalVariable *G) {
+  assert(G);
+
+  addSourceLine(Die, G->getLine(), G->getFilename(), G->getDirectory());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DISubprogram *SP) {
+  assert(SP);
+
+  addSourceLine(Die, SP->getLine(), SP->getFilename(), SP->getDirectory());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIType *Ty) {
+  assert(Ty);
+
+  addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DIObjCProperty *Ty) {
+  assert(Ty);
+
+  addSourceLine(Die, Ty->getLine(), Ty->getFilename(), Ty->getDirectory());
+}
+
+void DwarfUnit::addSourceLine(DIE &Die, const DINamespace *NS) {
+  addSourceLine(Die, NS->getLine(), NS->getFilename(), NS->getDirectory());
+}
+
+bool DwarfUnit::addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                                   unsigned SizeInBits, unsigned OffsetInBits) {
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  Expr.AddMachineRegPiece(*Asm->MF->getSubtarget().getRegisterInfo(), Reg,
+                          SizeInBits, OffsetInBits);
+  return true;
+}
+
+bool DwarfUnit::addRegisterOffset(DIELoc &TheDie, unsigned Reg,
+                                  int64_t Offset) {
+  DIEDwarfExpression Expr(*Asm, *this, TheDie);
+  return Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
+                                    Reg, Offset);
+}
+
+/* Byref variables, in Blocks, are declared by the programmer as "SomeType
+   VarName;", but the compiler creates a __Block_byref_x_VarName struct, and
+   gives the variable VarName either the struct, or a pointer to the struct, as
+   its type.  This is necessary for various behind-the-scenes things the
+   compiler needs to do with by-reference variables in Blocks.
+
+   However, as far as the original *programmer* is concerned, the variable
+   should still have type 'SomeType', as originally declared.
+
+   The function getBlockByrefType dives into the __Block_byref_x_VarName
+   struct to find the original type of the variable, which is then assigned to
+   the variable's Debug Information Entry as its real type.  So far, so good.
+   However now the debugger will expect the variable VarName to have the type
+   SomeType.  So we need the location attribute for the variable to be an
+   expression that explains to the debugger how to navigate through the
+   pointers and struct to find the actual variable of type SomeType.
+
+   The following function does just that.  We start by getting
+   the "normal" location for the variable. This will be the location
+   of either the struct __Block_byref_x_VarName or the pointer to the
+   struct __Block_byref_x_VarName.
+
+   The struct will look something like:
+
+   struct __Block_byref_x_VarName {
+     ... <various fields>
+     struct __Block_byref_x_VarName *forwarding;
+     ... <various other fields>
+     SomeType VarName;
+     ... <maybe more fields>
+   };
+
+   If we are given the struct directly (as our starting point) we
+   need to tell the debugger to:
+
+   1).  Add the offset of the forwarding field.
+
+   2).  Follow that pointer to get the real __Block_byref_x_VarName
+   struct to use (the real one may have been copied onto the heap).
+
+   3).  Add the offset for the field VarName, to find the actual variable.
+
+   If we started with a pointer to the struct, then we need to
+   dereference that pointer first, before the other steps.
+   Translating this into DWARF ops, we will need to append the following
+   to the current location description for the variable:
+
+   DW_OP_deref                    -- optional, if we start with a pointer
+   DW_OP_plus_uconst <forward_fld_offset>
+   DW_OP_deref
+   DW_OP_plus_uconst <varName_fld_offset>
+
+   That is what this function does.  */
+
+void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
+                                     dwarf::Attribute Attribute,
+                                     const MachineLocation &Location) {
+  const DIType *Ty = DV.getType();
+  const DIType *TmpTy = Ty;
+  uint16_t Tag = Ty->getTag();
+  bool isPointer = false;
+
+  StringRef varName = DV.getName();
+
+  if (Tag == dwarf::DW_TAG_pointer_type) {
+    auto *DTy = cast<DIDerivedType>(Ty);
+    TmpTy = resolve(DTy->getBaseType());
+    isPointer = true;
+  }
+
+  // Find the __forwarding field and the variable field in the __Block_byref
+  // struct.
+  DINodeArray Fields = cast<DICompositeType>(TmpTy)->getElements();
+  const DIDerivedType *varField = nullptr;
+  const DIDerivedType *forwardingField = nullptr;
+
+  for (unsigned i = 0, N = Fields.size(); i < N; ++i) {
+    auto *DT = cast<DIDerivedType>(Fields[i]);
+    StringRef fieldName = DT->getName();
+    if (fieldName == "__forwarding")
+      forwardingField = DT;
+    else if (fieldName == varName)
+      varField = DT;
+  }
+
+  // Get the offsets for the forwarding field and the variable field.
+  unsigned forwardingFieldOffset = forwardingField->getOffsetInBits() >> 3;
+  unsigned varFieldOffset = varField->getOffsetInBits() >> 2;
+
+  // Decode the original location, and use that as the start of the byref
+  // variable's location.
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+
+  bool validReg;
+  if (Location.isReg())
+    validReg = addRegisterOpPiece(*Loc, Location.getReg());
+  else
+    validReg = addRegisterOffset(*Loc, Location.getReg(), Location.getOffset());
+
+  if (!validReg)
+    return;
+
+  // If we started with a pointer to the __Block_byref... struct, then
+  // the first thing we need to do is dereference the pointer (DW_OP_deref).
+  if (isPointer)
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Next add the offset for the '__forwarding' field:
+  // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
+  // adding the offset if it's 0.
+  if (forwardingFieldOffset > 0) {
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(*Loc, dwarf::DW_FORM_udata, forwardingFieldOffset);
+  }
+
+  // Now dereference the __forwarding field to get to the real __Block_byref
+  // struct:  DW_OP_deref.
+  addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+
+  // Now that we've got the real __Block_byref... struct, add the offset
+  // for the variable's field to get to the location of the actual variable:
+  // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
+  if (varFieldOffset > 0) {
+    addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+    addUInt(*Loc, dwarf::DW_FORM_udata, varFieldOffset);
+  }
+
+  // Now attach the location information to the DIE.
+  addBlock(Die, Attribute, Loc);
+}
+
+/// Return true if type encoding is unsigned.
+static bool isUnsignedDIType(DwarfDebug *DD, const DIType *Ty) {
+  if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    // FIXME: Enums without a fixed underlying type have unknown signedness
+    // here, leading to incorrectly emitted constants.
+    if (CTy->getTag() == dwarf::DW_TAG_enumeration_type)
+      return false;
+
+    // (Pieces of) aggregate types that get hacked apart by SROA may be
+    // represented by a constant. Encode them as unsigned bytes.
+    return true;
+  }
+
+  if (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    dwarf::Tag T = (dwarf::Tag)Ty->getTag();
+    // Encode pointer constants as unsigned bytes. This is used at least for
+    // null pointer constant emission.
+    // FIXME: reference and rvalue_reference /probably/ shouldn't be allowed
+    // here, but accept them for now due to a bug in SROA producing bogus
+    // dbg.values.
+    if (T == dwarf::DW_TAG_pointer_type ||
+        T == dwarf::DW_TAG_ptr_to_member_type ||
+        T == dwarf::DW_TAG_reference_type ||
+        T == dwarf::DW_TAG_rvalue_reference_type)
+      return true;
+    assert(T == dwarf::DW_TAG_typedef || T == dwarf::DW_TAG_const_type ||
+           T == dwarf::DW_TAG_volatile_type ||
+           T == dwarf::DW_TAG_restrict_type);
+    DITypeRef Deriv = DTy->getBaseType();
+    assert(Deriv && "Expected valid base type");
+    return isUnsignedDIType(DD, DD->resolve(Deriv));
+  }
+
+  auto *BTy = cast<DIBasicType>(Ty);
+  unsigned Encoding = BTy->getEncoding();
+  assert((Encoding == dwarf::DW_ATE_unsigned ||
+          Encoding == dwarf::DW_ATE_unsigned_char ||
+          Encoding == dwarf::DW_ATE_signed ||
+          Encoding == dwarf::DW_ATE_signed_char ||
+          Encoding == dwarf::DW_ATE_float || Encoding == dwarf::DW_ATE_UTF ||
+          Encoding == dwarf::DW_ATE_boolean ||
+          (Ty->getTag() == dwarf::DW_TAG_unspecified_type &&
+           Ty->getName() == "decltype(nullptr)")) &&
+         "Unsupported encoding");
+  return Encoding == dwarf::DW_ATE_unsigned ||
+         Encoding == dwarf::DW_ATE_unsigned_char ||
+         Encoding == dwarf::DW_ATE_UTF || Encoding == dwarf::DW_ATE_boolean ||
+         Ty->getTag() == dwarf::DW_TAG_unspecified_type;
+}
+
+void DwarfUnit::addConstantFPValue(DIE &Die, const MachineOperand &MO) {
+  assert(MO.isFPImm() && "Invalid machine operand!");
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
+  APFloat FPImm = MO.getFPImm()->getValueAPF();
+
+  // Get the raw data form of the floating point.
+  const APInt FltVal = FPImm.bitcastToAPInt();
+  const char *FltPtr = (const char *)FltVal.getRawData();
+
+  int NumBytes = FltVal.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+  int Incr = (LittleEndian ? 1 : -1);
+  int Start = (LittleEndian ? 0 : NumBytes - 1);
+  int Stop = (LittleEndian ? NumBytes : -1);
+
+  // Output the constant to DWARF one byte at a time.
+  for (; Start != Stop; Start += Incr)
+    addUInt(*Block, dwarf::DW_FORM_data1, (unsigned char)0xFF & FltPtr[Start]);
+
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
+}
+
+void DwarfUnit::addConstantFPValue(DIE &Die, const ConstantFP *CFP) {
+  // Pass this down to addConstantValue as an unsigned bag of bits.
+  addConstantValue(Die, CFP->getValueAPF().bitcastToAPInt(), true);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const ConstantInt *CI,
+                                 const DIType *Ty) {
+  addConstantValue(Die, CI->getValue(), Ty);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const MachineOperand &MO,
+                                 const DIType *Ty) {
+  assert(MO.isImm() && "Invalid machine operand!");
+
+  addConstantValue(Die, isUnsignedDIType(DD, Ty), MO.getImm());
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, bool Unsigned, uint64_t Val) {
+  // FIXME: This is a bit conservative/simple - it emits negative values always
+  // sign extended to 64 bits rather than minimizing the number of bytes.
+  addUInt(Die, dwarf::DW_AT_const_value,
+          Unsigned ? dwarf::DW_FORM_udata : dwarf::DW_FORM_sdata, Val);
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty) {
+  addConstantValue(Die, Val, isUnsignedDIType(DD, Ty));
+}
+
+void DwarfUnit::addConstantValue(DIE &Die, const APInt &Val, bool Unsigned) {
+  unsigned CIBitWidth = Val.getBitWidth();
+  if (CIBitWidth <= 64) {
+    addConstantValue(Die, Unsigned,
+                     Unsigned ? Val.getZExtValue() : Val.getSExtValue());
+    return;
+  }
+
+  DIEBlock *Block = new (DIEValueAllocator) DIEBlock;
+
+  // Get the raw data form of the large APInt.
+  const uint64_t *Ptr64 = Val.getRawData();
+
+  int NumBytes = Val.getBitWidth() / 8; // 8 bits per byte.
+  bool LittleEndian = Asm->getDataLayout().isLittleEndian();
+
+  // Output the constant to DWARF one byte at a time.
+  for (int i = 0; i < NumBytes; i++) {
+    uint8_t c;
+    if (LittleEndian)
+      c = Ptr64[i / 8] >> (8 * (i & 7));
+    else
+      c = Ptr64[(NumBytes - 1 - i) / 8] >> (8 * ((NumBytes - 1 - i) & 7));
+    addUInt(*Block, dwarf::DW_FORM_data1, c);
+  }
+
+  addBlock(Die, dwarf::DW_AT_const_value, Block);
+}
+
+void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) {
+  if (!LinkageName.empty())
+    addString(Die,
+              DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name
+                                         : dwarf::DW_AT_MIPS_linkage_name,
+              GlobalValue::getRealLinkageName(LinkageName));
+}
+
+void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) {
+  // Add template parameters.
+  for (const auto *Element : TParams) {
+    if (auto *TTP = dyn_cast<DITemplateTypeParameter>(Element))
+      constructTemplateTypeParameterDIE(Buffer, TTP);
+    else if (auto *TVP = dyn_cast<DITemplateValueParameter>(Element))
+      constructTemplateValueParameterDIE(Buffer, TVP);
+  }
+}
+
+DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
+  if (!Context || isa<DIFile>(Context))
+    return &getUnitDie();
+  if (auto *T = dyn_cast<DIType>(Context))
+    return getOrCreateTypeDIE(T);
+  if (auto *NS = dyn_cast<DINamespace>(Context))
+    return getOrCreateNameSpace(NS);
+  if (auto *SP = dyn_cast<DISubprogram>(Context))
+    return getOrCreateSubprogramDIE(SP);
+  if (auto *M = dyn_cast<DIModule>(Context))
+    return getOrCreateModule(M);
+  return getDIE(Context);
+}
+
+DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
+  auto *Context = resolve(Ty->getScope());
+  DIE *ContextDIE = getOrCreateContextDIE(Context);
+
+  if (DIE *TyDIE = getDIE(Ty))
+    return TyDIE;
+
+  // Create new type.
+  DIE &TyDIE = createAndAddDIE(Ty->getTag(), *ContextDIE, Ty);
+
+  constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
+
+  if (!Ty->isExternalTypeRef())
+    updateAcceleratorTables(Context, Ty, TyDIE);
+  return &TyDIE;
+}
+
+DIE *DwarfUnit::getOrCreateTypeDIE(const MDNode *TyNode) {
+  if (!TyNode)
+    return nullptr;
+
+  auto *Ty = cast<DIType>(TyNode);
+
+  // DW_TAG_restrict_type is not supported in DWARF2
+  if (Ty->getTag() == dwarf::DW_TAG_restrict_type && DD->getDwarfVersion() <= 2)
+    return getOrCreateTypeDIE(resolve(cast<DIDerivedType>(Ty)->getBaseType()));
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  auto *Context = resolve(Ty->getScope());
+  DIE *ContextDIE = getOrCreateContextDIE(Context);
+  assert(ContextDIE);
+
+  if (DIE *TyDIE = getDIE(Ty))
+    return TyDIE;
+
+  // Create new type.
+  DIE &TyDIE = createAndAddDIE(Ty->getTag(), *ContextDIE, Ty);
+
+  updateAcceleratorTables(Context, Ty, TyDIE);
+
+  if (auto *BT = dyn_cast<DIBasicType>(Ty))
+    constructTypeDIE(TyDIE, BT);
+  else if (auto *STy = dyn_cast<DISubroutineType>(Ty))
+    constructTypeDIE(TyDIE, STy);
+  else if (auto *CTy = dyn_cast<DICompositeType>(Ty)) {
+    if (GenerateDwarfTypeUnits && !Ty->isForwardDecl())
+      if (MDString *TypeId = CTy->getRawIdentifier()) {
+        DD->addDwarfTypeUnitType(getCU(), TypeId->getString(), TyDIE, CTy);
+        // Skip updating the accelerator tables since this is not the full type.
+        return &TyDIE;
+      }
+    constructTypeDIE(TyDIE, CTy);
+  } else {
+    constructTypeDIE(TyDIE, cast<DIDerivedType>(Ty));
+  }
+
+  return &TyDIE;
+}
+
+void DwarfUnit::updateAcceleratorTables(const DIScope *Context,
+                                        const DIType *Ty, const DIE &TyDIE) {
+  if (!Ty->getName().empty() && !Ty->isForwardDecl()) {
+    bool IsImplementation = 0;
+    if (auto *CT = dyn_cast<DICompositeType>(Ty)) {
+      // A runtime language of 0 actually means C/C++ and that any
+      // non-negative value is some version of Objective-C/C++.
+      IsImplementation = CT->getRuntimeLang() == 0 || CT->isObjcClassComplete();
+    }
+    unsigned Flags = IsImplementation ? dwarf::DW_FLAG_type_implementation : 0;
+    DD->addAccelType(Ty->getName(), TyDIE, Flags);
+
+    if (!Context || isa<DICompileUnit>(Context) || isa<DIFile>(Context) ||
+        isa<DINamespace>(Context))
+      addGlobalType(Ty, TyDIE, Context);
+  }
+}
+
+void DwarfUnit::addType(DIE &Entity, const DIType *Ty,
+                        dwarf::Attribute Attribute) {
+  assert(Ty && "Trying to add a type that doesn't exist?");
+  addDIEEntry(Entity, Attribute, DIEEntry(*getOrCreateTypeDIE(Ty)));
+}
+
+std::string DwarfUnit::getParentContextString(const DIScope *Context) const {
+  if (!Context)
+    return "";
+
+  // FIXME: Decide whether to implement this for non-C++ languages.
+  if (getLanguage() != dwarf::DW_LANG_C_plus_plus)
+    return "";
+
+  std::string CS;
+  SmallVector<const DIScope *, 1> Parents;
+  while (!isa<DICompileUnit>(Context)) {
+    Parents.push_back(Context);
+    if (Context->getScope())
+      Context = resolve(Context->getScope());
+    else
+      // Structure, etc types will have a NULL context if they're at the top
+      // level.
+      break;
+  }
+
+  // Reverse iterate over our list to go from the outermost construct to the
+  // innermost.
+  for (const DIScope *Ctx : make_range(Parents.rbegin(), Parents.rend())) {
+    StringRef Name = Ctx->getName();
+    if (Name.empty() && isa<DINamespace>(Ctx))
+      Name = "(anonymous namespace)";
+    if (!Name.empty()) {
+      CS += Name;
+      CS += "::";
+    }
+  }
+  return CS;
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIBasicType *BTy) {
+  // Get core information.
+  StringRef Name = BTy->getName();
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  // An unspecified type only has a name attribute.
+  if (BTy->getTag() == dwarf::DW_TAG_unspecified_type)
+    return;
+
+  addUInt(Buffer, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          BTy->getEncoding());
+
+  uint64_t Size = BTy->getSizeInBits() >> 3;
+  addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
+  // Get core information.
+  StringRef Name = DTy->getName();
+  uint64_t Size = DTy->getSizeInBits() >> 3;
+  uint16_t Tag = Buffer.getTag();
+
+  // Map to main type, void will not have a type.
+  const DIType *FromTy = resolve(DTy->getBaseType());
+  if (FromTy)
+    addType(Buffer, FromTy);
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  // Add size if non-zero (derived types might be zero-sized.)
+  if (Size && Tag != dwarf::DW_TAG_pointer_type
+           && Tag != dwarf::DW_TAG_ptr_to_member_type
+           && Tag != dwarf::DW_TAG_reference_type
+           && Tag != dwarf::DW_TAG_rvalue_reference_type)
+    addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
+
+  if (Tag == dwarf::DW_TAG_ptr_to_member_type)
+    addDIEEntry(
+        Buffer, dwarf::DW_AT_containing_type,
+        *getOrCreateTypeDIE(resolve(cast<DIDerivedType>(DTy)->getClassType())));
+  // Add source line info if available and TyDesc is not a forward declaration.
+  if (!DTy->isForwardDecl())
+    addSourceLine(Buffer, DTy);
+}
+
+void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
+  for (unsigned i = 1, N = Args.size(); i < N; ++i) {
+    const DIType *Ty = resolve(Args[i]);
+    if (!Ty) {
+      assert(i == N-1 && "Unspecified parameter must be the last argument");
+      createAndAddDIE(dwarf::DW_TAG_unspecified_parameters, Buffer);
+    } else {
+      DIE &Arg = createAndAddDIE(dwarf::DW_TAG_formal_parameter, Buffer);
+      addType(Arg, Ty);
+      if (Ty->isArtificial())
+        addFlag(Arg, dwarf::DW_AT_artificial);
+    }
+  }
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
+  // Add return type.  A void return won't have a type.
+  auto Elements = cast<DISubroutineType>(CTy)->getTypeArray();
+  if (Elements.size())
+    if (auto RTy = resolve(Elements[0]))
+      addType(Buffer, RTy);
+
+  bool isPrototyped = true;
+  if (Elements.size() == 2 && !Elements[1])
+    isPrototyped = false;
+
+  constructSubprogramArguments(Buffer, Elements);
+
+  // Add prototype flag if we're dealing with a C language and the function has
+  // been prototyped.
+  uint16_t Language = getLanguage();
+  if (isPrototyped &&
+      (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
+       Language == dwarf::DW_LANG_ObjC))
+    addFlag(Buffer, dwarf::DW_AT_prototyped);
+
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CTy->getCC() && CTy->getCC() != dwarf::DW_CC_normal)
+    addUInt(Buffer, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1,
+            CTy->getCC());
+
+  if (CTy->isLValueReference())
+    addFlag(Buffer, dwarf::DW_AT_reference);
+
+  if (CTy->isRValueReference())
+    addFlag(Buffer, dwarf::DW_AT_rvalue_reference);
+}
+
+void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  if (CTy->isExternalTypeRef()) {
+    StringRef Identifier = CTy->getIdentifier();
+    assert(!Identifier.empty() && "external type ref without identifier");
+    addFlag(Buffer, dwarf::DW_AT_declaration);
+    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier);
+  }
+
+  // Add name if not anonymous or intermediate type.
+  StringRef Name = CTy->getName();
+
+  uint64_t Size = CTy->getSizeInBits() >> 3;
+  uint16_t Tag = Buffer.getTag();
+
+  switch (Tag) {
+  case dwarf::DW_TAG_array_type:
+    constructArrayTypeDIE(Buffer, CTy);
+    break;
+  case dwarf::DW_TAG_enumeration_type:
+    constructEnumTypeDIE(Buffer, CTy);
+    break;
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type: {
+    // Add elements to structure type.
+    DINodeArray Elements = CTy->getElements();
+    for (const auto *Element : Elements) {
+      if (!Element)
+        continue;
+      if (auto *SP = dyn_cast<DISubprogram>(Element))
+        getOrCreateSubprogramDIE(SP);
+      else if (auto *DDTy = dyn_cast<DIDerivedType>(Element)) {
+        if (DDTy->getTag() == dwarf::DW_TAG_friend) {
+          DIE &ElemDie = createAndAddDIE(dwarf::DW_TAG_friend, Buffer);
+          addType(ElemDie, resolve(DDTy->getBaseType()), dwarf::DW_AT_friend);
+        } else if (DDTy->isStaticMember()) {
+          getOrCreateStaticMemberDIE(DDTy);
+        } else {
+          constructMemberDIE(Buffer, DDTy);
+        }
+      } else if (auto *Property = dyn_cast<DIObjCProperty>(Element)) {
+        DIE &ElemDie = createAndAddDIE(Property->getTag(), Buffer);
+        StringRef PropertyName = Property->getName();
+        addString(ElemDie, dwarf::DW_AT_APPLE_property_name, PropertyName);
+        if (Property->getType())
+          addType(ElemDie, resolve(Property->getType()));
+        addSourceLine(ElemDie, Property);
+        StringRef GetterName = Property->getGetterName();
+        if (!GetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_getter, GetterName);
+        StringRef SetterName = Property->getSetterName();
+        if (!SetterName.empty())
+          addString(ElemDie, dwarf::DW_AT_APPLE_property_setter, SetterName);
+        if (unsigned PropertyAttributes = Property->getAttributes())
+          addUInt(ElemDie, dwarf::DW_AT_APPLE_property_attribute, None,
+                  PropertyAttributes);
+      }
+    }
+
+    if (CTy->isAppleBlockExtension())
+      addFlag(Buffer, dwarf::DW_AT_APPLE_block);
+
+    // This is outside the DWARF spec, but GDB expects a DW_AT_containing_type
+    // inside C++ composite types to point to the base class with the vtable.
+    if (auto *ContainingType =
+            dyn_cast_or_null<DICompositeType>(resolve(CTy->getVTableHolder())))
+      addDIEEntry(Buffer, dwarf::DW_AT_containing_type,
+                  *getOrCreateTypeDIE(ContainingType));
+
+    if (CTy->isObjcClassComplete())
+      addFlag(Buffer, dwarf::DW_AT_APPLE_objc_complete_type);
+
+    // Add template parameters to a class, structure or union types.
+    // FIXME: The support isn't in the metadata for this yet.
+    if (Tag == dwarf::DW_TAG_class_type ||
+        Tag == dwarf::DW_TAG_structure_type || Tag == dwarf::DW_TAG_union_type)
+      addTemplateParams(Buffer, CTy->getTemplateParams());
+
+    break;
+  }
+  default:
+    break;
+  }
+
+  // Add name if not anonymous or intermediate type.
+  if (!Name.empty())
+    addString(Buffer, dwarf::DW_AT_name, Name);
+
+  if (Tag == dwarf::DW_TAG_enumeration_type ||
+      Tag == dwarf::DW_TAG_class_type || Tag == dwarf::DW_TAG_structure_type ||
+      Tag == dwarf::DW_TAG_union_type) {
+    // Add size if non-zero (derived types might be zero-sized.)
+    // TODO: Do we care about size for enum forward declarations?
+    if (Size)
+      addUInt(Buffer, dwarf::DW_AT_byte_size, None, Size);
+    else if (!CTy->isForwardDecl())
+      // Add zero size if it is not a forward declaration.
+      addUInt(Buffer, dwarf::DW_AT_byte_size, None, 0);
+
+    // If we're a forward decl, say so.
+    if (CTy->isForwardDecl())
+      addFlag(Buffer, dwarf::DW_AT_declaration);
+
+    // Add source line info if available.
+    if (!CTy->isForwardDecl())
+      addSourceLine(Buffer, CTy);
+
+    // No harm in adding the runtime language to the declaration.
+    unsigned RLang = CTy->getRuntimeLang();
+    if (RLang)
+      addUInt(Buffer, dwarf::DW_AT_APPLE_runtime_class, dwarf::DW_FORM_data1,
+              RLang);
+  }
+}
+
+void DwarfUnit::constructTemplateTypeParameterDIE(
+    DIE &Buffer, const DITemplateTypeParameter *TP) {
+  DIE &ParamDIE =
+      createAndAddDIE(dwarf::DW_TAG_template_type_parameter, Buffer);
+  // Add the type if it exists, it could be void and therefore no type.
+  if (TP->getType())
+    addType(ParamDIE, resolve(TP->getType()));
+  if (!TP->getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, TP->getName());
+}
+
+void DwarfUnit::constructTemplateValueParameterDIE(
+    DIE &Buffer, const DITemplateValueParameter *VP) {
+  DIE &ParamDIE = createAndAddDIE(VP->getTag(), Buffer);
+
+  // Add the type if there is one, template template and template parameter
+  // packs will not have a type.
+  if (VP->getTag() == dwarf::DW_TAG_template_value_parameter)
+    addType(ParamDIE, resolve(VP->getType()));
+  if (!VP->getName().empty())
+    addString(ParamDIE, dwarf::DW_AT_name, VP->getName());
+  if (Metadata *Val = VP->getValue()) {
+    if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
+      addConstantValue(ParamDIE, CI, resolve(VP->getType()));
+    else if (GlobalValue *GV = mdconst::dyn_extract<GlobalValue>(Val)) {
+      // We cannot describe the location of dllimport'd entities: the
+      // computation of their address requires loads from the IAT.
+      if (!GV->hasDLLImportStorageClass()) {
+        // For declaration non-type template parameters (such as global values
+        // and functions)
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        addOpAddress(*Loc, Asm->getSymbol(GV));
+        // Emit DW_OP_stack_value to use the address as the immediate value of
+        // the parameter, rather than a pointer to it.
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+        addBlock(ParamDIE, dwarf::DW_AT_location, Loc);
+      }
+    } else if (VP->getTag() == dwarf::DW_TAG_GNU_template_template_param) {
+      assert(isa<MDString>(Val));
+      addString(ParamDIE, dwarf::DW_AT_GNU_template_name,
+                cast<MDString>(Val)->getString());
+    } else if (VP->getTag() == dwarf::DW_TAG_GNU_template_parameter_pack) {
+      addTemplateParams(ParamDIE, cast<MDTuple>(Val));
+    }
+  }
+}
+
+DIE *DwarfUnit::getOrCreateNameSpace(const DINamespace *NS) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(NS->getScope());
+
+  if (DIE *NDie = getDIE(NS))
+    return NDie;
+  DIE &NDie = createAndAddDIE(dwarf::DW_TAG_namespace, *ContextDIE, NS);
+
+  StringRef Name = NS->getName();
+  if (!Name.empty())
+    addString(NDie, dwarf::DW_AT_name, NS->getName());
+  else
+    Name = "(anonymous namespace)";
+  DD->addAccelNamespace(Name, NDie);
+  addGlobalName(Name, NDie, NS->getScope());
+  addSourceLine(NDie, NS);
+  return &NDie;
+}
+
+DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(M->getScope());
+
+  if (DIE *MDie = getDIE(M))
+    return MDie;
+  DIE &MDie = createAndAddDIE(dwarf::DW_TAG_module, *ContextDIE, M);
+
+  if (!M->getName().empty()) {
+    addString(MDie, dwarf::DW_AT_name, M->getName());
+    addGlobalName(M->getName(), MDie, M->getScope());
+  }
+  if (!M->getConfigurationMacros().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_config_macros,
+              M->getConfigurationMacros());
+  if (!M->getIncludePath().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_include_path, M->getIncludePath());
+  if (!M->getISysRoot().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_isysroot, M->getISysRoot());
+  
+  return &MDie;
+}
+
+DIE *DwarfUnit::getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal) {
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE (as is the case for member function
+  // declarations).
+  DIE *ContextDIE =
+      Minimal ? &getUnitDie() : getOrCreateContextDIE(resolve(SP->getScope()));
+
+  if (DIE *SPDie = getDIE(SP))
+    return SPDie;
+
+  if (auto *SPDecl = SP->getDeclaration()) {
+    if (!Minimal) {
+      // Add subprogram definitions to the CU die directly.
+      ContextDIE = &getUnitDie();
+      // Build the decl now to ensure it precedes the definition.
+      getOrCreateSubprogramDIE(SPDecl);
+    }
+  }
+
+  // DW_TAG_inlined_subroutine may refer to this DIE.
+  DIE &SPDie = createAndAddDIE(dwarf::DW_TAG_subprogram, *ContextDIE, SP);
+
+  // Stop here and fill this in later, depending on whether or not this
+  // subprogram turns out to have inlined instances or not.
+  if (SP->isDefinition())
+    return &SPDie;
+
+  applySubprogramAttributes(SP, SPDie);
+  return &SPDie;
+}
+
+bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
+                                                    DIE &SPDie) {
+  DIE *DeclDie = nullptr;
+  StringRef DeclLinkageName;
+  if (auto *SPDecl = SP->getDeclaration()) {
+    DeclDie = getDIE(SPDecl);
+    assert(DeclDie && "This DIE should've already been constructed when the "
+                      "definition DIE was created in "
+                      "getOrCreateSubprogramDIE");
+    DeclLinkageName = SPDecl->getLinkageName();
+    unsigned DeclID =
+        getOrCreateSourceID(SPDecl->getFilename(), SPDecl->getDirectory());
+    unsigned DefID = getOrCreateSourceID(SP->getFilename(), SP->getDirectory());
+    if (DeclID != DefID)
+      addUInt(SPDie, dwarf::DW_AT_decl_file, None, DefID);
+
+    if (SP->getLine() != SPDecl->getLine())
+      addUInt(SPDie, dwarf::DW_AT_decl_line, None, SP->getLine());
+  }
+
+  // Add function template parameters.
+  addTemplateParams(SPDie, SP->getTemplateParams());
+
+  // Add the linkage name if we have one and it isn't in the Decl.
+  StringRef LinkageName = SP->getLinkageName();
+  assert(((LinkageName.empty() || DeclLinkageName.empty()) ||
+          LinkageName == DeclLinkageName) &&
+         "decl has a linkage name and it is different");
+  if (DeclLinkageName.empty() &&
+      // Always emit it for abstract subprograms.
+      (DD->useAllLinkageNames() || DU->getAbstractSPDies().lookup(SP)))
+    addLinkageName(SPDie, LinkageName);
+
+  if (!DeclDie)
+    return false;
+
+  // Refer to the function declaration where all the other attributes will be
+  // found.
+  addDIEEntry(SPDie, dwarf::DW_AT_specification, *DeclDie);
+  return true;
+}
+
+void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
+                                          bool Minimal) {
+  if (!Minimal)
+    if (applySubprogramDefinitionAttributes(SP, SPDie))
+      return;
+
+  // Constructors and operators for anonymous aggregates do not have names.
+  if (!SP->getName().empty())
+    addString(SPDie, dwarf::DW_AT_name, SP->getName());
+
+  // Skip the rest of the attributes under -gmlt to save space.
+  if (Minimal)
+    return;
+
+  addSourceLine(SPDie, SP);
+
+  // Add the prototype if we have a prototype and we have a C like
+  // language.
+  uint16_t Language = getLanguage();
+  if (SP->isPrototyped() &&
+      (Language == dwarf::DW_LANG_C89 || Language == dwarf::DW_LANG_C99 ||
+       Language == dwarf::DW_LANG_ObjC))
+    addFlag(SPDie, dwarf::DW_AT_prototyped);
+
+  unsigned CC = 0;
+  DITypeRefArray Args;
+  if (const DISubroutineType *SPTy = SP->getType()) {
+    Args = SPTy->getTypeArray();
+    CC = SPTy->getCC();
+  }
+
+  // Add a DW_AT_calling_convention if this has an explicit convention.
+  if (CC && CC != dwarf::DW_CC_normal)
+    addUInt(SPDie, dwarf::DW_AT_calling_convention, dwarf::DW_FORM_data1, CC);
+
+  // Add a return type. If this is a type like a C/C++ void type we don't add a
+  // return type.
+  if (Args.size())
+    if (auto Ty = resolve(Args[0]))
+      addType(SPDie, Ty);
+
+  unsigned VK = SP->getVirtuality();
+  if (VK) {
+    addUInt(SPDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1, VK);
+    if (SP->getVirtualIndex() != -1u) {
+      DIELoc *Block = getDIELoc();
+      addUInt(*Block, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+      addUInt(*Block, dwarf::DW_FORM_udata, SP->getVirtualIndex());
+      addBlock(SPDie, dwarf::DW_AT_vtable_elem_location, Block);
+    }
+    ContainingTypeMap.insert(
+        std::make_pair(&SPDie, resolve(SP->getContainingType())));
+  }
+
+  if (!SP->isDefinition()) {
+    addFlag(SPDie, dwarf::DW_AT_declaration);
+
+    // Add arguments. Do not add arguments for subprogram definition. They will
+    // be handled while processing variables.
+    constructSubprogramArguments(SPDie, Args);
+  }
+
+  if (SP->isArtificial())
+    addFlag(SPDie, dwarf::DW_AT_artificial);
+
+  if (!SP->isLocalToUnit())
+    addFlag(SPDie, dwarf::DW_AT_external);
+
+  if (DD->useAppleExtensionAttributes()) {
+    if (SP->isOptimized())
+      addFlag(SPDie, dwarf::DW_AT_APPLE_optimized);
+
+    if (unsigned isa = Asm->getISAEncoding())
+      addUInt(SPDie, dwarf::DW_AT_APPLE_isa, dwarf::DW_FORM_flag, isa);
+  }
+
+  if (SP->isLValueReference())
+    addFlag(SPDie, dwarf::DW_AT_reference);
+
+  if (SP->isRValueReference())
+    addFlag(SPDie, dwarf::DW_AT_rvalue_reference);
+
+  if (SP->isProtected())
+    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (SP->isPrivate())
+    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else if (SP->isPublic())
+    addUInt(SPDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+
+  if (SP->isExplicit())
+    addFlag(SPDie, dwarf::DW_AT_explicit);
+}
+
+void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
+                                     DIE *IndexTy) {
+  DIE &DW_Subrange = createAndAddDIE(dwarf::DW_TAG_subrange_type, Buffer);
+  addDIEEntry(DW_Subrange, dwarf::DW_AT_type, *IndexTy);
+
+  // The LowerBound value defines the lower bounds which is typically zero for
+  // C/C++. The Count value is the number of elements.  Values are 64 bit. If
+  // Count == -1 then the array is unbounded and we do not emit
+  // DW_AT_lower_bound and DW_AT_count attributes.
+  int64_t LowerBound = SR->getLowerBound();
+  int64_t DefaultLowerBound = getDefaultLowerBound();
+  int64_t Count = SR->getCount();
+
+  if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
+
+  if (Count != -1)
+    // FIXME: An unbounded array should reference the expression that defines
+    // the array.
+    addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
+}
+
+DIE *DwarfUnit::getIndexTyDie() {
+  if (IndexTyDie)
+    return IndexTyDie;
+  // Construct an integer type to use for indexes.
+  IndexTyDie = &createAndAddDIE(dwarf::DW_TAG_base_type, UnitDie);
+  addString(*IndexTyDie, dwarf::DW_AT_name, "sizetype");
+  addUInt(*IndexTyDie, dwarf::DW_AT_byte_size, None, sizeof(int64_t));
+  addUInt(*IndexTyDie, dwarf::DW_AT_encoding, dwarf::DW_FORM_data1,
+          dwarf::DW_ATE_unsigned);
+  return IndexTyDie;
+}
+
+void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  if (CTy->isVector())
+    addFlag(Buffer, dwarf::DW_AT_GNU_vector);
+
+  // Emit the element type.
+  addType(Buffer, resolve(CTy->getBaseType()));
+
+  // Get an anonymous type for index type.
+  // FIXME: This type should be passed down from the front end
+  // as different languages may have different sizes for indexes.
+  DIE *IdxTy = getIndexTyDie();
+
+  // Add subranges to array type.
+  DINodeArray Elements = CTy->getElements();
+  for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
+    // FIXME: Should this really be such a loose cast?
+    if (auto *Element = dyn_cast_or_null<DINode>(Elements[i]))
+      if (Element->getTag() == dwarf::DW_TAG_subrange_type)
+        constructSubrangeDIE(Buffer, cast<DISubrange>(Element), IdxTy);
+  }
+}
+
+void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
+  DINodeArray Elements = CTy->getElements();
+
+  // Add enumerators to enumeration type.
+  for (unsigned i = 0, N = Elements.size(); i < N; ++i) {
+    auto *Enum = dyn_cast_or_null<DIEnumerator>(Elements[i]);
+    if (Enum) {
+      DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
+      StringRef Name = Enum->getName();
+      addString(Enumerator, dwarf::DW_AT_name, Name);
+      int64_t Value = Enum->getValue();
+      addSInt(Enumerator, dwarf::DW_AT_const_value, dwarf::DW_FORM_sdata,
+              Value);
+    }
+  }
+  const DIType *DTy = resolve(CTy->getBaseType());
+  if (DTy) {
+    addType(Buffer, DTy);
+    addFlag(Buffer, dwarf::DW_AT_enum_class);
+  }
+}
+
+void DwarfUnit::constructContainingTypeDIEs() {
+  for (auto CI = ContainingTypeMap.begin(), CE = ContainingTypeMap.end();
+       CI != CE; ++CI) {
+    DIE &SPDie = *CI->first;
+    const DINode *D = CI->second;
+    if (!D)
+      continue;
+    DIE *NDie = getDIE(D);
+    if (!NDie)
+      continue;
+    addDIEEntry(SPDie, dwarf::DW_AT_containing_type, *NDie);
+  }
+}
+
+void DwarfUnit::constructMemberDIE(DIE &Buffer, const DIDerivedType *DT) {
+  DIE &MemberDie = createAndAddDIE(DT->getTag(), Buffer);
+  StringRef Name = DT->getName();
+  if (!Name.empty())
+    addString(MemberDie, dwarf::DW_AT_name, Name);
+
+  addType(MemberDie, resolve(DT->getBaseType()));
+
+  addSourceLine(MemberDie, DT);
+
+  if (DT->getTag() == dwarf::DW_TAG_inheritance && DT->isVirtual()) {
+
+    // For C++, virtual base classes are not at fixed offset. Use following
+    // expression to extract appropriate offset from vtable.
+    // BaseAddr = ObAddr + *((*ObAddr) - Offset)
+
+    DIELoc *VBaseLocationDie = new (DIEValueAllocator) DIELoc;
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_dup);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_constu);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_udata, DT->getOffsetInBits());
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_minus);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_deref);
+    addUInt(*VBaseLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus);
+
+    addBlock(MemberDie, dwarf::DW_AT_data_member_location, VBaseLocationDie);
+  } else {
+    uint64_t Size = DT->getSizeInBits();
+    uint64_t FieldSize = DD->getBaseTypeSize(DT);
+    uint64_t OffsetInBytes;
+
+    bool IsBitfield = FieldSize && Size != FieldSize;
+    if (IsBitfield) {
+      // Handle bitfield, assume bytes are 8 bits.
+      if (DD->useDWARF2Bitfields())
+        addUInt(MemberDie, dwarf::DW_AT_byte_size, None, FieldSize/8);
+      addUInt(MemberDie, dwarf::DW_AT_bit_size, None, Size);
+
+      uint64_t Offset = DT->getOffsetInBits();
+      uint64_t Align = DT->getAlignInBits() ? DT->getAlignInBits() : FieldSize;
+      uint64_t AlignMask = ~(Align - 1);
+      // The bits from the start of the storage unit to the start of the field.
+      uint64_t StartBitOffset = Offset - (Offset & AlignMask);
+      // The byte offset of the field's aligned storage unit inside the struct.
+      OffsetInBytes = (Offset - StartBitOffset) / 8;
+
+      if (DD->useDWARF2Bitfields()) {
+        uint64_t HiMark = (Offset + FieldSize) & AlignMask;
+        uint64_t FieldOffset = (HiMark - FieldSize);
+        Offset -= FieldOffset;
+
+        // Maybe we need to work from the other end.
+        if (Asm->getDataLayout().isLittleEndian())
+          Offset = FieldSize - (Offset + Size);
+
+        addUInt(MemberDie, dwarf::DW_AT_bit_offset, None, Offset);
+        OffsetInBytes = FieldOffset >> 3;
+      } else {
+        addUInt(MemberDie, dwarf::DW_AT_data_bit_offset, None, Offset);
+      }
+    } else {
+      // This is not a bitfield.
+      OffsetInBytes = DT->getOffsetInBits() / 8;
+    }
+
+    if (DD->getDwarfVersion() <= 2) {
+      DIELoc *MemLocationDie = new (DIEValueAllocator) DIELoc;
+      addUInt(*MemLocationDie, dwarf::DW_FORM_data1, dwarf::DW_OP_plus_uconst);
+      addUInt(*MemLocationDie, dwarf::DW_FORM_udata, OffsetInBytes);
+      addBlock(MemberDie, dwarf::DW_AT_data_member_location, MemLocationDie);
+    } else if (!IsBitfield || DD->useDWARF2Bitfields())
+      addUInt(MemberDie, dwarf::DW_AT_data_member_location, None,
+              OffsetInBytes);
+  }
+
+  if (DT->isProtected())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (DT->isPrivate())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  // Otherwise C++ member and base classes are considered public.
+  else if (DT->isPublic())
+    addUInt(MemberDie, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+  if (DT->isVirtual())
+    addUInt(MemberDie, dwarf::DW_AT_virtuality, dwarf::DW_FORM_data1,
+            dwarf::DW_VIRTUALITY_virtual);
+
+  // Objective-C properties.
+  if (DINode *PNode = DT->getObjCProperty())
+    if (DIE *PDie = getDIE(PNode))
+      MemberDie.addValue(DIEValueAllocator, dwarf::DW_AT_APPLE_property,
+                         dwarf::DW_FORM_ref4, DIEEntry(*PDie));
+
+  if (DT->isArtificial())
+    addFlag(MemberDie, dwarf::DW_AT_artificial);
+}
+
+DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
+  if (!DT)
+    return nullptr;
+
+  // Construct the context before querying for the existence of the DIE in case
+  // such construction creates the DIE.
+  DIE *ContextDIE = getOrCreateContextDIE(resolve(DT->getScope()));
+  assert(dwarf::isType(ContextDIE->getTag()) &&
+         "Static member should belong to a type.");
+
+  if (DIE *StaticMemberDIE = getDIE(DT))
+    return StaticMemberDIE;
+
+  DIE &StaticMemberDIE = createAndAddDIE(DT->getTag(), *ContextDIE, DT);
+
+  const DIType *Ty = resolve(DT->getBaseType());
+
+  addString(StaticMemberDIE, dwarf::DW_AT_name, DT->getName());
+  addType(StaticMemberDIE, Ty);
+  addSourceLine(StaticMemberDIE, DT);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_external);
+  addFlag(StaticMemberDIE, dwarf::DW_AT_declaration);
+
+  // FIXME: We could omit private if the parent is a class_type, and
+  // public if the parent is something else.
+  if (DT->isProtected())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_protected);
+  else if (DT->isPrivate())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_private);
+  else if (DT->isPublic())
+    addUInt(StaticMemberDIE, dwarf::DW_AT_accessibility, dwarf::DW_FORM_data1,
+            dwarf::DW_ACCESS_public);
+
+  if (const ConstantInt *CI = dyn_cast_or_null<ConstantInt>(DT->getConstant()))
+    addConstantValue(StaticMemberDIE, CI, Ty);
+  if (const ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(DT->getConstant()))
+    addConstantFPValue(StaticMemberDIE, CFP);
+
+  return &StaticMemberDIE;
+}
+
+void DwarfUnit::emitHeader(bool UseOffsets) {
+  // Emit size of content not including length itself
+  Asm->OutStreamer->AddComment("Length of Unit");
+  Asm->EmitInt32(getHeaderSize() + UnitDie.getSize());
+
+  Asm->OutStreamer->AddComment("DWARF version number");
+  Asm->EmitInt16(DD->getDwarfVersion());
+  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
+
+  // We share one abbreviations table across all units so it's always at the
+  // start of the section. Use a relocatable offset where needed to ensure
+  // linking doesn't invalidate that offset.
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  if (UseOffsets)
+    Asm->EmitInt32(0);
+  else
+    Asm->emitDwarfSymbolReference(
+        TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
+
+  Asm->OutStreamer->AddComment("Address Size (in bytes)");
+  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+}
+
+void DwarfUnit::initSection(MCSection *Section) {
+  assert(!this->Section);
+  this->Section = Section;
+}
+
+void DwarfTypeUnit::emitHeader(bool UseOffsets) {
+  DwarfUnit::emitHeader(UseOffsets);
+  Asm->OutStreamer->AddComment("Type Signature");
+  Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
+  Asm->OutStreamer->AddComment("Type DIE Offset");
+  // In a skeleton type unit there is no type DIE so emit a zero offset.
+  Asm->OutStreamer->EmitIntValue(Ty ? Ty->getOffset() : 0,
+                                 sizeof(Ty->getOffset()));
+}
+
+bool DwarfTypeUnit::isDwoUnit() const {
+  // Since there are no skeleton type units, all type units are dwo type units
+  // when split DWARF is being used.
+  return DD->useSplitDwarf();
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
new file mode 100644
index 000000000000..e225f92116d4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -0,0 +1,391 @@
+//===-- llvm/CodeGen/DwarfUnit.h - Dwarf Compile Unit ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_DWARFUNIT_H
+
+#include "DwarfDebug.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/DIE.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+
+namespace llvm {
+
+class MachineLocation;
+class MachineOperand;
+class ConstantInt;
+class ConstantFP;
+class DbgVariable;
+class DwarfCompileUnit;
+
+// Data structure to hold a range for range lists.
+class RangeSpan {
+public:
+  RangeSpan(MCSymbol *S, MCSymbol *E) : Start(S), End(E) {}
+  const MCSymbol *getStart() const { return Start; }
+  const MCSymbol *getEnd() const { return End; }
+  void setEnd(const MCSymbol *E) { End = E; }
+
+private:
+  const MCSymbol *Start, *End;
+};
+
+class RangeSpanList {
+private:
+  // Index for locating within the debug_range section this particular span.
+  MCSymbol *RangeSym;
+  // List of ranges.
+  SmallVector<RangeSpan, 2> Ranges;
+
+public:
+  RangeSpanList(MCSymbol *Sym, SmallVector<RangeSpan, 2> Ranges)
+      : RangeSym(Sym), Ranges(std::move(Ranges)) {}
+  MCSymbol *getSym() const { return RangeSym; }
+  const SmallVectorImpl<RangeSpan> &getRanges() const { return Ranges; }
+  void addRange(RangeSpan Range) { Ranges.push_back(Range); }
+};
+
+//===----------------------------------------------------------------------===//
+/// This dwarf writer support class manages information associated with a
+/// source file.
+class DwarfUnit {
+protected:
+  /// MDNode for the compile unit.
+  const DICompileUnit *CUNode;
+
+  // All DIEValues are allocated through this allocator.
+  BumpPtrAllocator DIEValueAllocator;
+
+  /// Unit debug information entry.
+  DIE &UnitDie;
+
+  /// Target of Dwarf emission.
+  AsmPrinter *Asm;
+
+  // Holders for some common dwarf information.
+  DwarfDebug *DD;
+  DwarfFile *DU;
+
+  /// An anonymous type for index type.  Owned by UnitDie.
+  DIE *IndexTyDie;
+
+  /// Tracks the mapping of unit level debug information variables to debug
+  /// information entries.
+  DenseMap<const MDNode *, DIE *> MDNodeToDieMap;
+
+  /// A list of all the DIEBlocks in use.
+  std::vector<DIEBlock *> DIEBlocks;
+
+  /// A list of all the DIELocs in use.
+  std::vector<DIELoc *> DIELocs;
+
+  /// This map is used to keep track of subprogram DIEs that need
+  /// DW_AT_containing_type attribute. This attribute points to a DIE that
+  /// corresponds to the MDNode mapped with the subprogram DIE.
+  DenseMap<DIE *, const DINode *> ContainingTypeMap;
+
+  /// The section this unit will be emitted in.
+  MCSection *Section;
+
+  DwarfUnit(dwarf::Tag, const DICompileUnit *CU, AsmPrinter *A, DwarfDebug *DW,
+            DwarfFile *DWU);
+
+  bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie);
+
+public:
+  virtual ~DwarfUnit();
+
+  void initSection(MCSection *Section);
+
+  MCSection *getSection() const {
+    assert(Section);
+    return Section;
+  }
+
+  // Accessors.
+  AsmPrinter* getAsmPrinter() const { return Asm; }
+  uint16_t getLanguage() const { return CUNode->getSourceLanguage(); }
+  const DICompileUnit *getCUNode() const { return CUNode; }
+  DIE &getUnitDie() { return UnitDie; }
+
+  /// Return true if this compile unit has something to write out.
+  bool hasContent() const { return UnitDie.hasChildren(); }
+
+  /// Get string containing language specific context for a global name.
+  ///
+  /// Walks the metadata parent chain in a language specific manner (using the
+  /// compile unit language) and returns it as a string. This is done at the
+  /// metadata level because DIEs may not currently have been added to the
+  /// parent context and walking the DIEs looking for names is more expensive
+  /// than walking the metadata.
+  std::string getParentContextString(const DIScope *Context) const;
+
+  /// Add a new global name to the compile unit.
+  virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) {
+  }
+
+  /// Add a new global type to the compile unit.
+  virtual void addGlobalType(const DIType *Ty, const DIE &Die,
+                             const DIScope *Context) {}
+
+  /// Returns the DIE map slot for the specified debug variable.
+  ///
+  /// We delegate the request to DwarfDebug when the MDNode can be part of the
+  /// type system, since DIEs for the type system can be shared across CUs and
+  /// the mappings are kept in DwarfDebug.
+  DIE *getDIE(const DINode *D) const;
+
+  /// Returns a fresh newly allocated DIELoc.
+  DIELoc *getDIELoc() { return new (DIEValueAllocator) DIELoc; }
+
+  /// Insert DIE into the map.
+  ///
+  /// We delegate the request to DwarfDebug when the MDNode can be part of the
+  /// type system, since DIEs for the type system can be shared across CUs and
+  /// the mappings are kept in DwarfDebug.
+  void insertDIE(const DINode *Desc, DIE *D);
+
+  /// Add a flag that is true to the DIE.
+  void addFlag(DIE &Die, dwarf::Attribute Attribute);
+
+  /// Add an unsigned integer attribute data and value.
+  void addUInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               Optional<dwarf::Form> Form, uint64_t Integer);
+
+  void addUInt(DIEValueList &Block, dwarf::Form Form, uint64_t Integer);
+
+  /// Add an signed integer attribute data and value.
+  void addSInt(DIEValueList &Die, dwarf::Attribute Attribute,
+               Optional<dwarf::Form> Form, int64_t Integer);
+
+  void addSInt(DIELoc &Die, Optional<dwarf::Form> Form, int64_t Integer);
+
+  /// Add a string attribute data and value.
+  ///
+  /// We always emit a reference to the string pool instead of immediate
+  /// strings so that DIEs have more predictable sizes. In the case of split
+  /// dwarf we emit an index into another table which gets us the static offset
+  /// into the string table.
+  void addString(DIE &Die, dwarf::Attribute Attribute, StringRef Str);
+
+  /// Add a Dwarf label attribute data and value.
+  DIEValueList::value_iterator addLabel(DIEValueList &Die,
+                                        dwarf::Attribute Attribute,
+                                        dwarf::Form Form,
+                                        const MCSymbol *Label);
+
+  void addLabel(DIELoc &Die, dwarf::Form Form, const MCSymbol *Label);
+
+  /// Add an offset into a section attribute data and value.
+  void addSectionOffset(DIE &Die, dwarf::Attribute Attribute, uint64_t Integer);
+
+  /// Add a dwarf op address data and value using the form given and an
+  /// op of either DW_FORM_addr or DW_FORM_GNU_addr_index.
+  void addOpAddress(DIELoc &Die, const MCSymbol *Label);
+
+  /// Add a label delta attribute data and value.
+  void addLabelDelta(DIE &Die, dwarf::Attribute Attribute, const MCSymbol *Hi,
+                     const MCSymbol *Lo);
+
+  /// Add a DIE attribute data and value.
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIE &Entry);
+
+  /// Add a DIE attribute data and value.
+  void addDIEEntry(DIE &Die, dwarf::Attribute Attribute, DIEEntry Entry);
+
+  /// Add a type's DW_AT_signature and set the  declaration flag.
+  void addDIETypeSignature(DIE &Die, uint64_t Signature);
+  /// Add an attribute containing the type signature for a unique identifier.
+  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
+                           StringRef Identifier);
+
+  /// Add block data.
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
+
+  /// Add block data.
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, DIEBlock *Block);
+
+  /// Add location information to specified debug information entry.
+  void addSourceLine(DIE &Die, unsigned Line, StringRef File,
+                     StringRef Directory);
+  void addSourceLine(DIE &Die, const DILocalVariable *V);
+  void addSourceLine(DIE &Die, const DIGlobalVariable *G);
+  void addSourceLine(DIE &Die, const DISubprogram *SP);
+  void addSourceLine(DIE &Die, const DIType *Ty);
+  void addSourceLine(DIE &Die, const DINamespace *NS);
+  void addSourceLine(DIE &Die, const DIObjCProperty *Ty);
+
+  /// Add constant value entry in variable DIE.
+  void addConstantValue(DIE &Die, const MachineOperand &MO, const DIType *Ty);
+  void addConstantValue(DIE &Die, const ConstantInt *CI, const DIType *Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, const DIType *Ty);
+  void addConstantValue(DIE &Die, const APInt &Val, bool Unsigned);
+  void addConstantValue(DIE &Die, bool Unsigned, uint64_t Val);
+
+  /// Add constant value entry in variable DIE.
+  void addConstantFPValue(DIE &Die, const MachineOperand &MO);
+  void addConstantFPValue(DIE &Die, const ConstantFP *CFP);
+
+  /// Add a linkage name, if it isn't empty.
+  void addLinkageName(DIE &Die, StringRef LinkageName);
+
+  /// Add template parameters in buffer.
+  void addTemplateParams(DIE &Buffer, DINodeArray TParams);
+
+  /// Add register operand.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOpPiece(DIELoc &TheDie, unsigned Reg,
+                          unsigned SizeInBits = 0, unsigned OffsetInBits = 0);
+
+  /// Add register offset.
+  /// \returns false if the register does not exist, e.g., because it was never
+  /// materialized.
+  bool addRegisterOffset(DIELoc &TheDie, unsigned Reg, int64_t Offset);
+
+  // FIXME: Should be reformulated in terms of addComplexAddress.
+  /// Start with the address based on the location provided, and generate the
+  /// DWARF information necessary to find the actual Block variable (navigating
+  /// the Block struct) based on the starting location.  Add the DWARF
+  /// information to the die.  Obsolete, please use addComplexAddress instead.
+  void addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
+                            dwarf::Attribute Attribute,
+                            const MachineLocation &Location);
+
+  /// Add a new type attribute to the specified entity.
+  ///
+  /// This takes and attribute parameter because DW_AT_friend attributes are
+  /// also type references.
+  void addType(DIE &Entity, const DIType *Ty,
+               dwarf::Attribute Attribute = dwarf::DW_AT_type);
+
+  DIE *getOrCreateNameSpace(const DINamespace *NS);
+  DIE *getOrCreateModule(const DIModule *M);
+  DIE *getOrCreateSubprogramDIE(const DISubprogram *SP, bool Minimal = false);
+
+  void applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
+                                 bool Minimal = false);
+
+  /// Find existing DIE or create new DIE for the given type.
+  DIE *getOrCreateTypeDIE(const MDNode *N);
+
+  /// Get context owner's DIE.
+  DIE *createTypeDIE(const DICompositeType *Ty);
+
+  /// Get context owner's DIE.
+  DIE *getOrCreateContextDIE(const DIScope *Context);
+
+  /// Construct DIEs for types that contain vtables.
+  void constructContainingTypeDIEs();
+
+  /// Construct function argument DIEs.
+  void constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args);
+
+  /// Create a DIE with the given Tag, add the DIE to its parent, and
+  /// call insertDIE if MD is not null.
+  DIE &createAndAddDIE(unsigned Tag, DIE &Parent, const DINode *N = nullptr);
+
+  /// Compute the size of a header for this unit, not including the initial
+  /// length field.
+  virtual unsigned getHeaderSize() const {
+    return sizeof(int16_t) + // DWARF version number
+           sizeof(int32_t) + // Offset Into Abbrev. Section
+           sizeof(int8_t);   // Pointer Size (in bytes)
+  }
+
+  /// Emit the header for this unit, not including the initial length field.
+  virtual void emitHeader(bool UseOffsets);
+
+  virtual DwarfCompileUnit &getCU() = 0;
+
+  void constructTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+
+protected:
+  /// Create new static data member DIE.
+  DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
+
+  /// Look up the source ID with the given directory and source file names. If
+  /// none currently exists, create a new ID and insert it in the line table.
+  virtual unsigned getOrCreateSourceID(StringRef File, StringRef Directory) = 0;
+
+  /// Look in the DwarfDebug map for the MDNode that corresponds to the
+  /// reference.
+  template <typename T> T *resolve(TypedDINodeRef<T> Ref) const {
+    return Ref.resolve();
+  }
+
+private:
+  void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
+  void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
+  void constructTypeDIE(DIE &Buffer, const DISubroutineType *DTy);
+  void constructSubrangeDIE(DIE &Buffer, const DISubrange *SR, DIE *IndexTy);
+  void constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+  void constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy);
+  void constructMemberDIE(DIE &Buffer, const DIDerivedType *DT);
+  void constructTemplateTypeParameterDIE(DIE &Buffer,
+                                         const DITemplateTypeParameter *TP);
+  void constructTemplateValueParameterDIE(DIE &Buffer,
+                                          const DITemplateValueParameter *TVP);
+
+  /// Return the default lower bound for an array.
+  ///
+  /// If the DWARF version doesn't handle the language, return -1.
+  int64_t getDefaultLowerBound() const;
+
+  /// Get an anonymous type for index type.
+  DIE *getIndexTyDie();
+
+  /// Set D as anonymous type for index which can be reused later.
+  void setIndexTyDie(DIE *D) { IndexTyDie = D; }
+
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
+  virtual bool isDwoUnit() const = 0;
+};
+
+class DwarfTypeUnit : public DwarfUnit {
+  uint64_t TypeSignature;
+  const DIE *Ty;
+  DwarfCompileUnit &CU;
+  MCDwarfDwoLineTable *SplitLineTable;
+
+  unsigned getOrCreateSourceID(StringRef File, StringRef Directory) override;
+  bool isDwoUnit() const override;
+
+public:
+  DwarfTypeUnit(DwarfCompileUnit &CU, AsmPrinter *A, DwarfDebug *DW,
+                DwarfFile *DWU, MCDwarfDwoLineTable *SplitLineTable = nullptr);
+
+  void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
+  void setType(const DIE *Ty) { this->Ty = Ty; }
+
+  /// Emit the header for this unit, not including the initial length field.
+  void emitHeader(bool UseOffsets) override;
+  unsigned getHeaderSize() const override {
+    return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
+           sizeof(uint32_t);                               // Type DIE Offset
+  }
+  DwarfCompileUnit &getCU() override { return CU; }
+};
+} // end llvm namespace
+#endif
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
new file mode 100644
index 000000000000..e24dcb1bffd4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -0,0 +1,689 @@
+//===-- CodeGen/AsmPrinter/EHStreamer.cpp - Exception Directive Streamer --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "EHStreamer.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+
+using namespace llvm;
+
+EHStreamer::EHStreamer(AsmPrinter *A) : Asm(A), MMI(Asm->MMI) {}
+
+EHStreamer::~EHStreamer() {}
+
+/// How many leading type ids two landing pads have in common.
+unsigned EHStreamer::sharedTypeIDs(const LandingPadInfo *L,
+                                   const LandingPadInfo *R) {
+  const std::vector<int> &LIds = L->TypeIds, &RIds = R->TypeIds;
+  unsigned LSize = LIds.size(), RSize = RIds.size();
+  unsigned MinSize = LSize < RSize ? LSize : RSize;
+  unsigned Count = 0;
+
+  for (; Count != MinSize; ++Count)
+    if (LIds[Count] != RIds[Count])
+      return Count;
+
+  return Count;
+}
+
+/// Compute the actions table and gather the first action index for each landing
+/// pad site.
+unsigned EHStreamer::
+computeActionsTable(const SmallVectorImpl<const LandingPadInfo*> &LandingPads,
+                    SmallVectorImpl<ActionEntry> &Actions,
+                    SmallVectorImpl<unsigned> &FirstActions) {
+
+  // The action table follows the call-site table in the LSDA. The individual
+  // records are of two types:
+  //
+  //   * Catch clause
+  //   * Exception specification
+  //
+  // The two record kinds have the same format, with only small differences.
+  // They are distinguished by the "switch value" field: Catch clauses
+  // (TypeInfos) have strictly positive switch values, and exception
+  // specifications (FilterIds) have strictly negative switch values. Value 0
+  // indicates a catch-all clause.
+  //
+  // Negative type IDs index into FilterIds. Positive type IDs index into
+  // TypeInfos.  The value written for a positive type ID is just the type ID
+  // itself.  For a negative type ID, however, the value written is the
+  // (negative) byte offset of the corresponding FilterIds entry.  The byte
+  // offset is usually equal to the type ID (because the FilterIds entries are
+  // written using a variable width encoding, which outputs one byte per entry
+  // as long as the value written is not too large) but can differ.  This kind
+  // of complication does not occur for positive type IDs because type infos are
+  // output using a fixed width encoding.  FilterOffsets[i] holds the byte
+  // offset corresponding to FilterIds[i].
+
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  SmallVector<int, 16> FilterOffsets;
+  FilterOffsets.reserve(FilterIds.size());
+  int Offset = -1;
+
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I != E; ++I) {
+    FilterOffsets.push_back(Offset);
+    Offset -= getULEB128Size(*I);
+  }
+
+  FirstActions.reserve(LandingPads.size());
+
+  int FirstAction = 0;
+  unsigned SizeActions = 0;
+  const LandingPadInfo *PrevLPI = nullptr;
+
+  for (SmallVectorImpl<const LandingPadInfo *>::const_iterator
+         I = LandingPads.begin(), E = LandingPads.end(); I != E; ++I) {
+    const LandingPadInfo *LPI = *I;
+    const std::vector<int> &TypeIds = LPI->TypeIds;
+    unsigned NumShared = PrevLPI ? sharedTypeIDs(LPI, PrevLPI) : 0;
+    unsigned SizeSiteActions = 0;
+
+    if (NumShared < TypeIds.size()) {
+      unsigned SizeAction = 0;
+      unsigned PrevAction = (unsigned)-1;
+
+      if (NumShared) {
+        unsigned SizePrevIds = PrevLPI->TypeIds.size();
+        assert(Actions.size());
+        PrevAction = Actions.size() - 1;
+        SizeAction = getSLEB128Size(Actions[PrevAction].NextAction) +
+                     getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+
+        for (unsigned j = NumShared; j != SizePrevIds; ++j) {
+          assert(PrevAction != (unsigned)-1 && "PrevAction is invalid!");
+          SizeAction -= getSLEB128Size(Actions[PrevAction].ValueForTypeID);
+          SizeAction += -Actions[PrevAction].NextAction;
+          PrevAction = Actions[PrevAction].Previous;
+        }
+      }
+
+      // Compute the actions.
+      for (unsigned J = NumShared, M = TypeIds.size(); J != M; ++J) {
+        int TypeID = TypeIds[J];
+        assert(-1 - TypeID < (int)FilterOffsets.size() && "Unknown filter id!");
+        int ValueForTypeID =
+            isFilterEHSelector(TypeID) ? FilterOffsets[-1 - TypeID] : TypeID;
+        unsigned SizeTypeID = getSLEB128Size(ValueForTypeID);
+
+        int NextAction = SizeAction ? -(SizeAction + SizeTypeID) : 0;
+        SizeAction = SizeTypeID + getSLEB128Size(NextAction);
+        SizeSiteActions += SizeAction;
+
+        ActionEntry Action = { ValueForTypeID, NextAction, PrevAction };
+        Actions.push_back(Action);
+        PrevAction = Actions.size() - 1;
+      }
+
+      // Record the first action of the landing pad site.
+      FirstAction = SizeActions + SizeSiteActions - SizeAction + 1;
+    } // else identical - re-use previous FirstAction
+
+    // Information used when created the call-site table. The action record
+    // field of the call site record is the offset of the first associated
+    // action record, relative to the start of the actions table. This value is
+    // biased by 1 (1 indicating the start of the actions table), and 0
+    // indicates that there are no actions.
+    FirstActions.push_back(FirstAction);
+
+    // Compute this sites contribution to size.
+    SizeActions += SizeSiteActions;
+
+    PrevLPI = LPI;
+  }
+
+  return SizeActions;
+}
+
+/// Return `true' if this is a call to a function marked `nounwind'. Return
+/// `false' otherwise.
+bool EHStreamer::callToNoUnwindFunction(const MachineInstr *MI) {
+  assert(MI->isCall() && "This should be a call instruction!");
+
+  bool MarkedNoUnwind = false;
+  bool SawFunc = false;
+
+  for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI->getOperand(I);
+
+    if (!MO.isGlobal()) continue;
+
+    const Function *F = dyn_cast<Function>(MO.getGlobal());
+    if (!F) continue;
+
+    if (SawFunc) {
+      // Be conservative. If we have more than one function operand for this
+      // call, then we can't make the assumption that it's the callee and
+      // not a parameter to the call.
+      //
+      // FIXME: Determine if there's a way to say that `F' is the callee or
+      // parameter.
+      MarkedNoUnwind = false;
+      break;
+    }
+
+    MarkedNoUnwind = F->doesNotThrow();
+    SawFunc = true;
+  }
+
+  return MarkedNoUnwind;
+}
+
+void EHStreamer::computePadMap(
+    const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+    RangeMapType &PadMap) {
+  // Invokes and nounwind calls have entries in PadMap (due to being bracketed
+  // by try-range labels when lowered).  Ordinary calls do not, so appropriate
+  // try-ranges for them need be deduced so we can put them in the LSDA.
+  for (unsigned i = 0, N = LandingPads.size(); i != N; ++i) {
+    const LandingPadInfo *LandingPad = LandingPads[i];
+    for (unsigned j = 0, E = LandingPad->BeginLabels.size(); j != E; ++j) {
+      MCSymbol *BeginLabel = LandingPad->BeginLabels[j];
+      assert(!PadMap.count(BeginLabel) && "Duplicate landing pad labels!");
+      PadRange P = { i, j };
+      PadMap[BeginLabel] = P;
+    }
+  }
+}
+
+/// Compute the call-site table.  The entry for an invoke has a try-range
+/// containing the call, a non-zero landing pad, and an appropriate action.  The
+/// entry for an ordinary call has a try-range containing the call and zero for
+/// the landing pad and the action.  Calls marked 'nounwind' have no entry and
+/// must not be contained in the try-range of any entry - they form gaps in the
+/// table.  Entries must be ordered by try-range address.
+void EHStreamer::
+computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                     const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                     const SmallVectorImpl<unsigned> &FirstActions) {
+  RangeMapType PadMap;
+  computePadMap(LandingPads, PadMap);
+
+  // The end label of the previous invoke or nounwind try-range.
+  MCSymbol *LastLabel = nullptr;
+
+  // Whether there is a potentially throwing instruction (currently this means
+  // an ordinary call) between the end of the previous try-range and now.
+  bool SawPotentiallyThrowing = false;
+
+  // Whether the last CallSite entry was for an invoke.
+  bool PreviousIsInvoke = false;
+
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+
+  // Visit all instructions in order of address.
+  for (const auto &MBB : *Asm->MF) {
+    for (const auto &MI : MBB) {
+      if (!MI.isEHLabel()) {
+        if (MI.isCall())
+          SawPotentiallyThrowing |= !callToNoUnwindFunction(&MI);
+        continue;
+      }
+
+      // End of the previous try-range?
+      MCSymbol *BeginLabel = MI.getOperand(0).getMCSymbol();
+      if (BeginLabel == LastLabel)
+        SawPotentiallyThrowing = false;
+
+      // Beginning of a new try-range?
+      RangeMapType::const_iterator L = PadMap.find(BeginLabel);
+      if (L == PadMap.end())
+        // Nope, it was just some random label.
+        continue;
+
+      const PadRange &P = L->second;
+      const LandingPadInfo *LandingPad = LandingPads[P.PadIndex];
+      assert(BeginLabel == LandingPad->BeginLabels[P.RangeIndex] &&
+             "Inconsistent landing pad map!");
+
+      // For Dwarf exception handling (SjLj handling doesn't use this). If some
+      // instruction between the previous try-range and this one may throw,
+      // create a call-site entry with no landing pad for the region between the
+      // try-ranges.
+      if (SawPotentiallyThrowing && Asm->MAI->usesCFIForEH()) {
+        CallSiteEntry Site = { LastLabel, BeginLabel, nullptr, 0 };
+        CallSites.push_back(Site);
+        PreviousIsInvoke = false;
+      }
+
+      LastLabel = LandingPad->EndLabels[P.RangeIndex];
+      assert(BeginLabel && LastLabel && "Invalid landing pad!");
+
+      if (!LandingPad->LandingPadLabel) {
+        // Create a gap.
+        PreviousIsInvoke = false;
+      } else {
+        // This try-range is for an invoke.
+        CallSiteEntry Site = {
+          BeginLabel,
+          LastLabel,
+          LandingPad,
+          FirstActions[P.PadIndex]
+        };
+
+        // Try to merge with the previous call-site. SJLJ doesn't do this
+        if (PreviousIsInvoke && !IsSJLJ) {
+          CallSiteEntry &Prev = CallSites.back();
+          if (Site.LPad == Prev.LPad && Site.Action == Prev.Action) {
+            // Extend the range of the previous entry.
+            Prev.EndLabel = Site.EndLabel;
+            continue;
+          }
+        }
+
+        // Otherwise, create a new call-site.
+        if (!IsSJLJ)
+          CallSites.push_back(Site);
+        else {
+          // SjLj EH must maintain the call sites in the order assigned
+          // to them by the SjLjPrepare pass.
+          unsigned SiteNo = MMI->getCallSiteBeginLabel(BeginLabel);
+          if (CallSites.size() < SiteNo)
+            CallSites.resize(SiteNo);
+          CallSites[SiteNo - 1] = Site;
+        }
+        PreviousIsInvoke = true;
+      }
+    }
+  }
+
+  // If some instruction between the previous try-range and the end of the
+  // function may throw, create a call-site entry with no landing pad for the
+  // region following the try-range.
+  if (SawPotentiallyThrowing && !IsSJLJ && LastLabel != nullptr) {
+    CallSiteEntry Site = { LastLabel, nullptr, nullptr, 0 };
+    CallSites.push_back(Site);
+  }
+}
+
+/// Emit landing pads and actions.
+///
+/// The general organization of the table is complex, but the basic concepts are
+/// easy.  First there is a header which describes the location and organization
+/// of the three components that follow.
+///
+///  1. The landing pad site information describes the range of code covered by
+///     the try.  In our case it's an accumulation of the ranges covered by the
+///     invokes in the try.  There is also a reference to the landing pad that
+///     handles the exception once processed.  Finally an index into the actions
+///     table.
+///  2. The action table, in our case, is composed of pairs of type IDs and next
+///     action offset.  Starting with the action index from the landing pad
+///     site, each type ID is checked for a match to the current exception.  If
+///     it matches then the exception and type id are passed on to the landing
+///     pad.  Otherwise the next action is looked up.  This chain is terminated
+///     with a next action of zero.  If no type id is found then the frame is
+///     unwound and handling continues.
+///  3. Type ID table contains references to all the C++ typeinfo for all
+///     catches in the function.  This tables is reverse indexed base 1.
+void EHStreamer::emitExceptionTable() {
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+  const std::vector<LandingPadInfo> &PadInfos = MMI->getLandingPads();
+
+  // Sort the landing pads in order of their type ids.  This is used to fold
+  // duplicate actions.
+  SmallVector<const LandingPadInfo *, 64> LandingPads;
+  LandingPads.reserve(PadInfos.size());
+
+  for (unsigned i = 0, N = PadInfos.size(); i != N; ++i)
+    LandingPads.push_back(&PadInfos[i]);
+
+  // Order landing pads lexicographically by type id.
+  std::sort(LandingPads.begin(), LandingPads.end(),
+            [](const LandingPadInfo *L,
+               const LandingPadInfo *R) { return L->TypeIds < R->TypeIds; });
+
+  // Compute the actions table and gather the first action index for each
+  // landing pad site.
+  SmallVector<ActionEntry, 32> Actions;
+  SmallVector<unsigned, 64> FirstActions;
+  unsigned SizeActions =
+    computeActionsTable(LandingPads, Actions, FirstActions);
+
+  // Compute the call-site table.
+  SmallVector<CallSiteEntry, 64> CallSites;
+  computeCallSiteTable(CallSites, LandingPads, FirstActions);
+
+  // Final tallies.
+
+  // Call sites.
+  bool IsSJLJ = Asm->MAI->getExceptionHandlingType() == ExceptionHandling::SjLj;
+  bool HaveTTData = IsSJLJ ? (!TypeInfos.empty() || !FilterIds.empty()) : true;
+
+  unsigned CallSiteTableLength;
+  if (IsSJLJ)
+    CallSiteTableLength = 0;
+  else {
+    unsigned SiteStartSize  = 4; // dwarf::DW_EH_PE_udata4
+    unsigned SiteLengthSize = 4; // dwarf::DW_EH_PE_udata4
+    unsigned LandingPadSize = 4; // dwarf::DW_EH_PE_udata4
+    CallSiteTableLength =
+      CallSites.size() * (SiteStartSize + SiteLengthSize + LandingPadSize);
+  }
+
+  for (unsigned i = 0, e = CallSites.size(); i < e; ++i) {
+    CallSiteTableLength += getULEB128Size(CallSites[i].Action);
+    if (IsSJLJ)
+      CallSiteTableLength += getULEB128Size(i);
+  }
+
+  // Type infos.
+  MCSection *LSDASection = Asm->getObjFileLowering().getLSDASection();
+  unsigned TTypeEncoding;
+  unsigned TypeFormatSize;
+
+  if (!HaveTTData) {
+    // For SjLj exceptions, if there is no TypeInfo, then we just explicitly say
+    // that we're omitting that bit.
+    TTypeEncoding = dwarf::DW_EH_PE_omit;
+    // dwarf::DW_EH_PE_absptr
+    TypeFormatSize = Asm->getDataLayout().getPointerSize();
+  } else {
+    // Okay, we have actual filters or typeinfos to emit.  As such, we need to
+    // pick a type encoding for them.  We're about to emit a list of pointers to
+    // typeinfo objects at the end of the LSDA.  However, unless we're in static
+    // mode, this reference will require a relocation by the dynamic linker.
+    //
+    // Because of this, we have a couple of options:
+    //
+    //   1) If we are in -static mode, we can always use an absolute reference
+    //      from the LSDA, because the static linker will resolve it.
+    //
+    //   2) Otherwise, if the LSDA section is writable, we can output the direct
+    //      reference to the typeinfo and allow the dynamic linker to relocate
+    //      it.  Since it is in a writable section, the dynamic linker won't
+    //      have a problem.
+    //
+    //   3) Finally, if we're in PIC mode and the LDSA section isn't writable,
+    //      we need to use some form of indirection.  For example, on Darwin,
+    //      we can output a statically-relocatable reference to a dyld stub. The
+    //      offset to the stub is constant, but the contents are in a section
+    //      that is updated by the dynamic linker.  This is easy enough, but we
+    //      need to tell the personality function of the unwinder to indirect
+    //      through the dyld stub.
+    //
+    // FIXME: When (3) is actually implemented, we'll have to emit the stubs
+    // somewhere.  This predicate should be moved to a shared location that is
+    // in target-independent code.
+    //
+    TTypeEncoding = Asm->getObjFileLowering().getTTypeEncoding();
+    TypeFormatSize = Asm->GetSizeOfEncodedValue(TTypeEncoding);
+  }
+
+  // Begin the exception table.
+  // Sometimes we want not to emit the data into separate section (e.g. ARM
+  // EHABI). In this case LSDASection will be NULL.
+  if (LSDASection)
+    Asm->OutStreamer->SwitchSection(LSDASection);
+  Asm->EmitAlignment(2);
+
+  // Emit the LSDA.
+  MCSymbol *GCCETSym =
+    Asm->OutContext.getOrCreateSymbol(Twine("GCC_except_table")+
+                                      Twine(Asm->getFunctionNumber()));
+  Asm->OutStreamer->EmitLabel(GCCETSym);
+  Asm->OutStreamer->EmitLabel(Asm->getCurExceptionSym());
+
+  // Emit the LSDA header.
+  Asm->EmitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+  Asm->EmitEncodingByte(TTypeEncoding, "@TType");
+
+  // The type infos need to be aligned. GCC does this by inserting padding just
+  // before the type infos. However, this changes the size of the exception
+  // table, so you need to take this into account when you output the exception
+  // table size. However, the size is output using a variable length encoding.
+  // So by increasing the size by inserting padding, you may increase the number
+  // of bytes used for writing the size. If it increases, say by one byte, then
+  // you now need to output one less byte of padding to get the type infos
+  // aligned. However this decreases the size of the exception table. This
+  // changes the value you have to output for the exception table size. Due to
+  // the variable length encoding, the number of bytes used for writing the
+  // length may decrease. If so, you then have to increase the amount of
+  // padding. And so on. If you look carefully at the GCC code you will see that
+  // it indeed does this in a loop, going on and on until the values stabilize.
+  // We chose another solution: don't output padding inside the table like GCC
+  // does, instead output it before the table.
+  unsigned SizeTypes = TypeInfos.size() * TypeFormatSize;
+  unsigned CallSiteTableLengthSize = getULEB128Size(CallSiteTableLength);
+  unsigned TTypeBaseOffset =
+    sizeof(int8_t) +                            // Call site format
+    CallSiteTableLengthSize +                   // Call site table length size
+    CallSiteTableLength +                       // Call site table length
+    SizeActions +                               // Actions size
+    SizeTypes;
+  unsigned TTypeBaseOffsetSize = getULEB128Size(TTypeBaseOffset);
+  unsigned TotalSize =
+    sizeof(int8_t) +                            // LPStart format
+    sizeof(int8_t) +                            // TType format
+    (HaveTTData ? TTypeBaseOffsetSize : 0) +    // TType base offset size
+    TTypeBaseOffset;                            // TType base offset
+  unsigned SizeAlign = (4 - TotalSize) & 3;
+
+  if (HaveTTData) {
+    // Account for any extra padding that will be added to the call site table
+    // length.
+    Asm->EmitULEB128(TTypeBaseOffset, "@TType base offset", SizeAlign);
+    SizeAlign = 0;
+  }
+
+  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  // SjLj Exception handling
+  if (IsSJLJ) {
+    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
+
+    // Add extra padding if it wasn't added to the TType base offset.
+    Asm->EmitULEB128(CallSiteTableLength, "Call site table length", SizeAlign);
+
+    // Emit the landing pad site information.
+    unsigned idx = 0;
+    for (SmallVectorImpl<CallSiteEntry>::const_iterator
+         I = CallSites.begin(), E = CallSites.end(); I != E; ++I, ++idx) {
+      const CallSiteEntry &S = *I;
+
+      // Offset of the landing pad, counted in 16-byte bundles relative to the
+      // @LPStart address.
+      if (VerboseAsm) {
+        Asm->OutStreamer->AddComment(">> Call Site " + Twine(idx) + " <<");
+        Asm->OutStreamer->AddComment("  On exception at call site "+Twine(idx));
+      }
+      Asm->EmitULEB128(idx);
+
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer->AddComment("  Action: cleanup");
+        else
+          Asm->OutStreamer->AddComment("  Action: " +
+                                       Twine((S.Action - 1) / 2 + 1));
+      }
+      Asm->EmitULEB128(S.Action);
+    }
+  } else {
+    // Itanium LSDA exception handling
+
+    // The call-site table is a list of all call sites that may throw an
+    // exception (including C++ 'throw' statements) in the procedure
+    // fragment. It immediately follows the LSDA header. Each entry indicates,
+    // for a given call, the first corresponding action record and corresponding
+    // landing pad.
+    //
+    // The table begins with the number of bytes, stored as an LEB128
+    // compressed, unsigned integer. The records immediately follow the record
+    // count. They are sorted in increasing call-site address. Each record
+    // indicates:
+    //
+    //   * The position of the call-site.
+    //   * The position of the landing pad.
+    //   * The first action record for that call site.
+    //
+    // A missing entry in the call-site table indicates that a call is not
+    // supposed to throw.
+
+    // Emit the landing pad call site table.
+    Asm->EmitEncodingByte(dwarf::DW_EH_PE_udata4, "Call site");
+
+    // Add extra padding if it wasn't added to the TType base offset.
+    Asm->EmitULEB128(CallSiteTableLength, "Call site table length", SizeAlign);
+
+    unsigned Entry = 0;
+    for (SmallVectorImpl<CallSiteEntry>::const_iterator
+         I = CallSites.begin(), E = CallSites.end(); I != E; ++I) {
+      const CallSiteEntry &S = *I;
+
+      MCSymbol *EHFuncBeginSym = Asm->getFunctionBegin();
+
+      MCSymbol *BeginLabel = S.BeginLabel;
+      if (!BeginLabel)
+        BeginLabel = EHFuncBeginSym;
+      MCSymbol *EndLabel = S.EndLabel;
+      if (!EndLabel)
+        EndLabel = Asm->getFunctionEnd();
+
+      // Offset of the call site relative to the previous call site, counted in
+      // number of 16-byte bundles. The first call site is counted relative to
+      // the start of the procedure fragment.
+      if (VerboseAsm)
+        Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) + " <<");
+      Asm->EmitLabelDifference(BeginLabel, EHFuncBeginSym, 4);
+      if (VerboseAsm)
+        Asm->OutStreamer->AddComment(Twine("  Call between ") +
+                                     BeginLabel->getName() + " and " +
+                                     EndLabel->getName());
+      Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+
+      // Offset of the landing pad, counted in 16-byte bundles relative to the
+      // @LPStart address.
+      if (!S.LPad) {
+        if (VerboseAsm)
+          Asm->OutStreamer->AddComment("    has no landing pad");
+        Asm->OutStreamer->EmitIntValue(0, 4/*size*/);
+      } else {
+        if (VerboseAsm)
+          Asm->OutStreamer->AddComment(Twine("    jumps to ") +
+                                       S.LPad->LandingPadLabel->getName());
+        Asm->EmitLabelDifference(S.LPad->LandingPadLabel, EHFuncBeginSym, 4);
+      }
+
+      // Offset of the first associated action record, relative to the start of
+      // the action table. This value is biased by 1 (1 indicates the start of
+      // the action table), and 0 indicates that there are no actions.
+      if (VerboseAsm) {
+        if (S.Action == 0)
+          Asm->OutStreamer->AddComment("  On action: cleanup");
+        else
+          Asm->OutStreamer->AddComment("  On action: " +
+                                       Twine((S.Action - 1) / 2 + 1));
+      }
+      Asm->EmitULEB128(S.Action);
+    }
+  }
+
+  // Emit the Action Table.
+  int Entry = 0;
+  for (SmallVectorImpl<ActionEntry>::const_iterator
+         I = Actions.begin(), E = Actions.end(); I != E; ++I) {
+    const ActionEntry &Action = *I;
+
+    if (VerboseAsm) {
+      // Emit comments that decode the action table.
+      Asm->OutStreamer->AddComment(">> Action Record " + Twine(++Entry) + " <<");
+    }
+
+    // Type Filter
+    //
+    //   Used by the runtime to match the type of the thrown exception to the
+    //   type of the catch clauses or the types in the exception specification.
+    if (VerboseAsm) {
+      if (Action.ValueForTypeID > 0)
+        Asm->OutStreamer->AddComment("  Catch TypeInfo " +
+                                     Twine(Action.ValueForTypeID));
+      else if (Action.ValueForTypeID < 0)
+        Asm->OutStreamer->AddComment("  Filter TypeInfo " +
+                                     Twine(Action.ValueForTypeID));
+      else
+        Asm->OutStreamer->AddComment("  Cleanup");
+    }
+    Asm->EmitSLEB128(Action.ValueForTypeID);
+
+    // Action Record
+    //
+    //   Self-relative signed displacement in bytes of the next action record,
+    //   or 0 if there is no next action record.
+    if (VerboseAsm) {
+      if (Action.NextAction == 0) {
+        Asm->OutStreamer->AddComment("  No further actions");
+      } else {
+        unsigned NextAction = Entry + (Action.NextAction + 1) / 2;
+        Asm->OutStreamer->AddComment("  Continue to action "+Twine(NextAction));
+      }
+    }
+    Asm->EmitSLEB128(Action.NextAction);
+  }
+
+  emitTypeInfos(TTypeEncoding);
+
+  Asm->EmitAlignment(2);
+}
+
+void EHStreamer::emitTypeInfos(unsigned TTypeEncoding) {
+  const std::vector<const GlobalValue *> &TypeInfos = MMI->getTypeInfos();
+  const std::vector<unsigned> &FilterIds = MMI->getFilterIds();
+
+  bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
+
+  int Entry = 0;
+  // Emit the Catch TypeInfos.
+  if (VerboseAsm && !TypeInfos.empty()) {
+    Asm->OutStreamer->AddComment(">> Catch TypeInfos <<");
+    Asm->OutStreamer->AddBlankLine();
+    Entry = TypeInfos.size();
+  }
+
+  for (const GlobalValue *GV : make_range(TypeInfos.rbegin(),
+                                          TypeInfos.rend())) {
+    if (VerboseAsm)
+      Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
+    Asm->EmitTTypeReference(GV, TTypeEncoding);
+  }
+
+  // Emit the Exception Specifications.
+  if (VerboseAsm && !FilterIds.empty()) {
+    Asm->OutStreamer->AddComment(">> Filter TypeInfos <<");
+    Asm->OutStreamer->AddBlankLine();
+    Entry = 0;
+  }
+  for (std::vector<unsigned>::const_iterator
+         I = FilterIds.begin(), E = FilterIds.end(); I < E; ++I) {
+    unsigned TypeID = *I;
+    if (VerboseAsm) {
+      --Entry;
+      if (isFilterEHSelector(TypeID))
+        Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
+    }
+
+    Asm->EmitULEB128(TypeID);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
new file mode 100644
index 000000000000..080fdd14b467
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/EHStreamer.h
@@ -0,0 +1,137 @@
+//===-- EHStreamer.h - Exception Handling Directive Streamer ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing exception info into assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_EHSTREAMER_H
+
+#include "AsmPrinterHandler.h"
+#include "llvm/ADT/DenseMap.h"
+
+namespace llvm {
+struct LandingPadInfo;
+class MachineModuleInfo;
+class MachineInstr;
+class MachineFunction;
+class MCSymbol;
+class MCSymbolRefExpr;
+
+template <typename T>
+class SmallVectorImpl;
+
+/// Emits exception handling directives.
+class LLVM_LIBRARY_VISIBILITY EHStreamer : public AsmPrinterHandler {
+protected:
+  /// Target of directive emission.
+  AsmPrinter *Asm;
+
+  /// Collected machine module information.
+  MachineModuleInfo *MMI;
+
+  /// How many leading type ids two landing pads have in common.
+  static unsigned sharedTypeIDs(const LandingPadInfo *L,
+                                const LandingPadInfo *R);
+
+  /// Structure holding a try-range and the associated landing pad.
+  struct PadRange {
+    // The index of the landing pad.
+    unsigned PadIndex;
+    // The index of the begin and end labels in the landing pad's label lists.
+    unsigned RangeIndex;
+  };
+
+  typedef DenseMap<MCSymbol *, PadRange> RangeMapType;
+
+  /// Structure describing an entry in the actions table.
+  struct ActionEntry {
+    int ValueForTypeID; // The value to write - may not be equal to the type id.
+    int NextAction;
+    unsigned Previous;
+  };
+
+  /// Structure describing an entry in the call-site table.
+  struct CallSiteEntry {
+    // The 'try-range' is BeginLabel .. EndLabel.
+    MCSymbol *BeginLabel; // Null indicates the start of the function.
+    MCSymbol *EndLabel;   // Null indicates the end of the function.
+
+    // LPad contains the landing pad start labels.
+    const LandingPadInfo *LPad; // Null indicates that there is no landing pad.
+    unsigned Action;
+  };
+
+  /// Compute the actions table and gather the first action index for each
+  /// landing pad site.
+  unsigned computeActionsTable(const SmallVectorImpl<const LandingPadInfo*>&LPs,
+                               SmallVectorImpl<ActionEntry> &Actions,
+                               SmallVectorImpl<unsigned> &FirstActions);
+
+  void computePadMap(const SmallVectorImpl<const LandingPadInfo *> &LandingPads,
+                     RangeMapType &PadMap);
+
+  /// Compute the call-site table.  The entry for an invoke has a try-range
+  /// containing the call, a non-zero landing pad and an appropriate action.
+  /// The entry for an ordinary call has a try-range containing the call and
+  /// zero for the landing pad and the action.  Calls marked 'nounwind' have
+  /// no entry and must not be contained in the try-range of any entry - they
+  /// form gaps in the table.  Entries must be ordered by try-range address.
+  void computeCallSiteTable(SmallVectorImpl<CallSiteEntry> &CallSites,
+                            const SmallVectorImpl<const LandingPadInfo *> &LPs,
+                            const SmallVectorImpl<unsigned> &FirstActions);
+
+  /// Emit landing pads and actions.
+  ///
+  /// The general organization of the table is complex, but the basic concepts
+  /// are easy.  First there is a header which describes the location and
+  /// organization of the three components that follow.
+  ///  1. The landing pad site information describes the range of code covered
+  ///     by the try.  In our case it's an accumulation of the ranges covered
+  ///     by the invokes in the try.  There is also a reference to the landing
+  ///     pad that handles the exception once processed.  Finally an index into
+  ///     the actions table.
+  ///  2. The action table, in our case, is composed of pairs of type ids
+  ///     and next action offset.  Starting with the action index from the
+  ///     landing pad site, each type Id is checked for a match to the current
+  ///     exception.  If it matches then the exception and type id are passed
+  ///     on to the landing pad.  Otherwise the next action is looked up.  This
+  ///     chain is terminated with a next action of zero.  If no type id is
+  ///     found the frame is unwound and handling continues.
+  ///  3. Type id table contains references to all the C++ typeinfo for all
+  ///     catches in the function.  This tables is reversed indexed base 1.
+  void emitExceptionTable();
+
+  virtual void emitTypeInfos(unsigned TTypeEncoding);
+
+  // Helpers for for identifying what kind of clause an EH typeid or selector
+  // corresponds to. Negative selectors are for filter clauses, the zero
+  // selector is for cleanups, and positive selectors are for catch clauses.
+  static bool isFilterEHSelector(int Selector) { return Selector < 0; }
+  static bool isCleanupEHSelector(int Selector) { return Selector == 0; }
+  static bool isCatchEHSelector(int Selector) { return Selector > 0; }
+
+public:
+  EHStreamer(AsmPrinter *A);
+  ~EHStreamer() override;
+
+  // Unused.
+  void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+  void beginInstruction(const MachineInstr *MI) override {}
+  void endInstruction() override {}
+
+  /// Return `true' if this is a call to a function marked `nounwind'. Return
+  /// `false' otherwise.
+  static bool callToNoUnwindFunction(const MachineInstr *MI);
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
new file mode 100644
index 000000000000..6a023b998b32
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -0,0 +1,123 @@
+//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the compiler plugin that is used in order to emit
+// garbage collection information in a convenient layout for parsing and
+// loading in the Erlang/OTP runtime.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class ErlangGCPrinter : public GCMetadataPrinter {
+public:
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
+}
+
+static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
+    X("erlang", "erlang-compatible garbage collector");
+
+void llvm::linkErlangGCPrinter() {}
+
+void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                     AsmPrinter &AP) {
+  MCStreamer &OS = *AP.OutStreamer;
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
+
+  // Put this in a custom .note section.
+  OS.SwitchSection(
+      AP.getObjFileLowering().getContext().getELFSection(".note.gc",
+                                                         ELF::SHT_PROGBITS, 0));
+
+  // For each function...
+  for (GCModuleInfo::FuncInfoVec::iterator FI = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       FI != IE; ++FI) {
+    GCFunctionInfo &MD = **FI;
+    if (MD.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+    /** A compact GC layout. Emit this data structure:
+     *
+     * struct {
+     *   int16_t PointCount;
+     *   void *SafePointAddress[PointCount];
+     *   int16_t StackFrameSize; (in words)
+     *   int16_t StackArity;
+     *   int16_t LiveCount;
+     *   int16_t LiveOffsets[LiveCount];
+     * } __gcmap_<FUNCTIONNAME>;
+     **/
+
+    // Align to address width.
+    AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+
+    // Emit PointCount.
+    OS.AddComment("safe point count");
+    AP.EmitInt16(MD.size());
+
+    // And each safe point...
+    for (GCFunctionInfo::iterator PI = MD.begin(), PE = MD.end(); PI != PE;
+         ++PI) {
+      // Emit the address of the safe point.
+      OS.AddComment("safe point address");
+      MCSymbol *Label = PI->Label;
+      AP.EmitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
+    }
+
+    // Stack information never change in safe points! Only print info from the
+    // first call-site.
+    GCFunctionInfo::iterator PI = MD.begin();
+
+    // Emit the stack frame size.
+    OS.AddComment("stack frame size (in words)");
+    AP.EmitInt16(MD.getFrameSize() / IntPtrSize);
+
+    // Emit stack arity, i.e. the number of stacked arguments.
+    unsigned RegisteredArgs = IntPtrSize == 4 ? 5 : 6;
+    unsigned StackArity = MD.getFunction().arg_size() > RegisteredArgs
+                              ? MD.getFunction().arg_size() - RegisteredArgs
+                              : 0;
+    OS.AddComment("stack arity");
+    AP.EmitInt16(StackArity);
+
+    // Emit the number of live roots in the function.
+    OS.AddComment("live root count");
+    AP.EmitInt16(MD.live_size(PI));
+
+    // And for each live root...
+    for (GCFunctionInfo::live_iterator LI = MD.live_begin(PI),
+                                       LE = MD.live_end(PI);
+         LI != LE; ++LI) {
+      // Emit live root's offset within the stack frame.
+      OS.AddComment("stack index (offset / wordsize)");
+      AP.EmitInt16(LI->StackOffset / IntPtrSize);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
new file mode 100644
index 000000000000..c09ef6adea69
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -0,0 +1,182 @@
+//===-- OcamlGCPrinter.cpp - Ocaml frametable emitter ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements printing the assembly code for an Ocaml frametable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cctype>
+using namespace llvm;
+
+namespace {
+
+class OcamlGCMetadataPrinter : public GCMetadataPrinter {
+public:
+  void beginAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+  void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
+};
+}
+
+static GCMetadataPrinterRegistry::Add<OcamlGCMetadataPrinter>
+    Y("ocaml", "ocaml 3.10-compatible collector");
+
+void llvm::linkOcamlGCPrinter() {}
+
+static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
+  const std::string &MId = M.getModuleIdentifier();
+
+  std::string SymName;
+  SymName += "caml";
+  size_t Letter = SymName.size();
+  SymName.append(MId.begin(), std::find(MId.begin(), MId.end(), '.'));
+  SymName += "__";
+  SymName += Id;
+
+  // Capitalize the first letter of the module name.
+  SymName[Letter] = toupper(SymName[Letter]);
+
+  SmallString<128> TmpStr;
+  Mangler::getNameWithPrefix(TmpStr, SymName, M.getDataLayout());
+
+  MCSymbol *Sym = AP.OutContext.getOrCreateSymbol(TmpStr);
+
+  AP.OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+  AP.OutStreamer->EmitLabel(Sym);
+}
+
+void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
+                                           AsmPrinter &AP) {
+  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(M, AP, "code_begin");
+
+  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "data_begin");
+}
+
+/// emitAssembly - Print the frametable. The ocaml frametable format is thus:
+///
+///   extern "C" struct align(sizeof(intptr_t)) {
+///     uint16_t NumDescriptors;
+///     struct align(sizeof(intptr_t)) {
+///       void *ReturnAddress;
+///       uint16_t FrameSize;
+///       uint16_t NumLiveOffsets;
+///       uint16_t LiveOffsets[NumLiveOffsets];
+///     } Descriptors[NumDescriptors];
+///   } caml${module}__frametable;
+///
+/// Note that this precludes programs from stack frames larger than 64K
+/// (FrameSize and LiveOffsets would overflow). FrameTablePrinter will abort if
+/// either condition is detected in a function which uses the GC.
+///
+void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
+                                            AsmPrinter &AP) {
+  unsigned IntPtrSize = M.getDataLayout().getPointerSize();
+
+  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getTextSection());
+  EmitCamlGlobal(M, AP, "code_end");
+
+  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "data_end");
+
+  // FIXME: Why does ocaml emit this??
+  AP.OutStreamer->EmitIntValue(0, IntPtrSize);
+
+  AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
+  EmitCamlGlobal(M, AP, "frametable");
+
+  int NumDescriptors = 0;
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
+    GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
+      NumDescriptors++;
+    }
+  }
+
+  if (NumDescriptors >= 1 << 16) {
+    // Very rude!
+    report_fatal_error(" Too much descriptor for ocaml GC");
+  }
+  AP.EmitInt16(NumDescriptors);
+  AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+
+  for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
+                                           IE = Info.funcinfo_end();
+       I != IE; ++I) {
+    GCFunctionInfo &FI = **I;
+    if (FI.getStrategy().getName() != getStrategy().getName())
+      // this function is managed by some other GC
+      continue;
+
+    uint64_t FrameSize = FI.getFrameSize();
+    if (FrameSize >= 1 << 16) {
+      // Very rude!
+      report_fatal_error("Function '" + FI.getFunction().getName() +
+                         "' is too large for the ocaml GC! "
+                         "Frame size " +
+                         Twine(FrameSize) + ">= 65536.\n"
+                                            "(" +
+                         Twine(uintptr_t(&FI)) + ")");
+    }
+
+    AP.OutStreamer->AddComment("live roots for " +
+                               Twine(FI.getFunction().getName()));
+    AP.OutStreamer->AddBlankLine();
+
+    for (GCFunctionInfo::iterator J = FI.begin(), JE = FI.end(); J != JE; ++J) {
+      size_t LiveCount = FI.live_size(J);
+      if (LiveCount >= 1 << 16) {
+        // Very rude!
+        report_fatal_error("Function '" + FI.getFunction().getName() +
+                           "' is too large for the ocaml GC! "
+                           "Live root count " +
+                           Twine(LiveCount) + " >= 65536.");
+      }
+
+      AP.OutStreamer->EmitSymbolValue(J->Label, IntPtrSize);
+      AP.EmitInt16(FrameSize);
+      AP.EmitInt16(LiveCount);
+
+      for (GCFunctionInfo::live_iterator K = FI.live_begin(J),
+                                         KE = FI.live_end(J);
+           K != KE; ++K) {
+        if (K->StackOffset >= 1 << 16) {
+          // Very rude!
+          report_fatal_error(
+              "GC root stack offset is outside of fixed stack frame and out "
+              "of range for ocaml GC!");
+        }
+        AP.EmitInt16(K->StackOffset);
+      }
+
+      AP.EmitAlignment(IntPtrSize == 4 ? 2 : 3);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
new file mode 100644
index 000000000000..e5933d8d4160
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -0,0 +1,1273 @@
+//===-- CodeGen/AsmPrinter/WinException.cpp - Dwarf Exception Impl ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing Win64 exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "WinException.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWin64EH.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+WinException::WinException(AsmPrinter *A) : EHStreamer(A) {
+  // MSVC's EH tables are always composed of 32-bit words.  All known 64-bit
+  // platforms use an imagerel32 relocation to refer to symbols.
+  useImageRel32 = (A->getDataLayout().getPointerSizeInBits() == 64);
+}
+
+WinException::~WinException() {}
+
+/// endModule - Emit all exception information that should come after the
+/// content.
+void WinException::endModule() {
+  auto &OS = *Asm->OutStreamer;
+  const Module *M = MMI->getModule();
+  for (const Function &F : *M)
+    if (F.hasFnAttribute("safeseh"))
+      OS.EmitCOFFSafeSEH(Asm->getSymbol(&F));
+}
+
+void WinException::beginFunction(const MachineFunction *MF) {
+  shouldEmitMoves = shouldEmitPersonality = shouldEmitLSDA = false;
+
+  // If any landing pads survive, we need an EH table.
+  bool hasLandingPads = !MMI->getLandingPads().empty();
+  bool hasEHFunclets = MMI->hasEHFunclets();
+
+  const Function *F = MF->getFunction();
+
+  shouldEmitMoves = Asm->needsSEHMoves();
+
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  unsigned PerEncoding = TLOF.getPersonalityEncoding();
+  const Function *Per = nullptr;
+  if (F->hasPersonalityFn())
+    Per = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+
+  bool forceEmitPersonality =
+    F->hasPersonalityFn() && !isNoOpWithoutInvoke(classifyEHPersonality(Per)) &&
+    F->needsUnwindTableEntry();
+
+  shouldEmitPersonality =
+      forceEmitPersonality || ((hasLandingPads || hasEHFunclets) &&
+                               PerEncoding != dwarf::DW_EH_PE_omit && Per);
+
+  unsigned LSDAEncoding = TLOF.getLSDAEncoding();
+  shouldEmitLSDA = shouldEmitPersonality &&
+    LSDAEncoding != dwarf::DW_EH_PE_omit;
+
+  // If we're not using CFI, we don't want the CFI or the personality, but we
+  // might want EH tables if we had EH pads.
+  if (!Asm->MAI->usesWindowsCFI()) {
+    shouldEmitLSDA = hasEHFunclets;
+    shouldEmitPersonality = false;
+    return;
+  }
+
+  beginFunclet(MF->front(), Asm->CurrentFnSym);
+}
+
+/// endFunction - Gather and emit post-function exception information.
+///
+void WinException::endFunction(const MachineFunction *MF) {
+  if (!shouldEmitPersonality && !shouldEmitMoves && !shouldEmitLSDA)
+    return;
+
+  const Function *F = MF->getFunction();
+  EHPersonality Per = EHPersonality::Unknown;
+  if (F->hasPersonalityFn())
+    Per = classifyEHPersonality(F->getPersonalityFn());
+
+  // Get rid of any dead landing pads if we're not using funclets. In funclet
+  // schemes, the landing pad is not actually reachable. It only exists so
+  // that we can emit the right table data.
+  if (!isFuncletEHPersonality(Per))
+    MMI->TidyLandingPads();
+
+  endFunclet();
+
+  // endFunclet will emit the necessary .xdata tables for x64 SEH.
+  if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets())
+    return;
+
+  if (shouldEmitPersonality || shouldEmitLSDA) {
+    Asm->OutStreamer->PushSection();
+
+    // Just switch sections to the right xdata section.
+    MCSection *XData = Asm->OutStreamer->getAssociatedXDataSection(
+        Asm->OutStreamer->getCurrentSectionOnly());
+    Asm->OutStreamer->SwitchSection(XData);
+
+    // Emit the tables appropriate to the personality function in use. If we
+    // don't recognize the personality, assume it uses an Itanium-style LSDA.
+    if (Per == EHPersonality::MSVC_Win64SEH)
+      emitCSpecificHandlerTable(MF);
+    else if (Per == EHPersonality::MSVC_X86SEH)
+      emitExceptHandlerTable(MF);
+    else if (Per == EHPersonality::MSVC_CXX)
+      emitCXXFrameHandler3Table(MF);
+    else if (Per == EHPersonality::CoreCLR)
+      emitCLRExceptionTable(MF);
+    else
+      emitExceptionTable();
+
+    Asm->OutStreamer->PopSection();
+  }
+}
+
+/// Retreive the MCSymbol for a GlobalValue or MachineBasicBlock.
+static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm,
+                                   const MachineBasicBlock *MBB) {
+  if (!MBB)
+    return nullptr;
+
+  assert(MBB->isEHFuncletEntry());
+
+  // Give catches and cleanups a name based off of their parent function and
+  // their funclet entry block's number.
+  const MachineFunction *MF = MBB->getParent();
+  const Function *F = MF->getFunction();
+  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+  MCContext &Ctx = MF->getContext();
+  StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch";
+  return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" +
+                               Twine(MBB->getNumber()) + "@?0?" +
+                               FuncLinkageName + "@4HA");
+}
+
+void WinException::beginFunclet(const MachineBasicBlock &MBB,
+                                MCSymbol *Sym) {
+  CurrentFuncletEntry = &MBB;
+
+  const Function *F = Asm->MF->getFunction();
+  // If a symbol was not provided for the funclet, invent one.
+  if (!Sym) {
+    Sym = getMCSymbolForMBB(Asm, &MBB);
+
+    // Describe our funclet symbol as a function with internal linkage.
+    Asm->OutStreamer->BeginCOFFSymbolDef(Sym);
+    Asm->OutStreamer->EmitCOFFSymbolStorageClass(COFF::IMAGE_SYM_CLASS_STATIC);
+    Asm->OutStreamer->EmitCOFFSymbolType(COFF::IMAGE_SYM_DTYPE_FUNCTION
+                                         << COFF::SCT_COMPLEX_TYPE_SHIFT);
+    Asm->OutStreamer->EndCOFFSymbolDef();
+
+    // We want our funclet's entry point to be aligned such that no nops will be
+    // present after the label.
+    Asm->EmitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()),
+                       F);
+
+    // Now that we've emitted the alignment directive, point at our funclet.
+    Asm->OutStreamer->EmitLabel(Sym);
+  }
+
+  // Mark 'Sym' as starting our funclet.
+  if (shouldEmitMoves || shouldEmitPersonality)
+    Asm->OutStreamer->EmitWinCFIStartProc(Sym);
+
+  if (shouldEmitPersonality) {
+    const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+    const Function *PerFn = nullptr;
+
+    // Determine which personality routine we are using for this funclet.
+    if (F->hasPersonalityFn())
+      PerFn = dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+    const MCSymbol *PersHandlerSym =
+        TLOF.getCFIPersonalitySymbol(PerFn, *Asm->Mang, Asm->TM, MMI);
+
+    // Classify the personality routine so that we may reason about it.
+    EHPersonality Per = EHPersonality::Unknown;
+    if (F->hasPersonalityFn())
+      Per = classifyEHPersonality(F->getPersonalityFn());
+
+    // Do not emit a .seh_handler directive if it is a C++ cleanup funclet.
+    if (Per != EHPersonality::MSVC_CXX ||
+        !CurrentFuncletEntry->isCleanupFuncletEntry())
+      Asm->OutStreamer->EmitWinEHHandler(PersHandlerSym, true, true);
+  }
+}
+
+void WinException::endFunclet() {
+  // No funclet to process?  Great, we have nothing to do.
+  if (!CurrentFuncletEntry)
+    return;
+
+  if (shouldEmitMoves || shouldEmitPersonality) {
+    const Function *F = Asm->MF->getFunction();
+    EHPersonality Per = EHPersonality::Unknown;
+    if (F->hasPersonalityFn())
+      Per = classifyEHPersonality(F->getPersonalityFn());
+
+    // The .seh_handlerdata directive implicitly switches section, push the
+    // current section so that we may return to it.
+    Asm->OutStreamer->PushSection();
+
+    // Emit an UNWIND_INFO struct describing the prologue.
+    Asm->OutStreamer->EmitWinEHHandlerData();
+
+    if (Per == EHPersonality::MSVC_CXX && shouldEmitPersonality &&
+        !CurrentFuncletEntry->isCleanupFuncletEntry()) {
+      // If this is a C++ catch funclet (or the parent function),
+      // emit a reference to the LSDA for the parent function.
+      StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+      MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
+          Twine("$cppxdata$", FuncLinkageName));
+      Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
+    } else if (Per == EHPersonality::MSVC_Win64SEH && MMI->hasEHFunclets() &&
+               !CurrentFuncletEntry->isEHFuncletEntry()) {
+      // If this is the parent function in Win64 SEH, emit the LSDA immediately
+      // following .seh_handlerdata.
+      emitCSpecificHandlerTable(Asm->MF);
+    }
+
+    // Switch back to the previous section now that we are done writing to
+    // .xdata.
+    Asm->OutStreamer->PopSection();
+
+    // Emit a .seh_endproc directive to mark the end of the function.
+    Asm->OutStreamer->EmitWinCFIEndProc();
+  }
+
+  // Let's make sure we don't try to end the same funclet twice.
+  CurrentFuncletEntry = nullptr;
+}
+
+const MCExpr *WinException::create32bitRef(const MCSymbol *Value) {
+  if (!Value)
+    return MCConstantExpr::create(0, Asm->OutContext);
+  return MCSymbolRefExpr::create(Value, useImageRel32
+                                            ? MCSymbolRefExpr::VK_COFF_IMGREL32
+                                            : MCSymbolRefExpr::VK_None,
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::create32bitRef(const GlobalValue *GV) {
+  if (!GV)
+    return MCConstantExpr::create(0, Asm->OutContext);
+  return create32bitRef(Asm->getSymbol(GV));
+}
+
+const MCExpr *WinException::getLabelPlusOne(const MCSymbol *Label) {
+  return MCBinaryExpr::createAdd(create32bitRef(Label),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffset(const MCSymbol *OffsetOf,
+                                      const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(OffsetOf, Asm->OutContext),
+      MCSymbolRefExpr::create(OffsetFrom, Asm->OutContext), Asm->OutContext);
+}
+
+const MCExpr *WinException::getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                             const MCSymbol *OffsetFrom) {
+  return MCBinaryExpr::createAdd(getOffset(OffsetOf, OffsetFrom),
+                                 MCConstantExpr::create(1, Asm->OutContext),
+                                 Asm->OutContext);
+}
+
+int WinException::getFrameIndexOffset(int FrameIndex,
+                                      const WinEHFuncInfo &FuncInfo) {
+  const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
+  unsigned UnusedReg;
+  if (Asm->MAI->usesWindowsCFI()) {
+    int Offset =
+        TFI.getFrameIndexReferencePreferSP(*Asm->MF, FrameIndex, UnusedReg,
+                                           /*IgnoreSPUpdates*/ true);
+    assert(UnusedReg ==
+           Asm->MF->getSubtarget()
+               .getTargetLowering()
+               ->getStackPointerRegisterToSaveRestore());
+    return Offset;
+  }
+
+  // For 32-bit, offsets should be relative to the end of the EH registration
+  // node. For 64-bit, it's relative to SP at the end of the prologue.
+  assert(FuncInfo.EHRegNodeEndOffset != INT_MAX);
+  int Offset = TFI.getFrameIndexReference(*Asm->MF, FrameIndex, UnusedReg);
+  Offset += FuncInfo.EHRegNodeEndOffset;
+  return Offset;
+}
+
+namespace {
+
+/// Top-level state used to represent unwind to caller
+const int NullState = -1;
+
+struct InvokeStateChange {
+  /// EH Label immediately after the last invoke in the previous state, or
+  /// nullptr if the previous state was the null state.
+  const MCSymbol *PreviousEndLabel;
+
+  /// EH label immediately before the first invoke in the new state, or nullptr
+  /// if the new state is the null state.
+  const MCSymbol *NewStartLabel;
+
+  /// State of the invoke following NewStartLabel, or NullState to indicate
+  /// the presence of calls which may unwind to caller.
+  int NewState;
+};
+
+/// Iterator that reports all the invoke state changes in a range of machine
+/// basic blocks.  Changes to the null state are reported whenever a call that
+/// may unwind to caller is encountered.  The MBB range is expected to be an
+/// entire function or funclet, and the start and end of the range are treated
+/// as being in the NullState even if there's not an unwind-to-caller call
+/// before the first invoke or after the last one (i.e., the first state change
+/// reported is the first change to something other than NullState, and a
+/// change back to NullState is always reported at the end of iteration).
+class InvokeStateChangeIterator {
+  InvokeStateChangeIterator(const WinEHFuncInfo &EHInfo,
+                            MachineFunction::const_iterator MFI,
+                            MachineFunction::const_iterator MFE,
+                            MachineBasicBlock::const_iterator MBBI,
+                            int BaseState)
+      : EHInfo(EHInfo), MFI(MFI), MFE(MFE), MBBI(MBBI), BaseState(BaseState) {
+    LastStateChange.PreviousEndLabel = nullptr;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    scan();
+  }
+
+public:
+  static iterator_range<InvokeStateChangeIterator>
+  range(const WinEHFuncInfo &EHInfo, MachineFunction::const_iterator Begin,
+        MachineFunction::const_iterator End, int BaseState = NullState) {
+    // Reject empty ranges to simplify bookkeeping by ensuring that we can get
+    // the end of the last block.
+    assert(Begin != End);
+    auto BlockBegin = Begin->begin();
+    auto BlockEnd = std::prev(End)->end();
+    return make_range(
+        InvokeStateChangeIterator(EHInfo, Begin, End, BlockBegin, BaseState),
+        InvokeStateChangeIterator(EHInfo, End, End, BlockEnd, BaseState));
+  }
+
+  // Iterator methods.
+  bool operator==(const InvokeStateChangeIterator &O) const {
+    assert(BaseState == O.BaseState);
+    // Must be visiting same block.
+    if (MFI != O.MFI)
+      return false;
+    // Must be visiting same isntr.
+    if (MBBI != O.MBBI)
+      return false;
+    // At end of block/instr iteration, we can still have two distinct states:
+    // one to report the final EndLabel, and another indicating the end of the
+    // state change iteration.  Check for CurrentEndLabel equality to
+    // distinguish these.
+    return CurrentEndLabel == O.CurrentEndLabel;
+  }
+
+  bool operator!=(const InvokeStateChangeIterator &O) const {
+    return !operator==(O);
+  }
+  InvokeStateChange &operator*() { return LastStateChange; }
+  InvokeStateChange *operator->() { return &LastStateChange; }
+  InvokeStateChangeIterator &operator++() { return scan(); }
+
+private:
+  InvokeStateChangeIterator &scan();
+
+  const WinEHFuncInfo &EHInfo;
+  const MCSymbol *CurrentEndLabel = nullptr;
+  MachineFunction::const_iterator MFI;
+  MachineFunction::const_iterator MFE;
+  MachineBasicBlock::const_iterator MBBI;
+  InvokeStateChange LastStateChange;
+  bool VisitingInvoke = false;
+  int BaseState;
+};
+
+} // end anonymous namespace
+
+InvokeStateChangeIterator &InvokeStateChangeIterator::scan() {
+  bool IsNewBlock = false;
+  for (; MFI != MFE; ++MFI, IsNewBlock = true) {
+    if (IsNewBlock)
+      MBBI = MFI->begin();
+    for (auto MBBE = MFI->end(); MBBI != MBBE; ++MBBI) {
+      const MachineInstr &MI = *MBBI;
+      if (!VisitingInvoke && LastStateChange.NewState != BaseState &&
+          MI.isCall() && !EHStreamer::callToNoUnwindFunction(&MI)) {
+        // Indicate a change of state to the null state.  We don't have
+        // start/end EH labels handy but the caller won't expect them for
+        // null state regions.
+        LastStateChange.PreviousEndLabel = CurrentEndLabel;
+        LastStateChange.NewStartLabel = nullptr;
+        LastStateChange.NewState = BaseState;
+        CurrentEndLabel = nullptr;
+        // Don't re-visit this instr on the next scan
+        ++MBBI;
+        return *this;
+      }
+
+      // All other state changes are at EH labels before/after invokes.
+      if (!MI.isEHLabel())
+        continue;
+      MCSymbol *Label = MI.getOperand(0).getMCSymbol();
+      if (Label == CurrentEndLabel) {
+        VisitingInvoke = false;
+        continue;
+      }
+      auto InvokeMapIter = EHInfo.LabelToStateMap.find(Label);
+      // Ignore EH labels that aren't the ones inserted before an invoke
+      if (InvokeMapIter == EHInfo.LabelToStateMap.end())
+        continue;
+      auto &StateAndEnd = InvokeMapIter->second;
+      int NewState = StateAndEnd.first;
+      // Keep track of the fact that we're between EH start/end labels so
+      // we know not to treat the inoke we'll see as unwinding to caller.
+      VisitingInvoke = true;
+      if (NewState == LastStateChange.NewState) {
+        // The state isn't actually changing here.  Record the new end and
+        // keep going.
+        CurrentEndLabel = StateAndEnd.second;
+        continue;
+      }
+      // Found a state change to report
+      LastStateChange.PreviousEndLabel = CurrentEndLabel;
+      LastStateChange.NewStartLabel = Label;
+      LastStateChange.NewState = NewState;
+      // Start keeping track of the new current end
+      CurrentEndLabel = StateAndEnd.second;
+      // Don't re-visit this instr on the next scan
+      ++MBBI;
+      return *this;
+    }
+  }
+  // Iteration hit the end of the block range.
+  if (LastStateChange.NewState != BaseState) {
+    // Report the end of the last new state
+    LastStateChange.PreviousEndLabel = CurrentEndLabel;
+    LastStateChange.NewStartLabel = nullptr;
+    LastStateChange.NewState = BaseState;
+    // Leave CurrentEndLabel non-null to distinguish this state from end.
+    assert(CurrentEndLabel != nullptr);
+    return *this;
+  }
+  // We've reported all state changes and hit the end state.
+  CurrentEndLabel = nullptr;
+  return *this;
+}
+
+/// Emit the language-specific data that __C_specific_handler expects.  This
+/// handler lives in the x64 Microsoft C runtime and allows catching or cleaning
+/// up after faults with __try, __except, and __finally.  The typeinfo values
+/// are not really RTTI data, but pointers to filter functions that return an
+/// integer (1, 0, or -1) indicating how to handle the exception. For __finally
+/// blocks and other cleanups, the landing pad label is zero, and the filter
+/// function is actually a cleanup handler with the same prototype.  A catch-all
+/// entry is modeled with a null filter function field and a non-zero landing
+/// pad label.
+///
+/// Possible filter function return values:
+///   EXCEPTION_EXECUTE_HANDLER (1):
+///     Jump to the landing pad label after cleanups.
+///   EXCEPTION_CONTINUE_SEARCH (0):
+///     Continue searching this table or continue unwinding.
+///   EXCEPTION_CONTINUE_EXECUTION (-1):
+///     Resume execution at the trapping PC.
+///
+/// Inferred table structure:
+///   struct Table {
+///     int NumEntries;
+///     struct Entry {
+///       imagerel32 LabelStart;
+///       imagerel32 LabelEnd;
+///       imagerel32 FilterOrFinally;  // One means catch-all.
+///       imagerel32 LabelLPad;        // Zero means __finally.
+///     } Entries[NumEntries];
+///   };
+void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  // Emit a label assignment with the SEH frame offset so we can use it for
+  // llvm.x86.seh.recoverfp.
+  StringRef FLinkageName =
+      GlobalValue::getRealLinkageName(MF->getFunction()->getName());
+  MCSymbol *ParentFrameOffset =
+      Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+  const MCExpr *MCOffset =
+      MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx);
+  Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset);
+
+  // Use the assembler to compute the number of table entries through label
+  // difference and division.
+  MCSymbol *TableBegin =
+      Ctx.createTempSymbol("lsda_begin", /*AlwaysAddSuffix=*/true);
+  MCSymbol *TableEnd =
+      Ctx.createTempSymbol("lsda_end", /*AlwaysAddSuffix=*/true);
+  const MCExpr *LabelDiff = getOffset(TableEnd, TableBegin);
+  const MCExpr *EntrySize = MCConstantExpr::create(16, Ctx);
+  const MCExpr *EntryCount = MCBinaryExpr::createDiv(LabelDiff, EntrySize, Ctx);
+  AddComment("Number of call sites");
+  OS.EmitValue(EntryCount, 4);
+
+  OS.EmitLabel(TableBegin);
+
+  // Iterate over all the invoke try ranges. Unlike MSVC, LLVM currently only
+  // models exceptions from invokes. LLVM also allows arbitrary reordering of
+  // the code, so our tables end up looking a bit different. Rather than
+  // trying to match MSVC's tables exactly, we emit a denormalized table.  For
+  // each range of invokes in the same state, we emit table entries for all
+  // the actions that would be taken in that state. This means our tables are
+  // slightly bigger, which is OK.
+  const MCSymbol *LastStartLabel = nullptr;
+  int LastEHState = -1;
+  // Break out before we enter into a finally funclet.
+  // FIXME: We need to emit separate EH tables for cleanups.
+  MachineFunction::const_iterator End = MF->end();
+  MachineFunction::const_iterator Stop = std::next(MF->begin());
+  while (Stop != End && !Stop->isEHFuncletEntry())
+    ++Stop;
+  for (const auto &StateChange :
+       InvokeStateChangeIterator::range(FuncInfo, MF->begin(), Stop)) {
+    // Emit all the actions for the state we just transitioned out of
+    // if it was not the null state
+    if (LastEHState != -1)
+      emitSEHActionsForRange(FuncInfo, LastStartLabel,
+                             StateChange.PreviousEndLabel, LastEHState);
+    LastStartLabel = StateChange.NewStartLabel;
+    LastEHState = StateChange.NewState;
+  }
+
+  OS.EmitLabel(TableEnd);
+}
+
+void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                                          const MCSymbol *BeginLabel,
+                                          const MCSymbol *EndLabel, int State) {
+  auto &OS = *Asm->OutStreamer;
+  MCContext &Ctx = Asm->OutContext;
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  assert(BeginLabel && EndLabel);
+  while (State != -1) {
+    const SEHUnwindMapEntry &UME = FuncInfo.SEHUnwindMap[State];
+    const MCExpr *FilterOrFinally;
+    const MCExpr *ExceptOrNull;
+    auto *Handler = UME.Handler.get<MachineBasicBlock *>();
+    if (UME.IsFinally) {
+      FilterOrFinally = create32bitRef(getMCSymbolForMBB(Asm, Handler));
+      ExceptOrNull = MCConstantExpr::create(0, Ctx);
+    } else {
+      // For an except, the filter can be 1 (catch-all) or a function
+      // label.
+      FilterOrFinally = UME.Filter ? create32bitRef(UME.Filter)
+                                   : MCConstantExpr::create(1, Ctx);
+      ExceptOrNull = create32bitRef(Handler->getSymbol());
+    }
+
+    AddComment("LabelStart");
+    OS.EmitValue(getLabelPlusOne(BeginLabel), 4);
+    AddComment("LabelEnd");
+    OS.EmitValue(getLabelPlusOne(EndLabel), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
+                                                             : "CatchAll");
+    OS.EmitValue(FilterOrFinally, 4);
+    AddComment(UME.IsFinally ? "Null" : "ExceptionHandler");
+    OS.EmitValue(ExceptOrNull, 4);
+
+    assert(UME.ToState < State && "states should decrease");
+    State = UME.ToState;
+  }
+}
+
+void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
+  const Function *F = MF->getFunction();
+  auto &OS = *Asm->OutStreamer;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+
+  StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName());
+
+  SmallVector<std::pair<const MCExpr *, int>, 4> IPToStateTable;
+  MCSymbol *FuncInfoXData = nullptr;
+  if (shouldEmitPersonality) {
+    // If we're 64-bit, emit a pointer to the C++ EH data, and build a map from
+    // IPs to state numbers.
+    FuncInfoXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$cppxdata$", FuncLinkageName));
+    computeIP2StateTable(MF, FuncInfo, IPToStateTable);
+  } else {
+    FuncInfoXData = Asm->OutContext.getOrCreateLSDASymbol(FuncLinkageName);
+  }
+
+  int UnwindHelpOffset = 0;
+  if (Asm->MAI->usesWindowsCFI())
+    UnwindHelpOffset =
+        getFrameIndexOffset(FuncInfo.UnwindHelpFrameIdx, FuncInfo);
+
+  MCSymbol *UnwindMapXData = nullptr;
+  MCSymbol *TryBlockMapXData = nullptr;
+  MCSymbol *IPToStateXData = nullptr;
+  if (!FuncInfo.CxxUnwindMap.empty())
+    UnwindMapXData = Asm->OutContext.getOrCreateSymbol(
+        Twine("$stateUnwindMap$", FuncLinkageName));
+  if (!FuncInfo.TryBlockMap.empty())
+    TryBlockMapXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$tryMap$", FuncLinkageName));
+  if (!IPToStateTable.empty())
+    IPToStateXData =
+        Asm->OutContext.getOrCreateSymbol(Twine("$ip2state$", FuncLinkageName));
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  // FuncInfo {
+  //   uint32_t           MagicNumber
+  //   int32_t            MaxState;
+  //   UnwindMapEntry    *UnwindMap;
+  //   uint32_t           NumTryBlocks;
+  //   TryBlockMapEntry  *TryBlockMap;
+  //   uint32_t           IPMapEntries; // always 0 for x86
+  //   IPToStateMapEntry *IPToStateMap; // always 0 for x86
+  //   uint32_t           UnwindHelp;   // non-x86 only
+  //   ESTypeList        *ESTypeList;
+  //   int32_t            EHFlags;
+  // }
+  // EHFlags & 1 -> Synchronous exceptions only, no async exceptions.
+  // EHFlags & 2 -> ???
+  // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue.
+  OS.EmitValueToAlignment(4);
+  OS.EmitLabel(FuncInfoXData);
+
+  AddComment("MagicNumber");
+  OS.EmitIntValue(0x19930522, 4);
+
+  AddComment("MaxState");
+  OS.EmitIntValue(FuncInfo.CxxUnwindMap.size(), 4);
+
+  AddComment("UnwindMap");
+  OS.EmitValue(create32bitRef(UnwindMapXData), 4);
+
+  AddComment("NumTryBlocks");
+  OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4);
+
+  AddComment("TryBlockMap");
+  OS.EmitValue(create32bitRef(TryBlockMapXData), 4);
+
+  AddComment("IPMapEntries");
+  OS.EmitIntValue(IPToStateTable.size(), 4);
+
+  AddComment("IPToStateXData");
+  OS.EmitValue(create32bitRef(IPToStateXData), 4);
+
+  if (Asm->MAI->usesWindowsCFI()) {
+    AddComment("UnwindHelp");
+    OS.EmitIntValue(UnwindHelpOffset, 4);
+  }
+
+  AddComment("ESTypeList");
+  OS.EmitIntValue(0, 4);
+
+  AddComment("EHFlags");
+  OS.EmitIntValue(1, 4);
+
+  // UnwindMapEntry {
+  //   int32_t ToState;
+  //   void  (*Action)();
+  // };
+  if (UnwindMapXData) {
+    OS.EmitLabel(UnwindMapXData);
+    for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) {
+      MCSymbol *CleanupSym =
+          getMCSymbolForMBB(Asm, UME.Cleanup.dyn_cast<MachineBasicBlock *>());
+      AddComment("ToState");
+      OS.EmitIntValue(UME.ToState, 4);
+
+      AddComment("Action");
+      OS.EmitValue(create32bitRef(CleanupSym), 4);
+    }
+  }
+
+  // TryBlockMap {
+  //   int32_t      TryLow;
+  //   int32_t      TryHigh;
+  //   int32_t      CatchHigh;
+  //   int32_t      NumCatches;
+  //   HandlerType *HandlerArray;
+  // };
+  if (TryBlockMapXData) {
+    OS.EmitLabel(TryBlockMapXData);
+    SmallVector<MCSymbol *, 1> HandlerMaps;
+    for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
+
+      MCSymbol *HandlerMapXData = nullptr;
+      if (!TBME.HandlerArray.empty())
+        HandlerMapXData =
+            Asm->OutContext.getOrCreateSymbol(Twine("$handlerMap$")
+                                                  .concat(Twine(I))
+                                                  .concat("$")
+                                                  .concat(FuncLinkageName));
+      HandlerMaps.push_back(HandlerMapXData);
+
+      // TBMEs should form intervals.
+      assert(0 <= TBME.TryLow && "bad trymap interval");
+      assert(TBME.TryLow <= TBME.TryHigh && "bad trymap interval");
+      assert(TBME.TryHigh < TBME.CatchHigh && "bad trymap interval");
+      assert(TBME.CatchHigh < int(FuncInfo.CxxUnwindMap.size()) &&
+             "bad trymap interval");
+
+      AddComment("TryLow");
+      OS.EmitIntValue(TBME.TryLow, 4);
+
+      AddComment("TryHigh");
+      OS.EmitIntValue(TBME.TryHigh, 4);
+
+      AddComment("CatchHigh");
+      OS.EmitIntValue(TBME.CatchHigh, 4);
+
+      AddComment("NumCatches");
+      OS.EmitIntValue(TBME.HandlerArray.size(), 4);
+
+      AddComment("HandlerArray");
+      OS.EmitValue(create32bitRef(HandlerMapXData), 4);
+    }
+
+    // All funclets use the same parent frame offset currently.
+    unsigned ParentFrameOffset = 0;
+    if (shouldEmitPersonality) {
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      ParentFrameOffset = TFI->getWinEHParentFrameOffset(*MF);
+    }
+
+    for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
+      const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
+      MCSymbol *HandlerMapXData = HandlerMaps[I];
+      if (!HandlerMapXData)
+        continue;
+      // HandlerType {
+      //   int32_t         Adjectives;
+      //   TypeDescriptor *Type;
+      //   int32_t         CatchObjOffset;
+      //   void          (*Handler)();
+      //   int32_t         ParentFrameOffset; // x64 only
+      // };
+      OS.EmitLabel(HandlerMapXData);
+      for (const WinEHHandlerType &HT : TBME.HandlerArray) {
+        // Get the frame escape label with the offset of the catch object. If
+        // the index is INT_MAX, then there is no catch object, and we should
+        // emit an offset of zero, indicating that no copy will occur.
+        const MCExpr *FrameAllocOffsetRef = nullptr;
+        if (HT.CatchObj.FrameIndex != INT_MAX) {
+          int Offset = getFrameIndexOffset(HT.CatchObj.FrameIndex, FuncInfo);
+          assert(Offset != 0 && "Illegal offset for catch object!");
+          FrameAllocOffsetRef = MCConstantExpr::create(Offset, Asm->OutContext);
+        } else {
+          FrameAllocOffsetRef = MCConstantExpr::create(0, Asm->OutContext);
+        }
+
+        MCSymbol *HandlerSym =
+            getMCSymbolForMBB(Asm, HT.Handler.dyn_cast<MachineBasicBlock *>());
+
+        AddComment("Adjectives");
+        OS.EmitIntValue(HT.Adjectives, 4);
+
+        AddComment("Type");
+        OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4);
+
+        AddComment("CatchObjOffset");
+        OS.EmitValue(FrameAllocOffsetRef, 4);
+
+        AddComment("Handler");
+        OS.EmitValue(create32bitRef(HandlerSym), 4);
+
+        if (shouldEmitPersonality) {
+          AddComment("ParentFrameOffset");
+          OS.EmitIntValue(ParentFrameOffset, 4);
+        }
+      }
+    }
+  }
+
+  // IPToStateMapEntry {
+  //   void   *IP;
+  //   int32_t State;
+  // };
+  if (IPToStateXData) {
+    OS.EmitLabel(IPToStateXData);
+    for (auto &IPStatePair : IPToStateTable) {
+      AddComment("IP");
+      OS.EmitValue(IPStatePair.first, 4);
+      AddComment("ToState");
+      OS.EmitIntValue(IPStatePair.second, 4);
+    }
+  }
+}
+
+void WinException::computeIP2StateTable(
+    const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+    SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable) {
+
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    // Find the end of the funclet
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        break;
+      }
+    }
+
+    // Don't emit ip2state entries for cleanup funclets. Any interesting
+    // exceptional actions in cleanups must be handled in a separate IR
+    // function.
+    if (FuncletStart->isCleanupFuncletEntry())
+      continue;
+
+    MCSymbol *StartLabel;
+    int BaseState;
+    if (FuncletStart == MF->begin()) {
+      BaseState = NullState;
+      StartLabel = Asm->getFunctionBegin();
+    } else {
+      auto *FuncletPad =
+          cast<FuncletPadInst>(FuncletStart->getBasicBlock()->getFirstNonPHI());
+      assert(FuncInfo.FuncletBaseStateMap.count(FuncletPad) != 0);
+      BaseState = FuncInfo.FuncletBaseStateMap.find(FuncletPad)->second;
+      StartLabel = getMCSymbolForMBB(Asm, &*FuncletStart);
+    }
+    assert(StartLabel && "need local function start label");
+    IPToStateTable.push_back(
+        std::make_pair(create32bitRef(StartLabel), BaseState));
+
+    for (const auto &StateChange : InvokeStateChangeIterator::range(
+             FuncInfo, FuncletStart, FuncletEnd, BaseState)) {
+      // Compute the label to report as the start of this entry; use the EH
+      // start label for the invoke if we have one, otherwise (this is a call
+      // which may unwind to our caller and does not have an EH start label, so)
+      // use the previous end label.
+      const MCSymbol *ChangeLabel = StateChange.NewStartLabel;
+      if (!ChangeLabel)
+        ChangeLabel = StateChange.PreviousEndLabel;
+      // Emit an entry indicating that PCs after 'Label' have this EH state.
+      IPToStateTable.push_back(
+          std::make_pair(getLabelPlusOne(ChangeLabel), StateChange.NewState));
+      // FIXME: assert that NewState is between CatchLow and CatchHigh.
+    }
+  }
+}
+
+void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
+                                                 StringRef FLinkageName) {
+  // Outlined helpers called by the EH runtime need to know the offset of the EH
+  // registration in order to recover the parent frame pointer. Now that we know
+  // we've code generated the parent, we can emit the label assignment that
+  // those helpers use to get the offset of the registration node.
+  MCContext &Ctx = Asm->OutContext;
+  MCSymbol *ParentFrameOffset =
+      Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
+  unsigned UnusedReg;
+  const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+  int64_t Offset = TFI->getFrameIndexReference(
+      *Asm->MF, FuncInfo.EHRegNodeFrameIndex, UnusedReg);
+  const MCExpr *MCOffset = MCConstantExpr::create(Offset, Ctx);
+  Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset);
+}
+
+/// Emit the language-specific data that _except_handler3 and 4 expect. This is
+/// functionally equivalent to the __C_specific_handler table, except it is
+/// indexed by state number instead of IP.
+void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
+  MCStreamer &OS = *Asm->OutStreamer;
+  const Function *F = MF->getFunction();
+  StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName());
+
+  bool VerboseAsm = OS.isVerboseAsm();
+  auto AddComment = [&](const Twine &Comment) {
+    if (VerboseAsm)
+      OS.AddComment(Comment);
+  };
+
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+  emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName);
+
+  // Emit the __ehtable label that we use for llvm.x86.seh.lsda.
+  MCSymbol *LSDALabel = Asm->OutContext.getOrCreateLSDASymbol(FLinkageName);
+  OS.EmitValueToAlignment(4);
+  OS.EmitLabel(LSDALabel);
+
+  const Function *Per =
+      dyn_cast<Function>(F->getPersonalityFn()->stripPointerCasts());
+  StringRef PerName = Per->getName();
+  int BaseState = -1;
+  if (PerName == "_except_handler4") {
+    // The LSDA for _except_handler4 starts with this struct, followed by the
+    // scope table:
+    //
+    // struct EH4ScopeTable {
+    //   int32_t GSCookieOffset;
+    //   int32_t GSCookieXOROffset;
+    //   int32_t EHCookieOffset;
+    //   int32_t EHCookieXOROffset;
+    //   ScopeTableEntry ScopeRecord[];
+    // };
+    //
+    // Offsets are %ebp relative.
+    //
+    // The GS cookie is present only if the function needs stack protection.
+    // GSCookieOffset = -2 means that GS cookie is not used.
+    //
+    // The EH cookie is always present.
+    //
+    // Check is done the following way:
+    //    (ebp+CookieXOROffset) ^ [ebp+CookieOffset] == _security_cookie
+
+    // Retrieve the Guard Stack slot.
+    int GSCookieOffset = -2;
+    const MachineFrameInfo *MFI = MF->getFrameInfo();
+    if (MFI->hasStackProtectorIndex()) {
+      unsigned UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int SSPIdx = MFI->getStackProtectorIndex();
+      GSCookieOffset = TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg);
+    }
+
+    // Retrieve the EH Guard slot.
+    // TODO(etienneb): Get rid of this value and change it for and assertion.
+    int EHCookieOffset = 9999;
+    if (FuncInfo.EHGuardFrameIndex != INT_MAX) {
+      unsigned UnusedReg;
+      const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+      int EHGuardIdx = FuncInfo.EHGuardFrameIndex;
+      EHCookieOffset = TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg);
+    }
+
+    AddComment("GSCookieOffset");
+    OS.EmitIntValue(GSCookieOffset, 4);
+    AddComment("GSCookieXOROffset");
+    OS.EmitIntValue(0, 4);
+    AddComment("EHCookieOffset");
+    OS.EmitIntValue(EHCookieOffset, 4);
+    AddComment("EHCookieXOROffset");
+    OS.EmitIntValue(0, 4);
+    BaseState = -2;
+  }
+
+  assert(!FuncInfo.SEHUnwindMap.empty());
+  for (const SEHUnwindMapEntry &UME : FuncInfo.SEHUnwindMap) {
+    auto *Handler = UME.Handler.get<MachineBasicBlock *>();
+    const MCSymbol *ExceptOrFinally =
+        UME.IsFinally ? getMCSymbolForMBB(Asm, Handler) : Handler->getSymbol();
+    // -1 is usually the base state for "unwind to caller", but for
+    // _except_handler4 it's -2. Do that replacement here if necessary.
+    int ToState = UME.ToState == -1 ? BaseState : UME.ToState;
+    AddComment("ToState");
+    OS.EmitIntValue(ToState, 4);
+    AddComment(UME.IsFinally ? "Null" : "FilterFunction");
+    OS.EmitValue(create32bitRef(UME.Filter), 4);
+    AddComment(UME.IsFinally ? "FinallyFunclet" : "ExceptionHandler");
+    OS.EmitValue(create32bitRef(ExceptOrFinally), 4);
+  }
+}
+
+static int getTryRank(const WinEHFuncInfo &FuncInfo, int State) {
+  int Rank = 0;
+  while (State != -1) {
+    ++Rank;
+    State = FuncInfo.ClrEHUnwindMap[State].TryParentState;
+  }
+  return Rank;
+}
+
+static int getTryAncestor(const WinEHFuncInfo &FuncInfo, int Left, int Right) {
+  int LeftRank = getTryRank(FuncInfo, Left);
+  int RightRank = getTryRank(FuncInfo, Right);
+
+  while (LeftRank < RightRank) {
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+    --RightRank;
+  }
+
+  while (RightRank < LeftRank) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    --LeftRank;
+  }
+
+  while (Left != Right) {
+    Left = FuncInfo.ClrEHUnwindMap[Left].TryParentState;
+    Right = FuncInfo.ClrEHUnwindMap[Right].TryParentState;
+  }
+
+  return Left;
+}
+
+void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
+  // CLR EH "states" are really just IDs that identify handlers/funclets;
+  // states, handlers, and funclets all have 1:1 mappings between them, and a
+  // handler/funclet's "state" is its index in the ClrEHUnwindMap.
+  MCStreamer &OS = *Asm->OutStreamer;
+  const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo();
+  MCSymbol *FuncBeginSym = Asm->getFunctionBegin();
+  MCSymbol *FuncEndSym = Asm->getFunctionEnd();
+
+  // A ClrClause describes a protected region.
+  struct ClrClause {
+    const MCSymbol *StartLabel; // Start of protected region
+    const MCSymbol *EndLabel;   // End of protected region
+    int State;          // Index of handler protecting the protected region
+    int EnclosingState; // Index of funclet enclosing the protected region
+  };
+  SmallVector<ClrClause, 8> Clauses;
+
+  // Build a map from handler MBBs to their corresponding states (i.e. their
+  // indices in the ClrEHUnwindMap).
+  int NumStates = FuncInfo.ClrEHUnwindMap.size();
+  assert(NumStates > 0 && "Don't need exception table!");
+  DenseMap<const MachineBasicBlock *, int> HandlerStates;
+  for (int State = 0; State < NumStates; ++State) {
+    MachineBasicBlock *HandlerBlock =
+        FuncInfo.ClrEHUnwindMap[State].Handler.get<MachineBasicBlock *>();
+    HandlerStates[HandlerBlock] = State;
+    // Use this loop through all handlers to verify our assumption (used in
+    // the MinEnclosingState computation) that enclosing funclets have lower
+    // state numbers than their enclosed funclets.
+    assert(FuncInfo.ClrEHUnwindMap[State].HandlerParentState < State &&
+           "ill-formed state numbering");
+  }
+  // Map the main function to the NullState.
+  HandlerStates[&MF->front()] = NullState;
+
+  // Write out a sentinel indicating the end of the standard (Windows) xdata
+  // and the start of the additional (CLR) info.
+  OS.EmitIntValue(0xffffffff, 4);
+  // Write out the number of funclets
+  OS.EmitIntValue(NumStates, 4);
+
+  // Walk the machine blocks/instrs, computing and emitting a few things:
+  // 1. Emit a list of the offsets to each handler entry, in lexical order.
+  // 2. Compute a map (EndSymbolMap) from each funclet to the symbol at its end.
+  // 3. Compute the list of ClrClauses, in the required order (inner before
+  //    outer, earlier before later; the order by which a forward scan with
+  //    early termination will find the innermost enclosing clause covering
+  //    a given address).
+  // 4. A map (MinClauseMap) from each handler index to the index of the
+  //    outermost funclet/function which contains a try clause targeting the
+  //    key handler.  This will be used to determine IsDuplicate-ness when
+  //    emitting ClrClauses.  The NullState value is used to indicate that the
+  //    top-level function contains a try clause targeting the key handler.
+  // HandlerStack is a stack of (PendingStartLabel, PendingState) pairs for
+  // try regions we entered before entering the PendingState try but which
+  // we haven't yet exited.
+  SmallVector<std::pair<const MCSymbol *, int>, 4> HandlerStack;
+  // EndSymbolMap and MinClauseMap are maps described above.
+  std::unique_ptr<MCSymbol *[]> EndSymbolMap(new MCSymbol *[NumStates]);
+  SmallVector<int, 4> MinClauseMap((size_t)NumStates, NumStates);
+
+  // Visit the root function and each funclet.
+  for (MachineFunction::const_iterator FuncletStart = MF->begin(),
+                                       FuncletEnd = MF->begin(),
+                                       End = MF->end();
+       FuncletStart != End; FuncletStart = FuncletEnd) {
+    int FuncletState = HandlerStates[&*FuncletStart];
+    // Find the end of the funclet
+    MCSymbol *EndSymbol = FuncEndSym;
+    while (++FuncletEnd != End) {
+      if (FuncletEnd->isEHFuncletEntry()) {
+        EndSymbol = getMCSymbolForMBB(Asm, &*FuncletEnd);
+        break;
+      }
+    }
+    // Emit the function/funclet end and, if this is a funclet (and not the
+    // root function), record it in the EndSymbolMap.
+    OS.EmitValue(getOffset(EndSymbol, FuncBeginSym), 4);
+    if (FuncletState != NullState) {
+      // Record the end of the handler.
+      EndSymbolMap[FuncletState] = EndSymbol;
+    }
+
+    // Walk the state changes in this function/funclet and compute its clauses.
+    // Funclets always start in the null state.
+    const MCSymbol *CurrentStartLabel = nullptr;
+    int CurrentState = NullState;
+    assert(HandlerStack.empty());
+    for (const auto &StateChange :
+         InvokeStateChangeIterator::range(FuncInfo, FuncletStart, FuncletEnd)) {
+      // Close any try regions we're not still under
+      int StillPendingState =
+          getTryAncestor(FuncInfo, CurrentState, StateChange.NewState);
+      while (CurrentState != StillPendingState) {
+        assert(CurrentState != NullState &&
+               "Failed to find still-pending state!");
+        // Close the pending clause
+        Clauses.push_back({CurrentStartLabel, StateChange.PreviousEndLabel,
+                           CurrentState, FuncletState});
+        // Now the next-outer try region is current
+        CurrentState = FuncInfo.ClrEHUnwindMap[CurrentState].TryParentState;
+        // Pop the new start label from the handler stack if we've exited all
+        // inner try regions of the corresponding try region.
+        if (HandlerStack.back().second == CurrentState)
+          CurrentStartLabel = HandlerStack.pop_back_val().first;
+      }
+
+      if (StateChange.NewState != CurrentState) {
+        // For each clause we're starting, update the MinClauseMap so we can
+        // know which is the topmost funclet containing a clause targeting
+        // it.
+        for (int EnteredState = StateChange.NewState;
+             EnteredState != CurrentState;
+             EnteredState =
+                 FuncInfo.ClrEHUnwindMap[EnteredState].TryParentState) {
+          int &MinEnclosingState = MinClauseMap[EnteredState];
+          if (FuncletState < MinEnclosingState)
+            MinEnclosingState = FuncletState;
+        }
+        // Save the previous current start/label on the stack and update to
+        // the newly-current start/state.
+        HandlerStack.emplace_back(CurrentStartLabel, CurrentState);
+        CurrentStartLabel = StateChange.NewStartLabel;
+        CurrentState = StateChange.NewState;
+      }
+    }
+    assert(HandlerStack.empty());
+  }
+
+  // Now emit the clause info, starting with the number of clauses.
+  OS.EmitIntValue(Clauses.size(), 4);
+  for (ClrClause &Clause : Clauses) {
+    // Emit a CORINFO_EH_CLAUSE :
+    /*
+      struct CORINFO_EH_CLAUSE
+      {
+          CORINFO_EH_CLAUSE_FLAGS Flags;         // actually a CorExceptionFlag
+          DWORD                   TryOffset;
+          DWORD                   TryLength;     // actually TryEndOffset
+          DWORD                   HandlerOffset;
+          DWORD                   HandlerLength; // actually HandlerEndOffset
+          union
+          {
+              DWORD               ClassToken;   // use for catch clauses
+              DWORD               FilterOffset; // use for filter clauses
+          };
+      };
+
+      enum CORINFO_EH_CLAUSE_FLAGS
+      {
+          CORINFO_EH_CLAUSE_NONE    = 0,
+          CORINFO_EH_CLAUSE_FILTER  = 0x0001, // This clause is for a filter
+          CORINFO_EH_CLAUSE_FINALLY = 0x0002, // This clause is a finally clause
+          CORINFO_EH_CLAUSE_FAULT   = 0x0004, // This clause is a fault clause
+      };
+      typedef enum CorExceptionFlag
+      {
+          COR_ILEXCEPTION_CLAUSE_NONE,
+          COR_ILEXCEPTION_CLAUSE_FILTER  = 0x0001, // This is a filter clause
+          COR_ILEXCEPTION_CLAUSE_FINALLY = 0x0002, // This is a finally clause
+          COR_ILEXCEPTION_CLAUSE_FAULT = 0x0004,   // This is a fault clause
+          COR_ILEXCEPTION_CLAUSE_DUPLICATED = 0x0008, // duplicated clause. This
+                                                      // clause was duplicated
+                                                      // to a funclet which was
+                                                      // pulled out of line
+      } CorExceptionFlag;
+    */
+    // Add 1 to the start/end of the EH clause; the IP associated with a
+    // call when the runtime does its scan is the IP of the next instruction
+    // (the one to which control will return after the call), so we need
+    // to add 1 to the end of the clause to cover that offset.  We also add
+    // 1 to the start of the clause to make sure that the ranges reported
+    // for all clauses are disjoint.  Note that we'll need some additional
+    // logic when machine traps are supported, since in that case the IP
+    // that the runtime uses is the offset of the faulting instruction
+    // itself; if such an instruction immediately follows a call but the
+    // two belong to different clauses, we'll need to insert a nop between
+    // them so the runtime can distinguish the point to which the call will
+    // return from the point at which the fault occurs.
+
+    const MCExpr *ClauseBegin =
+        getOffsetPlusOne(Clause.StartLabel, FuncBeginSym);
+    const MCExpr *ClauseEnd = getOffsetPlusOne(Clause.EndLabel, FuncBeginSym);
+
+    const ClrEHUnwindMapEntry &Entry = FuncInfo.ClrEHUnwindMap[Clause.State];
+    MachineBasicBlock *HandlerBlock = Entry.Handler.get<MachineBasicBlock *>();
+    MCSymbol *BeginSym = getMCSymbolForMBB(Asm, HandlerBlock);
+    const MCExpr *HandlerBegin = getOffset(BeginSym, FuncBeginSym);
+    MCSymbol *EndSym = EndSymbolMap[Clause.State];
+    const MCExpr *HandlerEnd = getOffset(EndSym, FuncBeginSym);
+
+    uint32_t Flags = 0;
+    switch (Entry.HandlerType) {
+    case ClrHandlerType::Catch:
+      // Leaving bits 0-2 clear indicates catch.
+      break;
+    case ClrHandlerType::Filter:
+      Flags |= 1;
+      break;
+    case ClrHandlerType::Finally:
+      Flags |= 2;
+      break;
+    case ClrHandlerType::Fault:
+      Flags |= 4;
+      break;
+    }
+    if (Clause.EnclosingState != MinClauseMap[Clause.State]) {
+      // This is a "duplicate" clause; the handler needs to be entered from a
+      // frame above the one holding the invoke.
+      assert(Clause.EnclosingState > MinClauseMap[Clause.State]);
+      Flags |= 8;
+    }
+    OS.EmitIntValue(Flags, 4);
+
+    // Write the clause start/end
+    OS.EmitValue(ClauseBegin, 4);
+    OS.EmitValue(ClauseEnd, 4);
+
+    // Write out the handler start/end
+    OS.EmitValue(HandlerBegin, 4);
+    OS.EmitValue(HandlerEnd, 4);
+
+    // Write out the type token or filter offset
+    assert(Entry.HandlerType != ClrHandlerType::Filter && "NYI: filters");
+    OS.EmitIntValue(Entry.TypeToken, 4);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
new file mode 100644
index 000000000000..acb301016910
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -0,0 +1,106 @@
+//===-- WinException.h - Windows Exception Handling ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for writing windows exception info into asm files.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+#define LLVM_LIB_CODEGEN_ASMPRINTER_WIN64EXCEPTION_H
+
+#include "EHStreamer.h"
+
+namespace llvm {
+class Function;
+class GlobalValue;
+class MachineFunction;
+class MCExpr;
+class Value;
+struct WinEHFuncInfo;
+
+class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
+  /// Per-function flag to indicate if personality info should be emitted.
+  bool shouldEmitPersonality = false;
+
+  /// Per-function flag to indicate if the LSDA should be emitted.
+  bool shouldEmitLSDA = false;
+
+  /// Per-function flag to indicate if frame moves info should be emitted.
+  bool shouldEmitMoves = false;
+
+  /// True if this is a 64-bit target and we should use image relative offsets.
+  bool useImageRel32 = false;
+
+  /// Pointer to the current funclet entry BB.
+  const MachineBasicBlock *CurrentFuncletEntry = nullptr;
+
+  void emitCSpecificHandlerTable(const MachineFunction *MF);
+
+  void emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
+                              const MCSymbol *BeginLabel,
+                              const MCSymbol *EndLabel, int State);
+
+  /// Emit the EH table data for 32-bit and 64-bit functions using
+  /// the __CxxFrameHandler3 personality.
+  void emitCXXFrameHandler3Table(const MachineFunction *MF);
+
+  /// Emit the EH table data for _except_handler3 and _except_handler4
+  /// personality functions. These are only used on 32-bit and do not use CFI
+  /// tables.
+  void emitExceptHandlerTable(const MachineFunction *MF);
+
+  void emitCLRExceptionTable(const MachineFunction *MF);
+
+  void computeIP2StateTable(
+      const MachineFunction *MF, const WinEHFuncInfo &FuncInfo,
+      SmallVectorImpl<std::pair<const MCExpr *, int>> &IPToStateTable);
+
+  /// Emits the label used with llvm.x86.seh.recoverfp, which is used by
+  /// outlined funclets.
+  void emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
+                                     StringRef FLinkageName);
+
+  const MCExpr *create32bitRef(const MCSymbol *Value);
+  const MCExpr *create32bitRef(const GlobalValue *GV);
+  const MCExpr *getLabelPlusOne(const MCSymbol *Label);
+  const MCExpr *getOffset(const MCSymbol *OffsetOf, const MCSymbol *OffsetFrom);
+  const MCExpr *getOffsetPlusOne(const MCSymbol *OffsetOf,
+                                 const MCSymbol *OffsetFrom);
+
+  /// Gets the offset that we should use in a table for a stack object with the
+  /// given index. For targets using CFI (Win64, etc), this is relative to the
+  /// established SP at the end of the prologue. For targets without CFI (Win32
+  /// only), it is relative to the frame pointer.
+  int getFrameIndexOffset(int FrameIndex, const WinEHFuncInfo &FuncInfo);
+
+public:
+  //===--------------------------------------------------------------------===//
+  // Main entry points.
+  //
+  WinException(AsmPrinter *A);
+  ~WinException() override;
+
+  /// Emit all exception information that should come after the content.
+  void endModule() override;
+
+  /// Gather pre-function exception information.  Assumes being emitted
+  /// immediately after the function entry point.
+  void beginFunction(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function exception information.
+  void endFunction(const MachineFunction *) override;
+
+  /// \brief Emit target-specific EH funclet machinery.
+  void beginFunclet(const MachineBasicBlock &MBB, MCSymbol *Sym) override;
+  void endFunclet() override;
+};
+}
+
+#endif
+
diff --git a/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
new file mode 100644
index 000000000000..bf5cf105a8f8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -0,0 +1,1640 @@
+//===-- AtomicExpandPass.cpp - Expand atomic instructions -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass (at IR level) to replace atomic instructions with
+// __atomic_* library calls, or target specific instruction which implement the
+// same semantics in a way which better fits the target backend.  This can
+// include the use of (intrinsic-based) load-linked/store-conditional loops,
+// AtomicCmpXchg, or type coercions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/AtomicExpandUtils.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "atomic-expand"
+
+namespace {
+  class AtomicExpand: public FunctionPass {
+    const TargetMachine *TM;
+    const TargetLowering *TLI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit AtomicExpand(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM), TLI(nullptr) {
+      initializeAtomicExpandPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnFunction(Function &F) override;
+
+  private:
+    bool bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                               bool IsStore, bool IsLoad);
+    IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL);
+    LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI);
+    bool tryExpandAtomicLoad(LoadInst *LI);
+    bool expandAtomicLoadToLL(LoadInst *LI);
+    bool expandAtomicLoadToCmpXchg(LoadInst *LI);
+    StoreInst *convertAtomicStoreToIntegerType(StoreInst *SI);
+    bool expandAtomicStore(StoreInst *SI);
+    bool tryExpandAtomicRMW(AtomicRMWInst *AI);
+    Value *
+    insertRMWLLSCLoop(IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+                      AtomicOrdering MemOpOrder,
+                      function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+    void expandAtomicOpToLLSC(
+        Instruction *I, Type *ResultTy, Value *Addr, AtomicOrdering MemOpOrder,
+        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp);
+    void expandPartwordAtomicRMW(
+        AtomicRMWInst *I,
+        TargetLoweringBase::AtomicExpansionKind ExpansionKind);
+    void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+
+    AtomicCmpXchgInst *convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI);
+    static Value *insertRMWCmpXchgLoop(
+        IRBuilder<> &Builder, Type *ResultType, Value *Addr,
+        AtomicOrdering MemOpOrder,
+        function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+        CreateCmpXchgInstFun CreateCmpXchg);
+
+    bool expandAtomicCmpXchg(AtomicCmpXchgInst *CI);
+    bool isIdempotentRMW(AtomicRMWInst *AI);
+    bool simplifyIdempotentRMW(AtomicRMWInst *AI);
+
+    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
+                                 Value *PointerOperand, Value *ValueOperand,
+                                 Value *CASExpected, AtomicOrdering Ordering,
+                                 AtomicOrdering Ordering2,
+                                 ArrayRef<RTLIB::Libcall> Libcalls);
+    void expandAtomicLoadToLibcall(LoadInst *LI);
+    void expandAtomicStoreToLibcall(StoreInst *LI);
+    void expandAtomicRMWToLibcall(AtomicRMWInst *I);
+    void expandAtomicCASToLibcall(AtomicCmpXchgInst *I);
+
+    friend bool
+    llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                   CreateCmpXchgInstFun CreateCmpXchg);
+  };
+}
+
+char AtomicExpand::ID = 0;
+char &llvm::AtomicExpandID = AtomicExpand::ID;
+INITIALIZE_TM_PASS(AtomicExpand, "atomic-expand", "Expand Atomic instructions",
+                   false, false)
+
+FunctionPass *llvm::createAtomicExpandPass(const TargetMachine *TM) {
+  return new AtomicExpand(TM);
+}
+
+namespace {
+// Helper functions to retrieve the size of atomic instructions.
+unsigned getAtomicOpSize(LoadInst *LI) {
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(LI->getType());
+}
+
+unsigned getAtomicOpSize(StoreInst *SI) {
+  const DataLayout &DL = SI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(SI->getValueOperand()->getType());
+}
+
+unsigned getAtomicOpSize(AtomicRMWInst *RMWI) {
+  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
+}
+
+unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
+  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
+}
+
+// Helper functions to retrieve the alignment of atomic instructions.
+unsigned getAtomicOpAlign(LoadInst *LI) {
+  unsigned Align = LI->getAlignment();
+  // In the future, if this IR restriction is relaxed, we should
+  // return DataLayout::getABITypeAlignment when there's no align
+  // value.
+  assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
+  return Align;
+}
+
+unsigned getAtomicOpAlign(StoreInst *SI) {
+  unsigned Align = SI->getAlignment();
+  // In the future, if this IR restriction is relaxed, we should
+  // return DataLayout::getABITypeAlignment when there's no align
+  // value.
+  assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
+  return Align;
+}
+
+unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
+  // TODO(PR27168): This instruction has no alignment attribute, but unlike the
+  // default alignment for load/store, the default here is to assume
+  // it has NATURAL alignment, not DataLayout-specified alignment.
+  const DataLayout &DL = RMWI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
+}
+
+unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
+  // TODO(PR27168): same comment as above.
+  const DataLayout &DL = CASI->getModule()->getDataLayout();
+  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
+}
+
+// Determine if a particular atomic operation has a supported size,
+// and is of appropriate alignment, to be passed through for target
+// lowering. (Versus turning into a __atomic libcall)
+template <typename Inst>
+bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+}
+
+} // end anonymous namespace
+
+bool AtomicExpand::runOnFunction(Function &F) {
+  if (!TM || !TM->getSubtargetImpl(F)->enableAtomicExpand())
+    return false;
+  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+
+  SmallVector<Instruction *, 1> AtomicInsts;
+
+  // Changing control-flow while iterating through it is a bad idea, so gather a
+  // list of all atomic instructions before we start.
+  for (inst_iterator II = inst_begin(F), E = inst_end(F); II != E; ++II) {
+    Instruction *I = &*II;
+    if (I->isAtomic() && !isa<FenceInst>(I))
+      AtomicInsts.push_back(I);
+  }
+
+  bool MadeChange = false;
+  for (auto I : AtomicInsts) {
+    auto LI = dyn_cast<LoadInst>(I);
+    auto SI = dyn_cast<StoreInst>(I);
+    auto RMWI = dyn_cast<AtomicRMWInst>(I);
+    auto CASI = dyn_cast<AtomicCmpXchgInst>(I);
+    assert((LI || SI || RMWI || CASI) && "Unknown atomic instruction");
+
+    // If the Size/Alignment is not supported, replace with a libcall.
+    if (LI) {
+      if (!atomicSizeSupported(TLI, LI)) {
+        expandAtomicLoadToLibcall(LI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (SI) {
+      if (!atomicSizeSupported(TLI, SI)) {
+        expandAtomicStoreToLibcall(SI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (RMWI) {
+      if (!atomicSizeSupported(TLI, RMWI)) {
+        expandAtomicRMWToLibcall(RMWI);
+        MadeChange = true;
+        continue;
+      }
+    } else if (CASI) {
+      if (!atomicSizeSupported(TLI, CASI)) {
+        expandAtomicCASToLibcall(CASI);
+        MadeChange = true;
+        continue;
+      }
+    }
+
+    if (TLI->shouldInsertFencesForAtomic(I)) {
+      auto FenceOrdering = AtomicOrdering::Monotonic;
+      bool IsStore, IsLoad;
+      if (LI && isAcquireOrStronger(LI->getOrdering())) {
+        FenceOrdering = LI->getOrdering();
+        LI->setOrdering(AtomicOrdering::Monotonic);
+        IsStore = false;
+        IsLoad = true;
+      } else if (SI && isReleaseOrStronger(SI->getOrdering())) {
+        FenceOrdering = SI->getOrdering();
+        SI->setOrdering(AtomicOrdering::Monotonic);
+        IsStore = true;
+        IsLoad = false;
+      } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) ||
+                          isAcquireOrStronger(RMWI->getOrdering()))) {
+        FenceOrdering = RMWI->getOrdering();
+        RMWI->setOrdering(AtomicOrdering::Monotonic);
+        IsStore = IsLoad = true;
+      } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
+                 (isReleaseOrStronger(CASI->getSuccessOrdering()) ||
+                  isAcquireOrStronger(CASI->getSuccessOrdering()))) {
+        // If a compare and swap is lowered to LL/SC, we can do smarter fence
+        // insertion, with a stronger one on the success path than on the
+        // failure path. As a result, fence insertion is directly done by
+        // expandAtomicCmpXchg in that case.
+        FenceOrdering = CASI->getSuccessOrdering();
+        CASI->setSuccessOrdering(AtomicOrdering::Monotonic);
+        CASI->setFailureOrdering(AtomicOrdering::Monotonic);
+        IsStore = IsLoad = true;
+      }
+
+      if (FenceOrdering != AtomicOrdering::Monotonic) {
+        MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad);
+      }
+    }
+
+    if (LI) {
+      if (LI->getType()->isFloatingPointTy()) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        LI = convertAtomicLoadToIntegerType(LI);
+        assert(LI->getType()->isIntegerTy() && "invariant broken");
+        MadeChange = true;
+      }
+
+      MadeChange |= tryExpandAtomicLoad(LI);
+    } else if (SI) {
+      if (SI->getValueOperand()->getType()->isFloatingPointTy()) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        SI = convertAtomicStoreToIntegerType(SI);
+        assert(SI->getValueOperand()->getType()->isIntegerTy() &&
+               "invariant broken");
+        MadeChange = true;
+      }
+
+      if (TLI->shouldExpandAtomicStoreInIR(SI))
+        MadeChange |= expandAtomicStore(SI);
+    } else if (RMWI) {
+      // There are two different ways of expanding RMW instructions:
+      // - into a load if it is idempotent
+      // - into a Cmpxchg/LL-SC loop otherwise
+      // we try them in that order.
+
+      if (isIdempotentRMW(RMWI) && simplifyIdempotentRMW(RMWI)) {
+        MadeChange = true;
+      } else {
+        MadeChange |= tryExpandAtomicRMW(RMWI);
+      }
+    } else if (CASI) {
+      // TODO: when we're ready to make the change at the IR level, we can
+      // extend convertCmpXchgToInteger for floating point too.
+      assert(!CASI->getCompareOperand()->getType()->isFloatingPointTy() &&
+             "unimplemented - floating point not legal at IR level");
+      if (CASI->getCompareOperand()->getType()->isPointerTy() ) {
+        // TODO: add a TLI hook to control this so that each target can
+        // convert to lowering the original type one at a time.
+        CASI = convertCmpXchgToIntegerType(CASI);
+        assert(CASI->getCompareOperand()->getType()->isIntegerTy() &&
+               "invariant broken");
+        MadeChange = true;
+      }
+
+      unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+      unsigned ValueSize = getAtomicOpSize(CASI);
+      if (ValueSize < MinCASSize) {
+        assert(!TLI->shouldExpandAtomicCmpXchgInIR(CASI) &&
+               "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
+        expandPartwordCmpXchg(CASI);
+      } else {
+        if (TLI->shouldExpandAtomicCmpXchgInIR(CASI))
+          MadeChange |= expandAtomicCmpXchg(CASI);
+      }
+    }
+  }
+  return MadeChange;
+}
+
+bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order,
+                                         bool IsStore, bool IsLoad) {
+  IRBuilder<> Builder(I);
+
+  auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad);
+
+  auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad);
+  // The trailing fence is emitted before the instruction instead of after
+  // because there is no easy way of setting Builder insertion point after
+  // an instruction. So we must erase it from the BB, and insert it back
+  // in the right place.
+  // We have a guard here because not every atomic operation generates a
+  // trailing fence.
+  if (TrailingFence) {
+    TrailingFence->removeFromParent();
+    TrailingFence->insertAfter(I);
+  }
+
+  return (LeadingFence || TrailingFence);
+}
+
+/// Get the iX type with the same bitwidth as T.
+IntegerType *AtomicExpand::getCorrespondingIntegerType(Type *T,
+                                                       const DataLayout &DL) {
+  EVT VT = TLI->getValueType(DL, T);
+  unsigned BitWidth = VT.getStoreSizeInBits();
+  assert(BitWidth == VT.getSizeInBits() && "must be a power of two");
+  return IntegerType::get(T->getContext(), BitWidth);
+}
+
+/// Convert an atomic load of a non-integral type to an integer load of the
+/// equivalent bitwidth.  See the function comment on
+/// convertAtomicStoreToIntegerType for background.  
+LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
+  auto *M = LI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(LI->getType(),
+                                            M->getDataLayout());
+
+  IRBuilder<> Builder(LI);
+  
+  Value *Addr = LI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+  
+  auto *NewLI = Builder.CreateLoad(NewAddr);
+  NewLI->setAlignment(LI->getAlignment());
+  NewLI->setVolatile(LI->isVolatile());
+  NewLI->setAtomic(LI->getOrdering(), LI->getSynchScope());
+  DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
+  
+  Value *NewVal = Builder.CreateBitCast(NewLI, LI->getType());
+  LI->replaceAllUsesWith(NewVal);
+  LI->eraseFromParent();
+  return NewLI;
+}
+
+bool AtomicExpand::tryExpandAtomicLoad(LoadInst *LI) {
+  switch (TLI->shouldExpandAtomicLoadInIR(LI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC:
+    expandAtomicOpToLLSC(
+        LI, LI->getType(), LI->getPointerOperand(), LI->getOrdering(),
+        [](IRBuilder<> &Builder, Value *Loaded) { return Loaded; });
+    return true;
+  case TargetLoweringBase::AtomicExpansionKind::LLOnly:
+    return expandAtomicLoadToLL(LI);
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg:
+    return expandAtomicLoadToCmpXchg(LI);
+  }
+  llvm_unreachable("Unhandled case in tryExpandAtomicLoad");
+}
+
+bool AtomicExpand::expandAtomicLoadToLL(LoadInst *LI) {
+  IRBuilder<> Builder(LI);
+
+  // On some architectures, load-linked instructions are atomic for larger
+  // sizes than normal loads. For example, the only 64-bit load guaranteed
+  // to be single-copy atomic by ARM is an ldrexd (A3.5.3).
+  Value *Val =
+      TLI->emitLoadLinked(Builder, LI->getPointerOperand(), LI->getOrdering());
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
+
+  LI->replaceAllUsesWith(Val);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+bool AtomicExpand::expandAtomicLoadToCmpXchg(LoadInst *LI) {
+  IRBuilder<> Builder(LI);
+  AtomicOrdering Order = LI->getOrdering();
+  Value *Addr = LI->getPointerOperand();
+  Type *Ty = cast<PointerType>(Addr->getType())->getElementType();
+  Constant *DummyVal = Constant::getNullValue(Ty);
+
+  Value *Pair = Builder.CreateAtomicCmpXchg(
+      Addr, DummyVal, DummyVal, Order,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(Order));
+  Value *Loaded = Builder.CreateExtractValue(Pair, 0, "loaded");
+
+  LI->replaceAllUsesWith(Loaded);
+  LI->eraseFromParent();
+
+  return true;
+}
+
+/// Convert an atomic store of a non-integral type to an integer store of the
+/// equivalent bitwidth.  We used to not support floating point or vector
+/// atomics in the IR at all.  The backends learned to deal with the bitcast
+/// idiom because that was the only way of expressing the notion of a atomic
+/// float or vector store.  The long term plan is to teach each backend to
+/// instruction select from the original atomic store, but as a migration
+/// mechanism, we convert back to the old format which the backends understand.
+/// Each backend will need individual work to recognize the new format.
+StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
+  IRBuilder<> Builder(SI);
+  auto *M = SI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(SI->getValueOperand()->getType(),
+                                            M->getDataLayout());
+  Value *NewVal = Builder.CreateBitCast(SI->getValueOperand(), NewTy);
+  
+  Value *Addr = SI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
+  NewSI->setAlignment(SI->getAlignment());
+  NewSI->setVolatile(SI->isVolatile());
+  NewSI->setAtomic(SI->getOrdering(), SI->getSynchScope());
+  DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
+  SI->eraseFromParent();
+  return NewSI;
+}
+
+bool AtomicExpand::expandAtomicStore(StoreInst *SI) {
+  // This function is only called on atomic stores that are too large to be
+  // atomic if implemented as a native store. So we replace them by an
+  // atomic swap, that can be implemented for example as a ldrex/strex on ARM
+  // or lock cmpxchg8/16b on X86, as these are atomic for larger sizes.
+  // It is the responsibility of the target to only signal expansion via
+  // shouldExpandAtomicRMW in cases where this is required and possible.
+  IRBuilder<> Builder(SI);
+  AtomicRMWInst *AI =
+      Builder.CreateAtomicRMW(AtomicRMWInst::Xchg, SI->getPointerOperand(),
+                              SI->getValueOperand(), SI->getOrdering());
+  SI->eraseFromParent();
+
+  // Now we have an appropriate swap instruction, lower it as usual.
+  return tryExpandAtomicRMW(AI);
+}
+
+static void createCmpXchgInstFun(IRBuilder<> &Builder, Value *Addr,
+                                 Value *Loaded, Value *NewVal,
+                                 AtomicOrdering MemOpOrder,
+                                 Value *&Success, Value *&NewLoaded) {
+  Value* Pair = Builder.CreateAtomicCmpXchg(
+      Addr, Loaded, NewVal, MemOpOrder,
+      AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+  Success = Builder.CreateExtractValue(Pair, 1, "success");
+  NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+}
+
+/// Emit IR to implement the given atomicrmw operation on values in registers,
+/// returning the new value.
+static Value *performAtomicOp(AtomicRMWInst::BinOp Op, IRBuilder<> &Builder,
+                              Value *Loaded, Value *Inc) {
+  Value *NewVal;
+  switch (Op) {
+  case AtomicRMWInst::Xchg:
+    return Inc;
+  case AtomicRMWInst::Add:
+    return Builder.CreateAdd(Loaded, Inc, "new");
+  case AtomicRMWInst::Sub:
+    return Builder.CreateSub(Loaded, Inc, "new");
+  case AtomicRMWInst::And:
+    return Builder.CreateAnd(Loaded, Inc, "new");
+  case AtomicRMWInst::Nand:
+    return Builder.CreateNot(Builder.CreateAnd(Loaded, Inc), "new");
+  case AtomicRMWInst::Or:
+    return Builder.CreateOr(Loaded, Inc, "new");
+  case AtomicRMWInst::Xor:
+    return Builder.CreateXor(Loaded, Inc, "new");
+  case AtomicRMWInst::Max:
+    NewVal = Builder.CreateICmpSGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::Min:
+    NewVal = Builder.CreateICmpSLE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMax:
+    NewVal = Builder.CreateICmpUGT(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  case AtomicRMWInst::UMin:
+    NewVal = Builder.CreateICmpULE(Loaded, Inc);
+    return Builder.CreateSelect(NewVal, Loaded, Inc, "new");
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
+  switch (TLI->shouldExpandAtomicRMWInIR(AI)) {
+  case TargetLoweringBase::AtomicExpansionKind::None:
+    return false;
+  case TargetLoweringBase::AtomicExpansionKind::LLSC: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      llvm_unreachable(
+          "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
+    } else {
+      auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+        return performAtomicOp(AI->getOperation(), Builder, Loaded,
+                               AI->getValOperand());
+      };
+      expandAtomicOpToLLSC(AI, AI->getType(), AI->getPointerOperand(),
+                           AI->getOrdering(), PerformOp);
+    }
+    return true;
+  }
+  case TargetLoweringBase::AtomicExpansionKind::CmpXChg: {
+    unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
+    unsigned ValueSize = getAtomicOpSize(AI);
+    if (ValueSize < MinCASSize) {
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::CmpXChg);
+    } else {
+      expandAtomicRMWToCmpXchg(AI, createCmpXchgInstFun);
+    }
+    return true;
+  }
+  default:
+    llvm_unreachable("Unhandled case in tryExpandAtomicRMW");
+  }
+}
+
+namespace {
+
+/// Result values from createMaskInstrs helper.
+struct PartwordMaskValues {
+  Type *WordType;
+  Type *ValueType;
+  Value *AlignedAddr;
+  Value *ShiftAmt;
+  Value *Mask;
+  Value *Inv_Mask;
+};
+} // end anonymous namespace
+
+/// This is a helper function which builds instructions to provide
+/// values necessary for partword atomic operations. It takes an
+/// incoming address, Addr, and ValueType, and constructs the address,
+/// shift-amounts and masks needed to work with a larger value of size
+/// WordSize.
+///
+/// AlignedAddr: Addr rounded down to a multiple of WordSize
+///
+/// ShiftAmt: Number of bits to right-shift a WordSize value loaded
+///           from AlignAddr for it to have the same value as if
+///           ValueType was loaded from Addr.
+///
+/// Mask: Value to mask with the value loaded from AlignAddr to
+///       include only the part that would've been loaded from Addr.
+///
+/// Inv_Mask: The inverse of Mask.
+
+static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
+                                           Type *ValueType, Value *Addr,
+                                           unsigned WordSize) {
+  PartwordMaskValues Ret;
+
+  BasicBlock *BB = I->getParent();
+  Function *F = BB->getParent();
+  Module *M = I->getModule();
+
+  LLVMContext &Ctx = F->getContext();
+  const DataLayout &DL = M->getDataLayout();
+
+  unsigned ValueSize = DL.getTypeStoreSize(ValueType);
+
+  assert(ValueSize < WordSize);
+
+  Ret.ValueType = ValueType;
+  Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
+
+  Type *WordPtrType =
+      Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
+
+  Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
+  Ret.AlignedAddr = Builder.CreateIntToPtr(
+      Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
+      "AlignedAddr");
+
+  Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
+  if (DL.isLittleEndian()) {
+    // turn bytes into bits
+    Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
+  } else {
+    // turn bytes into bits, and count from the other side.
+    Ret.ShiftAmt =
+        Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
+  }
+
+  Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
+  Ret.Mask = Builder.CreateShl(
+      ConstantInt::get(Ret.WordType, (1 << ValueSize * 8) - 1), Ret.ShiftAmt,
+      "Mask");
+  Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
+
+  return Ret;
+}
+
+/// Emit IR to implement a masked version of a given atomicrmw
+/// operation. (That is, only the bits under the Mask should be
+/// affected by the operation)
+static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
+                                    IRBuilder<> &Builder, Value *Loaded,
+                                    Value *Shifted_Inc, Value *Inc,
+                                    const PartwordMaskValues &PMV) {
+  switch (Op) {
+  case AtomicRMWInst::Xchg: {
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, Shifted_Inc);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Or:
+  case AtomicRMWInst::Xor:
+    // Or/Xor won't affect any other bits, so can just be done
+    // directly.
+    return performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
+  case AtomicRMWInst::Add:
+  case AtomicRMWInst::Sub:
+  case AtomicRMWInst::And:
+  case AtomicRMWInst::Nand: {
+    // The other arithmetic ops need to be masked into place.
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded, Shifted_Inc);
+    Value *NewVal_Masked = Builder.CreateAnd(NewVal, PMV.Mask);
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Masked);
+    return FinalVal;
+  }
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin: {
+    // Finally, comparison ops will operate on the full value, so
+    // truncate down to the original size, and expand out again after
+    // doing the operation.
+    Value *Loaded_Shiftdown = Builder.CreateTrunc(
+        Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
+    Value *NewVal_Shiftup = Builder.CreateShl(
+        Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
+    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
+    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
+    return FinalVal;
+  }
+  default:
+    llvm_unreachable("Unknown atomic op");
+  }
+}
+
+/// Expand a sub-word atomicrmw operation into an appropriate
+/// word-sized operation.
+///
+/// It will create an LL/SC or cmpxchg loop, as appropriate, the same
+/// way as a typical atomicrmw expansion. The only difference here is
+/// that the operation inside of the loop must operate only upon a
+/// part of the value.
+void AtomicExpand::expandPartwordAtomicRMW(
+    AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
+
+  assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
+
+  AtomicOrdering MemOpOrder = AI->getOrdering();
+
+  IRBuilder<> Builder(AI);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, AI, AI->getType(), AI->getPointerOperand(),
+                       TLI->getMinCmpXchgSizeInBits() / 8);
+
+  Value *ValOperand_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(AI->getValOperand(), PMV.WordType),
+                        PMV.ShiftAmt, "ValOperand_Shifted");
+
+  auto PerformPartwordOp = [&](IRBuilder<> &Builder, Value *Loaded) {
+    return performMaskedAtomicOp(AI->getOperation(), Builder, Loaded,
+                                 ValOperand_Shifted, AI->getValOperand(), PMV);
+  };
+
+  // TODO: When we're ready to support LLSC conversions too, use
+  // insertRMWLLSCLoop here for ExpansionKind==LLSC.
+  Value *OldResult =
+      insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
+                           PerformPartwordOp, createCmpXchgInstFun);
+  Value *FinalOldResult = Builder.CreateTrunc(
+      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  AI->replaceAllUsesWith(FinalOldResult);
+  AI->eraseFromParent();
+}
+
+void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+  // The basic idea here is that we're expanding a cmpxchg of a
+  // smaller memory size up to a word-sized cmpxchg. To do this, we
+  // need to add a retry-loop for strong cmpxchg, so that
+  // modifications to other parts of the word don't cause a spurious
+  // failure.
+
+  // This generates code like the following:
+  //     [[Setup mask values PMV.*]]
+  //     %NewVal_Shifted = shl i32 %NewVal, %PMV.ShiftAmt
+  //     %Cmp_Shifted = shl i32 %Cmp, %PMV.ShiftAmt
+  //     %InitLoaded = load i32* %addr
+  //     %InitLoaded_MaskOut = and i32 %InitLoaded, %PMV.Inv_Mask
+  //     br partword.cmpxchg.loop
+  // partword.cmpxchg.loop:
+  //     %Loaded_MaskOut = phi i32 [ %InitLoaded_MaskOut, %entry ],
+  //        [ %OldVal_MaskOut, %partword.cmpxchg.failure ]
+  //     %FullWord_NewVal = or i32 %Loaded_MaskOut, %NewVal_Shifted
+  //     %FullWord_Cmp = or i32 %Loaded_MaskOut, %Cmp_Shifted
+  //     %NewCI = cmpxchg i32* %PMV.AlignedAddr, i32 %FullWord_Cmp,
+  //        i32 %FullWord_NewVal success_ordering failure_ordering
+  //     %OldVal = extractvalue { i32, i1 } %NewCI, 0
+  //     %Success = extractvalue { i32, i1 } %NewCI, 1
+  //     br i1 %Success, label %partword.cmpxchg.end,
+  //        label %partword.cmpxchg.failure
+  // partword.cmpxchg.failure:
+  //     %OldVal_MaskOut = and i32 %OldVal, %PMV.Inv_Mask
+  //     %ShouldContinue = icmp ne i32 %Loaded_MaskOut, %OldVal_MaskOut
+  //     br i1 %ShouldContinue, label %partword.cmpxchg.loop,
+  //         label %partword.cmpxchg.end
+  // partword.cmpxchg.end:
+  //    %tmp1 = lshr i32 %OldVal, %PMV.ShiftAmt
+  //    %FinalOldVal = trunc i32 %tmp1 to i8
+  //    %tmp2 = insertvalue { i8, i1 } undef, i8 %FinalOldVal, 0
+  //    %Res = insertvalue { i8, i1 } %25, i1 %Success, 1
+
+  Value *Addr = CI->getPointerOperand();
+  Value *Cmp = CI->getCompareOperand();
+  Value *NewVal = CI->getNewValOperand();
+
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  IRBuilder<> Builder(CI);
+  LLVMContext &Ctx = Builder.getContext();
+
+  const int WordSize = TLI->getMinCmpXchgSizeInBits() / 8;
+
+  BasicBlock *EndBB =
+      BB->splitBasicBlock(CI->getIterator(), "partword.cmpxchg.end");
+  auto FailureBB =
+      BasicBlock::Create(Ctx, "partword.cmpxchg.failure", F, EndBB);
+  auto LoopBB = BasicBlock::Create(Ctx, "partword.cmpxchg.loop", F, FailureBB);
+
+  // The split call above "helpfully" added a branch at the end of BB
+  // (to the wrong place).
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+
+  PartwordMaskValues PMV = createMaskInstrs(
+      Builder, CI, CI->getCompareOperand()->getType(), Addr, WordSize);
+
+  // Shift the incoming values over, into the right location in the word.
+  Value *NewVal_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
+  Value *Cmp_Shifted =
+      Builder.CreateShl(Builder.CreateZExt(Cmp, PMV.WordType), PMV.ShiftAmt);
+
+  // Load the entire current word, and mask into place the expected and new
+  // values
+  LoadInst *InitLoaded = Builder.CreateLoad(PMV.WordType, PMV.AlignedAddr);
+  InitLoaded->setVolatile(CI->isVolatile());
+  Value *InitLoaded_MaskOut = Builder.CreateAnd(InitLoaded, PMV.Inv_Mask);
+  Builder.CreateBr(LoopBB);
+
+  // partword.cmpxchg.loop:
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded_MaskOut = Builder.CreatePHI(PMV.WordType, 2);
+  Loaded_MaskOut->addIncoming(InitLoaded_MaskOut, BB);
+
+  // Mask/Or the expected and new values into place in the loaded word.
+  Value *FullWord_NewVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shifted);
+  Value *FullWord_Cmp = Builder.CreateOr(Loaded_MaskOut, Cmp_Shifted);
+  AtomicCmpXchgInst *NewCI = Builder.CreateAtomicCmpXchg(
+      PMV.AlignedAddr, FullWord_Cmp, FullWord_NewVal, CI->getSuccessOrdering(),
+      CI->getFailureOrdering(), CI->getSynchScope());
+  NewCI->setVolatile(CI->isVolatile());
+  // When we're building a strong cmpxchg, we need a loop, so you
+  // might think we could use a weak cmpxchg inside. But, using strong
+  // allows the below comparison for ShouldContinue, and we're
+  // expecting the underlying cmpxchg to be a machine instruction,
+  // which is strong anyways.
+  NewCI->setWeak(CI->isWeak());
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Success = Builder.CreateExtractValue(NewCI, 1);
+
+  if (CI->isWeak())
+    Builder.CreateBr(EndBB);
+  else
+    Builder.CreateCondBr(Success, EndBB, FailureBB);
+
+  // partword.cmpxchg.failure:
+  Builder.SetInsertPoint(FailureBB);
+  // Upon failure, verify that the masked-out part of the loaded value
+  // has been modified.  If it didn't, abort the cmpxchg, since the
+  // masked-in part must've.
+  Value *OldVal_MaskOut = Builder.CreateAnd(OldVal, PMV.Inv_Mask);
+  Value *ShouldContinue = Builder.CreateICmpNE(Loaded_MaskOut, OldVal_MaskOut);
+  Builder.CreateCondBr(ShouldContinue, LoopBB, EndBB);
+
+  // Add the second value to the phi from above
+  Loaded_MaskOut->addIncoming(OldVal_MaskOut, FailureBB);
+
+  // partword.cmpxchg.end:
+  Builder.SetInsertPoint(CI);
+
+  Value *FinalOldVal = Builder.CreateTrunc(
+      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
+  Value *Res = UndefValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Success, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+}
+
+void AtomicExpand::expandAtomicOpToLLSC(
+    Instruction *I, Type *ResultType, Value *Addr, AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  IRBuilder<> Builder(I);
+  Value *Loaded =
+      insertRMWLLSCLoop(Builder, ResultType, Addr, MemOpOrder, PerformOp);
+
+  I->replaceAllUsesWith(Loaded);
+  I->eraseFromParent();
+}
+
+Value *AtomicExpand::insertRMWLLSCLoop(
+    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  // atomicrmw.start:
+  //     %loaded = @load.linked(%addr)
+  //     %new = some_op iN %loaded, %incr
+  //     %stored = @store_conditional(%new, %addr)
+  //     %try_again = icmp i32 ne %stored, 0
+  //     br i1 %try_again, label %loop, label %atomicrmw.end
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+  BasicBlock *LoopBB =  BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place).
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  Value *Loaded = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+
+  Value *NewVal = PerformOp(Builder, Loaded);
+
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewVal, Addr, MemOpOrder);
+  Value *TryAgain = Builder.CreateICmpNE(
+      StoreSuccess, ConstantInt::get(IntegerType::get(Ctx, 32), 0), "tryagain");
+  Builder.CreateCondBr(TryAgain, LoopBB, ExitBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return Loaded;
+}
+
+/// Convert an atomic cmpxchg of a non-integral type to an integer cmpxchg of
+/// the equivalent bitwidth.  We used to not support pointer cmpxchg in the
+/// IR.  As a migration step, we convert back to what use to be the standard
+/// way to represent a pointer cmpxchg so that we can update backends one by
+/// one. 
+AtomicCmpXchgInst *AtomicExpand::convertCmpXchgToIntegerType(AtomicCmpXchgInst *CI) {
+  auto *M = CI->getModule();
+  Type *NewTy = getCorrespondingIntegerType(CI->getCompareOperand()->getType(),
+                                            M->getDataLayout());
+
+  IRBuilder<> Builder(CI);
+  
+  Value *Addr = CI->getPointerOperand();
+  Type *PT = PointerType::get(NewTy,
+                              Addr->getType()->getPointerAddressSpace());
+  Value *NewAddr = Builder.CreateBitCast(Addr, PT);
+
+  Value *NewCmp = Builder.CreatePtrToInt(CI->getCompareOperand(), NewTy);
+  Value *NewNewVal = Builder.CreatePtrToInt(CI->getNewValOperand(), NewTy);
+  
+  
+  auto *NewCI = Builder.CreateAtomicCmpXchg(NewAddr, NewCmp, NewNewVal,
+                                            CI->getSuccessOrdering(),
+                                            CI->getFailureOrdering(),
+                                            CI->getSynchScope());
+  NewCI->setVolatile(CI->isVolatile());
+  NewCI->setWeak(CI->isWeak());
+  DEBUG(dbgs() << "Replaced " << *CI << " with " << *NewCI << "\n");
+
+  Value *OldVal = Builder.CreateExtractValue(NewCI, 0);
+  Value *Succ = Builder.CreateExtractValue(NewCI, 1);
+
+  OldVal = Builder.CreateIntToPtr(OldVal, CI->getCompareOperand()->getType());
+
+  Value *Res = UndefValue::get(CI->getType());
+  Res = Builder.CreateInsertValue(Res, OldVal, 0);
+  Res = Builder.CreateInsertValue(Res, Succ, 1);
+
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+  return NewCI;
+}
+
+
+bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
+  AtomicOrdering SuccessOrder = CI->getSuccessOrdering();
+  AtomicOrdering FailureOrder = CI->getFailureOrdering();
+  Value *Addr = CI->getPointerOperand();
+  BasicBlock *BB = CI->getParent();
+  Function *F = BB->getParent();
+  LLVMContext &Ctx = F->getContext();
+  // If shouldInsertFencesForAtomic() returns true, then the target does not
+  // want to deal with memory orders, and emitLeading/TrailingFence should take
+  // care of everything. Otherwise, emitLeading/TrailingFence are no-op and we
+  // should preserve the ordering.
+  bool ShouldInsertFencesForAtomic = TLI->shouldInsertFencesForAtomic(CI);
+  AtomicOrdering MemOpOrder =
+      ShouldInsertFencesForAtomic ? AtomicOrdering::Monotonic : SuccessOrder;
+
+  // In implementations which use a barrier to achieve release semantics, we can
+  // delay emitting this barrier until we know a store is actually going to be
+  // attempted. The cost of this delay is that we need 2 copies of the block
+  // emitting the load-linked, affecting code size.
+  //
+  // Ideally, this logic would be unconditional except for the minsize check
+  // since in other cases the extra blocks naturally collapse down to the
+  // minimal loop. Unfortunately, this puts too much stress on later
+  // optimisations so we avoid emitting the extra logic in those cases too.
+  bool HasReleasedLoadBB = !CI->isWeak() && ShouldInsertFencesForAtomic &&
+                           SuccessOrder != AtomicOrdering::Monotonic &&
+                           SuccessOrder != AtomicOrdering::Acquire &&
+                           !F->optForMinSize();
+
+  // There's no overhead for sinking the release barrier in a weak cmpxchg, so
+  // do it even on minsize.
+  bool UseUnconditionalReleaseBarrier = F->optForMinSize() && !CI->isWeak();
+
+  // Given: cmpxchg some_op iN* %addr, iN %desired, iN %new success_ord fail_ord
+  //
+  // The full expansion we produce is:
+  //     [...]
+  // cmpxchg.start:
+  //     %unreleasedload = @load.linked(%addr)
+  //     %should_store = icmp eq %unreleasedload, %desired
+  //     br i1 %should_store, label %cmpxchg.fencedstore,
+  //                          label %cmpxchg.nostore
+  // cmpxchg.releasingstore:
+  //     fence?
+  //     br label cmpxchg.trystore
+  // cmpxchg.trystore:
+  //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
+  //                            [%releasedload, %cmpxchg.releasedload]
+  //     %stored = @store_conditional(%new, %addr)
+  //     %success = icmp eq i32 %stored, 0
+  //     br i1 %success, label %cmpxchg.success,
+  //                     label %cmpxchg.releasedload/%cmpxchg.failure
+  // cmpxchg.releasedload:
+  //     %releasedload = @load.linked(%addr)
+  //     %should_store = icmp eq %releasedload, %desired
+  //     br i1 %should_store, label %cmpxchg.trystore,
+  //                          label %cmpxchg.failure
+  // cmpxchg.success:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.nostore:
+  //     %loaded.nostore = phi [%unreleasedload, %cmpxchg.start],
+  //                           [%releasedload,
+  //                               %cmpxchg.releasedload/%cmpxchg.trystore]
+  //     @load_linked_fail_balance()?
+  //     br label %cmpxchg.failure
+  // cmpxchg.failure:
+  //     fence?
+  //     br label %cmpxchg.end
+  // cmpxchg.end:
+  //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
+  //                   [%loaded.trystore, %cmpxchg.trystore]
+  //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
+  //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
+  //     [...]
+  BasicBlock *ExitBB = BB->splitBasicBlock(CI->getIterator(), "cmpxchg.end");
+  auto FailureBB = BasicBlock::Create(Ctx, "cmpxchg.failure", F, ExitBB);
+  auto NoStoreBB = BasicBlock::Create(Ctx, "cmpxchg.nostore", F, FailureBB);
+  auto SuccessBB = BasicBlock::Create(Ctx, "cmpxchg.success", F, NoStoreBB);
+  auto ReleasedLoadBB =
+      BasicBlock::Create(Ctx, "cmpxchg.releasedload", F, SuccessBB);
+  auto TryStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.trystore", F, ReleasedLoadBB);
+  auto ReleasingStoreBB =
+      BasicBlock::Create(Ctx, "cmpxchg.fencedstore", F, TryStoreBB);
+  auto StartBB = BasicBlock::Create(Ctx, "cmpxchg.start", F, ReleasingStoreBB);
+
+  // This grabs the DebugLoc from CI
+  IRBuilder<> Builder(CI);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we might want a fence too. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                          /*IsLoad=*/true);
+  Builder.CreateBr(StartBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(StartBB);
+  Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *ShouldStore = Builder.CreateICmpEQ(
+      UnreleasedLoad, CI->getCompareOperand(), "should_store");
+
+  // If the cmpxchg doesn't actually need any ordering when it fails, we can
+  // jump straight past that fence instruction (if it exists).
+  Builder.CreateCondBr(ShouldStore, ReleasingStoreBB, NoStoreBB);
+
+  Builder.SetInsertPoint(ReleasingStoreBB);
+  if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier)
+    TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                          /*IsLoad=*/true);
+  Builder.CreateBr(TryStoreBB);
+
+  Builder.SetInsertPoint(TryStoreBB);
+  Value *StoreSuccess = TLI->emitStoreConditional(
+      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  StoreSuccess = Builder.CreateICmpEQ(
+      StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
+  BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
+  Builder.CreateCondBr(StoreSuccess, SuccessBB,
+                       CI->isWeak() ? FailureBB : RetryBB);
+
+  Builder.SetInsertPoint(ReleasedLoadBB);
+  Value *SecondLoad;
+  if (HasReleasedLoadBB) {
+    SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+    ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
+                                       "should_store");
+
+    // If the cmpxchg doesn't actually need any ordering when it fails, we can
+    // jump straight past that fence instruction (if it exists).
+    Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+  } else
+    Builder.CreateUnreachable();
+
+  // Make sure later instructions don't get reordered with a fence if
+  // necessary.
+  Builder.SetInsertPoint(SuccessBB);
+  if (ShouldInsertFencesForAtomic)
+    TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true,
+                           /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  Builder.SetInsertPoint(NoStoreBB);
+  // In the failing case, where we don't execute the store-conditional, the
+  // target might want to balance out the load-linked with a dedicated
+  // instruction (e.g., on ARM, clearing the exclusive monitor).
+  TLI->emitAtomicCmpXchgNoStoreLLBalance(Builder);
+  Builder.CreateBr(FailureBB);
+
+  Builder.SetInsertPoint(FailureBB);
+  if (ShouldInsertFencesForAtomic)
+    TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true,
+                           /*IsLoad=*/true);
+  Builder.CreateBr(ExitBB);
+
+  // Finally, we have control-flow based knowledge of whether the cmpxchg
+  // succeeded or not. We expose this to later passes by converting any
+  // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
+  // PHI.
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
+  Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
+
+  // Setup the builder so we can create any PHIs we need.
+  Value *Loaded;
+  if (!HasReleasedLoadBB)
+    Loaded = UnreleasedLoad;
+  else {
+    Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
+    PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
+    TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
+
+    Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
+    PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
+    NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
+
+    Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
+    PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
+    ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
+    ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
+
+    Loaded = ExitLoaded;
+  }
+
+  // Look for any users of the cmpxchg that are just comparing the loaded value
+  // against the desired one, and replace them with the CFG-derived version.
+  SmallVector<ExtractValueInst *, 2> PrunedInsts;
+  for (auto User : CI->users()) {
+    ExtractValueInst *EV = dyn_cast<ExtractValueInst>(User);
+    if (!EV)
+      continue;
+
+    assert(EV->getNumIndices() == 1 && EV->getIndices()[0] <= 1 &&
+           "weird extraction from { iN, i1 }");
+
+    if (EV->getIndices()[0] == 0)
+      EV->replaceAllUsesWith(Loaded);
+    else
+      EV->replaceAllUsesWith(Success);
+
+    PrunedInsts.push_back(EV);
+  }
+
+  // We can remove the instructions now we're no longer iterating through them.
+  for (auto EV : PrunedInsts)
+    EV->eraseFromParent();
+
+  if (!CI->use_empty()) {
+    // Some use of the full struct return that we don't understand has happened,
+    // so we've got to reconstruct it properly.
+    Value *Res;
+    Res = Builder.CreateInsertValue(UndefValue::get(CI->getType()), Loaded, 0);
+    Res = Builder.CreateInsertValue(Res, Success, 1);
+
+    CI->replaceAllUsesWith(Res);
+  }
+
+  CI->eraseFromParent();
+  return true;
+}
+
+bool AtomicExpand::isIdempotentRMW(AtomicRMWInst* RMWI) {
+  auto C = dyn_cast<ConstantInt>(RMWI->getValOperand());
+  if(!C)
+    return false;
+
+  AtomicRMWInst::BinOp Op = RMWI->getOperation();
+  switch(Op) {
+    case AtomicRMWInst::Add:
+    case AtomicRMWInst::Sub:
+    case AtomicRMWInst::Or:
+    case AtomicRMWInst::Xor:
+      return C->isZero();
+    case AtomicRMWInst::And:
+      return C->isMinusOne();
+    // FIXME: we could also treat Min/Max/UMin/UMax by the INT_MIN/INT_MAX/...
+    default:
+      return false;
+  }
+}
+
+bool AtomicExpand::simplifyIdempotentRMW(AtomicRMWInst* RMWI) {
+  if (auto ResultingLoad = TLI->lowerIdempotentRMWIntoFencedLoad(RMWI)) {
+    tryExpandAtomicLoad(ResultingLoad);
+    return true;
+  }
+  return false;
+}
+
+Value *AtomicExpand::insertRMWCmpXchgLoop(
+    IRBuilder<> &Builder, Type *ResultTy, Value *Addr,
+    AtomicOrdering MemOpOrder,
+    function_ref<Value *(IRBuilder<> &, Value *)> PerformOp,
+    CreateCmpXchgInstFun CreateCmpXchg) {
+  LLVMContext &Ctx = Builder.getContext();
+  BasicBlock *BB = Builder.GetInsertBlock();
+  Function *F = BB->getParent();
+
+  // Given: atomicrmw some_op iN* %addr, iN %incr ordering
+  //
+  // The standard expansion we produce is:
+  //     [...]
+  //     %init_loaded = load atomic iN* %addr
+  //     br label %loop
+  // loop:
+  //     %loaded = phi iN [ %init_loaded, %entry ], [ %new_loaded, %loop ]
+  //     %new = some_op iN %loaded, %incr
+  //     %pair = cmpxchg iN* %addr, iN %loaded, iN %new
+  //     %new_loaded = extractvalue { iN, i1 } %pair, 0
+  //     %success = extractvalue { iN, i1 } %pair, 1
+  //     br i1 %success, label %atomicrmw.end, label %loop
+  // atomicrmw.end:
+  //     [...]
+  BasicBlock *ExitBB =
+      BB->splitBasicBlock(Builder.GetInsertPoint(), "atomicrmw.end");
+  BasicBlock *LoopBB = BasicBlock::Create(Ctx, "atomicrmw.start", F, ExitBB);
+
+  // The split call above "helpfully" added a branch at the end of BB (to the
+  // wrong place), but we want a load. It's easiest to just remove
+  // the branch entirely.
+  std::prev(BB->end())->eraseFromParent();
+  Builder.SetInsertPoint(BB);
+  LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
+  // Atomics require at least natural alignment.
+  InitLoaded->setAlignment(ResultTy->getPrimitiveSizeInBits() / 8);
+  Builder.CreateBr(LoopBB);
+
+  // Start the main loop block now that we've taken care of the preliminaries.
+  Builder.SetInsertPoint(LoopBB);
+  PHINode *Loaded = Builder.CreatePHI(ResultTy, 2, "loaded");
+  Loaded->addIncoming(InitLoaded, BB);
+
+  Value *NewVal = PerformOp(Builder, Loaded);
+
+  Value *NewLoaded = nullptr;
+  Value *Success = nullptr;
+
+  CreateCmpXchg(Builder, Addr, Loaded, NewVal,
+                MemOpOrder == AtomicOrdering::Unordered
+                    ? AtomicOrdering::Monotonic
+                    : MemOpOrder,
+                Success, NewLoaded);
+  assert(Success && NewLoaded);
+
+  Loaded->addIncoming(NewLoaded, LoopBB);
+
+  Builder.CreateCondBr(Success, ExitBB, LoopBB);
+
+  Builder.SetInsertPoint(ExitBB, ExitBB->begin());
+  return NewLoaded;
+}
+
+// Note: This function is exposed externally by AtomicExpandUtils.h
+bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
+                                    CreateCmpXchgInstFun CreateCmpXchg) {
+  IRBuilder<> Builder(AI);
+  Value *Loaded = AtomicExpand::insertRMWCmpXchgLoop(
+      Builder, AI->getType(), AI->getPointerOperand(), AI->getOrdering(),
+      [&](IRBuilder<> &Builder, Value *Loaded) {
+        return performAtomicOp(AI->getOperation(), Builder, Loaded,
+                               AI->getValOperand());
+      },
+      CreateCmpXchg);
+
+  AI->replaceAllUsesWith(Loaded);
+  AI->eraseFromParent();
+  return true;
+}
+
+// In order to use one of the sized library calls such as
+// __atomic_fetch_add_4, the alignment must be sufficient, the size
+// must be one of the potentially-specialized sizes, and the value
+// type must actually exist in C on the target (otherwise, the
+// function wouldn't actually be defined.)
+static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
+                                  const DataLayout &DL) {
+  // TODO: "LargestSize" is an approximation for "largest type that
+  // you can express in C". It seems to be the case that int128 is
+  // supported on all 64-bit platforms, otherwise only up to 64-bit
+  // integers are supported. If we get this wrong, then we'll try to
+  // call a sized libcall that doesn't actually exist. There should
+  // really be some more reliable way in LLVM of determining integer
+  // sizes which are valid in the target's C ABI...
+  unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
+  return Align >= Size &&
+         (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
+         Size <= LargestSize;
+}
+
+void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
+      RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
+      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
+}
+
+void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
+      RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
+      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
+}
+
+void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
+  static const RTLIB::Libcall Libcalls[6] = {
+      RTLIB::ATOMIC_COMPARE_EXCHANGE,   RTLIB::ATOMIC_COMPARE_EXCHANGE_1,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
+      RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool expanded = expandAtomicOpToLibcall(
+      I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
+      I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
+      Libcalls);
+  (void)expanded;
+  assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor CAS");
+}
+
+static ArrayRef<RTLIB::Libcall> GetRMWLibcall(AtomicRMWInst::BinOp Op) {
+  static const RTLIB::Libcall LibcallsXchg[6] = {
+      RTLIB::ATOMIC_EXCHANGE,   RTLIB::ATOMIC_EXCHANGE_1,
+      RTLIB::ATOMIC_EXCHANGE_2, RTLIB::ATOMIC_EXCHANGE_4,
+      RTLIB::ATOMIC_EXCHANGE_8, RTLIB::ATOMIC_EXCHANGE_16};
+  static const RTLIB::Libcall LibcallsAdd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_ADD_1,
+      RTLIB::ATOMIC_FETCH_ADD_2, RTLIB::ATOMIC_FETCH_ADD_4,
+      RTLIB::ATOMIC_FETCH_ADD_8, RTLIB::ATOMIC_FETCH_ADD_16};
+  static const RTLIB::Libcall LibcallsSub[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_SUB_1,
+      RTLIB::ATOMIC_FETCH_SUB_2, RTLIB::ATOMIC_FETCH_SUB_4,
+      RTLIB::ATOMIC_FETCH_SUB_8, RTLIB::ATOMIC_FETCH_SUB_16};
+  static const RTLIB::Libcall LibcallsAnd[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_AND_1,
+      RTLIB::ATOMIC_FETCH_AND_2, RTLIB::ATOMIC_FETCH_AND_4,
+      RTLIB::ATOMIC_FETCH_AND_8, RTLIB::ATOMIC_FETCH_AND_16};
+  static const RTLIB::Libcall LibcallsOr[6] = {
+      RTLIB::UNKNOWN_LIBCALL,   RTLIB::ATOMIC_FETCH_OR_1,
+      RTLIB::ATOMIC_FETCH_OR_2, RTLIB::ATOMIC_FETCH_OR_4,
+      RTLIB::ATOMIC_FETCH_OR_8, RTLIB::ATOMIC_FETCH_OR_16};
+  static const RTLIB::Libcall LibcallsXor[6] = {
+      RTLIB::UNKNOWN_LIBCALL,    RTLIB::ATOMIC_FETCH_XOR_1,
+      RTLIB::ATOMIC_FETCH_XOR_2, RTLIB::ATOMIC_FETCH_XOR_4,
+      RTLIB::ATOMIC_FETCH_XOR_8, RTLIB::ATOMIC_FETCH_XOR_16};
+  static const RTLIB::Libcall LibcallsNand[6] = {
+      RTLIB::UNKNOWN_LIBCALL,     RTLIB::ATOMIC_FETCH_NAND_1,
+      RTLIB::ATOMIC_FETCH_NAND_2, RTLIB::ATOMIC_FETCH_NAND_4,
+      RTLIB::ATOMIC_FETCH_NAND_8, RTLIB::ATOMIC_FETCH_NAND_16};
+
+  switch (Op) {
+  case AtomicRMWInst::BAD_BINOP:
+    llvm_unreachable("Should not have BAD_BINOP.");
+  case AtomicRMWInst::Xchg:
+    return makeArrayRef(LibcallsXchg);
+  case AtomicRMWInst::Add:
+    return makeArrayRef(LibcallsAdd);
+  case AtomicRMWInst::Sub:
+    return makeArrayRef(LibcallsSub);
+  case AtomicRMWInst::And:
+    return makeArrayRef(LibcallsAnd);
+  case AtomicRMWInst::Or:
+    return makeArrayRef(LibcallsOr);
+  case AtomicRMWInst::Xor:
+    return makeArrayRef(LibcallsXor);
+  case AtomicRMWInst::Nand:
+    return makeArrayRef(LibcallsNand);
+  case AtomicRMWInst::Max:
+  case AtomicRMWInst::Min:
+  case AtomicRMWInst::UMax:
+  case AtomicRMWInst::UMin:
+    // No atomic libcalls are available for max/min/umax/umin.
+    return {};
+  }
+  llvm_unreachable("Unexpected AtomicRMW operation.");
+}
+
+void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
+  ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
+
+  unsigned Size = getAtomicOpSize(I);
+  unsigned Align = getAtomicOpAlign(I);
+
+  bool Success = false;
+  if (!Libcalls.empty())
+    Success = expandAtomicOpToLibcall(
+        I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
+        I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+
+  // The expansion failed: either there were no libcalls at all for
+  // the operation (min/max), or there were only size-specialized
+  // libcalls (add/sub/etc) and we needed a generic. So, expand to a
+  // CAS libcall, via a CAS loop, instead.
+  if (!Success) {
+    expandAtomicRMWToCmpXchg(I, [this](IRBuilder<> &Builder, Value *Addr,
+                                       Value *Loaded, Value *NewVal,
+                                       AtomicOrdering MemOpOrder,
+                                       Value *&Success, Value *&NewLoaded) {
+      // Create the CAS instruction normally...
+      AtomicCmpXchgInst *Pair = Builder.CreateAtomicCmpXchg(
+          Addr, Loaded, NewVal, MemOpOrder,
+          AtomicCmpXchgInst::getStrongestFailureOrdering(MemOpOrder));
+      Success = Builder.CreateExtractValue(Pair, 1, "success");
+      NewLoaded = Builder.CreateExtractValue(Pair, 0, "newloaded");
+
+      // ...and then expand the CAS into a libcall.
+      expandAtomicCASToLibcall(Pair);
+    });
+  }
+}
+
+// A helper routine for the above expandAtomic*ToLibcall functions.
+//
+// 'Libcalls' contains an array of enum values for the particular
+// ATOMIC libcalls to be emitted. All of the other arguments besides
+// 'I' are extracted from the Instruction subclass by the
+// caller. Depending on the particular call, some will be null.
+bool AtomicExpand::expandAtomicOpToLibcall(
+    Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
+    Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
+    AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
+  assert(Libcalls.size() == 6);
+
+  LLVMContext &Ctx = I->getContext();
+  Module *M = I->getModule();
+  const DataLayout &DL = M->getDataLayout();
+  IRBuilder<> Builder(I);
+  IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
+
+  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
+  Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
+
+  unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
+
+  // TODO: the "order" argument type is "int", not int32. So
+  // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
+  ConstantInt *SizeVal64 = ConstantInt::get(Type::getInt64Ty(Ctx), Size);
+  assert(Ordering != AtomicOrdering::NotAtomic && "expect atomic MO");
+  Constant *OrderingVal =
+      ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering));
+  Constant *Ordering2Val = nullptr;
+  if (CASExpected) {
+    assert(Ordering2 != AtomicOrdering::NotAtomic && "expect atomic MO");
+    Ordering2Val =
+        ConstantInt::get(Type::getInt32Ty(Ctx), (int)toCABI(Ordering2));
+  }
+  bool HasResult = I->getType() != Type::getVoidTy(Ctx);
+
+  RTLIB::Libcall RTLibType;
+  if (UseSizedLibcall) {
+    switch (Size) {
+    case 1: RTLibType = Libcalls[1]; break;
+    case 2: RTLibType = Libcalls[2]; break;
+    case 4: RTLibType = Libcalls[3]; break;
+    case 8: RTLibType = Libcalls[4]; break;
+    case 16: RTLibType = Libcalls[5]; break;
+    }
+  } else if (Libcalls[0] != RTLIB::UNKNOWN_LIBCALL) {
+    RTLibType = Libcalls[0];
+  } else {
+    // Can't use sized function, and there's no generic for this
+    // operation, so give up.
+    return false;
+  }
+
+  // Build up the function call. There's two kinds. First, the sized
+  // variants.  These calls are going to be one of the following (with
+  // N=1,2,4,8,16):
+  //  iN    __atomic_load_N(iN *ptr, int ordering)
+  //  void  __atomic_store_N(iN *ptr, iN val, int ordering)
+  //  iN    __atomic_{exchange|fetch_*}_N(iN *ptr, iN val, int ordering)
+  //  bool  __atomic_compare_exchange_N(iN *ptr, iN *expected, iN desired,
+  //                                    int success_order, int failure_order)
+  //
+  // Note that these functions can be used for non-integer atomic
+  // operations, the values just need to be bitcast to integers on the
+  // way in and out.
+  //
+  // And, then, the generic variants. They look like the following:
+  //  void  __atomic_load(size_t size, void *ptr, void *ret, int ordering)
+  //  void  __atomic_store(size_t size, void *ptr, void *val, int ordering)
+  //  void  __atomic_exchange(size_t size, void *ptr, void *val, void *ret,
+  //                          int ordering)
+  //  bool  __atomic_compare_exchange(size_t size, void *ptr, void *expected,
+  //                                  void *desired, int success_order,
+  //                                  int failure_order)
+  //
+  // The different signatures are built up depending on the
+  // 'UseSizedLibcall', 'CASExpected', 'ValueOperand', and 'HasResult'
+  // variables.
+
+  AllocaInst *AllocaCASExpected = nullptr;
+  Value *AllocaCASExpected_i8 = nullptr;
+  AllocaInst *AllocaValue = nullptr;
+  Value *AllocaValue_i8 = nullptr;
+  AllocaInst *AllocaResult = nullptr;
+  Value *AllocaResult_i8 = nullptr;
+
+  Type *ResultTy;
+  SmallVector<Value *, 6> Args;
+  AttributeSet Attr;
+
+  // 'size' argument.
+  if (!UseSizedLibcall) {
+    // Note, getIntPtrType is assumed equivalent to size_t.
+    Args.push_back(ConstantInt::get(DL.getIntPtrType(Ctx), Size));
+  }
+
+  // 'ptr' argument.
+  Value *PtrVal =
+      Builder.CreateBitCast(PointerOperand, Type::getInt8PtrTy(Ctx));
+  Args.push_back(PtrVal);
+
+  // 'expected' argument, if present.
+  if (CASExpected) {
+    AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
+    AllocaCASExpected->setAlignment(AllocaAlignment);
+    AllocaCASExpected_i8 =
+        Builder.CreateBitCast(AllocaCASExpected, Type::getInt8PtrTy(Ctx));
+    Builder.CreateLifetimeStart(AllocaCASExpected_i8, SizeVal64);
+    Builder.CreateAlignedStore(CASExpected, AllocaCASExpected, AllocaAlignment);
+    Args.push_back(AllocaCASExpected_i8);
+  }
+
+  // 'val' argument ('desired' for cas), if present.
+  if (ValueOperand) {
+    if (UseSizedLibcall) {
+      Value *IntValue =
+          Builder.CreateBitOrPointerCast(ValueOperand, SizedIntTy);
+      Args.push_back(IntValue);
+    } else {
+      AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
+      AllocaValue->setAlignment(AllocaAlignment);
+      AllocaValue_i8 =
+          Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
+      Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
+      Builder.CreateAlignedStore(ValueOperand, AllocaValue, AllocaAlignment);
+      Args.push_back(AllocaValue_i8);
+    }
+  }
+
+  // 'ret' argument.
+  if (!CASExpected && HasResult && !UseSizedLibcall) {
+    AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
+    AllocaResult->setAlignment(AllocaAlignment);
+    AllocaResult_i8 =
+        Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx));
+    Builder.CreateLifetimeStart(AllocaResult_i8, SizeVal64);
+    Args.push_back(AllocaResult_i8);
+  }
+
+  // 'ordering' ('success_order' for cas) argument.
+  Args.push_back(OrderingVal);
+
+  // 'failure_order' argument, if present.
+  if (Ordering2Val)
+    Args.push_back(Ordering2Val);
+
+  // Now, the return type.
+  if (CASExpected) {
+    ResultTy = Type::getInt1Ty(Ctx);
+    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
+  } else if (HasResult && UseSizedLibcall)
+    ResultTy = SizedIntTy;
+  else
+    ResultTy = Type::getVoidTy(Ctx);
+
+  // Done with setting up arguments and return types, create the call:
+  SmallVector<Type *, 6> ArgTys;
+  for (Value *Arg : Args)
+    ArgTys.push_back(Arg->getType());
+  FunctionType *FnType = FunctionType::get(ResultTy, ArgTys, false);
+  Constant *LibcallFn =
+      M->getOrInsertFunction(TLI->getLibcallName(RTLibType), FnType, Attr);
+  CallInst *Call = Builder.CreateCall(LibcallFn, Args);
+  Call->setAttributes(Attr);
+  Value *Result = Call;
+
+  // And then, extract the results...
+  if (ValueOperand && !UseSizedLibcall)
+    Builder.CreateLifetimeEnd(AllocaValue_i8, SizeVal64);
+
+  if (CASExpected) {
+    // The final result from the CAS is {load of 'expected' alloca, bool result
+    // from call}
+    Type *FinalResultTy = I->getType();
+    Value *V = UndefValue::get(FinalResultTy);
+    Value *ExpectedOut =
+        Builder.CreateAlignedLoad(AllocaCASExpected, AllocaAlignment);
+    Builder.CreateLifetimeEnd(AllocaCASExpected_i8, SizeVal64);
+    V = Builder.CreateInsertValue(V, ExpectedOut, 0);
+    V = Builder.CreateInsertValue(V, Result, 1);
+    I->replaceAllUsesWith(V);
+  } else if (HasResult) {
+    Value *V;
+    if (UseSizedLibcall)
+      V = Builder.CreateBitOrPointerCast(Result, I->getType());
+    else {
+      V = Builder.CreateAlignedLoad(AllocaResult, AllocaAlignment);
+      Builder.CreateLifetimeEnd(AllocaResult_i8, SizeVal64);
+    }
+    I->replaceAllUsesWith(V);
+  }
+  I->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
new file mode 100644
index 000000000000..a67e194356d8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BasicTargetTransformInfo.cpp
@@ -0,0 +1,38 @@
+//===- BasicTargetTransformInfo.cpp - Basic target-independent TTI impl ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file provides the implementation of a basic TargetTransformInfo pass
+/// predicated on the target abstractions present in the target independent
+/// code generator. It uses these (primarily TargetLowering) to model as much
+/// of the TTI query interface as possible. It is included by most targets so
+/// that they can specialize only a small subset of the query space.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/TargetTransformInfoImpl.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "basictti"
+
+// This flag is used by the template base class for BasicTTIImpl, and here to
+// provide a definition.
+cl::opt<unsigned>
+    llvm::PartialUnrollingThreshold("partial-unrolling-threshold", cl::init(0),
+                                    cl::desc("Threshold for partial unrolling"),
+                                    cl::Hidden);
+
+BasicTTIImpl::BasicTTIImpl(const TargetMachine *TM, const Function &F)
+    : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
+      TLI(ST->getTargetLowering()) {}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.cpp b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
new file mode 100644
index 000000000000..fa705761645f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.cpp
@@ -0,0 +1,1919 @@
+//===-- BranchFolding.cpp - Fold machine code branch instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass forwards branches to unconditional branches to make them branch
+// directly to the target block.  This pass often results in dead MBB's, which
+// it then removes.
+//
+// Note that this pass must be run after register allocation, it cannot handle
+// SSA form. It also must handle virtual registers for targets that emit virtual
+// ISA (e.g. NVPTX).
+//
+//===----------------------------------------------------------------------===//
+
+#include "BranchFolding.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "branchfolding"
+
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumBranchOpts, "Number of branches optimized");
+STATISTIC(NumTailMerge , "Number of block tails merged");
+STATISTIC(NumHoist     , "Number of times common instructions are hoisted");
+
+static cl::opt<cl::boolOrDefault> FlagEnableTailMerge("enable-tail-merge",
+                              cl::init(cl::BOU_UNSET), cl::Hidden);
+
+// Throttle for huge numbers of predecessors (compile speed problems)
+static cl::opt<unsigned>
+TailMergeThreshold("tail-merge-threshold",
+          cl::desc("Max number of predecessors to consider tail merging"),
+          cl::init(150), cl::Hidden);
+
+// Heuristic for tail merging (and, inversely, tail duplication).
+// TODO: This should be replaced with a target query.
+static cl::opt<unsigned>
+TailMergeSize("tail-merge-size",
+          cl::desc("Min number of instructions to consider tail merging"),
+                              cl::init(3), cl::Hidden);
+
+namespace {
+  /// BranchFolderPass - Wrap branch folder in a machine function pass.
+  class BranchFolderPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    explicit BranchFolderPass(): MachineFunctionPass(ID) {}
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      AU.addRequired<TargetPassConfig>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+}
+
+char BranchFolderPass::ID = 0;
+char &llvm::BranchFolderPassID = BranchFolderPass::ID;
+
+INITIALIZE_PASS(BranchFolderPass, "branch-folder",
+                "Control Flow Optimizer", false, false)
+
+bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // TailMerge can create jump into if branches that make CFG irreducible for
+  // HW that requires structurized CFG.
+  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                         PassConfig->getEnableTailMerge();
+  BranchFolder::MBFIWrapper MBBFreqInfo(
+      getAnalysis<MachineBlockFrequencyInfo>());
+  BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
+                      getAnalysis<MachineBranchProbabilityInfo>());
+  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
+                                 MF.getSubtarget().getRegisterInfo(),
+                                 getAnalysisIfAvailable<MachineModuleInfo>());
+}
+
+BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                           MBFIWrapper &FreqInfo,
+                           const MachineBranchProbabilityInfo &ProbInfo)
+    : EnableHoistCommonCode(CommonHoist), MBBFreqInfo(FreqInfo),
+      MBPI(ProbInfo) {
+  switch (FlagEnableTailMerge) {
+  case cl::BOU_UNSET: EnableTailMerge = defaultEnableTailMerge; break;
+  case cl::BOU_TRUE: EnableTailMerge = true; break;
+  case cl::BOU_FALSE: EnableTailMerge = false; break;
+  }
+}
+
+/// RemoveDeadBlock - Remove the specified dead machine basic block from the
+/// function, updating the CFG.
+void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  MachineFunction *MF = MBB->getParent();
+  // drop all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end()-1);
+
+  // Avoid matching if this pointer gets reused.
+  TriedMerging.erase(MBB);
+
+  // Remove the block.
+  MF->erase(MBB);
+  FuncletMembership.erase(MBB);
+  if (MLI)
+    MLI->removeBlock(MBB);
+}
+
+/// OptimizeImpDefsBlock - If a basic block is just a bunch of implicit_def
+/// followed by terminators, and if the implicitly defined registers are not
+/// used by the terminators, remove those implicit_def's. e.g.
+/// BB1:
+///   r0 = implicit_def
+///   r1 = implicit_def
+///   br
+/// This block can be optimized away later if the implicit instructions are
+/// removed.
+bool BranchFolder::OptimizeImpDefsBlock(MachineBasicBlock *MBB) {
+  SmallSet<unsigned, 4> ImpDefRegs;
+  MachineBasicBlock::iterator I = MBB->begin();
+  while (I != MBB->end()) {
+    if (!I->isImplicitDef())
+      break;
+    unsigned Reg = I->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        ImpDefRegs.insert(*SubRegs);
+    } else {
+      ImpDefRegs.insert(Reg);
+    }
+    ++I;
+  }
+  if (ImpDefRegs.empty())
+    return false;
+
+  MachineBasicBlock::iterator FirstTerm = I;
+  while (I != MBB->end()) {
+    if (!TII->isUnpredicatedTerminator(*I))
+      return false;
+    // See if it uses any of the implicitly defined registers.
+    for (const MachineOperand &MO : I->operands()) {
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (ImpDefRegs.count(Reg))
+        return false;
+    }
+    ++I;
+  }
+
+  I = MBB->begin();
+  while (I != FirstTerm) {
+    MachineInstr *ImpDefMI = &*I;
+    ++I;
+    MBB->erase(ImpDefMI);
+  }
+
+  return true;
+}
+
+/// OptimizeFunction - Perhaps branch folding, tail merging and other
+/// CFG optimizations on the given function.  Block placement changes the layout
+/// and may create new tail merging opportunities.
+bool BranchFolder::OptimizeFunction(MachineFunction &MF,
+                                    const TargetInstrInfo *tii,
+                                    const TargetRegisterInfo *tri,
+                                    MachineModuleInfo *mmi,
+                                    MachineLoopInfo *mli, bool AfterPlacement) {
+  if (!tii) return false;
+
+  TriedMerging.clear();
+
+  AfterBlockPlacement = AfterPlacement;
+  TII = tii;
+  TRI = tri;
+  MMI = mmi;
+  MLI = mli;
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  UpdateLiveIns = MRI.tracksLiveness() && TRI->trackLivenessAfterRegAlloc(MF);
+  if (!UpdateLiveIns)
+    MRI.invalidateLiveness();
+
+  // Fix CFG.  The later algorithms expect it to be right.
+  bool MadeChange = false;
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
+      MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+    MadeChange |= OptimizeImpDefsBlock(&MBB);
+  }
+
+  // Recalculate funclet membership.
+  FuncletMembership = getFuncletMembership(MF);
+
+  bool MadeChangeThisIteration = true;
+  while (MadeChangeThisIteration) {
+    MadeChangeThisIteration    = TailMergeBlocks(MF);
+    // No need to clean up if tail merging does not change anything after the
+    // block placement.
+    if (!AfterBlockPlacement || MadeChangeThisIteration)
+      MadeChangeThisIteration |= OptimizeBranches(MF);
+    if (EnableHoistCommonCode)
+      MadeChangeThisIteration |= HoistCommonCode(MF);
+    MadeChange |= MadeChangeThisIteration;
+  }
+
+  // See if any jump tables have become dead as the code generator
+  // did its thing.
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  if (!JTI)
+    return MadeChange;
+
+  // Walk the function to find jump tables that are live.
+  BitVector JTIsLive(JTI->getJumpTables().size());
+  for (const MachineBasicBlock &BB : MF) {
+    for (const MachineInstr &I : BB)
+      for (const MachineOperand &Op : I.operands()) {
+        if (!Op.isJTI()) continue;
+
+        // Remember that this JT is live.
+        JTIsLive.set(Op.getIndex());
+      }
+  }
+
+  // Finally, remove dead jump tables.  This happens when the
+  // indirect jump was unreachable (and thus deleted).
+  for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
+    if (!JTIsLive.test(i)) {
+      JTI->RemoveJumpTable(i);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Tail Merging of Blocks
+//===----------------------------------------------------------------------===//
+
+/// HashMachineInstr - Compute a hash value for MI and its operands.
+static unsigned HashMachineInstr(const MachineInstr &MI) {
+  unsigned Hash = MI.getOpcode();
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &Op = MI.getOperand(i);
+
+    // Merge in bits from the operand if easy. We can't use MachineOperand's
+    // hash_code here because it's not deterministic and we sort by hash value
+    // later.
+    unsigned OperandHash = 0;
+    switch (Op.getType()) {
+    case MachineOperand::MO_Register:
+      OperandHash = Op.getReg();
+      break;
+    case MachineOperand::MO_Immediate:
+      OperandHash = Op.getImm();
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      OperandHash = Op.getMBB()->getNumber();
+      break;
+    case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
+      OperandHash = Op.getIndex();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+      // Global address / external symbol are too hard, don't bother, but do
+      // pull in the offset.
+      OperandHash = Op.getOffset();
+      break;
+    default:
+      break;
+    }
+
+    Hash += ((OperandHash << 3) | Op.getType()) << (i & 31);
+  }
+  return Hash;
+}
+
+/// HashEndOfMBB - Hash the last instruction in the MBB.
+static unsigned HashEndOfMBB(const MachineBasicBlock &MBB) {
+  MachineBasicBlock::const_iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return 0;
+
+  return HashMachineInstr(*I);
+}
+
+/// ComputeCommonTailLength - Given two machine basic blocks, compute the number
+/// of instructions they actually have in common together at their end.  Return
+/// iterators for the first shared instruction in each block.
+static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
+                                        MachineBasicBlock *MBB2,
+                                        MachineBasicBlock::iterator &I1,
+                                        MachineBasicBlock::iterator &I2) {
+  I1 = MBB1->end();
+  I2 = MBB2->end();
+
+  unsigned TailLen = 0;
+  while (I1 != MBB1->begin() && I2 != MBB2->begin()) {
+    --I1; --I2;
+    // Skip debugging pseudos; necessary to avoid changing the code.
+    while (I1->isDebugValue()) {
+      if (I1==MBB1->begin()) {
+        while (I2->isDebugValue()) {
+          if (I2==MBB2->begin())
+            // I1==DBG at begin; I2==DBG at begin
+            return TailLen;
+          --I2;
+        }
+        ++I2;
+        // I1==DBG at begin; I2==non-DBG, or first of DBGs not at begin
+        return TailLen;
+      }
+      --I1;
+    }
+    // I1==first (untested) non-DBG preceding known match
+    while (I2->isDebugValue()) {
+      if (I2==MBB2->begin()) {
+        ++I1;
+        // I1==non-DBG, or first of DBGs not at begin; I2==DBG at begin
+        return TailLen;
+      }
+      --I2;
+    }
+    // I1, I2==first (untested) non-DBGs preceding known match
+    if (!I1->isIdenticalTo(*I2) ||
+        // FIXME: This check is dubious. It's used to get around a problem where
+        // people incorrectly expect inline asm directives to remain in the same
+        // relative order. This is untenable because normal compiler
+        // optimizations (like this one) may reorder and/or merge these
+        // directives.
+        I1->isInlineAsm()) {
+      ++I1; ++I2;
+      break;
+    }
+    ++TailLen;
+  }
+  // Back past possible debugging pseudos at beginning of block.  This matters
+  // when one block differs from the other only by whether debugging pseudos
+  // are present at the beginning. (This way, the various checks later for
+  // I1==MBB1->begin() work as expected.)
+  if (I1 == MBB1->begin() && I2 != MBB2->begin()) {
+    --I2;
+    while (I2->isDebugValue()) {
+      if (I2 == MBB2->begin())
+        return TailLen;
+      --I2;
+    }
+    ++I2;
+  }
+  if (I2 == MBB2->begin() && I1 != MBB1->begin()) {
+    --I1;
+    while (I1->isDebugValue()) {
+      if (I1 == MBB1->begin())
+        return TailLen;
+      --I1;
+    }
+    ++I1;
+  }
+  return TailLen;
+}
+
+void BranchFolder::computeLiveIns(MachineBasicBlock &MBB) {
+  if (!UpdateLiveIns)
+    return;
+
+  LiveRegs.init(TRI);
+  LiveRegs.addLiveOutsNoPristines(MBB);
+  for (MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend()))
+    LiveRegs.stepBackward(MI);
+
+  for (unsigned Reg : LiveRegs) {
+    // Skip the register if we are about to add one of its super registers.
+    bool ContainsSuperReg = false;
+    for (MCSuperRegIterator SReg(Reg, TRI); SReg.isValid(); ++SReg) {
+      if (LiveRegs.contains(*SReg)) {
+        ContainsSuperReg = true;
+        break;
+      }
+    }
+    if (ContainsSuperReg)
+      continue;
+    MBB.addLiveIn(Reg);
+  }
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                           MachineBasicBlock *NewDest) {
+  TII->ReplaceTailWithBranchTo(OldInst, NewDest);
+
+  computeLiveIns(*NewDest);
+
+  ++NumTailMerge;
+}
+
+/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
+/// MBB so that the part before the iterator falls into the part starting at the
+/// iterator.  This returns the new MBB.
+MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
+                                            MachineBasicBlock::iterator BBI1,
+                                            const BasicBlock *BB) {
+  if (!TII->isLegalToSplitMBBAt(CurMBB, BBI1))
+    return nullptr;
+
+  MachineFunction &MF = *CurMBB.getParent();
+
+  // Create the fall-through block.
+  MachineFunction::iterator MBBI = CurMBB.getIterator();
+  MachineBasicBlock *NewMBB =MF.CreateMachineBasicBlock(BB);
+  CurMBB.getParent()->insert(++MBBI, NewMBB);
+
+  // Move all the successors of this block to the specified block.
+  NewMBB->transferSuccessors(&CurMBB);
+
+  // Add an edge from CurMBB to NewMBB for the fall-through.
+  CurMBB.addSuccessor(NewMBB);
+
+  // Splice the code over.
+  NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
+
+  // NewMBB belongs to the same loop as CurMBB.
+  if (MLI) 
+    if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
+      ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
+
+  // NewMBB inherits CurMBB's block frequency.
+  MBBFreqInfo.setBlockFreq(NewMBB, MBBFreqInfo.getBlockFreq(&CurMBB));
+
+  computeLiveIns(*NewMBB);
+
+  // Add the new block to the funclet.
+  const auto &FuncletI = FuncletMembership.find(&CurMBB);
+  if (FuncletI != FuncletMembership.end()) {
+    auto n = FuncletI->second;
+    FuncletMembership[NewMBB] = n;
+  }
+
+  return NewMBB;
+}
+
+/// EstimateRuntime - Make a rough estimate for how long it will take to run
+/// the specified code.
+static unsigned EstimateRuntime(MachineBasicBlock::iterator I,
+                                MachineBasicBlock::iterator E) {
+  unsigned Time = 0;
+  for (; I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+    if (I->isCall())
+      Time += 10;
+    else if (I->mayLoad() || I->mayStore())
+      Time += 2;
+    else
+      ++Time;
+  }
+  return Time;
+}
+
+// CurMBB needs to add an unconditional branch to SuccMBB (we removed these
+// branches temporarily for tail merging).  In the case where CurMBB ends
+// with a conditional branch to the next block, optimize by reversing the
+// test and conditionally branching to SuccMBB instead.
+static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
+                    const TargetInstrInfo *TII) {
+  MachineFunction *MF = CurMBB->getParent();
+  MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
+    MachineBasicBlock *NextBB = &*I;
+    if (TBB == NextBB && !Cond.empty() && !FBB) {
+      if (!TII->ReverseBranchCondition(Cond)) {
+        TII->RemoveBranch(*CurMBB);
+        TII->InsertBranch(*CurMBB, SuccBB, nullptr, Cond, dl);
+        return;
+      }
+    }
+  }
+  TII->InsertBranch(*CurMBB, SuccBB, nullptr,
+                    SmallVector<MachineOperand, 0>(), dl);
+}
+
+bool
+BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
+  if (getHash() < o.getHash())
+    return true;
+  if (getHash() > o.getHash())
+    return false;
+  if (getBlock()->getNumber() < o.getBlock()->getNumber())
+    return true;
+  if (getBlock()->getNumber() > o.getBlock()->getNumber())
+    return false;
+  // _GLIBCXX_DEBUG checks strict weak ordering, which involves comparing
+  // an object with itself.
+#ifndef _GLIBCXX_DEBUG
+  llvm_unreachable("Predecessor appears twice");
+#else
+  return false;
+#endif
+}
+
+BlockFrequency
+BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  if (I != MergedBBFreq.end())
+    return I->second;
+
+  return MBFI.getBlockFreq(MBB);
+}
+
+void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
+                                             BlockFrequency F) {
+  MergedBBFreq[MBB] = F;
+}
+
+raw_ostream &
+BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
+}
+
+raw_ostream &
+BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI.printBlockFreq(OS, Freq);
+}
+
+/// CountTerminators - Count the number of terminators in the given
+/// block and set I to the position of the first non-terminator, if there
+/// is one, or MBB->end() otherwise.
+static unsigned CountTerminators(MachineBasicBlock *MBB,
+                                 MachineBasicBlock::iterator &I) {
+  I = MBB->end();
+  unsigned NumTerms = 0;
+  for (;;) {
+    if (I == MBB->begin()) {
+      I = MBB->end();
+      break;
+    }
+    --I;
+    if (!I->isTerminator()) break;
+    ++NumTerms;
+  }
+  return NumTerms;
+}
+
+/// ProfitableToMerge - Check if two machine basic blocks have a common tail
+/// and decide if it would be profitable to merge those tails.  Return the
+/// length of the common tail and iterators to the first common instruction
+/// in each block.
+static bool
+ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
+                  unsigned minCommonTailLength, unsigned &CommonTailLen,
+                  MachineBasicBlock::iterator &I1,
+                  MachineBasicBlock::iterator &I2, MachineBasicBlock *SuccBB,
+                  MachineBasicBlock *PredBB,
+                  DenseMap<const MachineBasicBlock *, int> &FuncletMembership) {
+  // It is never profitable to tail-merge blocks from two different funclets.
+  if (!FuncletMembership.empty()) {
+    auto Funclet1 = FuncletMembership.find(MBB1);
+    assert(Funclet1 != FuncletMembership.end());
+    auto Funclet2 = FuncletMembership.find(MBB2);
+    assert(Funclet2 != FuncletMembership.end());
+    if (Funclet1->second != Funclet2->second)
+      return false;
+  }
+
+  CommonTailLen = ComputeCommonTailLength(MBB1, MBB2, I1, I2);
+  if (CommonTailLen == 0)
+    return false;
+  DEBUG(dbgs() << "Common tail length of BB#" << MBB1->getNumber()
+               << " and BB#" << MBB2->getNumber() << " is " << CommonTailLen
+               << '\n');
+
+  // It's almost always profitable to merge any number of non-terminator
+  // instructions with the block that falls through into the common successor.
+  if (MBB1 == PredBB || MBB2 == PredBB) {
+    MachineBasicBlock::iterator I;
+    unsigned NumTerms = CountTerminators(MBB1 == PredBB ? MBB2 : MBB1, I);
+    if (CommonTailLen > NumTerms)
+      return true;
+  }
+
+  // If one of the blocks can be completely merged and happens to be in
+  // a position where the other could fall through into it, merge any number
+  // of instructions, because it can be done without a branch.
+  // TODO: If the blocks are not adjacent, move one of them so that they are?
+  if (MBB1->isLayoutSuccessor(MBB2) && I2 == MBB2->begin())
+    return true;
+  if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
+    return true;
+
+  // If both blocks have an unconditional branch temporarily stripped out,
+  // count that as an additional common instruction for the following
+  // heuristics.
+  unsigned EffectiveTailLen = CommonTailLen;
+  if (SuccBB && MBB1 != PredBB && MBB2 != PredBB &&
+      !MBB1->back().isBarrier() &&
+      !MBB2->back().isBarrier())
+    ++EffectiveTailLen;
+
+  // Check if the common tail is long enough to be worthwhile.
+  if (EffectiveTailLen >= minCommonTailLength)
+    return true;
+
+  // If we are optimizing for code size, 2 instructions in common is enough if
+  // we don't have to split a block.  At worst we will be introducing 1 new
+  // branch instruction, which is likely to be smaller than the 2
+  // instructions that would be deleted in the merge.
+  MachineFunction *MF = MBB1->getParent();
+  return EffectiveTailLen >= 2 && MF->getFunction()->optForSize() &&
+         (I1 == MBB1->begin() || I2 == MBB2->begin());
+}
+
+/// ComputeSameTails - Look through all the blocks in MergePotentials that have
+/// hash CurHash (guaranteed to match the last element).  Build the vector
+/// SameTails of all those that have the (same) largest number of instructions
+/// in common of any pair of these blocks.  SameTails entries contain an
+/// iterator into MergePotentials (from which the MachineBasicBlock can be
+/// found) and a MachineBasicBlock::iterator into that MBB indicating the
+/// instruction where the matching code sequence begins.
+/// Order of elements in SameTails is the reverse of the order in which
+/// those blocks appear in MergePotentials (where they are not necessarily
+/// consecutive).
+unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
+                                        unsigned minCommonTailLength,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  unsigned maxCommonTailLength = 0U;
+  SameTails.clear();
+  MachineBasicBlock::iterator TrialBBI1, TrialBBI2;
+  MPIterator HighestMPIter = std::prev(MergePotentials.end());
+  for (MPIterator CurMPIter = std::prev(MergePotentials.end()),
+                  B = MergePotentials.begin();
+       CurMPIter != B && CurMPIter->getHash() == CurHash; --CurMPIter) {
+    for (MPIterator I = std::prev(CurMPIter); I->getHash() == CurHash; --I) {
+      unsigned CommonTailLen;
+      if (ProfitableToMerge(CurMPIter->getBlock(), I->getBlock(),
+                            minCommonTailLength,
+                            CommonTailLen, TrialBBI1, TrialBBI2,
+                            SuccBB, PredBB,
+                            FuncletMembership)) {
+        if (CommonTailLen > maxCommonTailLength) {
+          SameTails.clear();
+          maxCommonTailLength = CommonTailLen;
+          HighestMPIter = CurMPIter;
+          SameTails.push_back(SameTailElt(CurMPIter, TrialBBI1));
+        }
+        if (HighestMPIter == CurMPIter &&
+            CommonTailLen == maxCommonTailLength)
+          SameTails.push_back(SameTailElt(I, TrialBBI2));
+      }
+      if (I == B)
+        break;
+    }
+  }
+  return maxCommonTailLength;
+}
+
+/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
+/// MergePotentials, restoring branches at ends of blocks as appropriate.
+void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
+                                        MachineBasicBlock *SuccBB,
+                                        MachineBasicBlock *PredBB) {
+  MPIterator CurMPIter, B;
+  for (CurMPIter = std::prev(MergePotentials.end()),
+      B = MergePotentials.begin();
+       CurMPIter->getHash() == CurHash; --CurMPIter) {
+    // Put the unconditional branch back, if we need one.
+    MachineBasicBlock *CurMBB = CurMPIter->getBlock();
+    if (SuccBB && CurMBB != PredBB)
+      FixTail(CurMBB, SuccBB, TII);
+    if (CurMPIter == B)
+      break;
+  }
+  if (CurMPIter->getHash() != CurHash)
+    CurMPIter++;
+  MergePotentials.erase(CurMPIter, MergePotentials.end());
+}
+
+/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
+/// only of the common tail.  Create a block that does by splitting one.
+bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                             MachineBasicBlock *SuccBB,
+                                             unsigned maxCommonTailLength,
+                                             unsigned &commonTailIndex) {
+  commonTailIndex = 0;
+  unsigned TimeEstimate = ~0U;
+  for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+    // Use PredBB if possible; that doesn't require a new branch.
+    if (SameTails[i].getBlock() == PredBB) {
+      commonTailIndex = i;
+      break;
+    }
+    // Otherwise, make a (fairly bogus) choice based on estimate of
+    // how long it will take the various blocks to execute.
+    unsigned t = EstimateRuntime(SameTails[i].getBlock()->begin(),
+                                 SameTails[i].getTailStartPos());
+    if (t <= TimeEstimate) {
+      TimeEstimate = t;
+      commonTailIndex = i;
+    }
+  }
+
+  MachineBasicBlock::iterator BBI =
+    SameTails[commonTailIndex].getTailStartPos();
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  // If the common tail includes any debug info we will take it pretty
+  // randomly from one of the inputs.  Might be better to remove it?
+  DEBUG(dbgs() << "\nSplitting BB#" << MBB->getNumber() << ", size "
+               << maxCommonTailLength);
+
+  // If the split block unconditionally falls-thru to SuccBB, it will be
+  // merged. In control flow terms it should then take SuccBB's name. e.g. If
+  // SuccBB is an inner loop, the common tail is still part of the inner loop.
+  const BasicBlock *BB = (SuccBB && MBB->succ_size() == 1) ?
+    SuccBB->getBasicBlock() : MBB->getBasicBlock();
+  MachineBasicBlock *newMBB = SplitMBBAt(*MBB, BBI, BB);
+  if (!newMBB) {
+    DEBUG(dbgs() << "... failed!");
+    return false;
+  }
+
+  SameTails[commonTailIndex].setBlock(newMBB);
+  SameTails[commonTailIndex].setTailStartPos(newMBB->begin());
+
+  // If we split PredBB, newMBB is the new predecessor.
+  if (PredBB == MBB)
+    PredBB = newMBB;
+
+  return true;
+}
+
+static void
+mergeMMOsFromMemoryOperations(MachineBasicBlock::iterator MBBIStartPos,
+                              MachineBasicBlock &MBBCommon) {
+  // Merge MMOs from memory operations in the common block.
+  MachineBasicBlock *MBB = MBBIStartPos->getParent();
+  // Note CommonTailLen does not necessarily matches the size of
+  // the common BB nor all its instructions because of debug
+  // instructions differences.
+  unsigned CommonTailLen = 0;
+  for (auto E = MBB->end(); MBBIStartPos != E; ++MBBIStartPos)
+    ++CommonTailLen;
+
+  MachineBasicBlock::reverse_iterator MBBI = MBB->rbegin();
+  MachineBasicBlock::reverse_iterator MBBIE = MBB->rend();
+  MachineBasicBlock::reverse_iterator MBBICommon = MBBCommon.rbegin();
+  MachineBasicBlock::reverse_iterator MBBIECommon = MBBCommon.rend();
+
+  while (CommonTailLen--) {
+    assert(MBBI != MBBIE && "Reached BB end within common tail length!");
+    (void)MBBIE;
+
+    if (MBBI->isDebugValue()) {
+      ++MBBI;
+      continue;
+    }
+
+    while ((MBBICommon != MBBIECommon) && MBBICommon->isDebugValue())
+      ++MBBICommon;
+
+    assert(MBBICommon != MBBIECommon &&
+           "Reached BB end within common tail length!");
+    assert(MBBICommon->isIdenticalTo(*MBBI) && "Expected matching MIIs!");
+
+    if (MBBICommon->mayLoad() || MBBICommon->mayStore())
+      MBBICommon->setMemRefs(MBBICommon->mergeMemRefsWith(*MBBI));
+
+    ++MBBI;
+    ++MBBICommon;
+  }
+}
+
+// See if any of the blocks in MergePotentials (which all have SuccBB as a
+// successor, or all have no successor if it is null) can be tail-merged.
+// If there is a successor, any blocks in MergePotentials that are not
+// tail-merged and are not immediately before Succ must have an unconditional
+// branch to Succ added (but the predecessor/successor lists need no
+// adjustment). The lone predecessor of Succ that falls through into Succ,
+// if any, is given in PredBB.
+bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
+                                      MachineBasicBlock *PredBB) {
+  bool MadeChange = false;
+
+  // Except for the special cases below, tail-merge if there are at least
+  // this many instructions in common.
+  unsigned minCommonTailLength = TailMergeSize;
+
+  DEBUG(dbgs() << "\nTryTailMergeBlocks: ";
+        for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+          dbgs() << "BB#" << MergePotentials[i].getBlock()->getNumber()
+                 << (i == e-1 ? "" : ", ");
+        dbgs() << "\n";
+        if (SuccBB) {
+          dbgs() << "  with successor BB#" << SuccBB->getNumber() << '\n';
+          if (PredBB)
+            dbgs() << "  which has fall-through from BB#"
+                   << PredBB->getNumber() << "\n";
+        }
+        dbgs() << "Looking for common tails of at least "
+               << minCommonTailLength << " instruction"
+               << (minCommonTailLength == 1 ? "" : "s") << '\n';
+       );
+
+  // Sort by hash value so that blocks with identical end sequences sort
+  // together.
+  array_pod_sort(MergePotentials.begin(), MergePotentials.end());
+
+  // Walk through equivalence sets looking for actual exact matches.
+  while (MergePotentials.size() > 1) {
+    unsigned CurHash = MergePotentials.back().getHash();
+
+    // Build SameTails, identifying the set of blocks with this hash code
+    // and with the maximum number of instructions in common.
+    unsigned maxCommonTailLength = ComputeSameTails(CurHash,
+                                                    minCommonTailLength,
+                                                    SuccBB, PredBB);
+
+    // If we didn't find any pair that has at least minCommonTailLength
+    // instructions in common, remove all blocks with this hash code and retry.
+    if (SameTails.empty()) {
+      RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+      continue;
+    }
+
+    // If one of the blocks is the entire common tail (and not the entry
+    // block, which we can't jump to), we can treat all blocks with this same
+    // tail at once.  Use PredBB if that is one of the possibilities, as that
+    // will not introduce any extra branches.
+    MachineBasicBlock *EntryBB =
+        &MergePotentials.front().getBlock()->getParent()->front();
+    unsigned commonTailIndex = SameTails.size();
+    // If there are two blocks, check to see if one can be made to fall through
+    // into the other.
+    if (SameTails.size() == 2 &&
+        SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
+        SameTails[1].tailIsWholeBlock())
+      commonTailIndex = 1;
+    else if (SameTails.size() == 2 &&
+             SameTails[1].getBlock()->isLayoutSuccessor(
+                                                     SameTails[0].getBlock()) &&
+             SameTails[0].tailIsWholeBlock())
+      commonTailIndex = 0;
+    else {
+      // Otherwise just pick one, favoring the fall-through predecessor if
+      // there is one.
+      for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
+        MachineBasicBlock *MBB = SameTails[i].getBlock();
+        if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
+          continue;
+        if (MBB == PredBB) {
+          commonTailIndex = i;
+          break;
+        }
+        if (SameTails[i].tailIsWholeBlock())
+          commonTailIndex = i;
+      }
+    }
+
+    if (commonTailIndex == SameTails.size() ||
+        (SameTails[commonTailIndex].getBlock() == PredBB &&
+         !SameTails[commonTailIndex].tailIsWholeBlock())) {
+      // None of the blocks consist entirely of the common tail.
+      // Split a block so that one does.
+      if (!CreateCommonTailOnlyBlock(PredBB, SuccBB,
+                                     maxCommonTailLength, commonTailIndex)) {
+        RemoveBlocksWithHash(CurHash, SuccBB, PredBB);
+        continue;
+      }
+    }
+
+    MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+    // Recompute common tail MBB's edge weights and block frequency.
+    setCommonTailEdgeWeights(*MBB);
+
+    // MBB is common tail.  Adjust all other BB's to jump to this one.
+    // Traversal must be forwards so erases work.
+    DEBUG(dbgs() << "\nUsing common tail in BB#" << MBB->getNumber()
+                 << " for ");
+    for (unsigned int i=0, e = SameTails.size(); i != e; ++i) {
+      if (commonTailIndex == i)
+        continue;
+      DEBUG(dbgs() << "BB#" << SameTails[i].getBlock()->getNumber()
+                   << (i == e-1 ? "" : ", "));
+      // Merge MMOs from memory operations as needed.
+      mergeMMOsFromMemoryOperations(SameTails[i].getTailStartPos(), *MBB);
+      // Hack the end off BB i, making it jump to BB commonTailIndex instead.
+      ReplaceTailWithBranchTo(SameTails[i].getTailStartPos(), MBB);
+      // BB i is no longer a predecessor of SuccBB; remove it from the worklist.
+      MergePotentials.erase(SameTails[i].getMPIter());
+    }
+    DEBUG(dbgs() << "\n");
+    // We leave commonTailIndex in the worklist in case there are other blocks
+    // that match it with a smaller number of instructions.
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+  if (!EnableTailMerge) return MadeChange;
+
+  // First find blocks with no successors.
+  // Block placement does not create new tail merging opportunities for these
+  // blocks.
+  if (!AfterBlockPlacement) {
+    MergePotentials.clear();
+    for (MachineBasicBlock &MBB : MF) {
+      if (MergePotentials.size() == TailMergeThreshold)
+        break;
+      if (!TriedMerging.count(&MBB) && MBB.succ_empty())
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(MBB), &MBB));
+    }
+
+    // If this is a large problem, avoid visiting the same basic blocks
+    // multiple times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
+
+    // See if we can do any tail merging on those.
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(nullptr, nullptr);
+  }
+
+  // Look at blocks (IBB) with multiple predecessors (PBB).
+  // We change each predecessor to a canonical form, by
+  // (1) temporarily removing any unconditional branch from the predecessor
+  // to IBB, and
+  // (2) alter conditional branches so they branch to the other block
+  // not IBB; this may require adding back an unconditional branch to IBB
+  // later, where there wasn't one coming in.  E.g.
+  //   Bcc IBB
+  //   fallthrough to QBB
+  // here becomes
+  //   Bncc QBB
+  // with a conceptual B to IBB after that, which never actually exists.
+  // With those changes, we see whether the predecessors' tails match,
+  // and merge them if so.  We change things out of canonical form and
+  // back to the way they were later in the process.  (OptimizeBranches
+  // would undo some of this, but we can't use it, because we'd get into
+  // a compile-time infinite loop repeatedly doing and undoing the same
+  // transformations.)
+
+  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+       I != E; ++I) {
+    if (I->pred_size() < 2) continue;
+    SmallPtrSet<MachineBasicBlock *, 8> UniquePreds;
+    MachineBasicBlock *IBB = &*I;
+    MachineBasicBlock *PredBB = &*std::prev(I);
+    MergePotentials.clear();
+    for (MachineBasicBlock *PBB : I->predecessors()) {
+      if (MergePotentials.size() == TailMergeThreshold)
+        break;
+
+      if (TriedMerging.count(PBB))
+        continue;
+
+      // Skip blocks that loop to themselves, can't tail merge these.
+      if (PBB == IBB)
+        continue;
+
+      // Visit each predecessor only once.
+      if (!UniquePreds.insert(PBB).second)
+        continue;
+
+      // Skip blocks which may jump to a landing pad. Can't tail merge these.
+      if (PBB->hasEHPadSuccessor())
+        continue;
+
+      // Bail out if the loop header (IBB) is not the top of the loop chain
+      // after the block placement.  Otherwise, the common tail of IBB's
+      // predecessors may become the loop top if block placement is called again
+      // and the predecessors may branch to this common tail.
+      // FIXME: Relaxed this check if the algorithm of finding loop top is
+      // changed in MBP.
+      if (AfterBlockPlacement && MLI)
+        if (MachineLoop *ML = MLI->getLoopFor(IBB))
+          if (IBB == ML->getHeader() && ML == MLI->getLoopFor(PBB))
+            continue;
+
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+      SmallVector<MachineOperand, 4> Cond;
+      if (!TII->analyzeBranch(*PBB, TBB, FBB, Cond, true)) {
+        // Failing case: IBB is the target of a cbr, and we cannot reverse the
+        // branch.
+        SmallVector<MachineOperand, 4> NewCond(Cond);
+        if (!Cond.empty() && TBB == IBB) {
+          if (TII->ReverseBranchCondition(NewCond))
+            continue;
+          // This is the QBB case described above
+          if (!FBB) {
+            auto Next = ++PBB->getIterator();
+            if (Next != MF.end())
+              FBB = &*Next;
+          }
+        }
+
+        // Failing case: the only way IBB can be reached from PBB is via
+        // exception handling.  Happens for landing pads.  Would be nice to have
+        // a bit in the edge so we didn't have to do all this.
+        if (IBB->isEHPad()) {
+          MachineFunction::iterator IP = ++PBB->getIterator();
+          MachineBasicBlock *PredNextBB = nullptr;
+          if (IP != MF.end())
+            PredNextBB = &*IP;
+          if (!TBB) {
+            if (IBB != PredNextBB)      // fallthrough
+              continue;
+          } else if (FBB) {
+            if (TBB != IBB && FBB != IBB)   // cbr then ubr
+              continue;
+          } else if (Cond.empty()) {
+            if (TBB != IBB)               // ubr
+              continue;
+          } else {
+            if (TBB != IBB && IBB != PredNextBB)  // cbr
+              continue;
+          }
+        }
+
+        // Remove the unconditional branch at the end, if any.
+        if (TBB && (Cond.empty() || FBB)) {
+          DebugLoc dl;  // FIXME: this is nowhere
+          TII->RemoveBranch(*PBB);
+          if (!Cond.empty())
+            // reinsert conditional branch only, for now
+            TII->InsertBranch(*PBB, (TBB == IBB) ? FBB : TBB, nullptr,
+                              NewCond, dl);
+        }
+
+        MergePotentials.push_back(MergePotentialsElt(HashEndOfMBB(*PBB), PBB));
+      }
+    }
+
+    // If this is a large problem, avoid visiting the same basic blocks multiple
+    // times.
+    if (MergePotentials.size() == TailMergeThreshold)
+      for (unsigned i = 0, e = MergePotentials.size(); i != e; ++i)
+        TriedMerging.insert(MergePotentials[i].getBlock());
+
+    if (MergePotentials.size() >= 2)
+      MadeChange |= TryTailMergeBlocks(IBB, PredBB);
+
+    // Reinsert an unconditional branch if needed. The 1 below can occur as a
+    // result of removing blocks in TryTailMergeBlocks.
+    PredBB = &*std::prev(I); // this may have been changed in TryTailMergeBlocks
+    if (MergePotentials.size() == 1 &&
+        MergePotentials.begin()->getBlock() != PredBB)
+      FixTail(MergePotentials.begin()->getBlock(), IBB, TII);
+  }
+
+  return MadeChange;
+}
+
+void BranchFolder::setCommonTailEdgeWeights(MachineBasicBlock &TailMBB) {
+  SmallVector<BlockFrequency, 2> EdgeFreqLs(TailMBB.succ_size());
+  BlockFrequency AccumulatedMBBFreq;
+
+  // Aggregate edge frequency of successor edge j:
+  //  edgeFreq(j) = sum (freq(bb) * edgeProb(bb, j)),
+  //  where bb is a basic block that is in SameTails.
+  for (const auto &Src : SameTails) {
+    const MachineBasicBlock *SrcMBB = Src.getBlock();
+    BlockFrequency BlockFreq = MBBFreqInfo.getBlockFreq(SrcMBB);
+    AccumulatedMBBFreq += BlockFreq;
+
+    // It is not necessary to recompute edge weights if TailBB has less than two
+    // successors.
+    if (TailMBB.succ_size() <= 1)
+      continue;
+
+    auto EdgeFreq = EdgeFreqLs.begin();
+
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq)
+      *EdgeFreq += BlockFreq * MBPI.getEdgeProbability(SrcMBB, *SuccI);
+  }
+
+  MBBFreqInfo.setBlockFreq(&TailMBB, AccumulatedMBBFreq);
+
+  if (TailMBB.succ_size() <= 1)
+    return;
+
+  auto SumEdgeFreq =
+      std::accumulate(EdgeFreqLs.begin(), EdgeFreqLs.end(), BlockFrequency(0))
+          .getFrequency();
+  auto EdgeFreq = EdgeFreqLs.begin();
+
+  if (SumEdgeFreq > 0) {
+    for (auto SuccI = TailMBB.succ_begin(), SuccE = TailMBB.succ_end();
+         SuccI != SuccE; ++SuccI, ++EdgeFreq) {
+      auto Prob = BranchProbability::getBranchProbability(
+          EdgeFreq->getFrequency(), SumEdgeFreq);
+      TailMBB.setSuccProbability(SuccI, Prob);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//  Branch Optimization
+//===----------------------------------------------------------------------===//
+
+bool BranchFolder::OptimizeBranches(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  // Make sure blocks are numbered in order
+  MF.RenumberBlocks();
+  // Renumbering blocks alters funclet membership, recalculate it.
+  FuncletMembership = getFuncletMembership(MF);
+
+  for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
+       I != E; ) {
+    MachineBasicBlock *MBB = &*I++;
+    MadeChange |= OptimizeBlock(MBB);
+
+    // If it is dead, remove it.
+    if (MBB->pred_empty()) {
+      RemoveDeadBlock(MBB);
+      MadeChange = true;
+      ++NumDeadBlocks;
+    }
+  }
+
+  return MadeChange;
+}
+
+// Blocks should be considered empty if they contain only debug info;
+// else the debug info would affect codegen.
+static bool IsEmptyBlock(MachineBasicBlock *MBB) {
+  return MBB->getFirstNonDebugInstr() == MBB->end();
+}
+
+// Blocks with only debug info and branches should be considered the same
+// as blocks with only branches.
+static bool IsBranchOnlyBlock(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator I = MBB->getFirstNonDebugInstr();
+  assert(I != MBB->end() && "empty block!");
+  return I->isBranch();
+}
+
+/// IsBetterFallthrough - Return true if it would be clearly better to
+/// fall-through to MBB1 than to fall through into MBB2.  This has to return
+/// a strict ordering, returning true for both (MBB1,MBB2) and (MBB2,MBB1) will
+/// result in infinite loops.
+static bool IsBetterFallthrough(MachineBasicBlock *MBB1,
+                                MachineBasicBlock *MBB2) {
+  // Right now, we use a simple heuristic.  If MBB2 ends with a call, and
+  // MBB1 doesn't, we prefer to fall through into MBB1.  This allows us to
+  // optimize branches that branch to either a return block or an assert block
+  // into a fallthrough to the return.
+  MachineBasicBlock::iterator MBB1I = MBB1->getLastNonDebugInstr();
+  MachineBasicBlock::iterator MBB2I = MBB2->getLastNonDebugInstr();
+  if (MBB1I == MBB1->end() || MBB2I == MBB2->end())
+    return false;
+
+  // If there is a clear successor ordering we make sure that one block
+  // will fall through to the next
+  if (MBB1->isSuccessor(MBB2)) return true;
+  if (MBB2->isSuccessor(MBB1)) return false;
+
+  return MBB2I->isCall() && !MBB1I->isCall();
+}
+
+/// getBranchDebugLoc - Find and return, if any, the DebugLoc of the branch
+/// instructions on the block.
+static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I != MBB.end() && I->isBranch())
+    return I->getDebugLoc();
+  return DebugLoc();
+}
+
+/// OptimizeBlock - Analyze and optimize control flow related to the specified
+/// block.  This is never called on the entry block.
+bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
+  bool MadeChange = false;
+  MachineFunction &MF = *MBB->getParent();
+ReoptimizeBlock:
+
+  MachineFunction::iterator FallThrough = MBB->getIterator();
+  ++FallThrough;
+
+  // Make sure MBB and FallThrough belong to the same funclet.
+  bool SameFunclet = true;
+  if (!FuncletMembership.empty() && FallThrough != MF.end()) {
+    auto MBBFunclet = FuncletMembership.find(MBB);
+    assert(MBBFunclet != FuncletMembership.end());
+    auto FallThroughFunclet = FuncletMembership.find(&*FallThrough);
+    assert(FallThroughFunclet != FuncletMembership.end());
+    SameFunclet = MBBFunclet->second == FallThroughFunclet->second;
+  }
+
+  // If this block is empty, make everyone use its fall-through, not the block
+  // explicitly.  Landing pads should not do this since the landing-pad table
+  // points to this block.  Blocks with their addresses taken shouldn't be
+  // optimized away.
+  if (IsEmptyBlock(MBB) && !MBB->isEHPad() && !MBB->hasAddressTaken() &&
+      SameFunclet) {
+    // Dead block?  Leave for cleanup later.
+    if (MBB->pred_empty()) return MadeChange;
+
+    if (FallThrough == MF.end()) {
+      // TODO: Simplify preds to not branch here if possible!
+    } else if (FallThrough->isEHPad()) {
+      // Don't rewrite to a landing pad fallthough.  That could lead to the case
+      // where a BB jumps to more than one landing pad.
+      // TODO: Is it ever worth rewriting predecessors which don't already
+      // jump to a landing pad, and so can safely jump to the fallthrough?
+    } else if (MBB->isSuccessor(&*FallThrough)) {
+      // Rewrite all predecessors of the old block to go to the fallthrough
+      // instead.
+      while (!MBB->pred_empty()) {
+        MachineBasicBlock *Pred = *(MBB->pred_end()-1);
+        Pred->ReplaceUsesOfBlockWith(MBB, &*FallThrough);
+      }
+      // If MBB was the target of a jump table, update jump tables to go to the
+      // fallthrough instead.
+      if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+        MJTI->ReplaceMBBInJumpTables(MBB, &*FallThrough);
+      MadeChange = true;
+    }
+    return MadeChange;
+  }
+
+  // Check to see if we can simplify the terminator of the block before this
+  // one.
+  MachineBasicBlock &PrevBB = *std::prev(MachineFunction::iterator(MBB));
+
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
+  SmallVector<MachineOperand, 4> PriorCond;
+  bool PriorUnAnalyzable =
+      TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
+  if (!PriorUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
+                                              !PriorCond.empty());
+
+    // If the previous branch is conditional and both conditions go to the same
+    // destination, remove the branch, replacing it with an unconditional one or
+    // a fall-through.
+    if (PriorTBB && PriorTBB == PriorFBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->RemoveBranch(PrevBB);
+      PriorCond.clear();
+      if (PriorTBB != MBB)
+        TII->InsertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the previous block unconditionally falls through to this block and
+    // this block has no other predecessors, move the contents of this block
+    // into the prior block. This doesn't usually happen when SimplifyCFG
+    // has been used, but it can happen if tail merging splits a fall-through
+    // predecessor of a block.
+    // This has to check PrevBB->succ_size() because EH edges are ignored by
+    // AnalyzeBranch.
+    if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
+        PrevBB.succ_size() == 1 &&
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
+      DEBUG(dbgs() << "\nMerging into block: " << PrevBB
+                   << "From MBB: " << *MBB);
+      // Remove redundant DBG_VALUEs first.
+      if (PrevBB.begin() != PrevBB.end()) {
+        MachineBasicBlock::iterator PrevBBIter = PrevBB.end();
+        --PrevBBIter;
+        MachineBasicBlock::iterator MBBIter = MBB->begin();
+        // Check if DBG_VALUE at the end of PrevBB is identical to the
+        // DBG_VALUE at the beginning of MBB.
+        while (PrevBBIter != PrevBB.begin() && MBBIter != MBB->end()
+               && PrevBBIter->isDebugValue() && MBBIter->isDebugValue()) {
+          if (!MBBIter->isIdenticalTo(*PrevBBIter))
+            break;
+          MachineInstr &DuplicateDbg = *MBBIter;
+          ++MBBIter; -- PrevBBIter;
+          DuplicateDbg.eraseFromParent();
+        }
+      }
+      PrevBB.splice(PrevBB.end(), MBB, MBB->begin(), MBB->end());
+      PrevBB.removeSuccessor(PrevBB.succ_begin());
+      assert(PrevBB.succ_empty());
+      PrevBB.transferSuccessors(MBB);
+      MadeChange = true;
+      return MadeChange;
+    }
+
+    // If the previous branch *only* branches to *this* block (conditional or
+    // not) remove the branch.
+    if (PriorTBB == MBB && !PriorFBB) {
+      TII->RemoveBranch(PrevBB);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches somewhere else on the condition and here if
+    // the condition is false, remove the uncond second branch.
+    if (PriorFBB == MBB) {
+      DebugLoc dl = getBranchDebugLoc(PrevBB);
+      TII->RemoveBranch(PrevBB);
+      TII->InsertBranch(PrevBB, PriorTBB, nullptr, PriorCond, dl);
+      MadeChange = true;
+      ++NumBranchOpts;
+      goto ReoptimizeBlock;
+    }
+
+    // If the prior block branches here on true and somewhere else on false, and
+    // if the branch condition is reversible, reverse the branch to create a
+    // fall-through.
+    if (PriorTBB == MBB) {
+      SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+      if (!TII->ReverseBranchCondition(NewPriorCond)) {
+        DebugLoc dl = getBranchDebugLoc(PrevBB);
+        TII->RemoveBranch(PrevBB);
+        TII->InsertBranch(PrevBB, PriorFBB, nullptr, NewPriorCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this block has no successors (e.g. it is a return block or ends with
+    // a call to a no-return function like abort or __cxa_throw) and if the pred
+    // falls through into this block, and if it would otherwise fall through
+    // into the block after this, move this block to the end of the function.
+    //
+    // We consider it more likely that execution will stay in the function (e.g.
+    // due to loops) than it is to exit it.  This asserts in loops etc, moving
+    // the assert condition out of the loop body.
+    if (MBB->succ_empty() && !PriorCond.empty() && !PriorFBB &&
+        MachineFunction::iterator(PriorTBB) == FallThrough &&
+        !MBB->canFallThrough()) {
+      bool DoTransform = true;
+
+      // We have to be careful that the succs of PredBB aren't both no-successor
+      // blocks.  If neither have successors and if PredBB is the second from
+      // last block in the function, we'd just keep swapping the two blocks for
+      // last.  Only do the swap if one is clearly better to fall through than
+      // the other.
+      if (FallThrough == --MF.end() &&
+          !IsBetterFallthrough(PriorTBB, MBB))
+        DoTransform = false;
+
+      if (DoTransform) {
+        // Reverse the branch so we will fall through on the previous true cond.
+        SmallVector<MachineOperand, 4> NewPriorCond(PriorCond);
+        if (!TII->ReverseBranchCondition(NewPriorCond)) {
+          DEBUG(dbgs() << "\nMoving MBB: " << *MBB
+                       << "To make fallthrough to: " << *PriorTBB << "\n");
+
+          DebugLoc dl = getBranchDebugLoc(PrevBB);
+          TII->RemoveBranch(PrevBB);
+          TII->InsertBranch(PrevBB, MBB, nullptr, NewPriorCond, dl);
+
+          // Move this block to the end of the function.
+          MBB->moveAfter(&MF.back());
+          MadeChange = true;
+          ++NumBranchOpts;
+          return MadeChange;
+        }
+      }
+    }
+  }
+
+  // Analyze the branch in the current block.
+  MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable =
+      TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+  if (!CurUnAnalyzable) {
+    // If the CFG for the prior block has extra edges, remove them.
+    MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
+
+    // If this is a two-way branch, and the FBB branches to this block, reverse
+    // the condition so the single-basic-block loop is faster.  Instead of:
+    //    Loop: xxx; jcc Out; jmp Loop
+    // we want:
+    //    Loop: xxx; jncc Loop; jmp Out
+    if (CurTBB && CurFBB && CurFBB == MBB && CurTBB != MBB) {
+      SmallVector<MachineOperand, 4> NewCond(CurCond);
+      if (!TII->ReverseBranchCondition(NewCond)) {
+        DebugLoc dl = getBranchDebugLoc(*MBB);
+        TII->RemoveBranch(*MBB);
+        TII->InsertBranch(*MBB, CurFBB, CurTBB, NewCond, dl);
+        MadeChange = true;
+        ++NumBranchOpts;
+        goto ReoptimizeBlock;
+      }
+    }
+
+    // If this branch is the only thing in its block, see if we can forward
+    // other blocks across it.
+    if (CurTBB && CurCond.empty() && !CurFBB &&
+        IsBranchOnlyBlock(MBB) && CurTBB != MBB &&
+        !MBB->hasAddressTaken() && !MBB->isEHPad()) {
+      DebugLoc dl = getBranchDebugLoc(*MBB);
+      // This block may contain just an unconditional branch.  Because there can
+      // be 'non-branch terminators' in the block, try removing the branch and
+      // then seeing if the block is empty.
+      TII->RemoveBranch(*MBB);
+      // If the only things remaining in the block are debug info, remove these
+      // as well, so this will behave the same as an empty block in non-debug
+      // mode.
+      if (IsEmptyBlock(MBB)) {
+        // Make the block empty, losing the debug info (we could probably
+        // improve this in some cases.)
+        MBB->erase(MBB->begin(), MBB->end());
+      }
+      // If this block is just an unconditional branch to CurTBB, we can
+      // usually completely eliminate the block.  The only case we cannot
+      // completely eliminate the block is when the block before this one
+      // falls through into MBB and we can't understand the prior block's branch
+      // condition.
+      if (MBB->empty()) {
+        bool PredHasNoFallThrough = !PrevBB.canFallThrough();
+        if (PredHasNoFallThrough || !PriorUnAnalyzable ||
+            !PrevBB.isSuccessor(MBB)) {
+          // If the prior block falls through into us, turn it into an
+          // explicit branch to us to make updates simpler.
+          if (!PredHasNoFallThrough && PrevBB.isSuccessor(MBB) &&
+              PriorTBB != MBB && PriorFBB != MBB) {
+            if (!PriorTBB) {
+              assert(PriorCond.empty() && !PriorFBB &&
+                     "Bad branch analysis");
+              PriorTBB = MBB;
+            } else {
+              assert(!PriorFBB && "Machine CFG out of date!");
+              PriorFBB = MBB;
+            }
+            DebugLoc pdl = getBranchDebugLoc(PrevBB);
+            TII->RemoveBranch(PrevBB);
+            TII->InsertBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, pdl);
+          }
+
+          // Iterate through all the predecessors, revectoring each in-turn.
+          size_t PI = 0;
+          bool DidChange = false;
+          bool HasBranchToSelf = false;
+          while(PI != MBB->pred_size()) {
+            MachineBasicBlock *PMBB = *(MBB->pred_begin() + PI);
+            if (PMBB == MBB) {
+              // If this block has an uncond branch to itself, leave it.
+              ++PI;
+              HasBranchToSelf = true;
+            } else {
+              DidChange = true;
+              PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
+              // If this change resulted in PMBB ending in a conditional
+              // branch where both conditions go to the same destination,
+              // change this to an unconditional branch (and fix the CFG).
+              MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
+              SmallVector<MachineOperand, 4> NewCurCond;
+              bool NewCurUnAnalyzable = TII->analyzeBranch(
+                  *PMBB, NewCurTBB, NewCurFBB, NewCurCond, true);
+              if (!NewCurUnAnalyzable && NewCurTBB && NewCurTBB == NewCurFBB) {
+                DebugLoc pdl = getBranchDebugLoc(*PMBB);
+                TII->RemoveBranch(*PMBB);
+                NewCurCond.clear();
+                TII->InsertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
+                MadeChange = true;
+                ++NumBranchOpts;
+                PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
+              }
+            }
+          }
+
+          // Change any jumptables to go to the new MBB.
+          if (MachineJumpTableInfo *MJTI = MF.getJumpTableInfo())
+            MJTI->ReplaceMBBInJumpTables(MBB, CurTBB);
+          if (DidChange) {
+            ++NumBranchOpts;
+            MadeChange = true;
+            if (!HasBranchToSelf) return MadeChange;
+          }
+        }
+      }
+
+      // Add the branch back if the block is more than just an uncond branch.
+      TII->InsertBranch(*MBB, CurTBB, nullptr, CurCond, dl);
+    }
+  }
+
+  // If the prior block doesn't fall through into this block, and if this
+  // block doesn't fall through into some other block, see if we can find a
+  // place to move this block where a fall-through will happen.
+  if (!PrevBB.canFallThrough()) {
+
+    // Now we know that there was no fall-through into this block, check to
+    // see if it has a fall-through into its successor.
+    bool CurFallsThru = MBB->canFallThrough();
+
+    if (!MBB->isEHPad()) {
+      // Check all the predecessors of this block.  If one of them has no fall
+      // throughs, move this block right after it.
+      for (MachineBasicBlock *PredBB : MBB->predecessors()) {
+        // Analyze the branch at the end of the pred.
+        MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+        SmallVector<MachineOperand, 4> PredCond;
+        if (PredBB != MBB && !PredBB->canFallThrough() &&
+            !TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
+            (!CurFallsThru || !CurTBB || !CurFBB) &&
+            (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
+          // If the current block doesn't fall through, just move it.
+          // If the current block can fall through and does not end with a
+          // conditional branch, we need to append an unconditional jump to
+          // the (current) next block.  To avoid a possible compile-time
+          // infinite loop, move blocks only backward in this case.
+          // Also, if there are already 2 branches here, we cannot add a third;
+          // this means we have the case
+          // Bcc next
+          // B elsewhere
+          // next:
+          if (CurFallsThru) {
+            MachineBasicBlock *NextBB = &*std::next(MBB->getIterator());
+            CurCond.clear();
+            TII->InsertBranch(*MBB, NextBB, nullptr, CurCond, DebugLoc());
+          }
+          MBB->moveAfter(PredBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+    }
+
+    if (!CurFallsThru) {
+      // Check all successors to see if we can move this block before it.
+      for (MachineBasicBlock *SuccBB : MBB->successors()) {
+        // Analyze the branch at the end of the block before the succ.
+        MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
+
+        // If this block doesn't already fall-through to that successor, and if
+        // the succ doesn't already have a block that can fall through into it,
+        // and if the successor isn't an EH destination, we can arrange for the
+        // fallthrough to happen.
+        if (SuccBB != MBB && &*SuccPrev != MBB &&
+            !SuccPrev->canFallThrough() && !CurUnAnalyzable &&
+            !SuccBB->isEHPad()) {
+          MBB->moveBefore(SuccBB);
+          MadeChange = true;
+          goto ReoptimizeBlock;
+        }
+      }
+
+      // Okay, there is no really great place to put this block.  If, however,
+      // the block before this one would be a fall-through if this block were
+      // removed, move this block to the end of the function.
+      MachineBasicBlock *PrevTBB = nullptr, *PrevFBB = nullptr;
+      SmallVector<MachineOperand, 4> PrevCond;
+      // We're looking for cases where PrevBB could possibly fall through to
+      // FallThrough, but if FallThrough is an EH pad that wouldn't be useful
+      // so here we skip over any EH pads so we might have a chance to find
+      // a branch target from PrevBB.
+      while (FallThrough != MF.end() && FallThrough->isEHPad())
+        ++FallThrough;
+      // Now check to see if the current block is sitting between PrevBB and
+      // a block to which it could fall through.
+      if (FallThrough != MF.end() &&
+          !TII->analyzeBranch(PrevBB, PrevTBB, PrevFBB, PrevCond, true) &&
+          PrevBB.isSuccessor(&*FallThrough)) {
+        MBB->moveAfter(&MF.back());
+        MadeChange = true;
+        return MadeChange;
+      }
+    }
+  }
+
+  return MadeChange;
+}
+
+//===----------------------------------------------------------------------===//
+//  Hoist Common Code
+//===----------------------------------------------------------------------===//
+
+/// HoistCommonCode - Hoist common instruction sequences at the start of basic
+/// blocks to their common predecessor.
+bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
+  bool MadeChange = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
+    MachineBasicBlock *MBB = &*I++;
+    MadeChange |= HoistCommonCodeInSuccs(MBB);
+  }
+
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock *SuccBB : BB->successors())
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  return nullptr;
+}
+
+template <class Container>
+static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
+                                Container &Set) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      Set.insert(*AI);
+  } else {
+    Set.insert(Reg);
+  }
+}
+
+/// findHoistingInsertPosAndDeps - Find the location to move common instructions
+/// in successors to. The location is usually just before the terminator,
+/// however if the terminator is a conditional branch and its previous
+/// instruction is the flag setting instruction, the previous instruction is
+/// the preferred location. This function also gathers uses and defs of the
+/// instructions from the insertion point to the end of the block. The data is
+/// used by HoistCommonCodeInSuccs to ensure safety.
+static
+MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
+                                                  const TargetInstrInfo *TII,
+                                                  const TargetRegisterInfo *TRI,
+                                                  SmallSet<unsigned,4> &Uses,
+                                                  SmallSet<unsigned,4> &Defs) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  if (!TII->isUnpredicatedTerminator(*Loc))
+    return MBB->end();
+
+  for (const MachineOperand &MO : Loc->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      addRegAndItsAliases(Reg, TRI, Uses);
+    } else {
+      if (!MO.isDead())
+        // Don't try to hoist code in the rare case the terminator defines a
+        // register that is later used.
+        return MBB->end();
+
+      // If the terminator defines a register, make sure we don't hoist
+      // the instruction whose def might be clobbered by the terminator.
+      addRegAndItsAliases(Reg, TRI, Defs);
+    }
+  }
+
+  if (Uses.empty())
+    return Loc;
+  if (Loc == MBB->begin())
+    return MBB->end();
+
+  // The terminator is probably a conditional branch, try not to separate the
+  // branch from condition setting instruction.
+  MachineBasicBlock::iterator PI = Loc;
+  --PI;
+  while (PI != MBB->begin() && PI->isDebugValue())
+    --PI;
+
+  bool IsDef = false;
+  for (const MachineOperand &MO : PI->operands()) {
+    // If PI has a regmask operand, it is probably a call. Separate away.
+    if (MO.isRegMask())
+      return Loc;
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (Uses.count(Reg)) {
+      IsDef = true;
+      break;
+    }
+  }
+  if (!IsDef)
+    // The condition setting instruction is not just before the conditional
+    // branch.
+    return Loc;
+
+  // Be conservative, don't insert instruction above something that may have
+  // side-effects. And since it's potentially bad to separate flag setting
+  // instruction from the conditional branch, just abort the optimization
+  // completely.
+  // Also avoid moving code above predicated instruction since it's hard to
+  // reason about register liveness with predicated instruction.
+  bool DontMoveAcrossStore = true;
+  if (!PI->isSafeToMove(nullptr, DontMoveAcrossStore) || TII->isPredicated(*PI))
+    return MBB->end();
+
+
+  // Find out what registers are live. Note this routine is ignoring other live
+  // registers which are only used by instructions in successor blocks.
+  for (const MachineOperand &MO : PI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isUse()) {
+      addRegAndItsAliases(Reg, TRI, Uses);
+    } else {
+      if (Uses.erase(Reg)) {
+        if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            Uses.erase(*SubRegs); // Use sub-registers to be conservative
+        }
+      }
+      addRegAndItsAliases(Reg, TRI, Defs);
+    }
+  }
+
+  return PI;
+}
+
+/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
+/// sequence at the start of the function, move the instructions before MBB
+/// terminator if it's legal.
+bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeBranch(*MBB, TBB, FBB, Cond, true) || !TBB || Cond.empty())
+    return false;
+
+  if (!FBB) FBB = findFalseBlock(MBB, TBB);
+  if (!FBB)
+    // Malformed bcc? True and false blocks are the same?
+    return false;
+
+  // Restrict the optimization to cases where MBB is the only predecessor,
+  // it is an obvious win.
+  if (TBB->pred_size() > 1 || FBB->pred_size() > 1)
+    return false;
+
+  // Find a suitable position to hoist the common instructions to. Also figure
+  // out which registers are used or defined by instructions from the insertion
+  // point to the end of the block.
+  SmallSet<unsigned, 4> Uses, Defs;
+  MachineBasicBlock::iterator Loc =
+    findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
+  if (Loc == MBB->end())
+    return false;
+
+  bool HasDups = false;
+  SmallVector<unsigned, 4> LocalDefs;
+  SmallSet<unsigned, 4> LocalDefsSet;
+  MachineBasicBlock::iterator TIB = TBB->begin();
+  MachineBasicBlock::iterator FIB = FBB->begin();
+  MachineBasicBlock::iterator TIE = TBB->end();
+  MachineBasicBlock::iterator FIE = FBB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(*FIB, MachineInstr::CheckKillDead))
+      break;
+
+    if (TII->isPredicated(*TIB))
+      // Hard to reason about register liveness with predicated instruction.
+      break;
+
+    bool IsSafe = true;
+    for (MachineOperand &MO : TIB->operands()) {
+      // Don't attempt to hoist instructions with register masks.
+      if (MO.isRegMask()) {
+        IsSafe = false;
+        break;
+      }
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(Reg)) {
+          // Avoid clobbering a register that's used by the instruction at
+          // the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (Defs.count(Reg) && !MO.isDead()) {
+          // Don't hoist the instruction if the def would be clobber by the
+          // instruction at the point insertion. FIXME: This is overly
+          // conservative. It should be possible to hoist the instructions
+          // in BB2 in the following example:
+          // BB1:
+          // r1, eflag = op1 r2, r3
+          // brcc eflag
+          //
+          // BB2:
+          // r1 = op2, ...
+          //    = op3, r1<kill>
+          IsSafe = false;
+          break;
+        }
+      } else if (!LocalDefsSet.count(Reg)) {
+        if (Defs.count(Reg)) {
+          // Use is defined by the instruction at the point of insertion.
+          IsSafe = false;
+          break;
+        }
+
+        if (MO.isKill() && Uses.count(Reg))
+          // Kills a register that's read by the instruction at the point of
+          // insertion. Remove the kill marker.
+          MO.setIsKill(false);
+      }
+    }
+    if (!IsSafe)
+      break;
+
+    bool DontMoveAcrossStore = true;
+    if (!TIB->isSafeToMove(nullptr, DontMoveAcrossStore))
+      break;
+
+    // Remove kills from LocalDefsSet, these registers had short live ranges.
+    for (const MachineOperand &MO : TIB->operands()) {
+      if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || !LocalDefsSet.count(Reg))
+        continue;
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+          LocalDefsSet.erase(*AI);
+      } else {
+        LocalDefsSet.erase(Reg);
+      }
+    }
+
+    // Track local defs so we can update liveins.
+    for (const MachineOperand &MO : TIB->operands()) {
+      if (!MO.isReg() || !MO.isDef() || MO.isDead())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      LocalDefs.push_back(Reg);
+      addRegAndItsAliases(Reg, TRI, LocalDefsSet);
+    }
+
+    HasDups = true;
+    ++TIB;
+    ++FIB;
+  }
+
+  if (!HasDups)
+    return false;
+
+  MBB->splice(Loc, TBB, TBB->begin(), TIB);
+  FBB->erase(FBB->begin(), FIB);
+
+  // Update livein's.
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Def = LocalDefs[i];
+    if (LocalDefsSet.count(Def)) {
+      TBB->addLiveIn(Def);
+      FBB->addLiveIn(Def);
+    }
+  }
+
+  ++NumHoist;
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/BranchFolding.h b/contrib/llvm/lib/CodeGen/BranchFolding.h
new file mode 100644
index 000000000000..36a5a2e2c97c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BranchFolding.h
@@ -0,0 +1,160 @@
+//===-- BranchFolding.h - Fold machine code branch instructions -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+#define LLVM_LIB_CODEGEN_BRANCHFOLDING_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/Support/BlockFrequency.h"
+#include <vector>
+
+namespace llvm {
+  class MachineBlockFrequencyInfo;
+  class MachineBranchProbabilityInfo;
+  class MachineFunction;
+  class MachineModuleInfo;
+  class MachineLoopInfo;
+  class TargetInstrInfo;
+  class TargetRegisterInfo;
+
+  class LLVM_LIBRARY_VISIBILITY BranchFolder {
+  public:
+    class MBFIWrapper;
+
+    explicit BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
+                          MBFIWrapper &MBFI,
+                          const MachineBranchProbabilityInfo &MBPI);
+
+    bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
+                          const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
+                          MachineLoopInfo *mli = nullptr,
+                          bool AfterPlacement = false);
+
+  private:
+    class MergePotentialsElt {
+      unsigned Hash;
+      MachineBasicBlock *Block;
+    public:
+      MergePotentialsElt(unsigned h, MachineBasicBlock *b)
+        : Hash(h), Block(b) {}
+
+      unsigned getHash() const { return Hash; }
+      MachineBasicBlock *getBlock() const { return Block; }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        Block = MBB;
+      }
+
+      bool operator<(const MergePotentialsElt &) const;
+    };
+    typedef std::vector<MergePotentialsElt>::iterator MPIterator;
+    std::vector<MergePotentialsElt> MergePotentials;
+    SmallPtrSet<const MachineBasicBlock*, 2> TriedMerging;
+    DenseMap<const MachineBasicBlock *, int> FuncletMembership;
+
+    class SameTailElt {
+      MPIterator MPIter;
+      MachineBasicBlock::iterator TailStartPos;
+    public:
+      SameTailElt(MPIterator mp, MachineBasicBlock::iterator tsp)
+        : MPIter(mp), TailStartPos(tsp) {}
+
+      MPIterator getMPIter() const {
+        return MPIter;
+      }
+      MergePotentialsElt &getMergePotentialsElt() const {
+        return *getMPIter();
+      }
+      MachineBasicBlock::iterator getTailStartPos() const {
+        return TailStartPos;
+      }
+      unsigned getHash() const {
+        return getMergePotentialsElt().getHash();
+      }
+      MachineBasicBlock *getBlock() const {
+        return getMergePotentialsElt().getBlock();
+      }
+      bool tailIsWholeBlock() const {
+        return TailStartPos == getBlock()->begin();
+      }
+
+      void setBlock(MachineBasicBlock *MBB) {
+        getMergePotentialsElt().setBlock(MBB);
+      }
+      void setTailStartPos(MachineBasicBlock::iterator Pos) {
+        TailStartPos = Pos;
+      }
+    };
+    std::vector<SameTailElt> SameTails;
+
+    bool AfterBlockPlacement;
+    bool EnableTailMerge;
+    bool EnableHoistCommonCode;
+    bool UpdateLiveIns;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineModuleInfo *MMI;
+    MachineLoopInfo *MLI;
+    LivePhysRegs LiveRegs;
+
+  public:
+    /// \brief This class keeps track of branch frequencies of newly created
+    /// blocks and tail-merged blocks.
+    class MBFIWrapper {
+    public:
+      MBFIWrapper(const MachineBlockFrequencyInfo &I) : MBFI(I) {}
+      BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
+      void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
+      raw_ostream &printBlockFreq(raw_ostream &OS,
+                                  const MachineBasicBlock *MBB) const;
+      raw_ostream &printBlockFreq(raw_ostream &OS,
+                                  const BlockFrequency Freq) const;
+
+    private:
+      const MachineBlockFrequencyInfo &MBFI;
+      DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
+    };
+
+  private:
+    MBFIWrapper &MBBFreqInfo;
+    const MachineBranchProbabilityInfo &MBPI;
+
+    bool TailMergeBlocks(MachineFunction &MF);
+    bool TryTailMergeBlocks(MachineBasicBlock* SuccBB,
+                       MachineBasicBlock* PredBB);
+    void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
+    void computeLiveIns(MachineBasicBlock &MBB);
+    void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
+                                 MachineBasicBlock *NewDest);
+    MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
+                                  MachineBasicBlock::iterator BBI1,
+                                  const BasicBlock *BB);
+    unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
+                              MachineBasicBlock *SuccBB,
+                              MachineBasicBlock *PredBB);
+    void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
+                                                MachineBasicBlock* PredBB);
+    bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
+                                   MachineBasicBlock *SuccBB,
+                                   unsigned maxCommonTailLength,
+                                   unsigned &commonTailIndex);
+
+    bool OptimizeBranches(MachineFunction &MF);
+    bool OptimizeBlock(MachineBasicBlock *MBB);
+    void RemoveDeadBlock(MachineBasicBlock *MBB);
+    bool OptimizeImpDefsBlock(MachineBasicBlock *MBB);
+
+    bool HoistCommonCode(MachineFunction &MF);
+    bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
+  };
+}
+
+#endif /* LLVM_CODEGEN_BRANCHFOLDING_HPP */
diff --git a/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp
new file mode 100644
index 000000000000..ff7c99de0420
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/BuiltinGCs.cpp
@@ -0,0 +1,139 @@
+//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the boilerplate required to define our various built in
+// gc lowering strategies.  
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCs.h"
+#include "llvm/CodeGen/GCStrategy.h"
+
+using namespace llvm;
+
+namespace {
+
+/// An example GC which attempts to be compatibile with Erlang/OTP garbage
+/// collector.
+///
+/// The frametable emitter is in ErlangGCPrinter.cpp.
+class ErlangGC : public GCStrategy {
+public:
+  ErlangGC() {
+    InitRoots = false;
+    NeededSafePoints = 1 << GC::PostCall;
+    UsesMetadata = true;
+    CustomRoots = false;
+  }
+};
+
+/// An example GC which attempts to be compatible with Objective Caml 3.10.0
+///
+/// The frametable emitter is in OcamlGCPrinter.cpp.
+class OcamlGC : public GCStrategy {
+public:
+  OcamlGC() {
+    NeededSafePoints = 1 << GC::PostCall;
+    UsesMetadata = true;
+  }
+};
+
+/// A GC strategy for uncooperative targets.  This implements lowering for the
+/// llvm.gc* intrinsics for targets that do not natively support them (which
+/// includes the C backend). Note that the code generated is not quite as
+/// efficient as algorithms which generate stack maps to identify roots.
+///
+/// In order to support this particular transformation, all stack roots are
+/// coallocated in the stack. This allows a fully target-independent stack map
+/// while introducing only minor runtime overhead.
+class ShadowStackGC : public GCStrategy {
+public:
+  ShadowStackGC() {
+    InitRoots = true;
+    CustomRoots = true;
+  }
+};
+
+/// A GCStrategy which serves as an example for the usage of a statepoint based
+/// lowering strategy.  This GCStrategy is intended to suitable as a default
+/// implementation usable with any collector which can consume the standard
+/// stackmap format generated by statepoints, uses the default addrespace to
+/// distinguish between gc managed and non-gc managed pointers, and has
+/// reasonable relocation semantics.
+class StatepointGC : public GCStrategy {
+public:
+  StatepointGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+    // Method is only valid on pointer typed values.
+    const PointerType *PT = cast<PointerType>(Ty);
+    // For the sake of this example GC, we arbitrarily pick addrspace(1) as our
+    // GC managed heap.  We know that a pointer into this heap needs to be
+    // updated and that no other pointer does.  Note that addrspace(1) is used
+    // only as an example, it has no special meaning, and is not reserved for
+    // GC usage.
+    return (1 == PT->getAddressSpace());
+  }
+};
+
+/// A GCStrategy for the CoreCLR Runtime. The strategy is similar to
+/// Statepoint-example GC, but differs from it in certain aspects, such as:
+/// 1) Base-pointers need not be explicitly tracked and reported for
+///    interior pointers
+/// 2) Uses a different format for encoding stack-maps
+/// 3) Location of Safe-point polls: polls are only needed before loop-back
+///    edges and before tail-calls (not needed at function-entry)
+///
+/// The above differences in behavior are to be implemented in upcoming
+/// checkins.
+class CoreCLRGC : public GCStrategy {
+public:
+  CoreCLRGC() {
+    UseStatepoints = true;
+    // These options are all gc.root specific, we specify them so that the
+    // gc.root lowering code doesn't run.
+    InitRoots = false;
+    NeededSafePoints = 0;
+    UsesMetadata = false;
+    CustomRoots = false;
+  }
+  Optional<bool> isGCManagedPointer(const Type *Ty) const override {
+    // Method is only valid on pointer typed values.
+    const PointerType *PT = cast<PointerType>(Ty);
+    // We pick addrspace(1) as our GC managed heap.
+    return (1 == PT->getAddressSpace());
+  }
+};
+}
+
+// Register all the above so that they can be found at runtime.  Note that
+// these static initializers are important since the registration list is
+// constructed from their storage.
+static GCRegistry::Add<ErlangGC> A("erlang",
+                                   "erlang-compatible garbage collector");
+static GCRegistry::Add<OcamlGC> B("ocaml", "ocaml 3.10-compatible GC");
+static GCRegistry::Add<ShadowStackGC>
+    C("shadow-stack", "Very portable GC for uncooperative code generators");
+static GCRegistry::Add<StatepointGC> D("statepoint-example",
+                                       "an example strategy for statepoint");
+static GCRegistry::Add<CoreCLRGC> E("coreclr", "CoreCLR-compatible GC");
+
+// Provide hooks to ensure the containing library is fully loaded.
+void llvm::linkErlangGC() {}
+void llvm::linkOcamlGC() {}
+void llvm::linkShadowStackGC() {}
+void llvm::linkStatepointExampleGC() {}
+void llvm::linkCoreCLRGC() {}
diff --git a/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
new file mode 100644
index 000000000000..dc2d38a95f99
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -0,0 +1,236 @@
+//===------------------------ CalcSpillWeights.cpp ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "calcspillweights"
+
+void llvm::calculateSpillWeightsAndHints(LiveIntervals &LIS,
+                           MachineFunction &MF,
+                           VirtRegMap *VRM,
+                           const MachineLoopInfo &MLI,
+                           const MachineBlockFrequencyInfo &MBFI,
+                           VirtRegAuxInfo::NormalizingFn norm) {
+  DEBUG(dbgs() << "********** Compute Spill Weights **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  VirtRegAuxInfo VRAI(MF, LIS, VRM, MLI, MBFI, norm);
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    VRAI.calculateSpillWeightAndHint(LIS.getInterval(Reg));
+  }
+}
+
+// Return the preferred allocation register for reg, given a COPY instruction.
+static unsigned copyHint(const MachineInstr *mi, unsigned reg,
+                         const TargetRegisterInfo &tri,
+                         const MachineRegisterInfo &mri) {
+  unsigned sub, hreg, hsub;
+  if (mi->getOperand(0).getReg() == reg) {
+    sub = mi->getOperand(0).getSubReg();
+    hreg = mi->getOperand(1).getReg();
+    hsub = mi->getOperand(1).getSubReg();
+  } else {
+    sub = mi->getOperand(1).getSubReg();
+    hreg = mi->getOperand(0).getReg();
+    hsub = mi->getOperand(0).getSubReg();
+  }
+
+  if (!hreg)
+    return 0;
+
+  if (TargetRegisterInfo::isVirtualRegister(hreg))
+    return sub == hsub ? hreg : 0;
+
+  const TargetRegisterClass *rc = mri.getRegClass(reg);
+
+  // Only allow physreg hints in rc.
+  if (sub == 0)
+    return rc->contains(hreg) ? hreg : 0;
+
+  // reg:sub should match the physreg hreg.
+  return tri.getMatchingSuperReg(hreg, sub, rc);
+}
+
+// Check if all values in LI are rematerializable
+static bool isRematerializable(const LiveInterval &LI,
+                               const LiveIntervals &LIS,
+                               VirtRegMap *VRM,
+                               const TargetInstrInfo &TII) {
+  unsigned Reg = LI.reg;
+  unsigned Original = VRM ? VRM->getOriginal(Reg) : 0;
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *VNI = *I;
+    if (VNI->isUnused())
+      continue;
+    if (VNI->isPHIDef())
+      return false;
+
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    assert(MI && "Dead valno in interval");
+
+    // Trace copies introduced by live range splitting.  The inline
+    // spiller can rematerialize through these copies, so the spill
+    // weight must reflect this.
+    if (VRM) {
+      while (MI->isFullCopy()) {
+        // The copy destination must match the interval register.
+        if (MI->getOperand(0).getReg() != Reg)
+          return false;
+
+        // Get the source register.
+        Reg = MI->getOperand(1).getReg();
+
+        // If the original (pre-splitting) registers match this
+        // copy came from a split.
+        if (!TargetRegisterInfo::isVirtualRegister(Reg) ||
+            VRM->getOriginal(Reg) != Original)
+          return false;
+
+        // Follow the copy live-in value.
+        const LiveInterval &SrcLI = LIS.getInterval(Reg);
+        LiveQueryResult SrcQ = SrcLI.Query(VNI->def);
+        VNI = SrcQ.valueIn();
+        assert(VNI && "Copy from non-existing value");
+        if (VNI->isPHIDef())
+          return false;
+        MI = LIS.getInstructionFromIndex(VNI->def);
+        assert(MI && "Dead valno in interval");
+      }
+    }
+
+    if (!TII.isTriviallyReMaterializable(*MI, LIS.getAliasAnalysis()))
+      return false;
+  }
+  return true;
+}
+
+void
+VirtRegAuxInfo::calculateSpillWeightAndHint(LiveInterval &li) {
+  MachineRegisterInfo &mri = MF.getRegInfo();
+  const TargetRegisterInfo &tri = *MF.getSubtarget().getRegisterInfo();
+  MachineBasicBlock *mbb = nullptr;
+  MachineLoop *loop = nullptr;
+  bool isExiting = false;
+  float totalWeight = 0;
+  unsigned numInstr = 0; // Number of instructions using li
+  SmallPtrSet<MachineInstr*, 8> visited;
+
+  // Find the best physreg hint and the best virtreg hint.
+  float bestPhys = 0, bestVirt = 0;
+  unsigned hintPhys = 0, hintVirt = 0;
+
+  // Don't recompute a target specific hint.
+  bool noHint = mri.getRegAllocationHint(li.reg).first != 0;
+
+  // Don't recompute spill weight for an unspillable register.
+  bool Spillable = li.isSpillable();
+
+  for (MachineRegisterInfo::reg_instr_iterator
+       I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end();
+       I != E; ) {
+    MachineInstr *mi = &*(I++);
+    numInstr++;
+    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugValue())
+      continue;
+    if (!visited.insert(mi).second)
+      continue;
+
+    float weight = 1.0f;
+    if (Spillable) {
+      // Get loop info for mi.
+      if (mi->getParent() != mbb) {
+        mbb = mi->getParent();
+        loop = Loops.getLoopFor(mbb);
+        isExiting = loop ? loop->isLoopExiting(mbb) : false;
+      }
+
+      // Calculate instr weight.
+      bool reads, writes;
+      std::tie(reads, writes) = mi->readsWritesVirtualRegister(li.reg);
+      weight = LiveIntervals::getSpillWeight(writes, reads, &MBFI, *mi);
+
+      // Give extra weight to what looks like a loop induction variable update.
+      if (writes && isExiting && LIS.isLiveOutOfMBB(li, mbb))
+        weight *= 3;
+
+      totalWeight += weight;
+    }
+
+    // Get allocation hints from copies.
+    if (noHint || !mi->isCopy())
+      continue;
+    unsigned hint = copyHint(mi, li.reg, tri, mri);
+    if (!hint)
+      continue;
+    // Force hweight onto the stack so that x86 doesn't add hidden precision,
+    // making the comparison incorrectly pass (i.e., 1 > 1 == true??).
+    //
+    // FIXME: we probably shouldn't use floats at all.
+    volatile float hweight = Hint[hint] += weight;
+    if (TargetRegisterInfo::isPhysicalRegister(hint)) {
+      if (hweight > bestPhys && mri.isAllocatable(hint)) {
+        bestPhys = hweight;
+        hintPhys = hint;
+      }
+    } else {
+      if (hweight > bestVirt) {
+        bestVirt = hweight;
+        hintVirt = hint;
+      }
+    }
+  }
+
+  Hint.clear();
+
+  // Always prefer the physreg hint.
+  if (unsigned hint = hintPhys ? hintPhys : hintVirt) {
+    mri.setRegAllocationHint(li.reg, 0, hint);
+    // Weakly boost the spill weight of hinted registers.
+    totalWeight *= 1.01F;
+  }
+
+  // If the live interval was already unspillable, leave it that way.
+  if (!Spillable)
+    return;
+
+  // Mark li as unspillable if all live ranges are tiny and the interval
+  // is not live at any reg mask.  If the interval is live at a reg mask
+  // spilling may be required.
+  if (li.isZeroLength(LIS.getSlotIndexes()) &&
+      !li.isLiveAtIndexes(LIS.getRegMaskSlots())) {
+    li.markNotSpillable();
+    return;
+  }
+
+  // If all of the definitions of the interval are re-materializable,
+  // it is a preferred candidate for spilling.
+  // FIXME: this gets much more complicated once we support non-trivial
+  // re-materialization.
+  if (isRematerializable(li, LIS, VRM, *MF.getSubtarget().getInstrInfo()))
+    totalWeight *= 0.5F;
+
+  li.weight = normalize(totalWeight, li.getSize(), numInstr);
+}
diff --git a/contrib/llvm/lib/CodeGen/CallingConvLower.cpp b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
new file mode 100644
index 000000000000..7d67bcfe5469
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -0,0 +1,287 @@
+//===-- CallingConvLower.cpp - Calling Conventions ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CCState class, used for lowering and implementing
+// calling conventions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SaveAndRestore.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
+                 SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
+    : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
+      CallOrPrologue(Unknown) {
+  // No stack is used.
+  StackOffset = 0;
+  MaxStackArgAlign = 1;
+
+  clearByValRegsInfo();
+  UsedRegs.resize((TRI.getNumRegs()+31)/32);
+}
+
+/// Allocate space on the stack large enough to pass an argument by value.
+/// The size and alignment information of the argument is encoded in
+/// its parameter attribute.
+void CCState::HandleByVal(unsigned ValNo, MVT ValVT,
+                          MVT LocVT, CCValAssign::LocInfo LocInfo,
+                          int MinSize, int MinAlign,
+                          ISD::ArgFlagsTy ArgFlags) {
+  unsigned Align = ArgFlags.getByValAlign();
+  unsigned Size  = ArgFlags.getByValSize();
+  if (MinSize > (int)Size)
+    Size = MinSize;
+  if (MinAlign > (int)Align)
+    Align = MinAlign;
+  ensureMaxAlignment(Align);
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Align);
+  Size = unsigned(alignTo(Size, MinAlign));
+  unsigned Offset = AllocateStack(Size, Align);
+  addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+}
+
+/// Mark a register and all of its aliases as allocated.
+void CCState::MarkAllocated(unsigned Reg) {
+  for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
+    UsedRegs[*AI/32] |= 1 << (*AI&31);
+}
+
+/// Analyze an array of argument values,
+/// incorporating info about the formals into this state.
+void
+CCState::AnalyzeFormalArguments(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  unsigned NumArgs = Ins.size();
+
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT ArgVT = Ins[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Ins[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Formal argument #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Analyze the return values of a function, returning true if the return can
+/// be performed without sret-demotion and false otherwise.
+bool CCState::CheckReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                          CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this))
+      return false;
+  }
+  return true;
+}
+
+/// Analyze the returned values of a return,
+/// incorporating info about the result values into this state.
+void CCState::AnalyzeReturn(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                            CCAssignFn Fn) {
+  // Determine which register each value should be copied into.
+  for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
+    MVT VT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Return operand #" << i << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Analyze the outgoing arguments to a call,
+/// incorporating info about the passed values into this state.
+void CCState::AnalyzeCallOperands(const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = Outs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = Outs[i].VT;
+    ISD::ArgFlagsTy ArgFlags = Outs[i].Flags;
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Same as above except it takes vectors of types and argument flags.
+void CCState::AnalyzeCallOperands(SmallVectorImpl<MVT> &ArgVTs,
+                                  SmallVectorImpl<ISD::ArgFlagsTy> &Flags,
+                                  CCAssignFn Fn) {
+  unsigned NumOps = ArgVTs.size();
+  for (unsigned i = 0; i != NumOps; ++i) {
+    MVT ArgVT = ArgVTs[i];
+    ISD::ArgFlagsTy ArgFlags = Flags[i];
+    if (Fn(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call operand #" << i << " has unhandled type "
+             << EVT(ArgVT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Analyze the return values of a call, incorporating info about the passed
+/// values into this state.
+void CCState::AnalyzeCallResult(const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn Fn) {
+  for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+    MVT VT = Ins[i].VT;
+    ISD::ArgFlagsTy Flags = Ins[i].Flags;
+    if (Fn(i, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call result #" << i << " has unhandled type "
+             << EVT(VT).getEVTString() << '\n';
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+}
+
+/// Same as above except it's specialized for calls that produce a single value.
+void CCState::AnalyzeCallResult(MVT VT, CCAssignFn Fn) {
+  if (Fn(0, VT, VT, CCValAssign::Full, ISD::ArgFlagsTy(), *this)) {
+#ifndef NDEBUG
+    dbgs() << "Call result has unhandled type "
+           << EVT(VT).getEVTString() << '\n';
+#endif
+    llvm_unreachable(nullptr);
+  }
+}
+
+static bool isValueTypeInRegForCC(CallingConv::ID CC, MVT VT) {
+  if (VT.isVector())
+    return true; // Assume -msse-regparm might be in effect.
+  if (!VT.isInteger())
+    return false;
+  if (CC == CallingConv::X86_VectorCall || CC == CallingConv::X86_FastCall)
+    return true;
+  return false;
+}
+
+void CCState::getRemainingRegParmsForType(SmallVectorImpl<MCPhysReg> &Regs,
+                                          MVT VT, CCAssignFn Fn) {
+  unsigned SavedStackOffset = StackOffset;
+  unsigned SavedMaxStackArgAlign = MaxStackArgAlign;
+  unsigned NumLocs = Locs.size();
+
+  // Set the 'inreg' flag if it is used for this calling convention.
+  ISD::ArgFlagsTy Flags;
+  if (isValueTypeInRegForCC(CallingConv, VT))
+    Flags.setInReg();
+
+  // Allocate something of this value type repeatedly until we get assigned a
+  // location in memory.
+  bool HaveRegParm = true;
+  while (HaveRegParm) {
+    if (Fn(0, VT, VT, CCValAssign::Full, Flags, *this)) {
+#ifndef NDEBUG
+      dbgs() << "Call has unhandled type " << EVT(VT).getEVTString()
+             << " while computing remaining regparms\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+    HaveRegParm = Locs.back().isRegLoc();
+  }
+
+  // Copy all the registers from the value locations we added.
+  assert(NumLocs < Locs.size() && "CC assignment failed to add location");
+  for (unsigned I = NumLocs, E = Locs.size(); I != E; ++I)
+    if (Locs[I].isRegLoc())
+      Regs.push_back(MCPhysReg(Locs[I].getLocReg()));
+
+  // Clear the assigned values and stack memory. We leave the registers marked
+  // as allocated so that future queries don't return the same registers, i.e.
+  // when i64 and f64 are both passed in GPRs.
+  StackOffset = SavedStackOffset;
+  MaxStackArgAlign = SavedMaxStackArgAlign;
+  Locs.resize(NumLocs);
+}
+
+void CCState::analyzeMustTailForwardedRegisters(
+    SmallVectorImpl<ForwardedRegister> &Forwards, ArrayRef<MVT> RegParmTypes,
+    CCAssignFn Fn) {
+  // Oftentimes calling conventions will not user register parameters for
+  // variadic functions, so we need to assume we're not variadic so that we get
+  // all the registers that might be used in a non-variadic call.
+  SaveAndRestore<bool> SavedVarArg(IsVarArg, false);
+  SaveAndRestore<bool> SavedMustTail(AnalyzingMustTailForwardedRegs, true);
+
+  for (MVT RegVT : RegParmTypes) {
+    SmallVector<MCPhysReg, 8> RemainingRegs;
+    getRemainingRegParmsForType(RemainingRegs, RegVT, Fn);
+    const TargetLowering *TL = MF.getSubtarget().getTargetLowering();
+    const TargetRegisterClass *RC = TL->getRegClassFor(RegVT);
+    for (MCPhysReg PReg : RemainingRegs) {
+      unsigned VReg = MF.addLiveIn(PReg, RC);
+      Forwards.push_back(ForwardedRegister(VReg, PReg, RegVT));
+    }
+  }
+}
+
+bool CCState::resultsCompatible(CallingConv::ID CalleeCC,
+                                CallingConv::ID CallerCC, MachineFunction &MF,
+                                LLVMContext &C,
+                                const SmallVectorImpl<ISD::InputArg> &Ins,
+                                CCAssignFn CalleeFn, CCAssignFn CallerFn) {
+  if (CalleeCC == CallerCC)
+    return true;
+  SmallVector<CCValAssign, 4> RVLocs1;
+  CCState CCInfo1(CalleeCC, false, MF, RVLocs1, C);
+  CCInfo1.AnalyzeCallResult(Ins, CalleeFn);
+
+  SmallVector<CCValAssign, 4> RVLocs2;
+  CCState CCInfo2(CallerCC, false, MF, RVLocs2, C);
+  CCInfo2.AnalyzeCallResult(Ins, CallerFn);
+
+  if (RVLocs1.size() != RVLocs2.size())
+    return false;
+  for (unsigned I = 0, E = RVLocs1.size(); I != E; ++I) {
+    const CCValAssign &Loc1 = RVLocs1[I];
+    const CCValAssign &Loc2 = RVLocs2[I];
+    if (Loc1.getLocInfo() != Loc2.getLocInfo())
+      return false;
+    bool RegLoc1 = Loc1.isRegLoc();
+    if (RegLoc1 != Loc2.isRegLoc())
+      return false;
+    if (RegLoc1) {
+      if (Loc1.getLocReg() != Loc2.getLocReg())
+        return false;
+    } else {
+      if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
+        return false;
+    }
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/CodeGen.cpp b/contrib/llvm/lib/CodeGen/CodeGen.cpp
new file mode 100644
index 000000000000..6679819fdef6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CodeGen.cpp
@@ -0,0 +1,94 @@
+//===-- CodeGen.cpp -------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the common initialization routines for the
+// CodeGen library.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+/// initializeCodeGen - Initialize all passes linked into the CodeGen library.
+void llvm::initializeCodeGen(PassRegistry &Registry) {
+  initializeAtomicExpandPass(Registry);
+  initializeBranchFolderPassPass(Registry);
+  initializeCodeGenPreparePass(Registry);
+  initializeDeadMachineInstructionElimPass(Registry);
+  initializeDetectDeadLanesPass(Registry);
+  initializeDwarfEHPreparePass(Registry);
+  initializeEarlyIfConverterPass(Registry);
+  initializeExpandISelPseudosPass(Registry);
+  initializeExpandPostRAPass(Registry);
+  initializeFinalizeMachineBundlesPass(Registry);
+  initializeFuncletLayoutPass(Registry);
+  initializeGCMachineCodeAnalysisPass(Registry);
+  initializeGCModuleInfoPass(Registry);
+  initializeIfConverterPass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeLiveDebugVariablesPass(Registry);
+  initializeLiveIntervalsPass(Registry);
+  initializeLiveStacksPass(Registry);
+  initializeLiveVariablesPass(Registry);
+  initializeLocalStackSlotPassPass(Registry);
+  initializeLowerIntrinsicsPass(Registry);
+  initializeMachineBlockFrequencyInfoPass(Registry);
+  initializeMachineBlockPlacementPass(Registry);
+  initializeMachineBlockPlacementStatsPass(Registry);
+  initializeMachineCSEPass(Registry);
+  initializeImplicitNullChecksPass(Registry);
+  initializeMachineCombinerPass(Registry);
+  initializeMachineCopyPropagationPass(Registry);
+  initializeMachineDominatorTreePass(Registry);
+  initializeMachineFunctionPrinterPassPass(Registry);
+  initializeMachineLICMPass(Registry);
+  initializeMachineLoopInfoPass(Registry);
+  initializeMachineModuleInfoPass(Registry);
+  initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineSchedulerPass(Registry);
+  initializeMachineSinkingPass(Registry);
+  initializeMachineVerifierPassPass(Registry);
+  initializeXRayInstrumentationPass(Registry);
+  initializePatchableFunctionPass(Registry);
+  initializeOptimizePHIsPass(Registry);
+  initializePEIPass(Registry);
+  initializePHIEliminationPass(Registry);
+  initializePeepholeOptimizerPass(Registry);
+  initializePostMachineSchedulerPass(Registry);
+  initializePostRAHazardRecognizerPass(Registry);
+  initializePostRASchedulerPass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeProcessImplicitDefsPass(Registry);
+  initializeRegisterCoalescerPass(Registry);
+  initializeRenameIndependentSubregsPass(Registry);
+  initializeShrinkWrapPass(Registry);
+  initializeSlotIndexesPass(Registry);
+  initializeStackColoringPass(Registry);
+  initializeStackMapLivenessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
+  initializeSafeStackPass(Registry);
+  initializeStackProtectorPass(Registry);
+  initializeStackSlotColoringPass(Registry);
+  initializeTailDuplicatePassPass(Registry);
+  initializeTargetPassConfigPass(Registry);
+  initializeTwoAddressInstructionPassPass(Registry);
+  initializeUnpackMachineBundlesPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeUnreachableMachineBlockElimPass(Registry);
+  initializeVirtRegMapPass(Registry);
+  initializeVirtRegRewriterPass(Registry);
+  initializeWinEHPreparePass(Registry);
+}
+
+void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
+  initializeCodeGen(*unwrap(R));
+}
diff --git a/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
new file mode 100644
index 000000000000..ede404149a1c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -0,0 +1,5656 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation. This works around limitations in it's
+// basic-block-at-a-time approach. It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "codegenprepare"
+
+STATISTIC(NumBlocksElim, "Number of blocks eliminated");
+STATISTIC(NumPHIsElim,   "Number of trivial PHIs eliminated");
+STATISTIC(NumGEPsElim,   "Number of GEPs converted to casts");
+STATISTIC(NumCmpUses, "Number of uses of Cmp expressions replaced with uses of "
+                      "sunken Cmps");
+STATISTIC(NumCastUses, "Number of uses of Cast expressions replaced with uses "
+                       "of sunken Casts");
+STATISTIC(NumMemoryInsts, "Number of memory instructions whose address "
+                          "computations were sunk");
+STATISTIC(NumExtsMoved,  "Number of [s|z]ext instructions combined with loads");
+STATISTIC(NumExtUses,    "Number of uses of [s|z]ext instructions optimized");
+STATISTIC(NumAndsAdded,
+          "Number of and mask instructions added to form ext loads");
+STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
+STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
+STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
+STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
+STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
+STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
+
+static cl::opt<bool> DisableBranchOpts(
+  "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
+  cl::desc("Disable branch optimizations in CodeGenPrepare"));
+
+static cl::opt<bool>
+    DisableGCOpts("disable-cgp-gc-opts", cl::Hidden, cl::init(false),
+                  cl::desc("Disable GC optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> DisableSelectToBranch(
+  "disable-cgp-select2branch", cl::Hidden, cl::init(false),
+  cl::desc("Disable select to branch conversion."));
+
+static cl::opt<bool> AddrSinkUsingGEPs(
+  "addr-sink-using-gep", cl::Hidden, cl::init(false),
+  cl::desc("Address sinking in CGP using GEPs."));
+
+static cl::opt<bool> EnableAndCmpSinking(
+   "enable-andcmp-sinking", cl::Hidden, cl::init(true),
+   cl::desc("Enable sinkinig and/cmp into branches."));
+
+static cl::opt<bool> DisableStoreExtract(
+    "disable-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Disable store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> StressStoreExtract(
+    "stress-cgp-store-extract", cl::Hidden, cl::init(false),
+    cl::desc("Stress test store(extract) optimizations in CodeGenPrepare"));
+
+static cl::opt<bool> DisableExtLdPromotion(
+    "disable-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Disable ext(promotable(ld)) -> promoted(ext(ld)) optimization in "
+             "CodeGenPrepare"));
+
+static cl::opt<bool> StressExtLdPromotion(
+    "stress-cgp-ext-ld-promotion", cl::Hidden, cl::init(false),
+    cl::desc("Stress test ext(promotable(ld)) -> promoted(ext(ld)) "
+             "optimization in CodeGenPrepare"));
+
+static cl::opt<bool> DisablePreheaderProtect(
+    "disable-preheader-prot", cl::Hidden, cl::init(false),
+    cl::desc("Disable protection against removing loop preheaders"));
+
+namespace {
+typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
+typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
+typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+class TypePromotionTransaction;
+
+  class CodeGenPrepare : public FunctionPass {
+    const TargetMachine *TM;
+    const TargetLowering *TLI;
+    const TargetTransformInfo *TTI;
+    const TargetLibraryInfo *TLInfo;
+    const LoopInfo *LI;
+
+    /// As we scan instructions optimizing them, this is the next instruction
+    /// to optimize. Transforms that can invalidate this should update it.
+    BasicBlock::iterator CurInstIterator;
+
+    /// Keeps track of non-local addresses that have been sunk into a block.
+    /// This allows us to avoid inserting duplicate code for blocks with
+    /// multiple load/stores of the same address.
+    ValueMap<Value*, Value*> SunkAddrs;
+
+    /// Keeps track of all instructions inserted for the current function.
+    SetOfInstrs InsertedInsts;
+    /// Keeps track of the type of the related instruction before their
+    /// promotion for the current function.
+    InstrToOrigTy PromotedInsts;
+
+    /// True if CFG is modified in any way.
+    bool ModifiedDT;
+
+    /// True if optimizing for size.
+    bool OptSize;
+
+    /// DataLayout for the Function being processed.
+    const DataLayout *DL;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit CodeGenPrepare(const TargetMachine *TM = nullptr)
+        : FunctionPass(ID), TM(TM), TLI(nullptr), TTI(nullptr), DL(nullptr) {
+        initializeCodeGenPreparePass(*PassRegistry::getPassRegistry());
+      }
+    bool runOnFunction(Function &F) override;
+
+    const char *getPassName() const override { return "CodeGen Prepare"; }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      // FIXME: When we can selectively preserve passes, preserve the domtree.
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetTransformInfoWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+    }
+
+  private:
+    bool eliminateFallThrough(Function &F);
+    bool eliminateMostlyEmptyBlocks(Function &F);
+    bool canMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void eliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool optimizeBlock(BasicBlock &BB, bool& ModifiedDT);
+    bool optimizeInst(Instruction *I, bool& ModifiedDT);
+    bool optimizeMemoryInst(Instruction *I, Value *Addr,
+                            Type *AccessTy, unsigned AS);
+    bool optimizeInlineAsmInst(CallInst *CS);
+    bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
+    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeExtUses(Instruction *I);
+    bool optimizeLoadExt(LoadInst *I);
+    bool optimizeSelectInst(SelectInst *SI);
+    bool optimizeShuffleVectorInst(ShuffleVectorInst *SI);
+    bool optimizeSwitchInst(SwitchInst *CI);
+    bool optimizeExtractElementInst(Instruction *Inst);
+    bool dupRetToEnableTailCallOpts(BasicBlock *BB);
+    bool placeDbgValues(Function &F);
+    bool sinkAndCmp(Function &F);
+    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
+                        Instruction *&Inst,
+                        const SmallVectorImpl<Instruction *> &Exts,
+                        unsigned CreatedInstCost);
+    bool splitBranchCondition(Function &F);
+    bool simplifyOffsetableRelocate(Instruction &I);
+    void stripInvariantGroupMetadata(Instruction &I);
+  };
+}
+
+char CodeGenPrepare::ID = 0;
+INITIALIZE_TM_PASS(CodeGenPrepare, "codegenprepare",
+                   "Optimize for code generation", false, false)
+
+FunctionPass *llvm::createCodeGenPreparePass(const TargetMachine *TM) {
+  return new CodeGenPrepare(TM);
+}
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool EverMadeChange = false;
+  // Clear per function information.
+  InsertedInsts.clear();
+  PromotedInsts.clear();
+
+  ModifiedDT = false;
+  if (TM)
+    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  OptSize = F.optForSize();
+
+  /// This optimization identifies DIV instructions that can be
+  /// profitably bypassed and carried out with a shorter, faster divide.
+  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
+    const DenseMap<unsigned int, unsigned int> &BypassWidths =
+       TLI->getBypassSlowDivWidths();
+    BasicBlock* BB = &*F.begin();
+    while (BB != nullptr) {
+      // bypassSlowDivision may create new BBs, but we don't want to reapply the
+      // optimization to those blocks.
+      BasicBlock* Next = BB->getNextNode();
+      EverMadeChange |= bypassSlowDivision(BB, BypassWidths);
+      BB = Next;
+    }
+  }
+
+  // Eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= eliminateMostlyEmptyBlocks(F);
+
+  // llvm.dbg.value is far away from the value then iSel may not be able
+  // handle it properly. iSel will drop llvm.dbg.value if it can not
+  // find a node corresponding to the value.
+  EverMadeChange |= placeDbgValues(F);
+
+  // If there is a mask, compare against zero, and branch that can be combined
+  // into a single target instruction, push the mask and compare into branch
+  // users. Do this before OptimizeBlock -> OptimizeInst ->
+  // OptimizeCmpExpression, which perturbs the pattern being searched for.
+  if (!DisableBranchOpts) {
+    EverMadeChange |= sinkAndCmp(F);
+    EverMadeChange |= splitBranchCondition(F);
+  }
+
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator I = F.begin(); I != F.end(); ) {
+      BasicBlock *BB = &*I++;
+      bool ModifiedDTOnIteration = false;
+      MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration);
+
+      // Restart BB iteration if the dominator tree of the Function was changed
+      if (ModifiedDTOnIteration)
+        break;
+    }
+    EverMadeChange |= MadeChange;
+  }
+
+  SunkAddrs.clear();
+
+  if (!DisableBranchOpts) {
+    MadeChange = false;
+    SmallPtrSet<BasicBlock*, 8> WorkList;
+    for (BasicBlock &BB : F) {
+      SmallVector<BasicBlock *, 2> Successors(succ_begin(&BB), succ_end(&BB));
+      MadeChange |= ConstantFoldTerminator(&BB, true);
+      if (!MadeChange) continue;
+
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
+
+    // Delete the dead blocks and any of their dead successors.
+    MadeChange |= !WorkList.empty();
+    while (!WorkList.empty()) {
+      BasicBlock *BB = *WorkList.begin();
+      WorkList.erase(BB);
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+
+      DeleteDeadBlock(BB);
+
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
+
+    // Merge pairs of basic blocks with unconditional branches, connected by
+    // a single edge.
+    if (EverMadeChange || MadeChange)
+      MadeChange |= eliminateFallThrough(F);
+
+    EverMadeChange |= MadeChange;
+  }
+
+  if (!DisableGCOpts) {
+    SmallVector<Instruction *, 2> Statepoints;
+    for (BasicBlock &BB : F)
+      for (Instruction &I : BB)
+        if (isStatepoint(I))
+          Statepoints.push_back(&I);
+    for (auto &I : Statepoints)
+      EverMadeChange |= simplifyOffsetableRelocate(*I);
+  }
+
+  return EverMadeChange;
+}
+
+/// Merge basic blocks which are connected by a single edge, where one of the
+/// basic blocks has a single successor pointing to the other basic block,
+/// which has a single predecessor.
+bool CodeGenPrepare::eliminateFallThrough(Function &F) {
+  bool Changed = false;
+  // Scan all of the blocks in the function, except for the entry block.
+  for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
+    BasicBlock *BB = &*I++;
+    // If the destination block has a single pred, then this is a trivial
+    // edge, just collapse it.
+    BasicBlock *SinglePred = BB->getSinglePredecessor();
+
+    // Don't merge if BB's address is taken.
+    if (!SinglePred || SinglePred == BB || BB->hasAddressTaken()) continue;
+
+    BranchInst *Term = dyn_cast<BranchInst>(SinglePred->getTerminator());
+    if (Term && !Term->isConditional()) {
+      Changed = true;
+      DEBUG(dbgs() << "To merge:\n"<< *SinglePred << "\n\n\n");
+      // Remember if SinglePred was the entry block of the function.
+      // If so, we will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(BB, nullptr);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+
+      // We have erased a block. Update the iterator.
+      I = BB->getIterator();
+    }
+  }
+  return Changed;
+}
+
+/// Eliminate blocks that contain only PHI nodes, debug info directives, and an
+/// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
+/// edges in ways that are non-optimal for isel. Start by eliminating these
+/// blocks so we can split them the way we want them.
+bool CodeGenPrepare::eliminateMostlyEmptyBlocks(Function &F) {
+  SmallPtrSet<BasicBlock *, 16> Preheaders;
+  SmallVector<Loop *, 16> LoopList(LI->begin(), LI->end());
+  while (!LoopList.empty()) {
+    Loop *L = LoopList.pop_back_val();
+    LoopList.insert(LoopList.end(), L->begin(), L->end());
+    if (BasicBlock *Preheader = L->getLoopPreheader())
+      Preheaders.insert(Preheader);
+  }
+
+  bool MadeChange = false;
+  // Note that this intentionally skips the entry block.
+  for (Function::iterator I = std::next(F.begin()), E = F.end(); I != E;) {
+    BasicBlock *BB = &*I++;
+
+    // If this block doesn't end with an uncond branch, ignore it.
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isUnconditional())
+      continue;
+
+    // If the instruction before the branch (skipping debug info) isn't a phi
+    // node, then other stuff is happening here.
+    BasicBlock::iterator BBI = BI->getIterator();
+    if (BBI != BB->begin()) {
+      --BBI;
+      while (isa<DbgInfoIntrinsic>(BBI)) {
+        if (BBI == BB->begin())
+          break;
+        --BBI;
+      }
+      if (!isa<DbgInfoIntrinsic>(BBI) && !isa<PHINode>(BBI))
+        continue;
+    }
+
+    // Do not break infinite loops.
+    BasicBlock *DestBB = BI->getSuccessor(0);
+    if (DestBB == BB)
+      continue;
+
+    if (!canMergeBlocks(BB, DestBB))
+      continue;
+
+    // Do not delete loop preheaders if doing so would create a critical edge.
+    // Loop preheaders can be good locations to spill registers. If the
+    // preheader is deleted and we create a critical edge, registers may be
+    // spilled in the loop body instead.
+    if (!DisablePreheaderProtect && Preheaders.count(BB) &&
+        !(BB->getSinglePredecessor() && BB->getSinglePredecessor()->getSingleSuccessor()))
+     continue;
+
+    eliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// Return true if we can merge BB into DestBB if there is a single
+/// unconditional branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::canMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  BasicBlock::const_iterator BBI = BB->begin();
+  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+    for (const User *U : PN->users()) {
+      const Instruction *UI = cast<Instruction>(U);
+      if (UI->getParent() != DestBB || !isa<PHINode>(UI))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check
+      // incoming value. If incoming value is not from BB then this is
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (UI->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(UI))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN) return true;  // no conflict.
+
+  // Collect the preds of BB.
+  SmallPtrSet<const BasicBlock*, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) {   // Common predecessor?
+      BBI = DestBB->begin();
+      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+        const Value *V1 = PN->getIncomingValueForBlock(Pred);
+        const Value *V2 = PN->getIncomingValueForBlock(BB);
+
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+
+        // If there is a conflict, bail out.
+        if (V1 != V2) return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/// Eliminate a basic block that has only phi's and an unconditional branch in
+/// it.
+void CodeGenPrepare::eliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+
+  DEBUG(dbgs() << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB);
+
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (BasicBlock *SinglePred = DestBB->getSinglePredecessor()) {
+    if (SinglePred != DestBB) {
+      // Remember if SinglePred was the entry block of the function.  If so, we
+      // will need to move BB back to the entry position.
+      bool isEntry = SinglePred == &SinglePred->getParent()->getEntryBlock();
+      MergeBasicBlockIntoOnlyPred(DestBB, nullptr);
+
+      if (isEntry && BB != &BB->getParent()->getEntryBlock())
+        BB->moveBefore(&BB->getParent()->getEntryBlock());
+
+      DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+      return;
+    }
+  }
+
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  PHINode *PN;
+  for (BasicBlock::iterator BBI = DestBB->begin();
+       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN->removeIncomingValue(BB, false);
+
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN->addIncoming(InValPhi->getIncomingValue(i),
+                        InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+          PN->addIncoming(InVal, *PI);
+      }
+    }
+  }
+
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  BB->eraseFromParent();
+  ++NumBlocksElim;
+
+  DEBUG(dbgs() << "AFTER:\n" << *DestBB << "\n\n\n");
+}
+
+// Computes a map of base pointer relocation instructions to corresponding
+// derived pointer relocation instructions given a vector of all relocate calls
+static void computeBaseDerivedRelocateMap(
+    const SmallVectorImpl<GCRelocateInst *> &AllRelocateCalls,
+    DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>>
+        &RelocateInstMap) {
+  // Collect information in two maps: one primarily for locating the base object
+  // while filling the second map; the second map is the final structure holding
+  // a mapping between Base and corresponding Derived relocate calls
+  DenseMap<std::pair<unsigned, unsigned>, GCRelocateInst *> RelocateIdxMap;
+  for (auto *ThisRelocate : AllRelocateCalls) {
+    auto K = std::make_pair(ThisRelocate->getBasePtrIndex(),
+                            ThisRelocate->getDerivedPtrIndex());
+    RelocateIdxMap.insert(std::make_pair(K, ThisRelocate));
+  }
+  for (auto &Item : RelocateIdxMap) {
+    std::pair<unsigned, unsigned> Key = Item.first;
+    if (Key.first == Key.second)
+      // Base relocation: nothing to insert
+      continue;
+
+    GCRelocateInst *I = Item.second;
+    auto BaseKey = std::make_pair(Key.first, Key.first);
+
+    // We're iterating over RelocateIdxMap so we cannot modify it.
+    auto MaybeBase = RelocateIdxMap.find(BaseKey);
+    if (MaybeBase == RelocateIdxMap.end())
+      // TODO: We might want to insert a new base object relocate and gep off
+      // that, if there are enough derived object relocates.
+      continue;
+
+    RelocateInstMap[MaybeBase->second].push_back(I);
+  }
+}
+
+// Accepts a GEP and extracts the operands into a vector provided they're all
+// small integer constants
+static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
+                                          SmallVectorImpl<Value *> &OffsetV) {
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    // Only accept small constant integer operands
+    auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    if (!Op || Op->getZExtValue() > 20)
+      return false;
+  }
+
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++)
+    OffsetV.push_back(GEP->getOperand(i));
+  return true;
+}
+
+// Takes a RelocatedBase (base pointer relocation instruction) and Targets to
+// replace, computes a replacement, and affects it.
+static bool
+simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
+                          const SmallVectorImpl<GCRelocateInst *> &Targets) {
+  bool MadeChange = false;
+  for (GCRelocateInst *ToReplace : Targets) {
+    assert(ToReplace->getBasePtrIndex() == RelocatedBase->getBasePtrIndex() &&
+           "Not relocating a derived object of the original base object");
+    if (ToReplace->getBasePtrIndex() == ToReplace->getDerivedPtrIndex()) {
+      // A duplicate relocate call. TODO: coalesce duplicates.
+      continue;
+    }
+
+    if (RelocatedBase->getParent() != ToReplace->getParent()) {
+      // Base and derived relocates are in different basic blocks.
+      // In this case transform is only valid when base dominates derived
+      // relocate. However it would be too expensive to check dominance
+      // for each such relocate, so we skip the whole transformation.
+      continue;
+    }
+
+    Value *Base = ToReplace->getBasePtr();
+    auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
+    if (!Derived || Derived->getPointerOperand() != Base)
+      continue;
+
+    SmallVector<Value *, 2> OffsetV;
+    if (!getGEPSmallConstantIntOffsetV(Derived, OffsetV))
+      continue;
+
+    // Create a Builder and replace the target callsite with a gep
+    assert(RelocatedBase->getNextNode() &&
+           "Should always have one since it's not a terminator");
+
+    // Insert after RelocatedBase
+    IRBuilder<> Builder(RelocatedBase->getNextNode());
+    Builder.SetCurrentDebugLocation(ToReplace->getDebugLoc());
+
+    // If gc_relocate does not match the actual type, cast it to the right type.
+    // In theory, there must be a bitcast after gc_relocate if the type does not
+    // match, and we should reuse it to get the derived pointer. But it could be
+    // cases like this:
+    // bb1:
+    //  ...
+    //  %g1 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
+    //  br label %merge
+    //
+    // bb2:
+    //  ...
+    //  %g2 = call coldcc i8 addrspace(1)* @llvm.experimental.gc.relocate.p1i8(...)
+    //  br label %merge
+    //
+    // merge:
+    //  %p1 = phi i8 addrspace(1)* [ %g1, %bb1 ], [ %g2, %bb2 ]
+    //  %cast = bitcast i8 addrspace(1)* %p1 in to i32 addrspace(1)*
+    //
+    // In this case, we can not find the bitcast any more. So we insert a new bitcast
+    // no matter there is already one or not. In this way, we can handle all cases, and
+    // the extra bitcast should be optimized away in later passes.
+    Value *ActualRelocatedBase = RelocatedBase;
+    if (RelocatedBase->getType() != Base->getType()) {
+      ActualRelocatedBase =
+          Builder.CreateBitCast(RelocatedBase, Base->getType());
+    }
+    Value *Replacement = Builder.CreateGEP(
+        Derived->getSourceElementType(), ActualRelocatedBase, makeArrayRef(OffsetV));
+    Replacement->takeName(ToReplace);
+    // If the newly generated derived pointer's type does not match the original derived
+    // pointer's type, cast the new derived pointer to match it. Same reasoning as above.
+    Value *ActualReplacement = Replacement;
+    if (Replacement->getType() != ToReplace->getType()) {
+      ActualReplacement =
+          Builder.CreateBitCast(Replacement, ToReplace->getType());
+    }
+    ToReplace->replaceAllUsesWith(ActualReplacement);
+    ToReplace->eraseFromParent();
+
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+// Turns this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = relocate(%tok, i32 4, i32 4)
+// %ptr' = relocate(%tok, i32 4, i32 5)
+// %val = load %ptr'
+//
+// into this:
+//
+// %base = ...
+// %ptr = gep %base + 15
+// %tok = statepoint (%fun, i32 0, i32 0, i32 0, %base, %ptr)
+// %base' = gc.relocate(%tok, i32 4, i32 4)
+// %ptr' = gep %base' + 15
+// %val = load %ptr'
+bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+  bool MadeChange = false;
+  SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
+
+  for (auto *U : I.users())
+    if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
+      // Collect all the relocate calls associated with a statepoint
+      AllRelocateCalls.push_back(Relocate);
+
+  // We need atleast one base pointer relocation + one derived pointer
+  // relocation to mangle
+  if (AllRelocateCalls.size() < 2)
+    return false;
+
+  // RelocateInstMap is a mapping from the base relocate instruction to the
+  // corresponding derived relocate instructions
+  DenseMap<GCRelocateInst *, SmallVector<GCRelocateInst *, 2>> RelocateInstMap;
+  computeBaseDerivedRelocateMap(AllRelocateCalls, RelocateInstMap);
+  if (RelocateInstMap.empty())
+    return false;
+
+  for (auto &Item : RelocateInstMap)
+    // Item.first is the RelocatedBase to offset against
+    // Item.second is the vector of Targets to replace
+    MadeChange = simplifyRelocatesOffABase(Item.first, Item.second);
+  return MadeChange;
+}
+
+/// SinkCast - Sink the specified cast instruction into its user blocks
+static bool SinkCast(CastInst *CI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      UserBB = PN->getIncomingBlock(TheUse);
+    }
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // The first insertion point of a block containing an EH pad is after the
+    // pad.  If the pad is the user, we cannot sink the cast past the pad.
+    if (User->isEHPad())
+      continue;
+
+    // If the block selected to receive the cast is an EH pad that does not
+    // allow non-PHI instructions before the terminator, we can't sink the
+    // cast.
+    if (UserBB->getTerminator()->isEHPad())
+      continue;
+
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedCast = CastInst::Create(CI->getOpcode(), CI->getOperand(0),
+                                      CI->getType(), "", &*InsertPt);
+    }
+
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+    MadeChange = true;
+    ++NumCastUses;
+  }
+
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty()) {
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+/// If the specified cast instruction is a noop copy (e.g. it's casting from
+/// one pointer type to another, i32->i8 on PPC), sink it into user blocks to
+/// reduce the number of virtual registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+///
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
+                                       const DataLayout &DL) {
+  // If this is a noop copy,
+  EVT SrcVT = TLI.getValueType(DL, CI->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, CI->getType());
+
+  // This is an fp<->int conversion?
+  if (SrcVT.isInteger() != DstVT.isInteger())
+    return false;
+
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT.bitsLT(DstVT)) return false;
+
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(CI->getContext(), SrcVT) ==
+      TargetLowering::TypePromoteInteger)
+    SrcVT = TLI.getTypeToTransformTo(CI->getContext(), SrcVT);
+  if (TLI.getTypeAction(CI->getContext(), DstVT) ==
+      TargetLowering::TypePromoteInteger)
+    DstVT = TLI.getTypeToTransformTo(CI->getContext(), DstVT);
+
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+
+  return SinkCast(CI);
+}
+
+/// Try to combine CI into a call to the llvm.uadd.with.overflow intrinsic if
+/// possible.
+///
+/// Return true if any changes were made.
+static bool CombineUAddWithOverflow(CmpInst *CI) {
+  Value *A, *B;
+  Instruction *AddI;
+  if (!match(CI,
+             m_UAddWithOverflow(m_Value(A), m_Value(B), m_Instruction(AddI))))
+    return false;
+
+  Type *Ty = AddI->getType();
+  if (!isa<IntegerType>(Ty))
+    return false;
+
+  // We don't want to move around uses of condition values this late, so we we
+  // check if it is legal to create the call to the intrinsic in the basic
+  // block containing the icmp:
+
+  if (AddI->getParent() != CI->getParent() && !AddI->hasOneUse())
+    return false;
+
+#ifndef NDEBUG
+  // Someday m_UAddWithOverflow may get smarter, but this is a safe assumption
+  // for now:
+  if (AddI->hasOneUse())
+    assert(*AddI->user_begin() == CI && "expected!");
+#endif
+
+  Module *M = CI->getModule();
+  Value *F = Intrinsic::getDeclaration(M, Intrinsic::uadd_with_overflow, Ty);
+
+  auto *InsertPt = AddI->hasOneUse() ? CI : AddI;
+
+  auto *UAddWithOverflow =
+      CallInst::Create(F, {A, B}, "uadd.overflow", InsertPt);
+  auto *UAdd = ExtractValueInst::Create(UAddWithOverflow, 0, "uadd", InsertPt);
+  auto *Overflow =
+      ExtractValueInst::Create(UAddWithOverflow, 1, "overflow", InsertPt);
+
+  CI->replaceAllUsesWith(Overflow);
+  AddI->replaceAllUsesWith(UAdd);
+  CI->eraseFromParent();
+  AddI->eraseFromParent();
+  return true;
+}
+
+/// Sink the given CmpInst into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced. This is a clear win except on
+/// targets with multiple condition code registers (PowerPC), where it might
+/// lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool SinkCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
+  BasicBlock *DefBB = CI->getParent();
+
+  // Avoid sinking soft-FP comparisons, since this can move them into a loop.
+  if (TLI && TLI->useSoftFloat() && isa<FCmpInst>(CI))
+    return false;
+
+  // Only insert a cmp in each block once.
+  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = CI->user_begin(), E = CI->user_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB) continue;
+
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedCmp =
+          CmpInst::Create(CI->getOpcode(), CI->getPredicate(),
+                          CI->getOperand(0), CI->getOperand(1), "", &*InsertPt);
+    }
+
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+    MadeChange = true;
+    ++NumCmpUses;
+  }
+
+  // If we removed all uses, nuke the cmp.
+  if (CI->use_empty()) {
+    CI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
+  if (SinkCmpExpression(CI, TLI))
+    return true;
+
+  if (CombineUAddWithOverflow(CI))
+    return true;
+
+  return false;
+}
+
+/// Check if the candidates could be combined with a shift instruction, which
+/// includes:
+/// 1. Truncate instruction
+/// 2. And instruction and the imm is a mask of the low bits:
+/// imm & (imm+1) == 0
+static bool isExtractBitsCandidateUse(Instruction *User) {
+  if (!isa<TruncInst>(User)) {
+    if (User->getOpcode() != Instruction::And ||
+        !isa<ConstantInt>(User->getOperand(1)))
+      return false;
+
+    const APInt &Cimm = cast<ConstantInt>(User->getOperand(1))->getValue();
+
+    if ((Cimm & (Cimm + 1)).getBoolValue())
+      return false;
+  }
+  return true;
+}
+
+/// Sink both shift and truncate instruction to the use of truncate's BB.
+static bool
+SinkShiftAndTruncate(BinaryOperator *ShiftI, Instruction *User, ConstantInt *CI,
+                     DenseMap<BasicBlock *, BinaryOperator *> &InsertedShifts,
+                     const TargetLowering &TLI, const DataLayout &DL) {
+  BasicBlock *UserBB = User->getParent();
+  DenseMap<BasicBlock *, CastInst *> InsertedTruncs;
+  TruncInst *TruncI = dyn_cast<TruncInst>(User);
+  bool MadeChange = false;
+
+  for (Value::user_iterator TruncUI = TruncI->user_begin(),
+                            TruncE = TruncI->user_end();
+       TruncUI != TruncE;) {
+
+    Use &TruncTheUse = TruncUI.getUse();
+    Instruction *TruncUser = cast<Instruction>(*TruncUI);
+    // Preincrement use iterator so we don't invalidate it.
+
+    ++TruncUI;
+
+    int ISDOpcode = TLI.InstructionOpcodeToISD(TruncUser->getOpcode());
+    if (!ISDOpcode)
+      continue;
+
+    // If the use is actually a legal node, there will not be an
+    // implicit truncate.
+    // FIXME: always querying the result type is just an
+    // approximation; some nodes' legality is determined by the
+    // operand or other means. There's no good way to find out though.
+    if (TLI.isOperationLegalOrCustom(
+            ISDOpcode, TLI.getValueType(DL, TruncUser->getType(), true)))
+      continue;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(TruncUser))
+      continue;
+
+    BasicBlock *TruncUserBB = TruncUser->getParent();
+
+    if (UserBB == TruncUserBB)
+      continue;
+
+    BinaryOperator *&InsertedShift = InsertedShifts[TruncUserBB];
+    CastInst *&InsertedTrunc = InsertedTruncs[TruncUserBB];
+
+    if (!InsertedShift && !InsertedTrunc) {
+      BasicBlock::iterator InsertPt = TruncUserBB->getFirstInsertionPt();
+      assert(InsertPt != TruncUserBB->end());
+      // Sink the shift
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      else
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+
+      // Sink the trunc
+      BasicBlock::iterator TruncInsertPt = TruncUserBB->getFirstInsertionPt();
+      TruncInsertPt++;
+      assert(TruncInsertPt != TruncUserBB->end());
+
+      InsertedTrunc = CastInst::Create(TruncI->getOpcode(), InsertedShift,
+                                       TruncI->getType(), "", &*TruncInsertPt);
+
+      MadeChange = true;
+
+      TruncTheUse = InsertedTrunc;
+    }
+  }
+  return MadeChange;
+}
+
+/// Sink the shift *right* instruction into user blocks if the uses could
+/// potentially be combined with this shift instruction and generate BitExtract
+/// instruction. It will only be applied if the architecture supports BitExtract
+/// instruction. Here is an example:
+/// BB1:
+///   %x.extract.shift = lshr i64 %arg1, 32
+/// BB2:
+///   %x.extract.trunc = trunc i64 %x.extract.shift to i16
+/// ==>
+///
+/// BB2:
+///   %x.extract.shift.1 = lshr i64 %arg1, 32
+///   %x.extract.trunc = trunc i64 %x.extract.shift.1 to i16
+///
+/// CodeGen will recoginze the pattern in BB2 and generate BitExtract
+/// instruction.
+/// Return true if any changes are made.
+static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI,
+                                const TargetLowering &TLI,
+                                const DataLayout &DL) {
+  BasicBlock *DefBB = ShiftI->getParent();
+
+  /// Only insert instructions in each block once.
+  DenseMap<BasicBlock *, BinaryOperator *> InsertedShifts;
+
+  bool shiftIsLegal = TLI.isTypeLegal(TLI.getValueType(DL, ShiftI->getType()));
+
+  bool MadeChange = false;
+  for (Value::user_iterator UI = ShiftI->user_begin(), E = ShiftI->user_end();
+       UI != E;) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    if (!isExtractBitsCandidateUse(User))
+      continue;
+
+    BasicBlock *UserBB = User->getParent();
+
+    if (UserBB == DefBB) {
+      // If the shift and truncate instruction are in the same BB. The use of
+      // the truncate(TruncUse) may still introduce another truncate if not
+      // legal. In this case, we would like to sink both shift and truncate
+      // instruction to the BB of TruncUse.
+      // for example:
+      // BB1:
+      // i64 shift.result = lshr i64 opnd, imm
+      // trunc.result = trunc shift.result to i16
+      //
+      // BB2:
+      //   ----> We will have an implicit truncate here if the architecture does
+      //   not have i16 compare.
+      // cmp i16 trunc.result, opnd2
+      //
+      if (isa<TruncInst>(User) && shiftIsLegal
+          // If the type of the truncate is legal, no trucate will be
+          // introduced in other basic blocks.
+          &&
+          (!TLI.isTypeLegal(TLI.getValueType(DL, User->getType()))))
+        MadeChange =
+            SinkShiftAndTruncate(ShiftI, User, CI, InsertedShifts, TLI, DL);
+
+      continue;
+    }
+    // If we have already inserted a shift into this block, use it.
+    BinaryOperator *&InsertedShift = InsertedShifts[UserBB];
+
+    if (!InsertedShift) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+
+      if (ShiftI->getOpcode() == Instruction::AShr)
+        InsertedShift = BinaryOperator::CreateAShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+      else
+        InsertedShift = BinaryOperator::CreateLShr(ShiftI->getOperand(0), CI,
+                                                   "", &*InsertPt);
+
+      MadeChange = true;
+    }
+
+    // Replace a use of the shift with a use of the new shift.
+    TheUse = InsertedShift;
+  }
+
+  // If we removed all uses, nuke the shift.
+  if (ShiftI->use_empty())
+    ShiftI->eraseFromParent();
+
+  return MadeChange;
+}
+
+// Translate a masked load intrinsic like
+// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask, <16 x i32> %passthru)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+//  %1 = bitcast i8* %addr to i32*
+//  %2 = extractelement <16 x i1> %mask, i32 0
+//  %3 = icmp eq i1 %2, true
+//  br i1 %3, label %cond.load, label %else
+//
+//cond.load:                                        ; preds = %0
+//  %4 = getelementptr i32* %1, i32 0
+//  %5 = load i32* %4
+//  %6 = insertelement <16 x i32> undef, i32 %5, i32 0
+//  br label %else
+//
+//else:                                             ; preds = %0, %cond.load
+//  %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ]
+//  %7 = extractelement <16 x i1> %mask, i32 1
+//  %8 = icmp eq i1 %7, true
+//  br i1 %8, label %cond.load1, label %else2
+//
+//cond.load1:                                       ; preds = %else
+//  %9 = getelementptr i32* %1, i32 1
+//  %10 = load i32* %9
+//  %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1
+//  br label %else2
+//
+//else2:                                            ; preds = %else, %cond.load1
+//  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+//  %12 = extractelement <16 x i1> %mask, i32 2
+//  %13 = icmp eq i1 %12, true
+//  br i1 %13, label %cond.load4, label %else5
+//
+static void scalarizeMaskedLoad(CallInst *CI) {
+  Value *Ptr  = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  Type *EltTy = CI->getType()->getVectorElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask = isa<Constant>(Mask) &&
+    cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+          continue;
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+      VResult = Builder.CreateInsertElement(VResult, Load,
+                                            Builder.getInt32(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ]
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_load = icmp eq i1 %mask_1, true
+    //  br i1 %to_load, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal);
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked store intrinsic, like
+// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align,
+//                               <16 x i1> %mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set
+//
+//   %1 = bitcast i8* %addr to i32*
+//   %2 = extractelement <16 x i1> %mask, i32 0
+//   %3 = icmp eq i1 %2, true
+//   br i1 %3, label %cond.store, label %else
+//
+// cond.store:                                       ; preds = %0
+//   %4 = extractelement <16 x i32> %val, i32 0
+//   %5 = getelementptr i32* %1, i32 0
+//   store i32 %4, i32* %5
+//   br label %else
+//
+// else:                                             ; preds = %0, %cond.store
+//   %6 = extractelement <16 x i1> %mask, i32 1
+//   %7 = icmp eq i1 %6, true
+//   br i1 %7, label %cond.store1, label %else2
+//
+// cond.store1:                                      ; preds = %else
+//   %8 = extractelement <16 x i32> %val, i32 1
+//   %9 = getelementptr i32* %1, i32 1
+//   store i32 %8, i32* %9
+//   br label %else2
+//   . . .
+static void scalarizeMaskedStore(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptr  = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  VectorType *VecType = dyn_cast<VectorType>(Src->getType());
+  assert(VecType && "Unexpected data type in masked store intrinsic");
+
+  Type *EltTy = VecType->getElementType();
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  // Short-cut if the mask is all-true.
+  bool IsAllOnesMask = isa<Constant>(Mask) &&
+    cast<Constant>(Mask)->isAllOnesValue();
+
+  if (IsAllOnesMask) {
+    Builder.CreateAlignedStore(Src, Ptr, AlignVal);
+    CI->eraseFromParent();
+    return;
+  }
+
+  // Adjust alignment for the scalar instruction.
+  AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8);
+  // Bitcast %addr fron i8* to EltTy*
+  Type *NewPtrType =
+    EltTy->getPointerTo(cast<PointerType>(Ptr->getType())->getAddressSpace());
+  Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
+  unsigned VectorWidth = VecType->getNumElements();
+
+  if (isa<ConstantVector>(Mask)) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+          continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+      Value *Gep =
+          Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %mask_1 = extractelement <16 x i1> %mask, i32 Idx
+    //  %to_store = icmp eq i1 %mask_1, true
+    //  br i1 %to_store, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1));
+
+    // Create "cond" block
+    //
+    //  %OneElt = extractelement <16 x i32> %Src, i32 Idx
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %store i32 %OneElt, i32* %EltAddr
+    //
+    BasicBlock *CondBlock =
+        IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx));
+    Value *Gep =
+        Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx));
+    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock =
+        CondBlock->splitBasicBlock(InsertPt->getIterator(), "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+// Translate a masked gather intrinsic like
+// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4,
+//                               <16 x i1> %Mask, <16 x i32> %Src)
+// to a chain of basic blocks, with loading element one-by-one if
+// the appropriate mask bit is set
+//
+// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> %Mask, i32 0
+// % ToLoad0 = icmp eq i1 % Mask0, true
+// br i1 % ToLoad0, label %cond.load, label %else
+//
+// cond.load:
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// % Load0 = load i32, i32* % Ptr0, align 4
+// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0
+// br label %else
+//
+// else:
+// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0]
+// % Mask1 = extractelement <16 x i1> %Mask, i32 1
+// % ToLoad1 = icmp eq i1 % Mask1, true
+// br i1 % ToLoad1, label %cond.load1, label %else2
+//
+// cond.load1:
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// % Load1 = load i32, i32* % Ptr1, align 4
+// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1
+// br label %else2
+// . . .
+// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src
+// ret <16 x i32> %Result
+static void scalarizeMaskedGather(CallInst *CI) {
+  Value *Ptrs = CI->getArgOperand(0);
+  Value *Alignment = CI->getArgOperand(1);
+  Value *Mask = CI->getArgOperand(2);
+  Value *Src0 = CI->getArgOperand(3);
+
+  VectorType *VecType = dyn_cast<VectorType>(CI->getType());
+
+  assert(VecType && "Unexpected return type of masked load intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  BasicBlock *CondBlock = nullptr;
+  BasicBlock *PrevIfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  Value *UndefVal = UndefValue::get(VecType);
+
+  // The result vector
+  Value *VResult = UndefVal;
+  unsigned VectorWidth = VecType->getNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                                 "Load" + Twine(Idx));
+      VResult = Builder.CreateInsertElement(VResult, Load,
+                                            Builder.getInt32(Idx),
+                                            "Res" + Twine(Idx));
+    }
+    Value *NewI = Builder.CreateSelect(Mask, VResult, Src0);
+    CI->replaceAllUsesWith(NewI);
+    CI->eraseFromParent();
+    return;
+  }
+
+  PHINode *Phi = nullptr;
+  Value *PrevPhi = UndefVal;
+
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  %Mask1 = extractelement <16 x i1> %Mask, i32 1
+    //  %ToLoad1 = icmp eq i1 %Mask1, true
+    //  br i1 %ToLoad1, label %cond.load, label %else
+    //
+    if (Idx > 0) {
+      Phi = Builder.CreatePHI(VecType, 2, "res.phi.else");
+      Phi->addIncoming(VResult, CondBlock);
+      Phi->addIncoming(PrevPhi, PrevIfBlock);
+      PrevPhi = Phi;
+      VResult = Phi;
+    }
+
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                                    ConstantInt::get(Predicate->getType(), 1),
+                                    "ToLoad" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  %EltAddr = getelementptr i32* %1, i32 0
+    //  %Elt = load i32* %EltAddr
+    //  VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx
+    //
+    CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal,
+                                               "Load" + Twine(Idx));
+    VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx),
+                                          "Res" + Twine(Idx));
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    PrevIfBlock = IfBlock;
+    IfBlock = NewIfBlock;
+  }
+
+  Phi = Builder.CreatePHI(VecType, 2, "res.phi.select");
+  Phi->addIncoming(VResult, CondBlock);
+  Phi->addIncoming(PrevPhi, PrevIfBlock);
+  Value *NewI = Builder.CreateSelect(Mask, Phi, Src0);
+  CI->replaceAllUsesWith(NewI);
+  CI->eraseFromParent();
+}
+
+// Translate a masked scatter intrinsic, like
+// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4,
+//                                  <16 x i1> %Mask)
+// to a chain of basic blocks, that stores element one-by-one if
+// the appropriate mask bit is set.
+//
+// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind
+// % Mask0 = extractelement <16 x i1> % Mask, i32 0
+// % ToStore0 = icmp eq i1 % Mask0, true
+// br i1 %ToStore0, label %cond.store, label %else
+//
+// cond.store:
+// % Elt0 = extractelement <16 x i32> %Src, i32 0
+// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0
+// store i32 %Elt0, i32* % Ptr0, align 4
+// br label %else
+//
+// else:
+// % Mask1 = extractelement <16 x i1> % Mask, i32 1
+// % ToStore1 = icmp eq i1 % Mask1, true
+// br i1 % ToStore1, label %cond.store1, label %else2
+//
+// cond.store1:
+// % Elt1 = extractelement <16 x i32> %Src, i32 1
+// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+// store i32 % Elt1, i32* % Ptr1, align 4
+// br label %else2
+//   . . .
+static void scalarizeMaskedScatter(CallInst *CI) {
+  Value *Src = CI->getArgOperand(0);
+  Value *Ptrs = CI->getArgOperand(1);
+  Value *Alignment = CI->getArgOperand(2);
+  Value *Mask = CI->getArgOperand(3);
+
+  assert(isa<VectorType>(Src->getType()) &&
+         "Unexpected data type in masked scatter intrinsic");
+  assert(isa<VectorType>(Ptrs->getType()) &&
+         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
+         "Vector of pointers is expected in masked scatter intrinsic");
+
+  IRBuilder<> Builder(CI->getContext());
+  Instruction *InsertPt = CI;
+  BasicBlock *IfBlock = CI->getParent();
+  Builder.SetInsertPoint(InsertPt);
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+
+  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+
+  // Shorten the way if the mask is a vector of constants.
+  bool IsConstMask = isa<ConstantVector>(Mask);
+
+  if (IsConstMask) {
+    for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+      if (cast<ConstantVector>(Mask)->getOperand(Idx)->isNullValue())
+        continue;
+      Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                   "Elt" + Twine(Idx));
+      Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                                "Ptr" + Twine(Idx));
+      Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+    }
+    CI->eraseFromParent();
+    return;
+  }
+  for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
+    // Fill the "else" block, created in the previous iteration
+    //
+    //  % Mask1 = extractelement <16 x i1> % Mask, i32 Idx
+    //  % ToStore = icmp eq i1 % Mask1, true
+    //  br i1 % ToStore, label %cond.store, label %else
+    //
+    Value *Predicate = Builder.CreateExtractElement(Mask,
+                                                    Builder.getInt32(Idx),
+                                                    "Mask" + Twine(Idx));
+    Value *Cmp =
+       Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate,
+                          ConstantInt::get(Predicate->getType(), 1),
+                          "ToStore" + Twine(Idx));
+
+    // Create "cond" block
+    //
+    //  % Elt1 = extractelement <16 x i32> %Src, i32 1
+    //  % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1
+    //  %store i32 % Elt1, i32* % Ptr1
+    //
+    BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store");
+    Builder.SetInsertPoint(InsertPt);
+
+    Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx),
+                                                 "Elt" + Twine(Idx));
+    Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx),
+                                              "Ptr" + Twine(Idx));
+    Builder.CreateAlignedStore(OneElt, Ptr, AlignVal);
+
+    // Create "else" block, fill it in the next iteration
+    BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else");
+    Builder.SetInsertPoint(InsertPt);
+    Instruction *OldBr = IfBlock->getTerminator();
+    BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr);
+    OldBr->eraseFromParent();
+    IfBlock = NewIfBlock;
+  }
+  CI->eraseFromParent();
+}
+
+/// If counting leading or trailing zeros is an expensive operation and a zero
+/// input is defined, add a check for zero to avoid calling the intrinsic.
+///
+/// We want to transform:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 false)
+///
+/// into:
+///   entry:
+///     %cmpz = icmp eq i64 %A, 0
+///     br i1 %cmpz, label %cond.end, label %cond.false
+///   cond.false:
+///     %z = call i64 @llvm.cttz.i64(i64 %A, i1 true)
+///     br label %cond.end
+///   cond.end:
+///     %ctz = phi i64 [ 64, %entry ], [ %z, %cond.false ]
+///
+/// If the transform is performed, return true and set ModifiedDT to true.
+static bool despeculateCountZeros(IntrinsicInst *CountZeros,
+                                  const TargetLowering *TLI,
+                                  const DataLayout *DL,
+                                  bool &ModifiedDT) {
+  if (!TLI || !DL)
+    return false;
+
+  // If a zero input is undefined, it doesn't make sense to despeculate that.
+  if (match(CountZeros->getOperand(1), m_One()))
+    return false;
+
+  // If it's cheap to speculate, there's nothing to do.
+  auto IntrinsicID = CountZeros->getIntrinsicID();
+  if ((IntrinsicID == Intrinsic::cttz && TLI->isCheapToSpeculateCttz()) ||
+      (IntrinsicID == Intrinsic::ctlz && TLI->isCheapToSpeculateCtlz()))
+    return false;
+
+  // Only handle legal scalar cases. Anything else requires too much work.
+  Type *Ty = CountZeros->getType();
+  unsigned SizeInBits = Ty->getPrimitiveSizeInBits();
+  if (Ty->isVectorTy() || SizeInBits > DL->getLargestLegalIntTypeSizeInBits())
+    return false;
+
+  // The intrinsic will be sunk behind a compare against zero and branch.
+  BasicBlock *StartBlock = CountZeros->getParent();
+  BasicBlock *CallBlock = StartBlock->splitBasicBlock(CountZeros, "cond.false");
+
+  // Create another block after the count zero intrinsic. A PHI will be added
+  // in this block to select the result of the intrinsic or the bit-width
+  // constant if the input to the intrinsic is zero.
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(CountZeros));
+  BasicBlock *EndBlock = CallBlock->splitBasicBlock(SplitPt, "cond.end");
+
+  // Set up a builder to create a compare, conditional branch, and PHI.
+  IRBuilder<> Builder(CountZeros->getContext());
+  Builder.SetInsertPoint(StartBlock->getTerminator());
+  Builder.SetCurrentDebugLocation(CountZeros->getDebugLoc());
+
+  // Replace the unconditional branch that was created by the first split with
+  // a compare against zero and a conditional branch.
+  Value *Zero = Constant::getNullValue(Ty);
+  Value *Cmp = Builder.CreateICmpEQ(CountZeros->getOperand(0), Zero, "cmpz");
+  Builder.CreateCondBr(Cmp, EndBlock, CallBlock);
+  StartBlock->getTerminator()->eraseFromParent();
+
+  // Create a PHI in the end block to select either the output of the intrinsic
+  // or the bit width of the operand.
+  Builder.SetInsertPoint(&EndBlock->front());
+  PHINode *PN = Builder.CreatePHI(Ty, 2, "ctz");
+  CountZeros->replaceAllUsesWith(PN);
+  Value *BitWidth = Builder.getInt(APInt(SizeInBits, SizeInBits));
+  PN->addIncoming(BitWidth, StartBlock);
+  PN->addIncoming(CountZeros, CallBlock);
+
+  // We are explicitly handling the zero case, so we can set the intrinsic's
+  // undefined zero argument to 'true'. This will also prevent reprocessing the
+  // intrinsic; we only despeculate when a zero input is defined.
+  CountZeros->setArgOperand(1, Builder.getTrue());
+  ModifiedDT = true;
+  return true;
+}
+
+bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
+  BasicBlock *BB = CI->getParent();
+
+  // Lower inline assembly if we can.
+  // If we found an inline asm expession, and if the target knows how to
+  // lower it to normal LLVM code, do so now.
+  if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
+    if (TLI->ExpandInlineAsm(CI)) {
+      // Avoid invalidating the iterator.
+      CurInstIterator = BB->begin();
+      // Avoid processing instructions out of order, which could cause
+      // reuse before a value is defined.
+      SunkAddrs.clear();
+      return true;
+    }
+    // Sink address computing for memory operands into the block.
+    if (optimizeInlineAsmInst(CI))
+      return true;
+  }
+
+  // Align the pointer arguments to this call if the target thinks it's a good
+  // idea
+  unsigned MinSize, PrefAlign;
+  if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+    for (auto &Arg : CI->arg_operands()) {
+      // We want to align both objects whose address is used directly and
+      // objects whose address is used in casts and GEPs, though it only makes
+      // sense for GEPs if the offset is a multiple of the desired alignment and
+      // if size - offset meets the size threshold.
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      APInt Offset(DL->getPointerSizeInBits(
+                       cast<PointerType>(Arg->getType())->getAddressSpace()),
+                   0);
+      Value *Val = Arg->stripAndAccumulateInBoundsConstantOffsets(*DL, Offset);
+      uint64_t Offset2 = Offset.getLimitedValue();
+      if ((Offset2 & (PrefAlign-1)) != 0)
+        continue;
+      AllocaInst *AI;
+      if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
+          DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
+        AI->setAlignment(PrefAlign);
+      // Global variables can only be aligned if they are defined in this
+      // object (i.e. they are uniquely initialized in this object), and
+      // over-aligning global variables that have an explicit section is
+      // forbidden.
+      GlobalVariable *GV;
+      if ((GV = dyn_cast<GlobalVariable>(Val)) && GV->canIncreaseAlignment() &&
+          GV->getPointerAlignment(*DL) < PrefAlign &&
+          DL->getTypeAllocSize(GV->getValueType()) >=
+              MinSize + Offset2)
+        GV->setAlignment(PrefAlign);
+    }
+    // If this is a memcpy (or similar) then we may be able to improve the
+    // alignment
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
+      unsigned Align = getKnownAlignment(MI->getDest(), *DL);
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI))
+        Align = std::min(Align, getKnownAlignment(MTI->getSource(), *DL));
+      if (Align > MI->getAlignment())
+        MI->setAlignment(ConstantInt::get(MI->getAlignmentType(), Align));
+    }
+  }
+
+  // If we have a cold call site, try to sink addressing computation into the
+  // cold block.  This interacts with our handling for loads and stores to
+  // ensure that we can fold all uses of a potential addressing computation
+  // into their uses.  TODO: generalize this to work over profiling data
+  if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+    for (auto &Arg : CI->arg_operands()) {
+      if (!Arg->getType()->isPointerTy())
+        continue;
+      unsigned AS = Arg->getType()->getPointerAddressSpace();
+      return optimizeMemoryInst(CI, Arg, Arg->getType(), AS);
+    }
+
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+  if (II) {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::objectsize: {
+      // Lower all uses of llvm.objectsize.*
+      uint64_t Size;
+      Type *ReturnTy = CI->getType();
+      Constant *RetVal = nullptr;
+      ConstantInt *Op1 = cast<ConstantInt>(II->getArgOperand(1));
+      ObjSizeMode Mode = Op1->isZero() ? ObjSizeMode::Max : ObjSizeMode::Min;
+      if (getObjectSize(II->getArgOperand(0),
+                        Size, *DL, TLInfo, false, Mode)) {
+        RetVal = ConstantInt::get(ReturnTy, Size);
+      } else {
+        RetVal = ConstantInt::get(ReturnTy,
+                                  Mode == ObjSizeMode::Min ? 0 : -1ULL);
+      }
+      // Substituting this can cause recursive simplifications, which can
+      // invalidate our iterator.  Use a WeakVH to hold onto it in case this
+      // happens.
+      Value *CurValue = &*CurInstIterator;
+      WeakVH IterHandle(CurValue);
+
+      replaceAndRecursivelySimplify(CI, RetVal, TLInfo, nullptr);
+
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      if (IterHandle != CurValue) {
+        CurInstIterator = BB->begin();
+        SunkAddrs.clear();
+      }
+      return true;
+    }
+    case Intrinsic::masked_load: {
+      // Scalarize unsupported vector masked load
+      if (!TTI->isLegalMaskedLoad(CI->getType())) {
+        scalarizeMaskedLoad(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_store: {
+      if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedStore(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_gather: {
+      if (!TTI->isLegalMaskedGather(CI->getType())) {
+        scalarizeMaskedGather(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::masked_scatter: {
+      if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) {
+        scalarizeMaskedScatter(CI);
+        ModifiedDT = true;
+        return true;
+      }
+      return false;
+    }
+    case Intrinsic::aarch64_stlxr:
+    case Intrinsic::aarch64_stxr: {
+      ZExtInst *ExtVal = dyn_cast<ZExtInst>(CI->getArgOperand(0));
+      if (!ExtVal || !ExtVal->hasOneUse() ||
+          ExtVal->getParent() == CI->getParent())
+        return false;
+      // Sink a zext feeding stlxr/stxr before it, so it can be folded into it.
+      ExtVal->moveBefore(CI);
+      // Mark this instruction as "inserted by CGP", so that other
+      // optimizations don't touch it.
+      InsertedInsts.insert(ExtVal);
+      return true;
+    }
+    case Intrinsic::invariant_group_barrier:
+      II->replaceAllUsesWith(II->getArgOperand(0));
+      II->eraseFromParent();
+      return true;
+
+    case Intrinsic::cttz:
+    case Intrinsic::ctlz:
+      // If counting zeros is expensive, try to avoid it.
+      return despeculateCountZeros(II, TLI, DL, ModifiedDT);
+    }
+
+    if (TLI) {
+      // Unknown address space.
+      // TODO: Target hook to pick which address space the intrinsic cares
+      // about?
+      unsigned AddrSpace = ~0u;
+      SmallVector<Value*, 2> PtrOps;
+      Type *AccessTy;
+      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
+        while (!PtrOps.empty())
+          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
+            return true;
+    }
+  }
+
+  // From here on out we're working with named functions.
+  if (!CI->getCalledFunction()) return false;
+
+  // Lower all default uses of _chk calls.  This is very similar
+  // to what InstCombineCalls does, but here we are only lowering calls
+  // to fortified library functions (e.g. __memcpy_chk) that have the default
+  // "don't know" as the objectsize.  Anything else should be left alone.
+  FortifiedLibCallSimplifier Simplifier(TLInfo, true);
+  if (Value *V = Simplifier.optimizeCall(CI)) {
+    CI->replaceAllUsesWith(V);
+    CI->eraseFromParent();
+    return true;
+  }
+  return false;
+}
+
+/// Look for opportunities to duplicate return instructions to the predecessor
+/// to enable tail call optimizations. The case it is currently looking for is:
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   br label %return
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   br label %return
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   br label %return
+/// return:
+///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
+///   ret i32 %retval
+/// @endcode
+///
+/// =>
+///
+/// @code
+/// bb0:
+///   %tmp0 = tail call i32 @f0()
+///   ret i32 %tmp0
+/// bb1:
+///   %tmp1 = tail call i32 @f1()
+///   ret i32 %tmp1
+/// bb2:
+///   %tmp2 = tail call i32 @f2()
+///   ret i32 %tmp2
+/// @endcode
+bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
+  if (!TLI)
+    return false;
+
+  ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+  if (!RI)
+    return false;
+
+  PHINode *PN = nullptr;
+  BitCastInst *BCI = nullptr;
+  Value *V = RI->getReturnValue();
+  if (V) {
+    BCI = dyn_cast<BitCastInst>(V);
+    if (BCI)
+      V = BCI->getOperand(0);
+
+    PN = dyn_cast<PHINode>(V);
+    if (!PN)
+      return false;
+  }
+
+  if (PN && PN->getParent() != BB)
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  // See llvm::isInTailCallPosition().
+  const Function *F = BB->getParent();
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Make sure there are no instructions between the PHI and return, or that the
+  // return is the first instruction in the block.
+  if (PN) {
+    BasicBlock::iterator BI = BB->begin();
+    do { ++BI; } while (isa<DbgInfoIntrinsic>(BI));
+    if (&*BI == BCI)
+      // Also skip over the bitcast.
+      ++BI;
+    if (&*BI != RI)
+      return false;
+  } else {
+    BasicBlock::iterator BI = BB->begin();
+    while (isa<DbgInfoIntrinsic>(BI)) ++BI;
+    if (&*BI != RI)
+      return false;
+  }
+
+  /// Only dup the ReturnInst if the CallInst is likely to be emitted as a tail
+  /// call.
+  SmallVector<CallInst*, 4> TailCalls;
+  if (PN) {
+    for (unsigned I = 0, E = PN->getNumIncomingValues(); I != E; ++I) {
+      CallInst *CI = dyn_cast<CallInst>(PN->getIncomingValue(I));
+      // Make sure the phi value is indeed produced by the tail call.
+      if (CI && CI->hasOneUse() && CI->getParent() == PN->getIncomingBlock(I) &&
+          TLI->mayBeEmittedAsTailCall(CI))
+        TailCalls.push_back(CI);
+    }
+  } else {
+    SmallPtrSet<BasicBlock*, 4> VisitedBBs;
+    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+      if (!VisitedBBs.insert(*PI).second)
+        continue;
+
+      BasicBlock::InstListType &InstList = (*PI)->getInstList();
+      BasicBlock::InstListType::reverse_iterator RI = InstList.rbegin();
+      BasicBlock::InstListType::reverse_iterator RE = InstList.rend();
+      do { ++RI; } while (RI != RE && isa<DbgInfoIntrinsic>(&*RI));
+      if (RI == RE)
+        continue;
+
+      CallInst *CI = dyn_cast<CallInst>(&*RI);
+      if (CI && CI->use_empty() && TLI->mayBeEmittedAsTailCall(CI))
+        TailCalls.push_back(CI);
+    }
+  }
+
+  bool Changed = false;
+  for (unsigned i = 0, e = TailCalls.size(); i != e; ++i) {
+    CallInst *CI = TailCalls[i];
+    CallSite CS(CI);
+
+    // Conservatively require the attributes of the call to match those of the
+    // return. Ignore noalias because it doesn't affect the call sequence.
+    AttributeSet CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
+          removeAttribute(Attribute::NoAlias))
+      continue;
+
+    // Make sure the call instruction is followed by an unconditional branch to
+    // the return block.
+    BasicBlock *CallBB = CI->getParent();
+    BranchInst *BI = dyn_cast<BranchInst>(CallBB->getTerminator());
+    if (!BI || !BI->isUnconditional() || BI->getSuccessor(0) != BB)
+      continue;
+
+    // Duplicate the return into CallBB.
+    (void)FoldReturnIntoUncondBranch(RI, BB, CallBB);
+    ModifiedDT = Changed = true;
+    ++NumRetsDup;
+  }
+
+  // If we eliminated all predecessors of the block, delete the block now.
+  if (Changed && !BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
+    BB->eraseFromParent();
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// Memory Optimization
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+/// This is an extended version of TargetLowering::AddrMode
+/// which holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg;
+  Value *ScaledReg;
+  ExtAddrMode() : BaseReg(nullptr), ScaledReg(nullptr) {}
+  void print(raw_ostream &OS) const;
+  void dump() const;
+
+  bool operator==(const ExtAddrMode& O) const {
+    return (BaseReg == O.BaseReg) && (ScaledReg == O.ScaledReg) &&
+           (BaseGV == O.BaseGV) && (BaseOffs == O.BaseOffs) &&
+           (HasBaseReg == O.HasBaseReg) && (Scale == O.Scale);
+  }
+};
+
+#ifndef NDEBUG
+static inline raw_ostream &operator<<(raw_ostream &OS, const ExtAddrMode &AM) {
+  AM.print(OS);
+  return OS;
+}
+#endif
+
+void ExtAddrMode::print(raw_ostream &OS) const {
+  bool NeedPlus = false;
+  OS << "[";
+  if (BaseGV) {
+    OS << (NeedPlus ? " + " : "")
+       << "GV:";
+    BaseGV->printAsOperand(OS, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+
+  if (BaseOffs) {
+    OS << (NeedPlus ? " + " : "")
+       << BaseOffs;
+    NeedPlus = true;
+  }
+
+  if (BaseReg) {
+    OS << (NeedPlus ? " + " : "")
+       << "Base:";
+    BaseReg->printAsOperand(OS, /*PrintType=*/false);
+    NeedPlus = true;
+  }
+  if (Scale) {
+    OS << (NeedPlus ? " + " : "")
+       << Scale << "*";
+    ScaledReg->printAsOperand(OS, /*PrintType=*/false);
+  }
+
+  OS << ']';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ExtAddrMode::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+#endif
+
+/// \brief This class provides transaction based operation on the IR.
+/// Every change made through this class is recorded in the internal state and
+/// can be undone (rollback) until commit is called.
+class TypePromotionTransaction {
+
+  /// \brief This represents the common interface of the individual transaction.
+  /// Each class implements the logic for doing one specific modification on
+  /// the IR via the TypePromotionTransaction.
+  class TypePromotionAction {
+  protected:
+    /// The Instruction modified.
+    Instruction *Inst;
+
+  public:
+    /// \brief Constructor of the action.
+    /// The constructor performs the related action on the IR.
+    TypePromotionAction(Instruction *Inst) : Inst(Inst) {}
+
+    virtual ~TypePromotionAction() {}
+
+    /// \brief Undo the modification done by this action.
+    /// When this method is called, the IR must be in the same state as it was
+    /// before this action was applied.
+    /// \pre Undoing the action works if and only if the IR is in the exact same
+    /// state as it was directly after this action was applied.
+    virtual void undo() = 0;
+
+    /// \brief Advocate every change made by this action.
+    /// When the results on the IR of the action are to be kept, it is important
+    /// to call this function, otherwise hidden information may be kept forever.
+    virtual void commit() {
+      // Nothing to be done, this action is not doing anything.
+    }
+  };
+
+  /// \brief Utility to remember the position of an instruction.
+  class InsertionHandler {
+    /// Position of an instruction.
+    /// Either an instruction:
+    /// - Is the first in a basic block: BB is used.
+    /// - Has a previous instructon: PrevInst is used.
+    union {
+      Instruction *PrevInst;
+      BasicBlock *BB;
+    } Point;
+    /// Remember whether or not the instruction had a previous instruction.
+    bool HasPrevInstruction;
+
+  public:
+    /// \brief Record the position of \p Inst.
+    InsertionHandler(Instruction *Inst) {
+      BasicBlock::iterator It = Inst->getIterator();
+      HasPrevInstruction = (It != (Inst->getParent()->begin()));
+      if (HasPrevInstruction)
+        Point.PrevInst = &*--It;
+      else
+        Point.BB = Inst->getParent();
+    }
+
+    /// \brief Insert \p Inst at the recorded position.
+    void insert(Instruction *Inst) {
+      if (HasPrevInstruction) {
+        if (Inst->getParent())
+          Inst->removeFromParent();
+        Inst->insertAfter(Point.PrevInst);
+      } else {
+        Instruction *Position = &*Point.BB->getFirstInsertionPt();
+        if (Inst->getParent())
+          Inst->moveBefore(Position);
+        else
+          Inst->insertBefore(Position);
+      }
+    }
+  };
+
+  /// \brief Move an instruction before another.
+  class InstructionMoveBefore : public TypePromotionAction {
+    /// Original position of the instruction.
+    InsertionHandler Position;
+
+  public:
+    /// \brief Move \p Inst before \p Before.
+    InstructionMoveBefore(Instruction *Inst, Instruction *Before)
+        : TypePromotionAction(Inst), Position(Inst) {
+      DEBUG(dbgs() << "Do: move: " << *Inst << "\nbefore: " << *Before << "\n");
+      Inst->moveBefore(Before);
+    }
+
+    /// \brief Move the instruction back to its original position.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: moveBefore: " << *Inst << "\n");
+      Position.insert(Inst);
+    }
+  };
+
+  /// \brief Set the operand of an instruction with a new value.
+  class OperandSetter : public TypePromotionAction {
+    /// Original operand of the instruction.
+    Value *Origin;
+    /// Index of the modified instruction.
+    unsigned Idx;
+
+  public:
+    /// \brief Set \p Idx operand of \p Inst with \p NewVal.
+    OperandSetter(Instruction *Inst, unsigned Idx, Value *NewVal)
+        : TypePromotionAction(Inst), Idx(Idx) {
+      DEBUG(dbgs() << "Do: setOperand: " << Idx << "\n"
+                   << "for:" << *Inst << "\n"
+                   << "with:" << *NewVal << "\n");
+      Origin = Inst->getOperand(Idx);
+      Inst->setOperand(Idx, NewVal);
+    }
+
+    /// \brief Restore the original value of the instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: setOperand:" << Idx << "\n"
+                   << "for: " << *Inst << "\n"
+                   << "with: " << *Origin << "\n");
+      Inst->setOperand(Idx, Origin);
+    }
+  };
+
+  /// \brief Hide the operands of an instruction.
+  /// Do as if this instruction was not using any of its operands.
+  class OperandsHider : public TypePromotionAction {
+    /// The list of original operands.
+    SmallVector<Value *, 4> OriginalValues;
+
+  public:
+    /// \brief Remove \p Inst from the uses of the operands of \p Inst.
+    OperandsHider(Instruction *Inst) : TypePromotionAction(Inst) {
+      DEBUG(dbgs() << "Do: OperandsHider: " << *Inst << "\n");
+      unsigned NumOpnds = Inst->getNumOperands();
+      OriginalValues.reserve(NumOpnds);
+      for (unsigned It = 0; It < NumOpnds; ++It) {
+        // Save the current operand.
+        Value *Val = Inst->getOperand(It);
+        OriginalValues.push_back(Val);
+        // Set a dummy one.
+        // We could use OperandSetter here, but that would imply an overhead
+        // that we are not willing to pay.
+        Inst->setOperand(It, UndefValue::get(Val->getType()));
+      }
+    }
+
+    /// \brief Restore the original list of uses.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: OperandsHider: " << *Inst << "\n");
+      for (unsigned It = 0, EndIt = OriginalValues.size(); It != EndIt; ++It)
+        Inst->setOperand(It, OriginalValues[It]);
+    }
+  };
+
+  /// \brief Build a truncate instruction.
+  class TruncBuilder : public TypePromotionAction {
+    Value *Val;
+  public:
+    /// \brief Build a truncate instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// trunc Opnd to Ty.
+    TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
+      IRBuilder<> Builder(Opnd);
+      Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
+    }
+
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// \brief Remove the built instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: TruncBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// \brief Build a sign extension instruction.
+  class SExtBuilder : public TypePromotionAction {
+    Value *Val;
+  public:
+    /// \brief Build a sign extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// sext Opnd to Ty.
+    SExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Val = Builder.CreateSExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: SExtBuilder: " << *Val << "\n");
+    }
+
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// \brief Remove the built instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: SExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// \brief Build a zero extension instruction.
+  class ZExtBuilder : public TypePromotionAction {
+    Value *Val;
+  public:
+    /// \brief Build a zero extension instruction of \p Opnd producing a \p Ty
+    /// result.
+    /// zext Opnd to Ty.
+    ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
+        : TypePromotionAction(InsertPt) {
+      IRBuilder<> Builder(InsertPt);
+      Val = Builder.CreateZExt(Opnd, Ty, "promoted");
+      DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
+    }
+
+    /// \brief Get the built value.
+    Value *getBuiltValue() { return Val; }
+
+    /// \brief Remove the built instruction.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: ZExtBuilder: " << *Val << "\n");
+      if (Instruction *IVal = dyn_cast<Instruction>(Val))
+        IVal->eraseFromParent();
+    }
+  };
+
+  /// \brief Mutate an instruction to another type.
+  class TypeMutator : public TypePromotionAction {
+    /// Record the original type.
+    Type *OrigTy;
+
+  public:
+    /// \brief Mutate the type of \p Inst into \p NewTy.
+    TypeMutator(Instruction *Inst, Type *NewTy)
+        : TypePromotionAction(Inst), OrigTy(Inst->getType()) {
+      DEBUG(dbgs() << "Do: MutateType: " << *Inst << " with " << *NewTy
+                   << "\n");
+      Inst->mutateType(NewTy);
+    }
+
+    /// \brief Mutate the instruction back to its original type.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: MutateType: " << *Inst << " with " << *OrigTy
+                   << "\n");
+      Inst->mutateType(OrigTy);
+    }
+  };
+
+  /// \brief Replace the uses of an instruction by another instruction.
+  class UsesReplacer : public TypePromotionAction {
+    /// Helper structure to keep track of the replaced uses.
+    struct InstructionAndIdx {
+      /// The instruction using the instruction.
+      Instruction *Inst;
+      /// The index where this instruction is used for Inst.
+      unsigned Idx;
+      InstructionAndIdx(Instruction *Inst, unsigned Idx)
+          : Inst(Inst), Idx(Idx) {}
+    };
+
+    /// Keep track of the original uses (pair Instruction, Index).
+    SmallVector<InstructionAndIdx, 4> OriginalUses;
+    typedef SmallVectorImpl<InstructionAndIdx>::iterator use_iterator;
+
+  public:
+    /// \brief Replace all the use of \p Inst by \p New.
+    UsesReplacer(Instruction *Inst, Value *New) : TypePromotionAction(Inst) {
+      DEBUG(dbgs() << "Do: UsersReplacer: " << *Inst << " with " << *New
+                   << "\n");
+      // Record the original uses.
+      for (Use &U : Inst->uses()) {
+        Instruction *UserI = cast<Instruction>(U.getUser());
+        OriginalUses.push_back(InstructionAndIdx(UserI, U.getOperandNo()));
+      }
+      // Now, we can replace the uses.
+      Inst->replaceAllUsesWith(New);
+    }
+
+    /// \brief Reassign the original uses of Inst to Inst.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: UsersReplacer: " << *Inst << "\n");
+      for (use_iterator UseIt = OriginalUses.begin(),
+                        EndIt = OriginalUses.end();
+           UseIt != EndIt; ++UseIt) {
+        UseIt->Inst->setOperand(UseIt->Idx, Inst);
+      }
+    }
+  };
+
+  /// \brief Remove an instruction from the IR.
+  class InstructionRemover : public TypePromotionAction {
+    /// Original position of the instruction.
+    InsertionHandler Inserter;
+    /// Helper structure to hide all the link to the instruction. In other
+    /// words, this helps to do as if the instruction was removed.
+    OperandsHider Hider;
+    /// Keep track of the uses replaced, if any.
+    UsesReplacer *Replacer;
+
+  public:
+    /// \brief Remove all reference of \p Inst and optinally replace all its
+    /// uses with New.
+    /// \pre If !Inst->use_empty(), then New != nullptr
+    InstructionRemover(Instruction *Inst, Value *New = nullptr)
+        : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
+          Replacer(nullptr) {
+      if (New)
+        Replacer = new UsesReplacer(Inst, New);
+      DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      Inst->removeFromParent();
+    }
+
+    ~InstructionRemover() override { delete Replacer; }
+
+    /// \brief Really remove the instruction.
+    void commit() override { delete Inst; }
+
+    /// \brief Resurrect the instruction and reassign it to the proper uses if
+    /// new value was provided when build this action.
+    void undo() override {
+      DEBUG(dbgs() << "Undo: InstructionRemover: " << *Inst << "\n");
+      Inserter.insert(Inst);
+      if (Replacer)
+        Replacer->undo();
+      Hider.undo();
+    }
+  };
+
+public:
+  /// Restoration point.
+  /// The restoration point is a pointer to an action instead of an iterator
+  /// because the iterator may be invalidated but not the pointer.
+  typedef const TypePromotionAction *ConstRestorationPt;
+  /// Advocate every changes made in that transaction.
+  void commit();
+  /// Undo all the changes made after the given point.
+  void rollback(ConstRestorationPt Point);
+  /// Get the current restoration point.
+  ConstRestorationPt getRestorationPoint() const;
+
+  /// \name API for IR modification with state keeping to support rollback.
+  /// @{
+  /// Same as Instruction::setOperand.
+  void setOperand(Instruction *Inst, unsigned Idx, Value *NewVal);
+  /// Same as Instruction::eraseFromParent.
+  void eraseInstruction(Instruction *Inst, Value *NewVal = nullptr);
+  /// Same as Value::replaceAllUsesWith.
+  void replaceAllUsesWith(Instruction *Inst, Value *New);
+  /// Same as Value::mutateType.
+  void mutateType(Instruction *Inst, Type *NewTy);
+  /// Same as IRBuilder::createTrunc.
+  Value *createTrunc(Instruction *Opnd, Type *Ty);
+  /// Same as IRBuilder::createSExt.
+  Value *createSExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  /// Same as IRBuilder::createZExt.
+  Value *createZExt(Instruction *Inst, Value *Opnd, Type *Ty);
+  /// Same as Instruction::moveBefore.
+  void moveBefore(Instruction *Inst, Instruction *Before);
+  /// @}
+
+private:
+  /// The ordered list of actions made so far.
+  SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
+  typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
+};
+
+void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
+                                          Value *NewVal) {
+  Actions.push_back(
+      make_unique<TypePromotionTransaction::OperandSetter>(Inst, Idx, NewVal));
+}
+
+void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
+                                                Value *NewVal) {
+  Actions.push_back(
+      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
+}
+
+void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
+                                                  Value *New) {
+  Actions.push_back(make_unique<TypePromotionTransaction::UsesReplacer>(Inst, New));
+}
+
+void TypePromotionTransaction::mutateType(Instruction *Inst, Type *NewTy) {
+  Actions.push_back(make_unique<TypePromotionTransaction::TypeMutator>(Inst, NewTy));
+}
+
+Value *TypePromotionTransaction::createTrunc(Instruction *Opnd,
+                                             Type *Ty) {
+  std::unique_ptr<TruncBuilder> Ptr(new TruncBuilder(Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+Value *TypePromotionTransaction::createSExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
+  std::unique_ptr<SExtBuilder> Ptr(new SExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+Value *TypePromotionTransaction::createZExt(Instruction *Inst,
+                                            Value *Opnd, Type *Ty) {
+  std::unique_ptr<ZExtBuilder> Ptr(new ZExtBuilder(Inst, Opnd, Ty));
+  Value *Val = Ptr->getBuiltValue();
+  Actions.push_back(std::move(Ptr));
+  return Val;
+}
+
+void TypePromotionTransaction::moveBefore(Instruction *Inst,
+                                          Instruction *Before) {
+  Actions.push_back(
+      make_unique<TypePromotionTransaction::InstructionMoveBefore>(Inst, Before));
+}
+
+TypePromotionTransaction::ConstRestorationPt
+TypePromotionTransaction::getRestorationPoint() const {
+  return !Actions.empty() ? Actions.back().get() : nullptr;
+}
+
+void TypePromotionTransaction::commit() {
+  for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
+       ++It)
+    (*It)->commit();
+  Actions.clear();
+}
+
+void TypePromotionTransaction::rollback(
+    TypePromotionTransaction::ConstRestorationPt Point) {
+  while (!Actions.empty() && Point != Actions.back().get()) {
+    std::unique_ptr<TypePromotionAction> Curr = Actions.pop_back_val();
+    Curr->undo();
+  }
+}
+
+/// \brief A helper class for matching addressing modes.
+///
+/// This encapsulates the logic for matching the target-legal addressing modes.
+class AddressingModeMatcher {
+  SmallVectorImpl<Instruction*> &AddrModeInsts;
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  const DataLayout &DL;
+
+  /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
+  /// the memory instruction that we're computing this address for.
+  Type *AccessTy;
+  unsigned AddrSpace;
+  Instruction *MemoryInst;
+
+  /// This is the addressing mode that we're building up. This is
+  /// part of the return value of this addressing mode matching stuff.
+  ExtAddrMode &AddrMode;
+
+  /// The instructions inserted by other CodeGenPrepare optimizations.
+  const SetOfInstrs &InsertedInsts;
+  /// A map from the instructions to their type before promotion.
+  InstrToOrigTy &PromotedInsts;
+  /// The ongoing transaction where every action should be registered.
+  TypePromotionTransaction &TPT;
+
+  /// This is set to true when we should not do profitability checks.
+  /// When true, IsProfitableToFoldIntoAddressingMode always returns true.
+  bool IgnoreProfitability;
+
+  AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
+                        const TargetMachine &TM, Type *AT, unsigned AS,
+                        Instruction *MI, ExtAddrMode &AM,
+                        const SetOfInstrs &InsertedInsts,
+                        InstrToOrigTy &PromotedInsts,
+                        TypePromotionTransaction &TPT)
+      : AddrModeInsts(AMI), TM(TM),
+        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
+                 ->getTargetLowering()),
+        DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
+        MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
+        PromotedInsts(PromotedInsts), TPT(TPT) {
+    IgnoreProfitability = false;
+  }
+public:
+
+  /// Find the maximal addressing mode that a load/store of V can fold,
+  /// give an access type of AccessTy.  This returns a list of involved
+  /// instructions in AddrModeInsts.
+  /// \p InsertedInsts The instructions inserted by other CodeGenPrepare
+  /// optimizations.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p The ongoing transaction where every action should be registered.
+  static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
+                           Instruction *MemoryInst,
+                           SmallVectorImpl<Instruction*> &AddrModeInsts,
+                           const TargetMachine &TM,
+                           const SetOfInstrs &InsertedInsts,
+                           InstrToOrigTy &PromotedInsts,
+                           TypePromotionTransaction &TPT) {
+    ExtAddrMode Result;
+
+    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
+                                         MemoryInst, Result, InsertedInsts,
+                                         PromotedInsts, TPT).matchAddr(V, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+    return Result;
+  }
+private:
+  bool matchScaledValue(Value *ScaleReg, int64_t Scale, unsigned Depth);
+  bool matchAddr(Value *V, unsigned Depth);
+  bool matchOperationAddr(User *Operation, unsigned Opcode, unsigned Depth,
+                          bool *MovedAway = nullptr);
+  bool isProfitableToFoldIntoAddressingMode(Instruction *I,
+                                            ExtAddrMode &AMBefore,
+                                            ExtAddrMode &AMAfter);
+  bool valueAlreadyLiveAtInst(Value *Val, Value *KnownLive1, Value *KnownLive2);
+  bool isPromotionProfitable(unsigned NewCost, unsigned OldCost,
+                             Value *PromotedOperand) const;
+};
+
+/// Try adding ScaleReg*Scale to the current addressing mode.
+/// Return true and update AddrMode if this addr mode is legal for the target,
+/// false if not.
+bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
+                                             unsigned Depth) {
+  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
+  // mode.  Just process that directly.
+  if (Scale == 1)
+    return matchAddr(ScaleReg, Depth);
+
+  // If the scale is 0, it takes nothing to add this.
+  if (Scale == 0)
+    return true;
+
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+
+  ExtAddrMode TestAddrMode = AddrMode;
+
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  TestAddrMode.Scale += Scale;
+  TestAddrMode.ScaledReg = ScaleReg;
+
+  // If the new address isn't legal, bail out.
+  if (!TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace))
+    return false;
+
+  // It was legal, so commit it.
+  AddrMode = TestAddrMode;
+
+  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+  // X*Scale + C*Scale to addr mode.
+  ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
+  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+    TestAddrMode.ScaledReg = AddLHS;
+    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+
+    // If this addressing mode is legal, commit it and remember that we folded
+    // this instruction.
+    if (TLI.isLegalAddressingMode(DL, TestAddrMode, AccessTy, AddrSpace)) {
+      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
+      AddrMode = TestAddrMode;
+      return true;
+    }
+  }
+
+  // Otherwise, not (x+c)*scale, just return what we have.
+  return true;
+}
+
+/// This is a little filter, which returns true if an addressing computation
+/// involving I might be folded into a load/store accessing it.
+/// This doesn't need to be perfect, but needs to accept at least
+/// the set of instructions that MatchOperationAddr can.
+static bool MightBeFoldableInst(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+    // Don't touch identity bitcasts.
+    if (I->getType() == I->getOperand(0)->getType())
+      return false;
+    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return true;
+  case Instruction::IntToPtr:
+    // We know the input is intptr_t, so this is foldable.
+    return true;
+  case Instruction::Add:
+    return true;
+  case Instruction::Mul:
+  case Instruction::Shl:
+    // Can only handle X*C and X << C.
+    return isa<ConstantInt>(I->getOperand(1));
+  case Instruction::GetElementPtr:
+    return true;
+  default:
+    return false;
+  }
+}
+
+/// \brief Check whether or not \p Val is a legal instruction for \p TLI.
+/// \note \p Val is assumed to be the product of some type promotion.
+/// Therefore if \p Val has an undefined state in \p TLI, this is assumed
+/// to be legal, as the non-promoted value would have had the same state.
+static bool isPromotedInstructionLegal(const TargetLowering &TLI,
+                                       const DataLayout &DL, Value *Val) {
+  Instruction *PromotedInst = dyn_cast<Instruction>(Val);
+  if (!PromotedInst)
+    return false;
+  int ISDOpcode = TLI.InstructionOpcodeToISD(PromotedInst->getOpcode());
+  // If the ISDOpcode is undefined, it was undefined before the promotion.
+  if (!ISDOpcode)
+    return true;
+  // Otherwise, check if the promoted instruction is legal or not.
+  return TLI.isOperationLegalOrCustom(
+      ISDOpcode, TLI.getValueType(DL, PromotedInst->getType()));
+}
+
+/// \brief Hepler class to perform type promotion.
+class TypePromotionHelper {
+  /// \brief Utility function to check whether or not a sign or zero extension
+  /// of \p Inst with \p ConsideredExtType can be moved through \p Inst by
+  /// either using the operands of \p Inst or promoting \p Inst.
+  /// The type of the extension is defined by \p IsSExt.
+  /// In other words, check if:
+  /// ext (Ty Inst opnd1 opnd2 ... opndN) to ConsideredExtType.
+  /// #1 Promotion applies:
+  /// ConsideredExtType Inst (ext opnd1 to ConsideredExtType, ...).
+  /// #2 Operand reuses:
+  /// ext opnd1 to ConsideredExtType.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  static bool canGetThrough(const Instruction *Inst, Type *ConsideredExtType,
+                            const InstrToOrigTy &PromotedInsts, bool IsSExt);
+
+  /// \brief Utility function to determine if \p OpIdx should be promoted when
+  /// promoting \p Inst.
+  static bool shouldExtOperand(const Instruction *Inst, int OpIdx) {
+    return !(isa<SelectInst>(Inst) && OpIdx == 0);
+  }
+
+  /// \brief Utility function to promote the operand of \p Ext when this
+  /// operand is a promotable trunc or sext or zext.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p CreatedInstsCost[out] contains the cost of all instructions
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
+  /// Should never be called directly.
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForTruncAndAnyExt(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI);
+
+  /// \brief Utility function to promote the operand of \p Ext when this
+  /// operand is promotable and is not a supported trunc or sext.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  /// \p CreatedInstsCost[out] contains the cost of all the instructions
+  /// created to promote the operand of Ext.
+  /// Newly added extensions are inserted in \p Exts.
+  /// Newly added truncates are inserted in \p Truncs.
+  /// Should never be called directly.
+  /// \return The promoted value which is used instead of Ext.
+  static Value *promoteOperandForOther(Instruction *Ext,
+                                       TypePromotionTransaction &TPT,
+                                       InstrToOrigTy &PromotedInsts,
+                                       unsigned &CreatedInstsCost,
+                                       SmallVectorImpl<Instruction *> *Exts,
+                                       SmallVectorImpl<Instruction *> *Truncs,
+                                       const TargetLowering &TLI, bool IsSExt);
+
+  /// \see promoteOperandForOther.
+  static Value *signExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, true);
+  }
+
+  /// \see promoteOperandForOther.
+  static Value *zeroExtendOperandForOther(
+      Instruction *Ext, TypePromotionTransaction &TPT,
+      InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+      SmallVectorImpl<Instruction *> *Exts,
+      SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+    return promoteOperandForOther(Ext, TPT, PromotedInsts, CreatedInstsCost,
+                                  Exts, Truncs, TLI, false);
+  }
+
+public:
+  /// Type for the utility function that promotes the operand of Ext.
+  typedef Value *(*Action)(Instruction *Ext, TypePromotionTransaction &TPT,
+                           InstrToOrigTy &PromotedInsts,
+                           unsigned &CreatedInstsCost,
+                           SmallVectorImpl<Instruction *> *Exts,
+                           SmallVectorImpl<Instruction *> *Truncs,
+                           const TargetLowering &TLI);
+  /// \brief Given a sign/zero extend instruction \p Ext, return the approriate
+  /// action to promote the operand of \p Ext instead of using Ext.
+  /// \return NULL if no promotable action is possible with the current
+  /// sign extension.
+  /// \p InsertedInsts keeps track of all the instructions inserted by the
+  /// other CodeGenPrepare optimizations. This information is important
+  /// because we do not want to promote these instructions as CodeGenPrepare
+  /// will reinsert them later. Thus creating an infinite loop: create/remove.
+  /// \p PromotedInsts maps the instructions to their type before promotion.
+  static Action getAction(Instruction *Ext, const SetOfInstrs &InsertedInsts,
+                          const TargetLowering &TLI,
+                          const InstrToOrigTy &PromotedInsts);
+};
+
+bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
+                                        Type *ConsideredExtType,
+                                        const InstrToOrigTy &PromotedInsts,
+                                        bool IsSExt) {
+  // The promotion helper does not know how to deal with vector types yet.
+  // To be able to fix that, we would need to fix the places where we
+  // statically extend, e.g., constants and such.
+  if (Inst->getType()->isVectorTy())
+    return false;
+
+  // We can always get through zext.
+  if (isa<ZExtInst>(Inst))
+    return true;
+
+  // sext(sext) is ok too.
+  if (IsSExt && isa<SExtInst>(Inst))
+    return true;
+
+  // We can get through binary operator, if it is legal. In other words, the
+  // binary operator must have a nuw or nsw flag.
+  const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
+  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+      ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
+       (IsSExt && BinOp->hasNoSignedWrap())))
+    return true;
+
+  // Check if we can do the following simplification.
+  // ext(trunc(opnd)) --> ext(opnd)
+  if (!isa<TruncInst>(Inst))
+    return false;
+
+  Value *OpndVal = Inst->getOperand(0);
+  // Check if we can use this operand in the extension.
+  // If the type is larger than the result type of the extension, we cannot.
+  if (!OpndVal->getType()->isIntegerTy() ||
+      OpndVal->getType()->getIntegerBitWidth() >
+          ConsideredExtType->getIntegerBitWidth())
+    return false;
+
+  // If the operand of the truncate is not an instruction, we will not have
+  // any information on the dropped bits.
+  // (Actually we could for constant but it is not worth the extra logic).
+  Instruction *Opnd = dyn_cast<Instruction>(OpndVal);
+  if (!Opnd)
+    return false;
+
+  // Check if the source of the type is narrow enough.
+  // I.e., check that trunc just drops extended bits of the same kind of
+  // the extension.
+  // #1 get the type of the operand and check the kind of the extended bits.
+  const Type *OpndType;
+  InstrToOrigTy::const_iterator It = PromotedInsts.find(Opnd);
+  if (It != PromotedInsts.end() && It->second.getInt() == IsSExt)
+    OpndType = It->second.getPointer();
+  else if ((IsSExt && isa<SExtInst>(Opnd)) || (!IsSExt && isa<ZExtInst>(Opnd)))
+    OpndType = Opnd->getOperand(0)->getType();
+  else
+    return false;
+
+  // #2 check that the truncate just drops extended bits.
+  return Inst->getType()->getIntegerBitWidth() >=
+         OpndType->getIntegerBitWidth();
+}
+
+TypePromotionHelper::Action TypePromotionHelper::getAction(
+    Instruction *Ext, const SetOfInstrs &InsertedInsts,
+    const TargetLowering &TLI, const InstrToOrigTy &PromotedInsts) {
+  assert((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+         "Unexpected instruction type");
+  Instruction *ExtOpnd = dyn_cast<Instruction>(Ext->getOperand(0));
+  Type *ExtTy = Ext->getType();
+  bool IsSExt = isa<SExtInst>(Ext);
+  // If the operand of the extension is not an instruction, we cannot
+  // get through.
+  // If it, check we can get through.
+  if (!ExtOpnd || !canGetThrough(ExtOpnd, ExtTy, PromotedInsts, IsSExt))
+    return nullptr;
+
+  // Do not promote if the operand has been added by codegenprepare.
+  // Otherwise, it means we are undoing an optimization that is likely to be
+  // redone, thus causing potential infinite loop.
+  if (isa<TruncInst>(ExtOpnd) && InsertedInsts.count(ExtOpnd))
+    return nullptr;
+
+  // SExt or Trunc instructions.
+  // Return the related handler.
+  if (isa<SExtInst>(ExtOpnd) || isa<TruncInst>(ExtOpnd) ||
+      isa<ZExtInst>(ExtOpnd))
+    return promoteOperandForTruncAndAnyExt;
+
+  // Regular instruction.
+  // Abort early if we will have to insert non-free instructions.
+  if (!ExtOpnd->hasOneUse() && !TLI.isTruncateFree(ExtTy, ExtOpnd->getType()))
+    return nullptr;
+  return IsSExt ? signExtendOperandForOther : zeroExtendOperandForOther;
+}
+
+Value *TypePromotionHelper::promoteOperandForTruncAndAnyExt(
+    llvm::Instruction *SExt, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI) {
+  // By construction, the operand of SExt is an instruction. Otherwise we cannot
+  // get through it and this method should not be called.
+  Instruction *SExtOpnd = cast<Instruction>(SExt->getOperand(0));
+  Value *ExtVal = SExt;
+  bool HasMergedNonFreeExt = false;
+  if (isa<ZExtInst>(SExtOpnd)) {
+    // Replace s|zext(zext(opnd))
+    // => zext(opnd).
+    HasMergedNonFreeExt = !TLI.isExtFree(SExtOpnd);
+    Value *ZExt =
+        TPT.createZExt(SExt, SExtOpnd->getOperand(0), SExt->getType());
+    TPT.replaceAllUsesWith(SExt, ZExt);
+    TPT.eraseInstruction(SExt);
+    ExtVal = ZExt;
+  } else {
+    // Replace z|sext(trunc(opnd)) or sext(sext(opnd))
+    // => z|sext(opnd).
+    TPT.setOperand(SExt, 0, SExtOpnd->getOperand(0));
+  }
+  CreatedInstsCost = 0;
+
+  // Remove dead code.
+  if (SExtOpnd->use_empty())
+    TPT.eraseInstruction(SExtOpnd);
+
+  // Check if the extension is still needed.
+  Instruction *ExtInst = dyn_cast<Instruction>(ExtVal);
+  if (!ExtInst || ExtInst->getType() != ExtInst->getOperand(0)->getType()) {
+    if (ExtInst) {
+      if (Exts)
+        Exts->push_back(ExtInst);
+      CreatedInstsCost = !TLI.isExtFree(ExtInst) && !HasMergedNonFreeExt;
+    }
+    return ExtVal;
+  }
+
+  // At this point we have: ext ty opnd to ty.
+  // Reassign the uses of ExtInst to the opnd and remove ExtInst.
+  Value *NextVal = ExtInst->getOperand(0);
+  TPT.eraseInstruction(ExtInst, NextVal);
+  return NextVal;
+}
+
+Value *TypePromotionHelper::promoteOperandForOther(
+    Instruction *Ext, TypePromotionTransaction &TPT,
+    InstrToOrigTy &PromotedInsts, unsigned &CreatedInstsCost,
+    SmallVectorImpl<Instruction *> *Exts,
+    SmallVectorImpl<Instruction *> *Truncs, const TargetLowering &TLI,
+    bool IsSExt) {
+  // By construction, the operand of Ext is an instruction. Otherwise we cannot
+  // get through it and this method should not be called.
+  Instruction *ExtOpnd = cast<Instruction>(Ext->getOperand(0));
+  CreatedInstsCost = 0;
+  if (!ExtOpnd->hasOneUse()) {
+    // ExtOpnd will be promoted.
+    // All its uses, but Ext, will need to use a truncated value of the
+    // promoted version.
+    // Create the truncate now.
+    Value *Trunc = TPT.createTrunc(Ext, ExtOpnd->getType());
+    if (Instruction *ITrunc = dyn_cast<Instruction>(Trunc)) {
+      ITrunc->removeFromParent();
+      // Insert it just after the definition.
+      ITrunc->insertAfter(ExtOpnd);
+      if (Truncs)
+        Truncs->push_back(ITrunc);
+    }
+
+    TPT.replaceAllUsesWith(ExtOpnd, Trunc);
+    // Restore the operand of Ext (which has been replaced by the previous call
+    // to replaceAllUsesWith) to avoid creating a cycle trunc <-> sext.
+    TPT.setOperand(Ext, 0, ExtOpnd);
+  }
+
+  // Get through the Instruction:
+  // 1. Update its type.
+  // 2. Replace the uses of Ext by Inst.
+  // 3. Extend each operand that needs to be extended.
+
+  // Remember the original type of the instruction before promotion.
+  // This is useful to know that the high bits are sign extended bits.
+  PromotedInsts.insert(std::pair<Instruction *, TypeIsSExt>(
+      ExtOpnd, TypeIsSExt(ExtOpnd->getType(), IsSExt)));
+  // Step #1.
+  TPT.mutateType(ExtOpnd, Ext->getType());
+  // Step #2.
+  TPT.replaceAllUsesWith(Ext, ExtOpnd);
+  // Step #3.
+  Instruction *ExtForOpnd = Ext;
+
+  DEBUG(dbgs() << "Propagate Ext to operands\n");
+  for (int OpIdx = 0, EndOpIdx = ExtOpnd->getNumOperands(); OpIdx != EndOpIdx;
+       ++OpIdx) {
+    DEBUG(dbgs() << "Operand:\n" << *(ExtOpnd->getOperand(OpIdx)) << '\n');
+    if (ExtOpnd->getOperand(OpIdx)->getType() == Ext->getType() ||
+        !shouldExtOperand(ExtOpnd, OpIdx)) {
+      DEBUG(dbgs() << "No need to propagate\n");
+      continue;
+    }
+    // Check if we can statically extend the operand.
+    Value *Opnd = ExtOpnd->getOperand(OpIdx);
+    if (const ConstantInt *Cst = dyn_cast<ConstantInt>(Opnd)) {
+      DEBUG(dbgs() << "Statically extend\n");
+      unsigned BitWidth = Ext->getType()->getIntegerBitWidth();
+      APInt CstVal = IsSExt ? Cst->getValue().sext(BitWidth)
+                            : Cst->getValue().zext(BitWidth);
+      TPT.setOperand(ExtOpnd, OpIdx, ConstantInt::get(Ext->getType(), CstVal));
+      continue;
+    }
+    // UndefValue are typed, so we have to statically sign extend them.
+    if (isa<UndefValue>(Opnd)) {
+      DEBUG(dbgs() << "Statically extend\n");
+      TPT.setOperand(ExtOpnd, OpIdx, UndefValue::get(Ext->getType()));
+      continue;
+    }
+
+    // Otherwise we have to explicity sign extend the operand.
+    // Check if Ext was reused to extend an operand.
+    if (!ExtForOpnd) {
+      // If yes, create a new one.
+      DEBUG(dbgs() << "More operands to ext\n");
+      Value *ValForExtOpnd = IsSExt ? TPT.createSExt(Ext, Opnd, Ext->getType())
+        : TPT.createZExt(Ext, Opnd, Ext->getType());
+      if (!isa<Instruction>(ValForExtOpnd)) {
+        TPT.setOperand(ExtOpnd, OpIdx, ValForExtOpnd);
+        continue;
+      }
+      ExtForOpnd = cast<Instruction>(ValForExtOpnd);
+    }
+    if (Exts)
+      Exts->push_back(ExtForOpnd);
+    TPT.setOperand(ExtForOpnd, 0, Opnd);
+
+    // Move the sign extension before the insertion point.
+    TPT.moveBefore(ExtForOpnd, ExtOpnd);
+    TPT.setOperand(ExtOpnd, OpIdx, ExtForOpnd);
+    CreatedInstsCost += !TLI.isExtFree(ExtForOpnd);
+    // If more sext are required, new instructions will have to be created.
+    ExtForOpnd = nullptr;
+  }
+  if (ExtForOpnd == Ext) {
+    DEBUG(dbgs() << "Extension is useless now\n");
+    TPT.eraseInstruction(Ext);
+  }
+  return ExtOpnd;
+}
+
+/// Check whether or not promoting an instruction to a wider type is profitable.
+/// \p NewCost gives the cost of extension instructions created by the
+/// promotion.
+/// \p OldCost gives the cost of extension instructions before the promotion
+/// plus the number of instructions that have been
+/// matched in the addressing mode the promotion.
+/// \p PromotedOperand is the value that has been promoted.
+/// \return True if the promotion is profitable, false otherwise.
+bool AddressingModeMatcher::isPromotionProfitable(
+    unsigned NewCost, unsigned OldCost, Value *PromotedOperand) const {
+  DEBUG(dbgs() << "OldCost: " << OldCost << "\tNewCost: " << NewCost << '\n');
+  // The cost of the new extensions is greater than the cost of the
+  // old extension plus what we folded.
+  // This is not profitable.
+  if (NewCost > OldCost)
+    return false;
+  if (NewCost < OldCost)
+    return true;
+  // The promotion is neutral but it may help folding the sign extension in
+  // loads for instance.
+  // Check that we did not create an illegal instruction.
+  return isPromotedInstructionLegal(TLI, DL, PromotedOperand);
+}
+
+/// Given an instruction or constant expr, see if we can fold the operation
+/// into the addressing mode. If so, update the addressing mode and return
+/// true, otherwise return false without modifying AddrMode.
+/// If \p MovedAway is not NULL, it contains the information of whether or
+/// not AddrInst has to be folded into the addressing mode on success.
+/// If \p MovedAway == true, \p AddrInst will not be part of the addressing
+/// because it has been moved away.
+/// Thus AddrInst must not be added in the matched instructions.
+/// This state can happen when AddrInst is a sext, since it may be moved away.
+/// Therefore, AddrInst may not be valid when MovedAway is true and it must
+/// not be referenced anymore.
+bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
+                                               unsigned Depth,
+                                               bool *MovedAway) {
+  // Avoid exponential behavior on extremely deep expression trees.
+  if (Depth >= 5) return false;
+
+  // By default, all matched instructions stay in place.
+  if (MovedAway)
+    *MovedAway = false;
+
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    return matchAddr(AddrInst->getOperand(0), Depth);
+  case Instruction::IntToPtr: {
+    auto AS = AddrInst->getType()->getPointerAddressSpace();
+    auto PtrTy = MVT::getIntegerVT(DL.getPointerSizeInBits(AS));
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(DL, AddrInst->getOperand(0)->getType()) == PtrTy)
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  }
+  case Instruction::BitCast:
+    // BitCast is always a noop, and we can handle it as long as it is
+    // int->int or pointer->pointer (we don't want int<->fp or something).
+    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
+         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
+        // Don't touch identity bitcasts.  These were probably put here by LSR,
+        // and we don't want to mess around with them.  Assume it knows what it
+        // is doing.
+        AddrInst->getOperand(0)->getType() != AddrInst->getType())
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  case Instruction::AddrSpaceCast: {
+    unsigned SrcAS
+      = AddrInst->getOperand(0)->getType()->getPointerAddressSpace();
+    unsigned DestAS = AddrInst->getType()->getPointerAddressSpace();
+    if (TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+      return matchAddr(AddrInst->getOperand(0), Depth);
+    return false;
+  }
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    // Start a transaction at this point.
+    // The LHS may match but not the RHS.
+    // Therefore, we need a higher level restoration point to undo partially
+    // matched operation.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+
+    if (matchAddr(AddrInst->getOperand(1), Depth+1) &&
+        matchAddr(AddrInst->getOperand(0), Depth+1))
+      return true;
+
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    TPT.rollback(LastKnownGood);
+
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (matchAddr(AddrInst->getOperand(0), Depth+1) &&
+        matchAddr(AddrInst->getOperand(1), Depth+1))
+      return true;
+
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    TPT.rollback(LastKnownGood);
+    break;
+  }
+  //case Instruction::Or:
+  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+  //break;
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS)
+      return false;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1LL << Scale;
+
+    return matchScaledValue(AddrInst->getOperand(0), Scale, Depth);
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+
+    int64_t ConstantOffset = 0;
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = DL.getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1)
+            return false;
+
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 ||
+          TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace)) {
+        // Check to see if we can fold the base pointer in too.
+        if (matchAddr(AddrInst->getOperand(0), Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+      return false;
+    }
+
+    // Save the valid addressing mode in case we can't match.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // See if the scale and offset amount is valid for this target.
+    AddrMode.BaseOffs += ConstantOffset;
+
+    // Match the base operand of the GEP.
+    if (!matchAddr(AddrInst->getOperand(0), Depth+1)) {
+      // If it couldn't be matched, just stuff the value in a register.
+      if (AddrMode.HasBaseReg) {
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+    }
+
+    // Match the remaining variable portion of the GEP.
+    if (!matchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
+                          Depth)) {
+      // If it couldn't be matched, try stuffing the base into a register
+      // instead of matching it, and retrying the match of the scale.
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      if (AddrMode.HasBaseReg)
+        return false;
+      AddrMode.HasBaseReg = true;
+      AddrMode.BaseReg = AddrInst->getOperand(0);
+      AddrMode.BaseOffs += ConstantOffset;
+      if (!matchScaledValue(AddrInst->getOperand(VariableOperand),
+                            VariableScale, Depth)) {
+        // If even that didn't work, bail.
+        AddrMode = BackupAddrMode;
+        AddrModeInsts.resize(OldSize);
+        return false;
+      }
+    }
+
+    return true;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt: {
+    Instruction *Ext = dyn_cast<Instruction>(AddrInst);
+    if (!Ext)
+      return false;
+
+    // Try to move this ext out of the way of the addressing mode.
+    // Ask for a method for doing so.
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(Ext, InsertedInsts, TLI, PromotedInsts);
+    if (!TPH)
+      return false;
+
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    unsigned CreatedInstsCost = 0;
+    unsigned ExtCost = !TLI.isExtFree(Ext);
+    Value *PromotedOperand =
+        TPH(Ext, TPT, PromotedInsts, CreatedInstsCost, nullptr, nullptr, TLI);
+    // SExt has been moved away.
+    // Thus either it will be rematched later in the recursive calls or it is
+    // gone. Anyway, we must not fold it into the addressing mode at this point.
+    // E.g.,
+    // op = add opnd, 1
+    // idx = ext op
+    // addr = gep base, idx
+    // is now:
+    // promotedOpnd = ext opnd            <- no match here
+    // op = promoted_add promotedOpnd, 1  <- match (later in recursive calls)
+    // addr = gep base, op                <- match
+    if (MovedAway)
+      *MovedAway = true;
+
+    assert(PromotedOperand &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    if (!matchAddr(PromotedOperand, Depth) ||
+        // The total of the new cost is equal to the cost of the created
+        // instructions.
+        // The total of the old cost is equal to the cost of the extension plus
+        // what we have saved in the addressing mode.
+        !isPromotionProfitable(CreatedInstsCost,
+                               ExtCost + (AddrModeInsts.size() - OldSize),
+                               PromotedOperand)) {
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      DEBUG(dbgs() << "Sign extension does not pay off: rollback\n");
+      TPT.rollback(LastKnownGood);
+      return false;
+    }
+    return true;
+  }
+  }
+  return false;
+}
+
+/// If we can, try to add the value of 'Addr' into the current addressing mode.
+/// If Addr can't be added to AddrMode this returns false and leaves AddrMode
+/// unmodified. This assumes that Addr is either a pointer type or intptr_t
+/// for the target.
+///
+bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
+  // Start a transaction at this point that we will rollback if the matching
+  // fails.
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    // Fold in immediates if legal for the target.
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    // If this is a global variable, try to fold it into the addressing mode.
+    if (!AddrMode.BaseGV) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+        return true;
+      AddrMode.BaseGV = nullptr;
+    }
+  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+
+    // Check to see if it is possible to fold this operation.
+    bool MovedAway = false;
+    if (matchOperationAddr(I, I->getOpcode(), Depth, &MovedAway)) {
+      // This instruction may have been moved away. If so, there is nothing
+      // to check here.
+      if (MovedAway)
+        return true;
+      // Okay, it's possible to fold this.  Check to see if it is actually
+      // *profitable* to do so.  We use a simple cost model to avoid increasing
+      // register pressure too much.
+      if (I->hasOneUse() ||
+          isProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
+        AddrModeInsts.push_back(I);
+        return true;
+      }
+
+      // It isn't profitable to do this, roll back.
+      //cerr << "NOT FOLDING: " << *I;
+      AddrMode = BackupAddrMode;
+      AddrModeInsts.resize(OldSize);
+      TPT.rollback(LastKnownGood);
+    }
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    if (matchOperationAddr(CE, CE->getOpcode(), Depth))
+      return true;
+    TPT.rollback(LastKnownGood);
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    // Null pointer gets folded without affecting the addressing mode.
+    return true;
+  }
+
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    AddrMode.BaseReg = Addr;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+      return true;
+    AddrMode.HasBaseReg = false;
+    AddrMode.BaseReg = nullptr;
+  }
+
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    AddrMode.ScaledReg = Addr;
+    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+      return true;
+    AddrMode.Scale = 0;
+    AddrMode.ScaledReg = nullptr;
+  }
+  // Couldn't match.
+  TPT.rollback(LastKnownGood);
+  return false;
+}
+
+/// Check to see if all uses of OpVal by the specified inline asm call are due
+/// to memory operands. If so, return true, otherwise return false.
+static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
+                                    const TargetMachine &TM) {
+  const Function *F = CI->getParent()->getParent();
+  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
+  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+                            ImmutableCallSite(CI));
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+
+    // If this asm operand is our Value*, and if it isn't an indirect memory
+    // operand, we can't fold it!
+    if (OpInfo.CallOperandVal == OpVal &&
+        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
+         !OpInfo.isIndirect))
+      return false;
+  }
+
+  return true;
+}
+
+/// Recursively walk all the uses of I until we find a memory use.
+/// If we find an obviously non-foldable instruction, return true.
+/// Add the ultimately found memory instructions to MemoryUses.
+static bool FindAllMemoryUses(
+    Instruction *I,
+    SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
+  // If we already considered this instruction, we're done.
+  if (!ConsideredInsts.insert(I).second)
+    return false;
+
+  // If this is an obviously unfoldable instruction, bail out.
+  if (!MightBeFoldableInst(I))
+    return true;
+
+  const bool OptSize = I->getFunction()->optForSize();
+
+  // Loop over all the uses, recursively processing them.
+  for (Use &U : I->uses()) {
+    Instruction *UserI = cast<Instruction>(U.getUser());
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(UserI)) {
+      MemoryUses.push_back(std::make_pair(LI, U.getOperandNo()));
+      continue;
+    }
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo == 0) return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(SI, opNo));
+      continue;
+    }
+
+    if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
+      // If this is a cold call, we can sink the addressing calculation into
+      // the cold path.  See optimizeCallInst
+      if (!OptSize && CI->hasFnAttr(Attribute::Cold))
+        continue;
+
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      if (!IA) return true;
+
+      // If this is a memory operand, we're cool, otherwise bail out.
+      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
+        return true;
+      continue;
+    }
+
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
+      return true;
+  }
+
+  return false;
+}
+
+/// Return true if Val is already known to be live at the use site that we're
+/// folding it into. If so, there is no cost to include it in the addressing
+/// mode. KnownLive1 and KnownLive2 are two values that we know are live at the
+/// instruction already.
+bool AddressingModeMatcher::valueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
+                                                   Value *KnownLive2) {
+  // If Val is either of the known-live values, we know it is live!
+  if (Val == nullptr || Val == KnownLive1 || Val == KnownLive2)
+    return true;
+
+  // All values other than instructions and arguments (e.g. constants) are live.
+  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
+
+  // If Val is a constant sized alloca in the entry block, it is live, this is
+  // true because it is just a reference to the stack/frame pointer, which is
+  // live for the whole function.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
+    if (AI->isStaticAlloca())
+      return true;
+
+  // Check to see if this value is already used in the memory instruction's
+  // block.  If so, it's already live into the block at the very least, so we
+  // can reasonably fold it.
+  return Val->isUsedInBasicBlock(MemoryInst->getParent());
+}
+
+/// It is possible for the addressing mode of the machine to fold the specified
+/// instruction into a load or store that ultimately uses it.
+/// However, the specified instruction has multiple uses.
+/// Given this, it may actually increase register pressure to fold it
+/// into the load. For example, consider this code:
+///
+///     X = ...
+///     Y = X+1
+///     use(Y)   -> nonload/store
+///     Z = Y+1
+///     load Z
+///
+/// In this case, Y has multiple uses, and can be folded into the load of Z
+/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
+/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
+/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
+/// number of computations either.
+///
+/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
+/// X was live across 'load Z' for other reasons, we actually *would* want to
+/// fold the addressing mode in the Z case.  This would make Y die earlier.
+bool AddressingModeMatcher::
+isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
+                                     ExtAddrMode &AMAfter) {
+  if (IgnoreProfitability) return true;
+
+  // AMBefore is the addressing mode before this instruction was folded into it,
+  // and AMAfter is the addressing mode after the instruction was folded.  Get
+  // the set of registers referenced by AMAfter and subtract out those
+  // referenced by AMBefore: this is the set of values which folding in this
+  // address extends the lifetime of.
+  //
+  // Note that there are only two potential values being referenced here,
+  // BaseReg and ScaleReg (global addresses are always available, as are any
+  // folded immediates).
+  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
+
+  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
+  // lifetime wasn't extended by adding this instruction.
+  if (valueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    BaseReg = nullptr;
+  if (valueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
+    ScaledReg = nullptr;
+
+  // If folding this instruction (and it's subexprs) didn't extend any live
+  // ranges, we're ok with it.
+  if (!BaseReg && !ScaledReg)
+    return true;
+
+  // If all uses of this instruction can have the address mode sunk into them,
+  // we can remove the addressing mode and effectively trade one live register
+  // for another (at worst.)  In this context, folding an addressing mode into
+  // the use is just a particularly nice way of sinking it.
+  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
+  SmallPtrSet<Instruction*, 16> ConsideredInsts;
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
+    return false;  // Has a non-memory, non-foldable use!
+
+  // Now that we know that all uses of this instruction are part of a chain of
+  // computation involving only operations that could theoretically be folded
+  // into a memory use, loop over each of these memory operation uses and see
+  // if they could  *actually* fold the instruction.  The assumption is that
+  // addressing modes are cheap and that duplicating the computation involved
+  // many times is worthwhile, even on a fastpath. For sinking candidates
+  // (i.e. cold call sites), this serves as a way to prevent excessive code
+  // growth since most architectures have some reasonable small and fast way to
+  // compute an effective address.  (i.e LEA on x86)
+  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
+  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
+    Instruction *User = MemoryUses[i].first;
+    unsigned OpNo = MemoryUses[i].second;
+
+    // Get the access type of this use.  If the use isn't a pointer, we don't
+    // know what it accesses.
+    Value *Address = User->getOperand(OpNo);
+    PointerType *AddrTy = dyn_cast<PointerType>(Address->getType());
+    if (!AddrTy)
+      return false;
+    Type *AddressAccessTy = AddrTy->getElementType();
+    unsigned AS = AddrTy->getAddressSpace();
+
+    // Do a match against the root of this address, ignoring profitability. This
+    // will tell us if the addressing mode for the memory operation will
+    // *actually* cover the shared instruction.
+    ExtAddrMode Result;
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
+                                  MemoryInst, Result, InsertedInsts,
+                                  PromotedInsts, TPT);
+    Matcher.IgnoreProfitability = true;
+    bool Success = Matcher.matchAddr(Address, 0);
+    (void)Success; assert(Success && "Couldn't select *anything*?");
+
+    // The match was to check the profitability, the changes made are not
+    // part of the original matcher. Therefore, they should be dropped
+    // otherwise the original matcher will not present the right state.
+    TPT.rollback(LastKnownGood);
+
+    // If the match didn't cover I, then it won't be shared by it.
+    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
+                  I) == MatchedAddrModeInsts.end())
+      return false;
+
+    MatchedAddrModeInsts.clear();
+  }
+
+  return true;
+}
+
+} // end anonymous namespace
+
+/// Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// Sink addressing mode computation immediate before MemoryInst if doing so
+/// can be done without increasing register pressure.  The need for the
+/// register pressure constraint means this can end up being an all or nothing
+/// decision for all uses of the same addressing computation.
+///
+/// Load and Store Instructions often have addressing modes that can do
+/// significant amounts of computation. As such, instruction selection will try
+/// to get the load or store to do as much computation as possible for the
+/// program. The problem is that isel can only see within a single block. As
+/// such, we sink as much legal addressing mode work into the block as possible.
+///
+/// This method is used to optimize both load/store and inline asms with memory
+/// operands.  It's also used to sink addressing computations feeding into cold
+/// call sites into their (cold) basic block.
+///
+/// The motivation for handling sinking into cold blocks is that doing so can
+/// both enable other address mode sinking (by satisfying the register pressure
+/// constraint above), and reduce register pressure globally (by removing the
+/// addressing mode computation from the fast path entirely.).
+bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
+                                        Type *AccessTy, unsigned AddrSpace) {
+  Value *Repl = Addr;
+
+  // Try to collapse single-value PHI nodes.  This is necessary to undo
+  // unprofitable PRE transformations.
+  SmallVector<Value*, 8> worklist;
+  SmallPtrSet<Value*, 16> Visited;
+  worklist.push_back(Addr);
+
+  // Use a worklist to iteratively look through PHI nodes, and ensure that
+  // the addressing mode obtained from the non-PHI roots of the graph
+  // are equivalent.
+  Value *Consensus = nullptr;
+  unsigned NumUsesConsensus = 0;
+  bool IsNumUsesConsensusValid = false;
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode;
+  TypePromotionTransaction TPT;
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  while (!worklist.empty()) {
+    Value *V = worklist.back();
+    worklist.pop_back();
+
+    // Break use-def graph loops.
+    if (!Visited.insert(V).second) {
+      Consensus = nullptr;
+      break;
+    }
+
+    // For a PHI node, push all of its incoming values.
+    if (PHINode *P = dyn_cast<PHINode>(V)) {
+      for (Value *IncValue : P->incoming_values())
+        worklist.push_back(IncValue);
+      continue;
+    }
+
+    // For non-PHIs, determine the addressing mode being computed.  Note that
+    // the result may differ depending on what other uses our candidate
+    // addressing instructions might have.
+    SmallVector<Instruction*, 16> NewAddrModeInsts;
+    ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
+      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
+      InsertedInsts, PromotedInsts, TPT);
+
+    // This check is broken into two cases with very similar code to avoid using
+    // getNumUses() as much as possible. Some values have a lot of uses, so
+    // calling getNumUses() unconditionally caused a significant compile-time
+    // regression.
+    if (!Consensus) {
+      Consensus = V;
+      AddrMode = NewAddrMode;
+      AddrModeInsts = NewAddrModeInsts;
+      continue;
+    } else if (NewAddrMode == AddrMode) {
+      if (!IsNumUsesConsensusValid) {
+        NumUsesConsensus = Consensus->getNumUses();
+        IsNumUsesConsensusValid = true;
+      }
+
+      // Ensure that the obtained addressing mode is equivalent to that obtained
+      // for all other roots of the PHI traversal.  Also, when choosing one
+      // such root as representative, select the one with the most uses in order
+      // to keep the cost modeling heuristics in AddressingModeMatcher
+      // applicable.
+      unsigned NumUses = V->getNumUses();
+      if (NumUses > NumUsesConsensus) {
+        Consensus = V;
+        NumUsesConsensus = NumUses;
+        AddrModeInsts = NewAddrModeInsts;
+      }
+      continue;
+    }
+
+    Consensus = nullptr;
+    break;
+  }
+
+  // If the addressing mode couldn't be determined, or if multiple different
+  // ones were determined, bail out now.
+  if (!Consensus) {
+    TPT.rollback(LastKnownGood);
+    return false;
+  }
+  TPT.commit();
+
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], MemoryInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  IRBuilder<> Builder(MemoryInst);
+
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst << "\n");
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+  } else if (AddrSinkUsingGEPs ||
+             (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
+              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
+                  ->useAA())) {
+    // By default, we use the GEP-based method when AA is used later. This
+    // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
+    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst << "\n");
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
+    Value *ResultPtr = nullptr, *ResultIndex = nullptr;
+
+    // First, find the pointer.
+    if (AddrMode.BaseReg && AddrMode.BaseReg->getType()->isPointerTy()) {
+      ResultPtr = AddrMode.BaseReg;
+      AddrMode.BaseReg = nullptr;
+    }
+
+    if (AddrMode.Scale && AddrMode.ScaledReg->getType()->isPointerTy()) {
+      // We can't add more than one pointer together, nor can we scale a
+      // pointer (both of which seem meaningless).
+      if (ResultPtr || AddrMode.Scale != 1)
+        return false;
+
+      ResultPtr = AddrMode.ScaledReg;
+      AddrMode.Scale = 0;
+    }
+
+    if (AddrMode.BaseGV) {
+      if (ResultPtr)
+        return false;
+
+      ResultPtr = AddrMode.BaseGV;
+    }
+
+    // If the real base value actually came from an inttoptr, then the matcher
+    // will look through it and provide only the integer value. In that case,
+    // use it here.
+    if (!ResultPtr && AddrMode.BaseReg) {
+      ResultPtr =
+        Builder.CreateIntToPtr(AddrMode.BaseReg, Addr->getType(), "sunkaddr");
+      AddrMode.BaseReg = nullptr;
+    } else if (!ResultPtr && AddrMode.Scale == 1) {
+      ResultPtr =
+        Builder.CreateIntToPtr(AddrMode.ScaledReg, Addr->getType(), "sunkaddr");
+      AddrMode.Scale = 0;
+    }
+
+    if (!ResultPtr &&
+        !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    } else if (!ResultPtr) {
+      return false;
+    } else {
+      Type *I8PtrTy =
+          Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
+      Type *I8Ty = Builder.getInt8Ty();
+
+      // Start with the base register. Do this first so that subsequent address
+      // matching finds it last, which will prevent it from trying to match it
+      // as the scaled value in case it happens to be a mul. That would be
+      // problematic if we've sunk a different mul for the scale, because then
+      // we'd end up sinking both muls.
+      if (AddrMode.BaseReg) {
+        Value *V = AddrMode.BaseReg;
+        if (V->getType() != IntPtrTy)
+          V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+
+        ResultIndex = V;
+      }
+
+      // Add the scale value.
+      if (AddrMode.Scale) {
+        Value *V = AddrMode.ScaledReg;
+        if (V->getType() == IntPtrTy) {
+          // done.
+        } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                   cast<IntegerType>(V->getType())->getBitWidth()) {
+          V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+        } else {
+          // It is only safe to sign extend the BaseReg if we know that the math
+          // required to create it did not overflow before we extend it. Since
+          // the original IR value was tossed in favor of a constant back when
+          // the AddrMode was created we need to bail out gracefully if widths
+          // do not match instead of extending it.
+          Instruction *I = dyn_cast_or_null<Instruction>(ResultIndex);
+          if (I && (ResultIndex != AddrMode.BaseReg))
+            I->eraseFromParent();
+          return false;
+        }
+
+        if (AddrMode.Scale != 1)
+          V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                                "sunkaddr");
+        if (ResultIndex)
+          ResultIndex = Builder.CreateAdd(ResultIndex, V, "sunkaddr");
+        else
+          ResultIndex = V;
+      }
+
+      // Add in the Base Offset if present.
+      if (AddrMode.BaseOffs) {
+        Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+        if (ResultIndex) {
+          // We need to add this separately from the scale above to help with
+          // SDAG consecutive load/store merging.
+          if (ResultPtr->getType() != I8PtrTy)
+            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
+        }
+
+        ResultIndex = V;
+      }
+
+      if (!ResultIndex) {
+        SunkAddr = ResultPtr;
+      } else {
+        if (ResultPtr->getType() != I8PtrTy)
+          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+        SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
+      }
+
+      if (SunkAddr->getType() != Addr->getType())
+        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+    }
+  } else {
+    DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
+                 << *MemoryInst << "\n");
+    Type *IntPtrTy = DL->getIntPtrType(Addr->getType());
+    Value *Result = nullptr;
+
+    // Start with the base register. Do this first so that subsequent address
+    // matching finds it last, which will prevent it from trying to match it
+    // as the scaled value in case it happens to be a mul. That would be
+    // problematic if we've sunk a different mul for the scale, because then
+    // we'd end up sinking both muls.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType()->isPointerTy())
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      if (V->getType() != IntPtrTy)
+        V = Builder.CreateIntCast(V, IntPtrTy, /*isSigned=*/true, "sunkaddr");
+      Result = V;
+    }
+
+    // Add the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (V->getType()->isPointerTy()) {
+        V = Builder.CreatePtrToInt(V, IntPtrTy, "sunkaddr");
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = Builder.CreateTrunc(V, IntPtrTy, "sunkaddr");
+      } else {
+        // It is only safe to sign extend the BaseReg if we know that the math
+        // required to create it did not overflow before we extend it. Since
+        // the original IR value was tossed in favor of a constant back when
+        // the AddrMode was created we need to bail out gracefully if widths
+        // do not match instead of extending it.
+        Instruction *I = dyn_cast_or_null<Instruction>(Result);
+        if (I && (Result != AddrMode.BaseReg))
+          I->eraseFromParent();
+        return false;
+      }
+      if (AddrMode.Scale != 1)
+        V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
+                              "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = Builder.CreatePtrToInt(AddrMode.BaseGV, IntPtrTy, "sunkaddr");
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = Builder.CreateAdd(Result, V, "sunkaddr");
+      else
+        Result = V;
+    }
+
+    if (!Result)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = Builder.CreateIntToPtr(Result, Addr->getType(), "sunkaddr");
+  }
+
+  MemoryInst->replaceUsesOfWith(Repl, SunkAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Repl->use_empty()) {
+    // This can cause recursive deletion, which can invalidate our iterator.
+    // Use a WeakVH to hold onto it in case this happens.
+    Value *CurValue = &*CurInstIterator;
+    WeakVH IterHandle(CurValue);
+    BasicBlock *BB = CurInstIterator->getParent();
+
+    RecursivelyDeleteTriviallyDeadInstructions(Repl, TLInfo);
+
+    if (IterHandle != CurValue) {
+      // If the iterator instruction was recursively deleted, start over at the
+      // start of the block.
+      CurInstIterator = BB->begin();
+      SunkAddrs.clear();
+    }
+  }
+  ++NumMemoryInsts;
+  return true;
+}
+
+/// If there are any memory operands, use OptimizeMemoryInst to sink their
+/// address computing into the block when possible / profitable.
+bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
+  bool MadeChange = false;
+
+  const TargetRegisterInfo *TRI =
+      TM->getSubtargetImpl(*CS->getParent()->getParent())->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(*DL, TRI, CS);
+  unsigned ArgNo = 0;
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.isIndirect) {
+      Value *OpVal = CS->getArgOperand(ArgNo++);
+      MadeChange |= optimizeMemoryInst(CS, OpVal, OpVal->getType(), ~0u);
+    } else if (OpInfo.Type == InlineAsm::isInput)
+      ArgNo++;
+  }
+
+  return MadeChange;
+}
+
+/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// sign extensions.
+static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
+  assert(!Inst->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+  bool IsSExt = isa<SExtInst>(FirstUser);
+  Type *ExtTy = FirstUser->getType();
+  for (const User *U : Inst->users()) {
+    const Instruction *UI = cast<Instruction>(U);
+    if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
+      return false;
+    Type *CurTy = UI->getType();
+    // Same input and output types: Same instruction after CSE.
+    if (CurTy == ExtTy)
+      continue;
+
+    // If IsSExt is true, we are in this situation:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty1 a to ty3
+    // Assuming ty2 is shorter than ty3, this could be turned into:
+    // a = Inst
+    // b = sext ty1 a to ty2
+    // c = sext ty2 b to ty3
+    // However, the last sext is not free.
+    if (IsSExt)
+      return false;
+
+    // This is a ZExt, maybe this is free to extend from one type to another.
+    // In that case, we would not account for a different use.
+    Type *NarrowTy;
+    Type *LargeTy;
+    if (ExtTy->getScalarType()->getIntegerBitWidth() >
+        CurTy->getScalarType()->getIntegerBitWidth()) {
+      NarrowTy = CurTy;
+      LargeTy = ExtTy;
+    } else {
+      NarrowTy = ExtTy;
+      LargeTy = CurTy;
+    }
+
+    if (!TLI.isZExtFree(NarrowTy, LargeTy))
+      return false;
+  }
+  // All uses are the same or can be derived from one another for free.
+  return true;
+}
+
+/// \brief Try to form ExtLd by promoting \p Exts until they reach a
+/// load instruction.
+/// If an ext(load) can be formed, it is returned via \p LI for the load
+/// and \p Inst for the extension.
+/// Otherwise LI == nullptr and Inst == nullptr.
+/// When some promotion happened, \p TPT contains the proper state to
+/// revert them.
+///
+/// \return true when promoting was necessary to expose the ext(load)
+/// opportunity, false otherwise.
+///
+/// Example:
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+/// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Thanks to the promotion, we can match zext(load i32*) to i64.
+bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
+                                    LoadInst *&LI, Instruction *&Inst,
+                                    const SmallVectorImpl<Instruction *> &Exts,
+                                    unsigned CreatedInstsCost = 0) {
+  // Iterate over all the extensions to see if one form an ext(load).
+  for (auto I : Exts) {
+    // Check if we directly have ext(load).
+    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
+      Inst = I;
+      // No promotion happened here.
+      return false;
+    }
+    // Check whether or not we want to do any promotion.
+    if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+      continue;
+    // Get the action to perform the promotion.
+    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
+        I, InsertedInsts, *TLI, PromotedInsts);
+    // Check if we can promote.
+    if (!TPH)
+      continue;
+    // Save the current state.
+    TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+        TPT.getRestorationPoint();
+    SmallVector<Instruction *, 4> NewExts;
+    unsigned NewCreatedInstsCost = 0;
+    unsigned ExtCost = !TLI->isExtFree(I);
+    // Promote.
+    Value *PromotedVal = TPH(I, TPT, PromotedInsts, NewCreatedInstsCost,
+                             &NewExts, nullptr, *TLI);
+    assert(PromotedVal &&
+           "TypePromotionHelper should have filtered out those cases");
+
+    // We would be able to merge only one extension in a load.
+    // Therefore, if we have more than 1 new extension we heuristically
+    // cut this search path, because it means we degrade the code quality.
+    // With exactly 2, the transformation is neutral, because we will merge
+    // one extension but leave one. However, we optimistically keep going,
+    // because the new extension may be removed too.
+    long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
+    TotalCreatedInstsCost -= ExtCost;
+    if (!StressExtLdPromotion &&
+        (TotalCreatedInstsCost > 1 ||
+         !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
+      // The promotion is not profitable, rollback to the previous state.
+      TPT.rollback(LastKnownGood);
+      continue;
+    }
+    // The promotion is profitable.
+    // Check if it exposes an ext(load).
+    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
+    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
+               // If we have created a new extension, i.e., now we have two
+               // extensions. We must make sure one of them is merged with
+               // the load, otherwise we may degrade the code quality.
+               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
+      // Promotion happened.
+      return true;
+    // If this does not help to expose an ext(load) then, rollback.
+    TPT.rollback(LastKnownGood);
+  }
+  // None of the extension can form an ext(load).
+  LI = nullptr;
+  Inst = nullptr;
+  return false;
+}
+
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
+/// \p I[in/out] the extension may be modified during the process if some
+/// promotions apply.
+///
+bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
+  // Try to promote a chain of computation if it allows to form
+  // an extended load.
+  TypePromotionTransaction TPT;
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+    TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  Exts.push_back(I);
+  // Look for a load being extended.
+  LoadInst *LI = nullptr;
+  Instruction *OldExt = I;
+  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
+  if (!LI || !I) {
+    assert(!HasPromoted && !LI && "If we did not match any load instruction "
+                                  "the code must remain the same");
+    I = OldExt;
+    return false;
+  }
+
+  // If they're already in the same block, there's nothing to do.
+  // Make the cheap checks first if we did not promote.
+  // If we promoted, we need to check if it is indeed profitable.
+  if (!HasPromoted && LI->getParent() == I->getParent())
+    return false;
+
+  EVT VT = TLI->getValueType(*DL, I->getType());
+  EVT LoadVT = TLI->getValueType(*DL, LI->getType());
+
+  // If the load has other users and the truncate is not free, this probably
+  // isn't worthwhile.
+  if (!LI->hasOneUse() && TLI &&
+      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(I->getType(), LI->getType())) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
+    return false;
+  }
+
+  // Check whether the target supports casts folded into loads.
+  unsigned LType;
+  if (isa<ZExtInst>(I))
+    LType = ISD::ZEXTLOAD;
+  else {
+    assert(isa<SExtInst>(I) && "Unexpected ext type!");
+    LType = ISD::SEXTLOAD;
+  }
+  if (TLI && !TLI->isLoadExtLegal(LType, VT, LoadVT)) {
+    I = OldExt;
+    TPT.rollback(LastKnownGood);
+    return false;
+  }
+
+  // Move the extend into the same block as the load, so that SelectionDAG
+  // can fold it.
+  TPT.commit();
+  I->removeFromParent();
+  I->insertAfter(LI);
+  ++NumExtsMoved;
+  return true;
+}
+
+bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
+  BasicBlock *DefBB = I->getParent();
+
+  // If the result of a {s|z}ext and its source are both live out, rewrite all
+  // other uses of the source with result of extension.
+  Value *Src = I->getOperand(0);
+  if (Src->hasOneUse())
+    return false;
+
+  // Only do this xform if truncating is free.
+  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+    return false;
+
+  // Only safe to perform the optimization if the source is also defined in
+  // this block.
+  if (!isa<Instruction>(Src) || DefBB != cast<Instruction>(Src)->getParent())
+    return false;
+
+  bool DefIsLiveOut = false;
+  for (User *U : I->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = UI->getParent();
+    if (UserBB == DefBB) continue;
+    DefIsLiveOut = true;
+    break;
+  }
+  if (!DefIsLiveOut)
+    return false;
+
+  // Make sure none of the uses are PHI nodes.
+  for (User *U : Src->users()) {
+    Instruction *UI = cast<Instruction>(U);
+    BasicBlock *UserBB = UI->getParent();
+    if (UserBB == DefBB) continue;
+    // Be conservative. We don't want this xform to end up introducing
+    // reloads just before load / store instructions.
+    if (isa<PHINode>(UI) || isa<LoadInst>(UI) || isa<StoreInst>(UI))
+      return false;
+  }
+
+  // InsertedTruncs - Only insert one trunc in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedTruncs;
+
+  bool MadeChange = false;
+  for (Use &U : Src->uses()) {
+    Instruction *User = cast<Instruction>(U.getUser());
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = User->getParent();
+    if (UserBB == DefBB) continue;
+
+    // Both src and def are live in this block. Rewrite the use.
+    Instruction *&InsertedTrunc = InsertedTruncs[UserBB];
+
+    if (!InsertedTrunc) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedTrunc = new TruncInst(I, Src->getType(), "", &*InsertPt);
+      InsertedInsts.insert(InsertedTrunc);
+    }
+
+    // Replace a use of the {s|z}ext source with a use of the result.
+    U = InsertedTrunc;
+    ++NumExtUses;
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+// Find loads whose uses only use some of the loaded value's bits.  Add an "and"
+// just after the load if the target can fold this into one extload instruction,
+// with the hope of eliminating some of the other later "and" instructions using
+// the loaded value.  "and"s that are made trivially redundant by the insertion
+// of the new "and" are removed by this function, while others (e.g. those whose
+// path from the load goes through a phi) are left for isel to potentially
+// remove.
+//
+// For example:
+//
+// b0:
+//   x = load i32
+//   ...
+// b1:
+//   y = and x, 0xff
+//   z = use y
+//
+// becomes:
+//
+// b0:
+//   x = load i32
+//   x' = and x, 0xff
+//   ...
+// b1:
+//   z = use x'
+//
+// whereas:
+//
+// b0:
+//   x1 = load i32
+//   ...
+// b1:
+//   x2 = load i32
+//   ...
+// b2:
+//   x = phi x1, x2
+//   y = and x, 0xff
+//
+// becomes (after a call to optimizeLoadExt for each load):
+//
+// b0:
+//   x1 = load i32
+//   x1' = and x1, 0xff
+//   ...
+// b1:
+//   x2 = load i32
+//   x2' = and x2, 0xff
+//   ...
+// b2:
+//   x = phi x1', x2'
+//   y = and x, 0xff
+//
+
+bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
+
+  if (!Load->isSimple() ||
+      !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
+    return false;
+
+  // Skip loads we've already transformed or have no reason to transform.
+  if (Load->hasOneUse()) {
+    User *LoadUser = *Load->user_begin();
+    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
+        !dyn_cast<PHINode>(LoadUser))
+      return false;
+  }
+
+  // Look at all uses of Load, looking through phis, to determine how many bits
+  // of the loaded value are needed.
+  SmallVector<Instruction *, 8> WorkList;
+  SmallPtrSet<Instruction *, 16> Visited;
+  SmallVector<Instruction *, 8> AndsToMaybeRemove;
+  for (auto *U : Load->users())
+    WorkList.push_back(cast<Instruction>(U));
+
+  EVT LoadResultVT = TLI->getValueType(*DL, Load->getType());
+  unsigned BitWidth = LoadResultVT.getSizeInBits();
+  APInt DemandBits(BitWidth, 0);
+  APInt WidestAndBits(BitWidth, 0);
+
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    // Break use-def graph loops.
+    if (!Visited.insert(I).second)
+      continue;
+
+    // For a PHI node, push all of its users.
+    if (auto *Phi = dyn_cast<PHINode>(I)) {
+      for (auto *U : Phi->users())
+        WorkList.push_back(cast<Instruction>(U));
+      continue;
+    }
+
+    switch (I->getOpcode()) {
+    case llvm::Instruction::And: {
+      auto *AndC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!AndC)
+        return false;
+      APInt AndBits = AndC->getValue();
+      DemandBits |= AndBits;
+      // Keep track of the widest and mask we see.
+      if (AndBits.ugt(WidestAndBits))
+        WidestAndBits = AndBits;
+      if (AndBits == WidestAndBits && I->getOperand(0) == Load)
+        AndsToMaybeRemove.push_back(I);
+      break;
+    }
+
+    case llvm::Instruction::Shl: {
+      auto *ShlC = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!ShlC)
+        return false;
+      uint64_t ShiftAmt = ShlC->getLimitedValue(BitWidth - 1);
+      auto ShlDemandBits = APInt::getAllOnesValue(BitWidth).lshr(ShiftAmt);
+      DemandBits |= ShlDemandBits;
+      break;
+    }
+
+    case llvm::Instruction::Trunc: {
+      EVT TruncVT = TLI->getValueType(*DL, I->getType());
+      unsigned TruncBitWidth = TruncVT.getSizeInBits();
+      auto TruncBits = APInt::getAllOnesValue(TruncBitWidth).zext(BitWidth);
+      DemandBits |= TruncBits;
+      break;
+    }
+
+    default:
+      return false;
+    }
+  }
+
+  uint32_t ActiveBits = DemandBits.getActiveBits();
+  // Avoid hoisting (and (load x) 1) since it is unlikely to be folded by the
+  // target even if isLoadExtLegal says an i1 EXTLOAD is valid.  For example,
+  // for the AArch64 target isLoadExtLegal(ZEXTLOAD, i32, i1) returns true, but
+  // (and (load x) 1) is not matched as a single instruction, rather as a LDR
+  // followed by an AND.
+  // TODO: Look into removing this restriction by fixing backends to either
+  // return false for isLoadExtLegal for i1 or have them select this pattern to
+  // a single instruction.
+  //
+  // Also avoid hoisting if we didn't see any ands with the exact DemandBits
+  // mask, since these are the only ands that will be removed by isel.
+  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+      WidestAndBits != DemandBits)
+    return false;
+
+  LLVMContext &Ctx = Load->getType()->getContext();
+  Type *TruncTy = Type::getIntNTy(Ctx, ActiveBits);
+  EVT TruncVT = TLI->getValueType(*DL, TruncTy);
+
+  // Reject cases that won't be matched as extloads.
+  if (!LoadResultVT.bitsGT(TruncVT) || !TruncVT.isRound() ||
+      !TLI->isLoadExtLegal(ISD::ZEXTLOAD, LoadResultVT, TruncVT))
+    return false;
+
+  IRBuilder<> Builder(Load->getNextNode());
+  auto *NewAnd = dyn_cast<Instruction>(
+      Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+
+  // Replace all uses of load with new and (except for the use of load in the
+  // new and itself).
+  Load->replaceAllUsesWith(NewAnd);
+  NewAnd->setOperand(0, Load);
+
+  // Remove any and instructions that are now redundant.
+  for (auto *And : AndsToMaybeRemove)
+    // Check that the and mask is the same as the one we decided to put on the
+    // new and.
+    if (cast<ConstantInt>(And->getOperand(1))->getValue() == DemandBits) {
+      And->replaceAllUsesWith(NewAnd);
+      if (&*CurInstIterator == And)
+        CurInstIterator = std::next(And->getIterator());
+      And->eraseFromParent();
+      ++NumAndUses;
+    }
+
+  ++NumAndsAdded;
+  return true;
+}
+
+/// Check if V (an operand of a select instruction) is an expensive instruction
+/// that is only used once.
+static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
+  auto *I = dyn_cast<Instruction>(V);
+  // If it's safe to speculatively execute, then it should not have side
+  // effects; therefore, it's safe to sink and possibly *not* execute.
+  return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
+         TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
+}
+
+/// Returns true if a SelectInst should be turned into an explicit branch.
+static bool isFormingBranchFromSelectProfitable(const TargetTransformInfo *TTI,
+                                                const TargetLowering *TLI,
+                                                SelectInst *SI) {
+  // If even a predictable select is cheap, then a branch can't be cheaper.
+  if (!TLI->isPredictableSelectExpensive())
+    return false;
+
+  // FIXME: This should use the same heuristics as IfConversion to determine
+  // whether a select is better represented as a branch.
+
+  // If metadata tells us that the select condition is obviously predictable,
+  // then we want to replace the select with a branch.
+  uint64_t TrueWeight, FalseWeight;
+  if (SI->extractProfMetadata(TrueWeight, FalseWeight)) {
+    uint64_t Max = std::max(TrueWeight, FalseWeight);
+    uint64_t Sum = TrueWeight + FalseWeight;
+    if (Sum != 0) {
+      auto Probability = BranchProbability::getBranchProbability(Max, Sum);
+      if (Probability > TLI->getPredictableBranchThreshold())
+        return true;
+    }
+  }
+
+  CmpInst *Cmp = dyn_cast<CmpInst>(SI->getCondition());
+
+  // If a branch is predictable, an out-of-order CPU can avoid blocking on its
+  // comparison condition. If the compare has more than one use, there's
+  // probably another cmov or setcc around, so it's not worth emitting a branch.
+  if (!Cmp || !Cmp->hasOneUse())
+    return false;
+
+  // If either operand of the select is expensive and only needed on one side
+  // of the select, we should form a branch.
+  if (sinkSelectOperand(TTI, SI->getTrueValue()) ||
+      sinkSelectOperand(TTI, SI->getFalseValue()))
+    return true;
+
+  return false;
+}
+
+
+/// If we have a SelectInst that will likely profit from branch prediction,
+/// turn it into a branch.
+bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
+  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+
+  // Can we convert the 'select' to CF ?
+  if (DisableSelectToBranch || OptSize || !TLI || VectorCond ||
+      SI->getMetadata(LLVMContext::MD_unpredictable))
+    return false;
+
+  TargetLowering::SelectSupportKind SelectKind;
+  if (VectorCond)
+    SelectKind = TargetLowering::VectorMaskSelect;
+  else if (SI->getType()->isVectorTy())
+    SelectKind = TargetLowering::ScalarCondVectorVal;
+  else
+    SelectKind = TargetLowering::ScalarValSelect;
+
+  if (TLI->isSelectSupported(SelectKind) &&
+      !isFormingBranchFromSelectProfitable(TTI, TLI, SI))
+    return false;
+
+  ModifiedDT = true;
+
+  // Transform a sequence like this:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       %sel = select i1 %cmp, i32 %c, i32 %d
+  //
+  // Into:
+  //    start:
+  //       %cmp = cmp uge i32 %a, %b
+  //       br i1 %cmp, label %select.true, label %select.false
+  //    select.true:
+  //       br label %select.end
+  //    select.false:
+  //       br label %select.end
+  //    select.end:
+  //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
+  //
+  // In addition, we may sink instructions that produce %c or %d from
+  // the entry block into the destination(s) of the new branch.
+  // If the true or false blocks do not contain a sunken instruction, that
+  // block and its branch may be optimized away. In that case, one side of the
+  // first branch will point directly to select.end, and the corresponding PHI
+  // predecessor block will be the start block.
+
+  // First, we split the block containing the select into 2 blocks.
+  BasicBlock *StartBlock = SI->getParent();
+  BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(SI));
+  BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+
+  // Delete the unconditional branch that was just created by the split.
+  StartBlock->getTerminator()->eraseFromParent();
+
+  // These are the new basic blocks for the conditional branch.
+  // At least one will become an actual new basic block.
+  BasicBlock *TrueBlock = nullptr;
+  BasicBlock *FalseBlock = nullptr;
+
+  // Sink expensive instructions into the conditional blocks to avoid executing
+  // them speculatively.
+  if (sinkSelectOperand(TTI, SI->getTrueValue())) {
+    TrueBlock = BasicBlock::Create(SI->getContext(), "select.true.sink",
+                                   EndBlock->getParent(), EndBlock);
+    auto *TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
+    auto *TrueInst = cast<Instruction>(SI->getTrueValue());
+    TrueInst->moveBefore(TrueBranch);
+  }
+  if (sinkSelectOperand(TTI, SI->getFalseValue())) {
+    FalseBlock = BasicBlock::Create(SI->getContext(), "select.false.sink",
+                                    EndBlock->getParent(), EndBlock);
+    auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
+    auto *FalseInst = cast<Instruction>(SI->getFalseValue());
+    FalseInst->moveBefore(FalseBranch);
+  }
+
+  // If there was nothing to sink, then arbitrarily choose the 'false' side
+  // for a new input value to the PHI.
+  if (TrueBlock == FalseBlock) {
+    assert(TrueBlock == nullptr &&
+           "Unexpected basic block transform while optimizing select");
+
+    FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+                                    EndBlock->getParent(), EndBlock);
+    BranchInst::Create(EndBlock, FalseBlock);
+  }
+
+  // Insert the real conditional branch based on the original condition.
+  // If we did not create a new block for one of the 'true' or 'false' paths
+  // of the condition, it means that side of the branch goes to the end block
+  // directly and the path originates from the start block from the point of
+  // view of the new PHI.
+  if (TrueBlock == nullptr) {
+    BranchInst::Create(EndBlock, FalseBlock, SI->getCondition(), SI);
+    TrueBlock = StartBlock;
+  } else if (FalseBlock == nullptr) {
+    BranchInst::Create(TrueBlock, EndBlock, SI->getCondition(), SI);
+    FalseBlock = StartBlock;
+  } else {
+    BranchInst::Create(TrueBlock, FalseBlock, SI->getCondition(), SI);
+  }
+
+  // The select itself is replaced with a PHI Node.
+  PHINode *PN = PHINode::Create(SI->getType(), 2, "", &EndBlock->front());
+  PN->takeName(SI);
+  PN->addIncoming(SI->getTrueValue(), TrueBlock);
+  PN->addIncoming(SI->getFalseValue(), FalseBlock);
+
+  SI->replaceAllUsesWith(PN);
+  SI->eraseFromParent();
+
+  // Instruct OptimizeBlock to skip to the next block.
+  CurInstIterator = StartBlock->end();
+  ++NumSelectsExpanded;
+  return true;
+}
+
+static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
+  SmallVector<int, 16> Mask(SVI->getShuffleMask());
+  int SplatElem = -1;
+  for (unsigned i = 0; i < Mask.size(); ++i) {
+    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
+      return false;
+    SplatElem = Mask[i];
+  }
+
+  return true;
+}
+
+/// Some targets have expensive vector shifts if the lanes aren't all the same
+/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
+/// it's often worth sinking a shufflevector splat down to its use so that
+/// codegen can spot all lanes are identical.
+bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
+  BasicBlock *DefBB = SVI->getParent();
+
+  // Only do this xform if variable vector shifts are particularly expensive.
+  if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
+    return false;
+
+  // We only expect better codegen by sinking a shuffle if we can recognise a
+  // constant splat.
+  if (!isBroadcastShuffle(SVI))
+    return false;
+
+  // InsertedShuffles - Only insert a shuffle in each block once.
+  DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
+
+  bool MadeChange = false;
+  for (User *U : SVI->users()) {
+    Instruction *UI = cast<Instruction>(U);
+
+    // Figure out which BB this ext is used in.
+    BasicBlock *UserBB = UI->getParent();
+    if (UserBB == DefBB) continue;
+
+    // For now only apply this when the splat is used by a shift instruction.
+    if (!UI->isShift()) continue;
+
+    // Everything checks out, sink the shuffle if the user's block doesn't
+    // already have a copy.
+    Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
+
+    if (!InsertedShuffle) {
+      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
+      assert(InsertPt != UserBB->end());
+      InsertedShuffle =
+          new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                SVI->getOperand(2), "", &*InsertPt);
+    }
+
+    UI->replaceUsesOfWith(SVI, InsertedShuffle);
+    MadeChange = true;
+  }
+
+  // If we removed all uses, nuke the shuffle.
+  if (SVI->use_empty()) {
+    SVI->eraseFromParent();
+    MadeChange = true;
+  }
+
+  return MadeChange;
+}
+
+bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
+  if (!TLI || !DL)
+    return false;
+
+  Value *Cond = SI->getCondition();
+  Type *OldType = Cond->getType();
+  LLVMContext &Context = Cond->getContext();
+  MVT RegType = TLI->getRegisterType(Context, TLI->getValueType(*DL, OldType));
+  unsigned RegWidth = RegType.getSizeInBits();
+
+  if (RegWidth <= cast<IntegerType>(OldType)->getBitWidth())
+    return false;
+
+  // If the register width is greater than the type width, expand the condition
+  // of the switch instruction and each case constant to the width of the
+  // register. By widening the type of the switch condition, subsequent
+  // comparisons (for case comparisons) will not need to be extended to the
+  // preferred register width, so we will potentially eliminate N-1 extends,
+  // where N is the number of cases in the switch.
+  auto *NewType = Type::getIntNTy(Context, RegWidth);
+
+  // Zero-extend the switch condition and case constants unless the switch
+  // condition is a function argument that is already being sign-extended.
+  // In that case, we can avoid an unnecessary mask/extension by sign-extending
+  // everything instead.
+  Instruction::CastOps ExtType = Instruction::ZExt;
+  if (auto *Arg = dyn_cast<Argument>(Cond))
+    if (Arg->hasSExtAttr())
+      ExtType = Instruction::SExt;
+
+  auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
+  ExtInst->insertBefore(SI);
+  SI->setCondition(ExtInst);
+  for (SwitchInst::CaseIt Case : SI->cases()) {
+    APInt NarrowConst = Case.getCaseValue()->getValue();
+    APInt WideConst = (ExtType == Instruction::ZExt) ?
+                      NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
+    Case.setValue(ConstantInt::get(Context, WideConst));
+  }
+
+  return true;
+}
+
+namespace {
+/// \brief Helper class to promote a scalar operation to a vector one.
+/// This class is used to move downward extractelement transition.
+/// E.g.,
+/// a = vector_op <2 x i32>
+/// b = extractelement <2 x i32> a, i32 0
+/// c = scalar_op b
+/// store c
+///
+/// =>
+/// a = vector_op <2 x i32>
+/// c = vector_op a (equivalent to scalar_op on the related lane)
+/// * d = extractelement <2 x i32> c, i32 0
+/// * store d
+/// Assuming both extractelement and store can be combine, we get rid of the
+/// transition.
+class VectorPromoteHelper {
+  /// DataLayout associated with the current module.
+  const DataLayout &DL;
+
+  /// Used to perform some checks on the legality of vector operations.
+  const TargetLowering &TLI;
+
+  /// Used to estimated the cost of the promoted chain.
+  const TargetTransformInfo &TTI;
+
+  /// The transition being moved downwards.
+  Instruction *Transition;
+  /// The sequence of instructions to be promoted.
+  SmallVector<Instruction *, 4> InstsToBePromoted;
+  /// Cost of combining a store and an extract.
+  unsigned StoreExtractCombineCost;
+  /// Instruction that will be combined with the transition.
+  Instruction *CombineInst;
+
+  /// \brief The instruction that represents the current end of the transition.
+  /// Since we are faking the promotion until we reach the end of the chain
+  /// of computation, we need a way to get the current end of the transition.
+  Instruction *getEndOfTransition() const {
+    if (InstsToBePromoted.empty())
+      return Transition;
+    return InstsToBePromoted.back();
+  }
+
+  /// \brief Return the index of the original value in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the original value,
+  /// c, is at index 0.
+  unsigned getTransitionOriginalValueIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 0;
+  }
+
+  /// \brief Return the index of the index in the transition.
+  /// E.g., for "extractelement <2 x i32> c, i32 0" the index
+  /// is at index 1.
+  unsigned getTransitionIdx() const {
+    assert(isa<ExtractElementInst>(Transition) &&
+           "Other kind of transitions are not supported yet");
+    return 1;
+  }
+
+  /// \brief Get the type of the transition.
+  /// This is the type of the original value.
+  /// E.g., for "extractelement <2 x i32> c, i32 1" the type of the
+  /// transition is <2 x i32>.
+  Type *getTransitionType() const {
+    return Transition->getOperand(getTransitionOriginalValueIdx())->getType();
+  }
+
+  /// \brief Promote \p ToBePromoted by moving \p Def downward through.
+  /// I.e., we have the following sequence:
+  /// Def = Transition <ty1> a to <ty2>
+  /// b = ToBePromoted <ty2> Def, ...
+  /// =>
+  /// b = ToBePromoted <ty1> a, ...
+  /// Def = Transition <ty1> ToBePromoted to <ty2>
+  void promoteImpl(Instruction *ToBePromoted);
+
+  /// \brief Check whether or not it is profitable to promote all the
+  /// instructions enqueued to be promoted.
+  bool isProfitableToPromote() {
+    Value *ValIdx = Transition->getOperand(getTransitionOriginalValueIdx());
+    unsigned Index = isa<ConstantInt>(ValIdx)
+                         ? cast<ConstantInt>(ValIdx)->getZExtValue()
+                         : -1;
+    Type *PromotedType = getTransitionType();
+
+    StoreInst *ST = cast<StoreInst>(CombineInst);
+    unsigned AS = ST->getPointerAddressSpace();
+    unsigned Align = ST->getAlignment();
+    // Check if this store is supported.
+    if (!TLI.allowsMisalignedMemoryAccesses(
+            TLI.getValueType(DL, ST->getValueOperand()->getType()), AS,
+            Align)) {
+      // If this is not supported, there is no way we can combine
+      // the extract with the store.
+      return false;
+    }
+
+    // The scalar chain of computation has to pay for the transition
+    // scalar to vector.
+    // The vector chain has to account for the combining cost.
+    uint64_t ScalarCost =
+        TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
+    uint64_t VectorCost = StoreExtractCombineCost;
+    for (const auto &Inst : InstsToBePromoted) {
+      // Compute the cost.
+      // By construction, all instructions being promoted are arithmetic ones.
+      // Moreover, one argument is a constant that can be viewed as a splat
+      // constant.
+      Value *Arg0 = Inst->getOperand(0);
+      bool IsArg0Constant = isa<UndefValue>(Arg0) || isa<ConstantInt>(Arg0) ||
+                            isa<ConstantFP>(Arg0);
+      TargetTransformInfo::OperandValueKind Arg0OVK =
+          IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                         : TargetTransformInfo::OK_AnyValue;
+      TargetTransformInfo::OperandValueKind Arg1OVK =
+          !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
+                          : TargetTransformInfo::OK_AnyValue;
+      ScalarCost += TTI.getArithmeticInstrCost(
+          Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+      VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               Arg0OVK, Arg1OVK);
+    }
+    DEBUG(dbgs() << "Estimated cost of computation to be promoted:\nScalar: "
+                 << ScalarCost << "\nVector: " << VectorCost << '\n');
+    return ScalarCost > VectorCost;
+  }
+
+  /// \brief Generate a constant vector with \p Val with the same
+  /// number of elements as the transition.
+  /// \p UseSplat defines whether or not \p Val should be replicated
+  /// across the whole vector.
+  /// In other words, if UseSplat == true, we generate <Val, Val, ..., Val>,
+  /// otherwise we generate a vector with as many undef as possible:
+  /// <undef, ..., undef, Val, undef, ..., undef> where \p Val is only
+  /// used at the index of the extract.
+  Value *getConstantVector(Constant *Val, bool UseSplat) const {
+    unsigned ExtractIdx = UINT_MAX;
+    if (!UseSplat) {
+      // If we cannot determine where the constant must be, we have to
+      // use a splat constant.
+      Value *ValExtractIdx = Transition->getOperand(getTransitionIdx());
+      if (ConstantInt *CstVal = dyn_cast<ConstantInt>(ValExtractIdx))
+        ExtractIdx = CstVal->getSExtValue();
+      else
+        UseSplat = true;
+    }
+
+    unsigned End = getTransitionType()->getVectorNumElements();
+    if (UseSplat)
+      return ConstantVector::getSplat(End, Val);
+
+    SmallVector<Constant *, 4> ConstVec;
+    UndefValue *UndefVal = UndefValue::get(Val->getType());
+    for (unsigned Idx = 0; Idx != End; ++Idx) {
+      if (Idx == ExtractIdx)
+        ConstVec.push_back(Val);
+      else
+        ConstVec.push_back(UndefVal);
+    }
+    return ConstantVector::get(ConstVec);
+  }
+
+  /// \brief Check if promoting to a vector type an operand at \p OperandIdx
+  /// in \p Use can trigger undefined behavior.
+  static bool canCauseUndefinedBehavior(const Instruction *Use,
+                                        unsigned OperandIdx) {
+    // This is not safe to introduce undef when the operand is on
+    // the right hand side of a division-like instruction.
+    if (OperandIdx != 1)
+      return false;
+    switch (Use->getOpcode()) {
+    default:
+      return false;
+    case Instruction::SDiv:
+    case Instruction::UDiv:
+    case Instruction::SRem:
+    case Instruction::URem:
+      return true;
+    case Instruction::FDiv:
+    case Instruction::FRem:
+      return !Use->hasNoNaNs();
+    }
+    llvm_unreachable(nullptr);
+  }
+
+public:
+  VectorPromoteHelper(const DataLayout &DL, const TargetLowering &TLI,
+                      const TargetTransformInfo &TTI, Instruction *Transition,
+                      unsigned CombineCost)
+      : DL(DL), TLI(TLI), TTI(TTI), Transition(Transition),
+        StoreExtractCombineCost(CombineCost), CombineInst(nullptr) {
+    assert(Transition && "Do not know how to promote null");
+  }
+
+  /// \brief Check if we can promote \p ToBePromoted to \p Type.
+  bool canPromote(const Instruction *ToBePromoted) const {
+    // We could support CastInst too.
+    return isa<BinaryOperator>(ToBePromoted);
+  }
+
+  /// \brief Check if it is profitable to promote \p ToBePromoted
+  /// by moving downward the transition through.
+  bool shouldPromote(const Instruction *ToBePromoted) const {
+    // Promote only if all the operands can be statically expanded.
+    // Indeed, we do not want to introduce any new kind of transitions.
+    for (const Use &U : ToBePromoted->operands()) {
+      const Value *Val = U.get();
+      if (Val == getEndOfTransition()) {
+        // If the use is a division and the transition is on the rhs,
+        // we cannot promote the operation, otherwise we may create a
+        // division by zero.
+        if (canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()))
+          return false;
+        continue;
+      }
+      if (!isa<ConstantInt>(Val) && !isa<UndefValue>(Val) &&
+          !isa<ConstantFP>(Val))
+        return false;
+    }
+    // Check that the resulting operation is legal.
+    int ISDOpcode = TLI.InstructionOpcodeToISD(ToBePromoted->getOpcode());
+    if (!ISDOpcode)
+      return false;
+    return StressStoreExtract ||
+           TLI.isOperationLegalOrCustom(
+               ISDOpcode, TLI.getValueType(DL, getTransitionType(), true));
+  }
+
+  /// \brief Check whether or not \p Use can be combined
+  /// with the transition.
+  /// I.e., is it possible to do Use(Transition) => AnotherUse?
+  bool canCombine(const Instruction *Use) { return isa<StoreInst>(Use); }
+
+  /// \brief Record \p ToBePromoted as part of the chain to be promoted.
+  void enqueueForPromotion(Instruction *ToBePromoted) {
+    InstsToBePromoted.push_back(ToBePromoted);
+  }
+
+  /// \brief Set the instruction that will be combined with the transition.
+  void recordCombineInstruction(Instruction *ToBeCombined) {
+    assert(canCombine(ToBeCombined) && "Unsupported instruction to combine");
+    CombineInst = ToBeCombined;
+  }
+
+  /// \brief Promote all the instructions enqueued for promotion if it is
+  /// is profitable.
+  /// \return True if the promotion happened, false otherwise.
+  bool promote() {
+    // Check if there is something to promote.
+    // Right now, if we do not have anything to combine with,
+    // we assume the promotion is not profitable.
+    if (InstsToBePromoted.empty() || !CombineInst)
+      return false;
+
+    // Check cost.
+    if (!StressStoreExtract && !isProfitableToPromote())
+      return false;
+
+    // Promote.
+    for (auto &ToBePromoted : InstsToBePromoted)
+      promoteImpl(ToBePromoted);
+    InstsToBePromoted.clear();
+    return true;
+  }
+};
+} // End of anonymous namespace.
+
+void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
+  // At this point, we know that all the operands of ToBePromoted but Def
+  // can be statically promoted.
+  // For Def, we need to use its parameter in ToBePromoted:
+  // b = ToBePromoted ty1 a
+  // Def = Transition ty1 b to ty2
+  // Move the transition down.
+  // 1. Replace all uses of the promoted operation by the transition.
+  // = ... b => = ... Def.
+  assert(ToBePromoted->getType() == Transition->getType() &&
+         "The type of the result of the transition does not match "
+         "the final type");
+  ToBePromoted->replaceAllUsesWith(Transition);
+  // 2. Update the type of the uses.
+  // b = ToBePromoted ty2 Def => b = ToBePromoted ty1 Def.
+  Type *TransitionTy = getTransitionType();
+  ToBePromoted->mutateType(TransitionTy);
+  // 3. Update all the operands of the promoted operation with promoted
+  // operands.
+  // b = ToBePromoted ty1 Def => b = ToBePromoted ty1 a.
+  for (Use &U : ToBePromoted->operands()) {
+    Value *Val = U.get();
+    Value *NewVal = nullptr;
+    if (Val == Transition)
+      NewVal = Transition->getOperand(getTransitionOriginalValueIdx());
+    else if (isa<UndefValue>(Val) || isa<ConstantInt>(Val) ||
+             isa<ConstantFP>(Val)) {
+      // Use a splat constant if it is not safe to use undef.
+      NewVal = getConstantVector(
+          cast<Constant>(Val),
+          isa<UndefValue>(Val) ||
+              canCauseUndefinedBehavior(ToBePromoted, U.getOperandNo()));
+    } else
+      llvm_unreachable("Did you modified shouldPromote and forgot to update "
+                       "this?");
+    ToBePromoted->setOperand(U.getOperandNo(), NewVal);
+  }
+  Transition->removeFromParent();
+  Transition->insertAfter(ToBePromoted);
+  Transition->setOperand(getTransitionOriginalValueIdx(), ToBePromoted);
+}
+
+/// Some targets can do store(extractelement) with one instruction.
+/// Try to push the extractelement towards the stores when the target
+/// has this feature and this is profitable.
+bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
+  unsigned CombineCost = UINT_MAX;
+  if (DisableStoreExtract || !TLI ||
+      (!StressStoreExtract &&
+       !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
+                                       Inst->getOperand(1), CombineCost)))
+    return false;
+
+  // At this point we know that Inst is a vector to scalar transition.
+  // Try to move it down the def-use chain, until:
+  // - We can combine the transition with its single use
+  //   => we got rid of the transition.
+  // - We escape the current basic block
+  //   => we would need to check that we are moving it at a cheaper place and
+  //      we do not do that for now.
+  BasicBlock *Parent = Inst->getParent();
+  DEBUG(dbgs() << "Found an interesting transition: " << *Inst << '\n');
+  VectorPromoteHelper VPH(*DL, *TLI, *TTI, Inst, CombineCost);
+  // If the transition has more than one use, assume this is not going to be
+  // beneficial.
+  while (Inst->hasOneUse()) {
+    Instruction *ToBePromoted = cast<Instruction>(*Inst->user_begin());
+    DEBUG(dbgs() << "Use: " << *ToBePromoted << '\n');
+
+    if (ToBePromoted->getParent() != Parent) {
+      DEBUG(dbgs() << "Instruction to promote is in a different block ("
+                   << ToBePromoted->getParent()->getName()
+                   << ") than the transition (" << Parent->getName() << ").\n");
+      return false;
+    }
+
+    if (VPH.canCombine(ToBePromoted)) {
+      DEBUG(dbgs() << "Assume " << *Inst << '\n'
+                   << "will be combined with: " << *ToBePromoted << '\n');
+      VPH.recordCombineInstruction(ToBePromoted);
+      bool Changed = VPH.promote();
+      NumStoreExtractExposed += Changed;
+      return Changed;
+    }
+
+    DEBUG(dbgs() << "Try promoting.\n");
+    if (!VPH.canPromote(ToBePromoted) || !VPH.shouldPromote(ToBePromoted))
+      return false;
+
+    DEBUG(dbgs() << "Promoting is possible... Enqueue for promotion!\n");
+
+    VPH.enqueueForPromotion(ToBePromoted);
+    Inst = ToBePromoted;
+  }
+  return false;
+}
+
+bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
+  // Bail out if we inserted the instruction to prevent optimizations from
+  // stepping on each other's toes.
+  if (InsertedInsts.count(I))
+    return false;
+
+  if (PHINode *P = dyn_cast<PHINode>(I)) {
+    // It is possible for very late stage optimizations (such as SimplifyCFG)
+    // to introduce PHI nodes too late to be cleaned up.  If we detect such a
+    // trivial PHI, go ahead and zap it here.
+    if (Value *V = SimplifyInstruction(P, *DL, TLInfo, nullptr)) {
+      P->replaceAllUsesWith(V);
+      P->eraseFromParent();
+      ++NumPHIsElim;
+      return true;
+    }
+    return false;
+  }
+
+  if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    // If the source of the cast is a constant, then this should have
+    // already been constant folded.  The only reason NOT to constant fold
+    // it is if something (e.g. LSR) was careful to place the constant
+    // evaluation in a block other than then one that uses it (e.g. to hoist
+    // the address of globals out of a loop).  If this is the case, we don't
+    // want to forward-subst the cast.
+    if (isa<Constant>(CI->getOperand(0)))
+      return false;
+
+    if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
+      return true;
+
+    if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
+      /// Sink a zext or sext into its user blocks if the target type doesn't
+      /// fit in one register
+      if (TLI &&
+          TLI->getTypeAction(CI->getContext(),
+                             TLI->getValueType(*DL, CI->getType())) ==
+              TargetLowering::TypeExpandInteger) {
+        return SinkCast(CI);
+      } else {
+        bool MadeChange = moveExtToFormExtLoad(I);
+        return MadeChange | optimizeExtUses(I);
+      }
+    }
+    return false;
+  }
+
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (!TLI || !TLI->hasMultipleConditionRegisters())
+      return OptimizeCmpExpression(CI, TLI);
+
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    stripInvariantGroupMetadata(*LI);
+    if (TLI) {
+      bool Modified = optimizeLoadExt(LI);
+      unsigned AS = LI->getPointerAddressSpace();
+      Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+      return Modified;
+    }
+    return false;
+  }
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+    stripInvariantGroupMetadata(*SI);
+    if (TLI) {
+      unsigned AS = SI->getPointerAddressSpace();
+      return optimizeMemoryInst(I, SI->getOperand(1),
+                                SI->getOperand(0)->getType(), AS);
+    }
+    return false;
+  }
+
+  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
+
+  if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
+                BinOp->getOpcode() == Instruction::LShr)) {
+    ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
+    if (TLI && CI && TLI->hasExtractBitsInsn())
+      return OptimizeExtractBits(BinOp, CI, *TLI, *DL);
+
+    return false;
+  }
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEPI->hasAllZeroIndices()) {
+      /// The GEP operand must be a pointer, so must its result -> BitCast
+      Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(),
+                                        GEPI->getName(), GEPI);
+      GEPI->replaceAllUsesWith(NC);
+      GEPI->eraseFromParent();
+      ++NumGEPsElim;
+      optimizeInst(NC, ModifiedDT);
+      return true;
+    }
+    return false;
+  }
+
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    return optimizeCallInst(CI, ModifiedDT);
+
+  if (SelectInst *SI = dyn_cast<SelectInst>(I))
+    return optimizeSelectInst(SI);
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+    return optimizeShuffleVectorInst(SVI);
+
+  if (auto *Switch = dyn_cast<SwitchInst>(I))
+    return optimizeSwitchInst(Switch);
+
+  if (isa<ExtractElementInst>(I))
+    return optimizeExtractElementInst(I);
+
+  return false;
+}
+
+/// Given an OR instruction, check to see if this is a bitreverse
+/// idiom. If so, insert the new intrinsic and return true.
+static bool makeBitReverse(Instruction &I, const DataLayout &DL,
+                           const TargetLowering &TLI) {
+  if (!I.getType()->isIntegerTy() ||
+      !TLI.isOperationLegalOrCustom(ISD::BITREVERSE,
+                                    TLI.getValueType(DL, I.getType(), true)))
+    return false;
+
+  SmallVector<Instruction*, 4> Insts;
+  if (!recognizeBSwapOrBitReverseIdiom(&I, false, true, Insts))
+    return false;
+  Instruction *LastInst = Insts.back();
+  I.replaceAllUsesWith(LastInst);
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  return true;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool& ModifiedDT) {
+  SunkAddrs.clear();
+  bool MadeChange = false;
+
+  CurInstIterator = BB.begin();
+  while (CurInstIterator != BB.end()) {
+    MadeChange |= optimizeInst(&*CurInstIterator++, ModifiedDT);
+    if (ModifiedDT)
+      return true;
+  }
+
+  bool MadeBitReverse = true;
+  while (TLI && MadeBitReverse) {
+    MadeBitReverse = false;
+    for (auto &I : reverse(BB)) {
+      if (makeBitReverse(I, *DL, *TLI)) {
+        MadeBitReverse = MadeChange = true;
+        ModifiedDT = true;
+        break;
+      }
+    }
+  }
+  MadeChange |= dupRetToEnableTailCallOpts(&BB);
+
+  return MadeChange;
+}
+
+// llvm.dbg.value is far away from the value then iSel may not be able
+// handle it properly. iSel will drop llvm.dbg.value if it can not
+// find a node corresponding to the value.
+bool CodeGenPrepare::placeDbgValues(Function &F) {
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    Instruction *PrevNonDbgInst = nullptr;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end(); BI != BE;) {
+      Instruction *Insn = &*BI++;
+      DbgValueInst *DVI = dyn_cast<DbgValueInst>(Insn);
+      // Leave dbg.values that refer to an alloca alone. These
+      // instrinsics describe the address of a variable (= the alloca)
+      // being taken.  They should not be moved next to the alloca
+      // (and to the beginning of the scope), but rather stay close to
+      // where said address is used.
+      if (!DVI || (DVI->getValue() && isa<AllocaInst>(DVI->getValue()))) {
+        PrevNonDbgInst = Insn;
+        continue;
+      }
+
+      Instruction *VI = dyn_cast_or_null<Instruction>(DVI->getValue());
+      if (VI && VI != PrevNonDbgInst && !VI->isTerminator()) {
+        // If VI is a phi in a block with an EHPad terminator, we can't insert
+        // after it.
+        if (isa<PHINode>(VI) && VI->getParent()->getTerminator()->isEHPad())
+          continue;
+        DEBUG(dbgs() << "Moving Debug Value before :\n" << *DVI << ' ' << *VI);
+        DVI->removeFromParent();
+        if (isa<PHINode>(VI))
+          DVI->insertBefore(&*VI->getParent()->getFirstInsertionPt());
+        else
+          DVI->insertAfter(VI);
+        MadeChange = true;
+        ++NumDbgValueMoved;
+      }
+    }
+  }
+  return MadeChange;
+}
+
+// If there is a sequence that branches based on comparing a single bit
+// against zero that can be combined into a single instruction, and the
+// target supports folding these into a single instruction, sink the
+// mask and compare into the branch uses. Do this before OptimizeBlock ->
+// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
+// searched for.
+bool CodeGenPrepare::sinkAndCmp(Function &F) {
+  if (!EnableAndCmpSinking)
+    return false;
+  if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
+    return false;
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    // Does this BB end with the following?
+    //   %andVal = and %val, #single-bit-set
+    //   %icmpVal = icmp %andResult, 0
+    //   br i1 %cmpVal label %dest1, label %dest2"
+    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
+    if (!Brcc || !Brcc->isConditional())
+      continue;
+    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
+    if (!Cmp || Cmp->getParent() != &BB)
+      continue;
+    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
+    if (!Zero || !Zero->isZero())
+      continue;
+    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
+    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
+      continue;
+    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
+    if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
+      continue;
+    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
+
+    // Push the "and; icmp" for any users that are conditional branches.
+    // Since there can only be one branch use per BB, we don't need to keep
+    // track of which BBs we insert into.
+    for (Use &TheUse : Cmp->uses()) {
+      // Find brcc use.
+      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
+      if (!BrccUser || !BrccUser->isConditional())
+        continue;
+      BasicBlock *UserBB = BrccUser->getParent();
+      if (UserBB == &BB) continue;
+      DEBUG(dbgs() << "found Brcc use\n");
+
+      // Sink the "and; icmp" to use.
+      MadeChange = true;
+      BinaryOperator *NewAnd =
+        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
+                                  BrccUser);
+      CmpInst *NewCmp =
+        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
+                        "", BrccUser);
+      TheUse = NewCmp;
+      ++NumAndCmpsMoved;
+      DEBUG(BrccUser->getParent()->dump());
+    }
+  }
+  return MadeChange;
+}
+
+/// \brief Scale down both weights to fit into uint32_t.
+static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
+  uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
+  uint32_t Scale = (NewMax / UINT32_MAX) + 1;
+  NewTrue = NewTrue / Scale;
+  NewFalse = NewFalse / Scale;
+}
+
+/// \brief Some targets prefer to split a conditional branch like:
+/// \code
+///   %0 = icmp ne i32 %a, 0
+///   %1 = icmp ne i32 %b, 0
+///   %or.cond = or i1 %0, %1
+///   br i1 %or.cond, label %TrueBB, label %FalseBB
+/// \endcode
+/// into multiple branch instructions like:
+/// \code
+///   bb1:
+///     %0 = icmp ne i32 %a, 0
+///     br i1 %0, label %TrueBB, label %bb2
+///   bb2:
+///     %1 = icmp ne i32 %b, 0
+///     br i1 %1, label %TrueBB, label %FalseBB
+/// \endcode
+/// This usually allows instruction selection to do even further optimizations
+/// and combine the compare with the branch instruction. Currently this is
+/// applied for targets which have "cheap" jump instructions.
+///
+/// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
+///
+bool CodeGenPrepare::splitBranchCondition(Function &F) {
+  if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
+    return false;
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    // Does this BB end with the following?
+    //   %cond1 = icmp|fcmp|binary instruction ...
+    //   %cond2 = icmp|fcmp|binary instruction ...
+    //   %cond.or = or|and i1 %cond1, cond2
+    //   br i1 %cond.or label %dest1, label %dest2"
+    BinaryOperator *LogicOp;
+    BasicBlock *TBB, *FBB;
+    if (!match(BB.getTerminator(), m_Br(m_OneUse(m_BinOp(LogicOp)), TBB, FBB)))
+      continue;
+
+    auto *Br1 = cast<BranchInst>(BB.getTerminator());
+    if (Br1->getMetadata(LLVMContext::MD_unpredictable))
+      continue;
+
+    unsigned Opc;
+    Value *Cond1, *Cond2;
+    if (match(LogicOp, m_And(m_OneUse(m_Value(Cond1)),
+                             m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::And;
+    else if (match(LogicOp, m_Or(m_OneUse(m_Value(Cond1)),
+                                 m_OneUse(m_Value(Cond2)))))
+      Opc = Instruction::Or;
+    else
+      continue;
+
+    if (!match(Cond1, m_CombineOr(m_Cmp(), m_BinOp())) ||
+        !match(Cond2, m_CombineOr(m_Cmp(), m_BinOp()))   )
+      continue;
+
+    DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
+
+    // Create a new BB.
+    auto TmpBB =
+        BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
+                           BB.getParent(), BB.getNextNode());
+
+    // Update original basic block by using the first condition directly by the
+    // branch instruction and removing the no longer needed and/or instruction.
+    Br1->setCondition(Cond1);
+    LogicOp->eraseFromParent();
+
+    // Depending on the conditon we have to either replace the true or the false
+    // successor of the original branch instruction.
+    if (Opc == Instruction::And)
+      Br1->setSuccessor(0, TmpBB);
+    else
+      Br1->setSuccessor(1, TmpBB);
+
+    // Fill in the new basic block.
+    auto *Br2 = IRBuilder<>(TmpBB).CreateCondBr(Cond2, TBB, FBB);
+    if (auto *I = dyn_cast<Instruction>(Cond2)) {
+      I->removeFromParent();
+      I->insertBefore(Br2);
+    }
+
+    // Update PHI nodes in both successors. The original BB needs to be
+    // replaced in one succesor's PHI nodes, because the branch comes now from
+    // the newly generated BB (NewBB). In the other successor we need to add one
+    // incoming edge to the PHI nodes, because both branch instructions target
+    // now the same successor. Depending on the original branch condition
+    // (and/or) we have to swap the successors (TrueDest, FalseDest), so that
+    // we perfrom the correct update for the PHI nodes.
+    // This doesn't change the successor order of the just created branch
+    // instruction (or any other instruction).
+    if (Opc == Instruction::Or)
+      std::swap(TBB, FBB);
+
+    // Replace the old BB with the new BB.
+    for (auto &I : *TBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      int i;
+      while ((i = PN->getBasicBlockIndex(&BB)) >= 0)
+        PN->setIncomingBlock(i, TmpBB);
+    }
+
+    // Add another incoming edge form the new BB.
+    for (auto &I : *FBB) {
+      PHINode *PN = dyn_cast<PHINode>(&I);
+      if (!PN)
+        break;
+      auto *Val = PN->getIncomingValueForBlock(&BB);
+      PN->addIncoming(Val, TmpBB);
+    }
+
+    // Update the branch weights (from SelectionDAGBuilder::
+    // FindMergedConditions).
+    if (Opc == Instruction::Or) {
+      // Codegen X | Y as:
+      // BB1:
+      //   jmp_if_X TBB
+      //   jmp TmpBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+
+      // We have flexibility in setting Prob for BB1 and Prob for NewBB.
+      // The requirement is that
+      //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+      //     = TrueProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to A and A+2B, and set TmpBB's weights to A and 2B. This choice
+      // assumes that
+      //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+      // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+      // TmpBB, but the math is more complicated.
+      uint64_t TrueWeight, FalseWeight;
+      if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = TrueWeight;
+        uint64_t NewFalseWeight = TrueWeight + 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = TrueWeight;
+        NewFalseWeight = 2 * FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    } else {
+      // Codegen X & Y as:
+      // BB1:
+      //   jmp_if_X TmpBB
+      //   jmp FBB
+      // TmpBB:
+      //   jmp_if_Y TBB
+      //   jmp FBB
+      //
+      //  This requires creation of TmpBB after CurBB.
+
+      // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+      // The requirement is that
+      //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+      //     = FalseProb for orignal BB.
+      // Assuming the orignal weights are A and B, one choice is to set BB1's
+      // weights to 2A+B and B, and set TmpBB's weights to 2A and B. This choice
+      // assumes that
+      //   FalseProb for BB1 == TrueProb for BB1 * FalseProb for TmpBB.
+      uint64_t TrueWeight, FalseWeight;
+      if (Br1->extractProfMetadata(TrueWeight, FalseWeight)) {
+        uint64_t NewTrueWeight = 2 * TrueWeight + FalseWeight;
+        uint64_t NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br1->setMetadata(LLVMContext::MD_prof, MDBuilder(Br1->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+
+        NewTrueWeight = 2 * TrueWeight;
+        NewFalseWeight = FalseWeight;
+        scaleWeights(NewTrueWeight, NewFalseWeight);
+        Br2->setMetadata(LLVMContext::MD_prof, MDBuilder(Br2->getContext())
+                         .createBranchWeights(TrueWeight, FalseWeight));
+      }
+    }
+
+    // Note: No point in getting fancy here, since the DT info is never
+    // available to CodeGenPrepare.
+    ModifiedDT = true;
+
+    MadeChange = true;
+
+    DEBUG(dbgs() << "After branch condition splitting\n"; BB.dump();
+          TmpBB->dump());
+  }
+  return MadeChange;
+}
+
+void CodeGenPrepare::stripInvariantGroupMetadata(Instruction &I) {
+  if (auto *InvariantMD = I.getMetadata(LLVMContext::MD_invariant_group))
+    I.dropUnknownNonDebugMetadata(InvariantMD->getMetadataID());
+}
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
new file mode 100644
index 000000000000..a0189a172bfc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -0,0 +1,681 @@
+//===----- CriticalAntiDepBreaker.cpp - Anti-dep breaker -------- ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+CriticalAntiDepBreaker::CriticalAntiDepBreaker(MachineFunction &MFi,
+                                               const RegisterClassInfo &RCI)
+    : AntiDepBreaker(), MF(MFi), MRI(MF.getRegInfo()),
+      TII(MF.getSubtarget().getInstrInfo()),
+      TRI(MF.getSubtarget().getRegisterInfo()), RegClassInfo(RCI),
+      Classes(TRI->getNumRegs(), nullptr), KillIndices(TRI->getNumRegs(), 0),
+      DefIndices(TRI->getNumRegs(), 0), KeepRegs(TRI->getNumRegs(), false) {}
+
+CriticalAntiDepBreaker::~CriticalAntiDepBreaker() {
+}
+
+void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
+  const unsigned BBSize = BB->size();
+  for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i) {
+    // Clear out the register class data.
+    Classes[i] = nullptr;
+
+    // Initialize the indices to indicate that no registers are live.
+    KillIndices[i] = ~0u;
+    DefIndices[i] = BBSize;
+  }
+
+  // Clear "do not change" set.
+  KeepRegs.reset();
+
+  bool IsReturnBlock = BB->isReturnBlock();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         SE = BB->succ_end(); SI != SE; ++SI)
+    for (const auto &LI : (*SI)->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI) {
+        unsigned Reg = *AI;
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        KillIndices[Reg] = BBSize;
+        DefIndices[Reg] = ~0u;
+      }
+    }
+
+  // Mark live-out callee-saved registers. In a return block this is
+  // all callee-saved registers. In non-return this is any
+  // callee-saved register that is not saved in the prolog.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  BitVector Pristine = MFI->getPristineRegs(MF);
+  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+    for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
+      unsigned Reg = *AI;
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = BBSize;
+      DefIndices[Reg] = ~0u;
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::FinishBlock() {
+  RegRefs.clear();
+  KeepRegs.reset();
+}
+
+void CriticalAntiDepBreaker::Observe(MachineInstr &MI, unsigned Count,
+                                     unsigned InsertPosIndex) {
+  // Kill instructions can define registers but are really nops, and there might
+  // be a real definition earlier that needs to be paired with uses dominated by
+  // this kill.
+
+  // FIXME: It may be possible to remove the isKill() restriction once PR18663
+  // has been properly fixed. There can be value in processing kills as seen in
+  // the AggressiveAntiDepBreaker class.
+  if (MI.isDebugValue() || MI.isKill())
+    return;
+  assert(Count < InsertPosIndex && "Instruction index out of expected range!");
+
+  for (unsigned Reg = 0; Reg != TRI->getNumRegs(); ++Reg) {
+    if (KillIndices[Reg] != ~0u) {
+      // If Reg is currently live, then mark that it can't be renamed as
+      // we don't know the extent of its live-range anymore (now that it
+      // has been scheduled).
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      KillIndices[Reg] = Count;
+    } else if (DefIndices[Reg] < InsertPosIndex && DefIndices[Reg] >= Count) {
+      // Any register which was defined within the previous scheduling region
+      // may have been rescheduled and its lifetime may overlap with registers
+      // in ways not reflected in our current liveness state. For each such
+      // register, adjust the liveness state to be conservatively correct.
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+      // Move the def index to the end of the previous region, to reflect
+      // that the def could theoretically have been scheduled at the end.
+      DefIndices[Reg] = InsertPosIndex;
+    }
+  }
+
+  PrescanInstruction(MI);
+  ScanInstruction(MI, Count);
+}
+
+/// CriticalPathStep - Return the next SUnit after SU on the bottom-up
+/// critical path.
+static const SDep *CriticalPathStep(const SUnit *SU) {
+  const SDep *Next = nullptr;
+  unsigned NextDepth = 0;
+  // Find the predecessor edge with the greatest depth.
+  for (SUnit::const_pred_iterator P = SU->Preds.begin(), PE = SU->Preds.end();
+       P != PE; ++P) {
+    const SUnit *PredSU = P->getSUnit();
+    unsigned PredLatency = P->getLatency();
+    unsigned PredTotalLatency = PredSU->getDepth() + PredLatency;
+    // In the case of a latency tie, prefer an anti-dependency edge over
+    // other types of edges.
+    if (NextDepth < PredTotalLatency ||
+        (NextDepth == PredTotalLatency && P->getKind() == SDep::Anti)) {
+      NextDepth = PredTotalLatency;
+      Next = &*P;
+    }
+  }
+  return Next;
+}
+
+void CriticalAntiDepBreaker::PrescanInstruction(MachineInstr &MI) {
+  // It's not safe to change register allocation for source operands of
+  // instructions that have special allocation requirements. Also assume all
+  // registers used in a call must not be changed (ABI).
+  // FIXME: The issue with predicated instruction is more complex. We are being
+  // conservative here because the kill markers cannot be trusted after
+  // if-conversion:
+  // %R6<def> = LDR %SP, %reg0, 92, pred:14, pred:%reg0; mem:LD4[FixedStack14]
+  // ...
+  // STR %R0, %R6<kill>, %reg0, 0, pred:0, pred:%CPSR; mem:ST4[%395]
+  // %R6<def> = LDR %SP, %reg0, 100, pred:0, pred:%CPSR; mem:LD4[FixedStack12]
+  // STR %R0, %R6<kill>, %reg0, 0, pred:14, pred:%reg0; mem:ST4[%396](align=8)
+  //
+  // The first R6 kill is not really a kill since it's killed by a predicated
+  // instruction which may not be executed. The second R6 def may or may not
+  // re-define R6 so it's not safe to change it since the last R6 use cannot be
+  // changed.
+  bool Special =
+      MI.isCall() || MI.hasExtraSrcRegAllocReq() || TII->isPredicated(MI);
+
+  // Scan the register operands for this instruction and update
+  // Classes and RegRefs.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    const TargetRegisterClass *NewRC = nullptr;
+
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    // Now check for aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, false); AI.isValid(); ++AI) {
+      // If an alias of the reg is used during the live range, give up.
+      // Note that this allows us to skip checking if AntiDepReg
+      // overlaps with any of the aliases, among other things.
+      unsigned AliasReg = *AI;
+      if (Classes[AliasReg]) {
+        Classes[AliasReg] = reinterpret_cast<TargetRegisterClass *>(-1);
+        Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+      }
+    }
+
+    // If we're still willing to consider this register, note the reference.
+    if (Classes[Reg] != reinterpret_cast<TargetRegisterClass *>(-1))
+      RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // If this reg is tied and live (Classes[Reg] is set to -1), we can't change
+    // it or any of its sub or super regs. We need to use KeepRegs to mark the
+    // reg because not all uses of the same reg within an instruction are
+    // necessarily tagged as tied.
+    // Example: an x86 "xor %eax, %eax" will have one source operand tied to the
+    // def register but not the second (see PR20020 for details).
+    // FIXME: can this check be relaxed to account for undef uses
+    // of a register? In the above 'xor' example, the uses of %eax are undef, so
+    // earlier instructions could still replace %eax even though the 'xor'
+    // itself can't be changed.
+    if (MI.isRegTiedToUseOperand(i) &&
+        Classes[Reg] == reinterpret_cast<TargetRegisterClass *>(-1)) {
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs) {
+        KeepRegs.set(*SubRegs);
+      }
+      for (MCSuperRegIterator SuperRegs(Reg, TRI);
+           SuperRegs.isValid(); ++SuperRegs) {
+        KeepRegs.set(*SuperRegs);
+      }
+    }
+
+    if (MO.isUse() && Special) {
+      if (!KeepRegs.test(Reg)) {
+        for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+             SubRegs.isValid(); ++SubRegs)
+          KeepRegs.set(*SubRegs);
+      }
+    }
+  }
+}
+
+void CriticalAntiDepBreaker::ScanInstruction(MachineInstr &MI, unsigned Count) {
+  // Update liveness.
+  // Proceeding upwards, registers that are defed but not used in this
+  // instruction are now dead.
+  assert(!MI.isKill() && "Attempting to scan a kill instruction");
+
+  if (!TII->isPredicated(MI)) {
+    // Predicated defs are modeled as read + write, i.e. similar to two
+    // address updates.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+
+      if (MO.isRegMask())
+        for (unsigned i = 0, e = TRI->getNumRegs(); i != e; ++i)
+          if (MO.clobbersPhysReg(i)) {
+            DefIndices[i] = Count;
+            KillIndices[i] = ~0u;
+            KeepRegs.reset(i);
+            Classes[i] = nullptr;
+            RegRefs.erase(i);
+          }
+
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+
+      // Ignore two-addr defs.
+      if (MI.isRegTiedToUseOperand(i))
+        continue;
+
+      // If we've already marked this reg as unchangeable, don't remove
+      // it or any of its subregs from KeepRegs.
+      bool Keep = KeepRegs.test(Reg);
+
+      // For the reg itself and all subregs: update the def to current;
+      // reset the kill state, any restrictions, and references.
+      for (MCSubRegIterator SRI(Reg, TRI, true); SRI.isValid(); ++SRI) {
+        unsigned SubregReg = *SRI;
+        DefIndices[SubregReg] = Count;
+        KillIndices[SubregReg] = ~0u;
+        Classes[SubregReg] = nullptr;
+        RegRefs.erase(SubregReg);
+        if (!Keep)
+          KeepRegs.reset(SubregReg);
+      }
+      // Conservatively mark super-registers as unusable.
+      for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+        Classes[*SR] = reinterpret_cast<TargetRegisterClass *>(-1);
+    }
+  }
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+    if (!MO.isUse()) continue;
+
+    const TargetRegisterClass *NewRC = nullptr;
+    if (i < MI.getDesc().getNumOperands())
+      NewRC = TII->getRegClass(MI.getDesc(), i, TRI, MF);
+
+    // For now, only allow the register to be changed if its register
+    // class is consistent across all uses.
+    if (!Classes[Reg] && NewRC)
+      Classes[Reg] = NewRC;
+    else if (!NewRC || Classes[Reg] != NewRC)
+      Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
+
+    RegRefs.insert(std::make_pair(Reg, &MO));
+
+    // It wasn't previously live but now it is, this is a kill.
+    // Repeat for all aliases.
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      unsigned AliasReg = *AI;
+      if (KillIndices[AliasReg] == ~0u) {
+        KillIndices[AliasReg] = Count;
+        DefIndices[AliasReg] = ~0u;
+      }
+    }
+  }
+}
+
+// Check all machine operands that reference the antidependent register and must
+// be replaced by NewReg. Return true if any of their parent instructions may
+// clobber the new register.
+//
+// Note: AntiDepReg may be referenced by a two-address instruction such that
+// it's use operand is tied to a def operand. We guard against the case in which
+// the two-address instruction also defines NewReg, as may happen with
+// pre/postincrement loads. In this case, both the use and def operands are in
+// RegRefs because the def is inserted by PrescanInstruction and not erased
+// during ScanInstruction. So checking for an instruction with definitions of
+// both NewReg and AntiDepReg covers it.
+bool
+CriticalAntiDepBreaker::isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                                RegRefIter RegRefEnd,
+                                                unsigned NewReg)
+{
+  for (RegRefIter I = RegRefBegin; I != RegRefEnd; ++I ) {
+    MachineOperand *RefOper = I->second;
+
+    // Don't allow the instruction defining AntiDepReg to earlyclobber its
+    // operands, in case they may be assigned to NewReg. In this case antidep
+    // breaking must fail, but it's too rare to bother optimizing.
+    if (RefOper->isDef() && RefOper->isEarlyClobber())
+      return true;
+
+    // Handle cases in which this instruction defines NewReg.
+    MachineInstr *MI = RefOper->getParent();
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &CheckOper = MI->getOperand(i);
+
+      if (CheckOper.isRegMask() && CheckOper.clobbersPhysReg(NewReg))
+        return true;
+
+      if (!CheckOper.isReg() || !CheckOper.isDef() ||
+          CheckOper.getReg() != NewReg)
+        continue;
+
+      // Don't allow the instruction to define NewReg and AntiDepReg.
+      // When AntiDepReg is renamed it will be an illegal op.
+      if (RefOper->isDef())
+        return true;
+
+      // Don't allow an instruction using AntiDepReg to be earlyclobbered by
+      // NewReg.
+      if (CheckOper.isEarlyClobber())
+        return true;
+
+      // Don't allow inline asm to define NewReg at all. Who knows what it's
+      // doing with it.
+      if (MI->isInlineAsm())
+        return true;
+    }
+  }
+  return false;
+}
+
+unsigned CriticalAntiDepBreaker::
+findSuitableFreeRegister(RegRefIter RegRefBegin,
+                         RegRefIter RegRefEnd,
+                         unsigned AntiDepReg,
+                         unsigned LastNewReg,
+                         const TargetRegisterClass *RC,
+                         SmallVectorImpl<unsigned> &Forbid)
+{
+  ArrayRef<MCPhysReg> Order = RegClassInfo.getOrder(RC);
+  for (unsigned i = 0; i != Order.size(); ++i) {
+    unsigned NewReg = Order[i];
+    // Don't replace a register with itself.
+    if (NewReg == AntiDepReg) continue;
+    // Don't replace a register with one that was recently used to repair
+    // an anti-dependence with this AntiDepReg, because that would
+    // re-introduce that anti-dependence.
+    if (NewReg == LastNewReg) continue;
+    // If any instructions that define AntiDepReg also define the NewReg, it's
+    // not suitable.  For example, Instruction with multiple definitions can
+    // result in this condition.
+    if (isNewRegClobberedByRefs(RegRefBegin, RegRefEnd, NewReg)) continue;
+    // If NewReg is dead and NewReg's most recent def is not before
+    // AntiDepReg's kill, it's safe to replace AntiDepReg with NewReg.
+    assert(((KillIndices[AntiDepReg] == ~0u) != (DefIndices[AntiDepReg] == ~0u))
+           && "Kill and Def maps aren't consistent for AntiDepReg!");
+    assert(((KillIndices[NewReg] == ~0u) != (DefIndices[NewReg] == ~0u))
+           && "Kill and Def maps aren't consistent for NewReg!");
+    if (KillIndices[NewReg] != ~0u ||
+        Classes[NewReg] == reinterpret_cast<TargetRegisterClass *>(-1) ||
+        KillIndices[AntiDepReg] > DefIndices[NewReg])
+      continue;
+    // If NewReg overlaps any of the forbidden registers, we can't use it.
+    bool Forbidden = false;
+    for (SmallVectorImpl<unsigned>::iterator it = Forbid.begin(),
+           ite = Forbid.end(); it != ite; ++it)
+      if (TRI->regsOverlap(NewReg, *it)) {
+        Forbidden = true;
+        break;
+      }
+    if (Forbidden) continue;
+    return NewReg;
+  }
+
+  // No registers are free and available!
+  return 0;
+}
+
+unsigned CriticalAntiDepBreaker::
+BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                      MachineBasicBlock::iterator Begin,
+                      MachineBasicBlock::iterator End,
+                      unsigned InsertPosIndex,
+                      DbgValueVector &DbgValues) {
+  // The code below assumes that there is at least one instruction,
+  // so just duck out immediately if the block is empty.
+  if (SUnits.empty()) return 0;
+
+  // Keep a map of the MachineInstr*'s back to the SUnit representing them.
+  // This is used for updating debug information.
+  //
+  // FIXME: Replace this with the existing map in ScheduleDAGInstrs::MISUnitMap
+  DenseMap<MachineInstr*,const SUnit*> MISUnitMap;
+
+  // Find the node at the bottom of the critical path.
+  const SUnit *Max = nullptr;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    const SUnit *SU = &SUnits[i];
+    MISUnitMap[SU->getInstr()] = SU;
+    if (!Max || SU->getDepth() + SU->Latency > Max->getDepth() + Max->Latency)
+      Max = SU;
+  }
+
+#ifndef NDEBUG
+  {
+    DEBUG(dbgs() << "Critical path has total latency "
+          << (Max->getDepth() + Max->Latency) << "\n");
+    DEBUG(dbgs() << "Available regs:");
+    for (unsigned Reg = 0; Reg < TRI->getNumRegs(); ++Reg) {
+      if (KillIndices[Reg] == ~0u)
+        DEBUG(dbgs() << " " << TRI->getName(Reg));
+    }
+    DEBUG(dbgs() << '\n');
+  }
+#endif
+
+  // Track progress along the critical path through the SUnit graph as we walk
+  // the instructions.
+  const SUnit *CriticalPathSU = Max;
+  MachineInstr *CriticalPathMI = CriticalPathSU->getInstr();
+
+  // Consider this pattern:
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  //   A = ...
+  //   ... = A
+  // There are three anti-dependencies here, and without special care,
+  // we'd break all of them using the same register:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  //   B = ...
+  //   ... = B
+  // because at each anti-dependence, B is the first register that
+  // isn't A which is free.  This re-introduces anti-dependencies
+  // at all but one of the original anti-dependencies that we were
+  // trying to break.  To avoid this, keep track of the most recent
+  // register that each register was replaced with, avoid
+  // using it to repair an anti-dependence on the same register.
+  // This lets us produce this:
+  //   A = ...
+  //   ... = A
+  //   B = ...
+  //   ... = B
+  //   C = ...
+  //   ... = C
+  //   B = ...
+  //   ... = B
+  // This still has an anti-dependence on B, but at least it isn't on the
+  // original critical path.
+  //
+  // TODO: If we tracked more than one register here, we could potentially
+  // fix that remaining critical edge too. This is a little more involved,
+  // because unlike the most recent register, less recent registers should
+  // still be considered, though only if no other registers are available.
+  std::vector<unsigned> LastNewReg(TRI->getNumRegs(), 0);
+
+  // Attempt to break anti-dependence edges on the critical path. Walk the
+  // instructions from the bottom up, tracking information about liveness
+  // as we go to help determine which registers are available.
+  unsigned Broken = 0;
+  unsigned Count = InsertPosIndex - 1;
+  for (MachineBasicBlock::iterator I = End, E = Begin; I != E; --Count) {
+    MachineInstr &MI = *--I;
+    // Kill instructions can define registers but are really nops, and there
+    // might be a real definition earlier that needs to be paired with uses
+    // dominated by this kill.
+    
+    // FIXME: It may be possible to remove the isKill() restriction once PR18663
+    // has been properly fixed. There can be value in processing kills as seen
+    // in the AggressiveAntiDepBreaker class.
+    if (MI.isDebugValue() || MI.isKill())
+      continue;
+
+    // Check if this instruction has a dependence on the critical path that
+    // is an anti-dependence that we may be able to break. If it is, set
+    // AntiDepReg to the non-zero register associated with the anti-dependence.
+    //
+    // We limit our attention to the critical path as a heuristic to avoid
+    // breaking anti-dependence edges that aren't going to significantly
+    // impact the overall schedule. There are a limited number of registers
+    // and we want to save them for the important edges.
+    //
+    // TODO: Instructions with multiple defs could have multiple
+    // anti-dependencies. The current code here only knows how to break one
+    // edge per instruction. Note that we'd have to be able to break all of
+    // the anti-dependencies in an instruction in order to be effective.
+    unsigned AntiDepReg = 0;
+    if (&MI == CriticalPathMI) {
+      if (const SDep *Edge = CriticalPathStep(CriticalPathSU)) {
+        const SUnit *NextSU = Edge->getSUnit();
+
+        // Only consider anti-dependence edges.
+        if (Edge->getKind() == SDep::Anti) {
+          AntiDepReg = Edge->getReg();
+          assert(AntiDepReg != 0 && "Anti-dependence on reg0?");
+          if (!MRI.isAllocatable(AntiDepReg))
+            // Don't break anti-dependencies on non-allocatable registers.
+            AntiDepReg = 0;
+          else if (KeepRegs.test(AntiDepReg))
+            // Don't break anti-dependencies if a use down below requires
+            // this exact register.
+            AntiDepReg = 0;
+          else {
+            // If the SUnit has other dependencies on the SUnit that it
+            // anti-depends on, don't bother breaking the anti-dependency
+            // since those edges would prevent such units from being
+            // scheduled past each other regardless.
+            //
+            // Also, if there are dependencies on other SUnits with the
+            // same register as the anti-dependency, don't attempt to
+            // break it.
+            for (SUnit::const_pred_iterator P = CriticalPathSU->Preds.begin(),
+                 PE = CriticalPathSU->Preds.end(); P != PE; ++P)
+              if (P->getSUnit() == NextSU ?
+                    (P->getKind() != SDep::Anti || P->getReg() != AntiDepReg) :
+                    (P->getKind() == SDep::Data && P->getReg() == AntiDepReg)) {
+                AntiDepReg = 0;
+                break;
+              }
+          }
+        }
+        CriticalPathSU = NextSU;
+        CriticalPathMI = CriticalPathSU->getInstr();
+      } else {
+        // We've reached the end of the critical path.
+        CriticalPathSU = nullptr;
+        CriticalPathMI = nullptr;
+      }
+    }
+
+    PrescanInstruction(MI);
+
+    SmallVector<unsigned, 2> ForbidRegs;
+
+    // If MI's defs have a special allocation requirement, don't allow
+    // any def registers to be changed. Also assume all registers
+    // defined in a call must not be changed (ABI).
+    if (MI.isCall() || MI.hasExtraDefRegAllocReq() || TII->isPredicated(MI))
+      // If this instruction's defs have special allocation requirement, don't
+      // break this anti-dependency.
+      AntiDepReg = 0;
+    else if (AntiDepReg) {
+      // If this instruction has a use of AntiDepReg, breaking it
+      // is invalid.  If the instruction defines other registers,
+      // save a list of them so that we don't pick a new register
+      // that overlaps any of them.
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (Reg == 0) continue;
+        if (MO.isUse() && TRI->regsOverlap(AntiDepReg, Reg)) {
+          AntiDepReg = 0;
+          break;
+        }
+        if (MO.isDef() && Reg != AntiDepReg)
+          ForbidRegs.push_back(Reg);
+      }
+    }
+
+    // Determine AntiDepReg's register class, if it is live and is
+    // consistently used within a single class.
+    const TargetRegisterClass *RC = AntiDepReg != 0 ? Classes[AntiDepReg]
+                                                    : nullptr;
+    assert((AntiDepReg == 0 || RC != nullptr) &&
+           "Register should be live if it's causing an anti-dependence!");
+    if (RC == reinterpret_cast<TargetRegisterClass *>(-1))
+      AntiDepReg = 0;
+
+    // Look for a suitable register to use to break the anti-dependence.
+    //
+    // TODO: Instead of picking the first free register, consider which might
+    // be the best.
+    if (AntiDepReg != 0) {
+      std::pair<std::multimap<unsigned, MachineOperand *>::iterator,
+                std::multimap<unsigned, MachineOperand *>::iterator>
+        Range = RegRefs.equal_range(AntiDepReg);
+      if (unsigned NewReg = findSuitableFreeRegister(Range.first, Range.second,
+                                                     AntiDepReg,
+                                                     LastNewReg[AntiDepReg],
+                                                     RC, ForbidRegs)) {
+        DEBUG(dbgs() << "Breaking anti-dependence edge on "
+              << TRI->getName(AntiDepReg)
+              << " with " << RegRefs.count(AntiDepReg) << " references"
+              << " using " << TRI->getName(NewReg) << "!\n");
+
+        // Update the references to the old register to refer to the new
+        // register.
+        for (std::multimap<unsigned, MachineOperand *>::iterator
+             Q = Range.first, QE = Range.second; Q != QE; ++Q) {
+          Q->second->setReg(NewReg);
+          // If the SU for the instruction being updated has debug information
+          // related to the anti-dependency register, make sure to update that
+          // as well.
+          const SUnit *SU = MISUnitMap[Q->second->getParent()];
+          if (!SU) continue;
+          for (DbgValueVector::iterator DVI = DbgValues.begin(),
+                 DVE = DbgValues.end(); DVI != DVE; ++DVI)
+            if (DVI->second == Q->second->getParent())
+              UpdateDbgValue(*DVI->first, AntiDepReg, NewReg);
+        }
+
+        // We just went back in time and modified history; the
+        // liveness information for the anti-dependence reg is now
+        // inconsistent. Set the state as if it were dead.
+        Classes[NewReg] = Classes[AntiDepReg];
+        DefIndices[NewReg] = DefIndices[AntiDepReg];
+        KillIndices[NewReg] = KillIndices[AntiDepReg];
+        assert(((KillIndices[NewReg] == ~0u) !=
+                (DefIndices[NewReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for NewReg!");
+
+        Classes[AntiDepReg] = nullptr;
+        DefIndices[AntiDepReg] = KillIndices[AntiDepReg];
+        KillIndices[AntiDepReg] = ~0u;
+        assert(((KillIndices[AntiDepReg] == ~0u) !=
+                (DefIndices[AntiDepReg] == ~0u)) &&
+             "Kill and Def maps aren't consistent for AntiDepReg!");
+
+        RegRefs.erase(AntiDepReg);
+        LastNewReg[AntiDepReg] = NewReg;
+        ++Broken;
+      }
+    }
+
+    ScanInstruction(MI, Count);
+  }
+
+  return Broken;
+}
diff --git a/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
new file mode 100644
index 000000000000..678779fa1a26
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -0,0 +1,106 @@
+//=- llvm/CodeGen/CriticalAntiDepBreaker.h - Anti-Dep Support -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the CriticalAntiDepBreaker class, which
+// implements register anti-dependence breaking along a blocks
+// critical path during post-RA scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+#define LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
+
+#include "AntiDepBreaker.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+namespace llvm {
+class RegisterClassInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class MachineFunction;
+
+class LLVM_LIBRARY_VISIBILITY CriticalAntiDepBreaker : public AntiDepBreaker {
+    MachineFunction& MF;
+    MachineRegisterInfo &MRI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const RegisterClassInfo &RegClassInfo;
+
+    /// The set of allocatable registers.
+    /// We'll be ignoring anti-dependencies on non-allocatable registers,
+    /// because they may not be safe to break.
+    const BitVector AllocatableSet;
+
+    /// For live regs that are only used in one register class in a
+    /// live range, the register class. If the register is not live, the
+    /// corresponding value is null. If the register is live but used in
+    /// multiple register classes, the corresponding value is -1 casted to a
+    /// pointer.
+    std::vector<const TargetRegisterClass*> Classes;
+
+    /// Map registers to all their references within a live range.
+    std::multimap<unsigned, MachineOperand *> RegRefs;
+    typedef std::multimap<unsigned, MachineOperand *>::const_iterator
+      RegRefIter;
+
+    /// The index of the most recent kill (proceeding bottom-up),
+    /// or ~0u if the register is not live.
+    std::vector<unsigned> KillIndices;
+
+    /// The index of the most recent complete def (proceeding
+    /// bottom up), or ~0u if the register is live.
+    std::vector<unsigned> DefIndices;
+
+    /// A set of registers which are live and cannot be changed to
+    /// break anti-dependencies.
+    BitVector KeepRegs;
+
+  public:
+    CriticalAntiDepBreaker(MachineFunction& MFi, const RegisterClassInfo&);
+    ~CriticalAntiDepBreaker() override;
+
+    /// Initialize anti-dep breaking for a new basic block.
+    void StartBlock(MachineBasicBlock *BB) override;
+
+    /// Identifiy anti-dependencies along the critical path
+    /// of the ScheduleDAG and break them by renaming registers.
+    unsigned BreakAntiDependencies(const std::vector<SUnit>& SUnits,
+                                   MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End,
+                                   unsigned InsertPosIndex,
+                                   DbgValueVector &DbgValues) override;
+
+    /// Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    void Observe(MachineInstr &MI, unsigned Count,
+                 unsigned InsertPosIndex) override;
+
+    /// Finish anti-dep breaking for a basic block.
+    void FinishBlock() override;
+
+  private:
+    void PrescanInstruction(MachineInstr &MI);
+    void ScanInstruction(MachineInstr &MI, unsigned Count);
+    bool isNewRegClobberedByRefs(RegRefIter RegRefBegin,
+                                 RegRefIter RegRefEnd,
+                                 unsigned NewReg);
+    unsigned findSuitableFreeRegister(RegRefIter RegRefBegin,
+                                      RegRefIter RegRefEnd,
+                                      unsigned AntiDepReg,
+                                      unsigned LastNewReg,
+                                      const TargetRegisterClass *RC,
+                                      SmallVectorImpl<unsigned> &Forbid);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
new file mode 100644
index 000000000000..2386af9e6877
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DFAPacketizer.cpp
@@ -0,0 +1,325 @@
+//=- llvm/CodeGen/DFAPacketizer.cpp - DFA Packetizer for VLIW -*- C++ -*-=====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This class implements a deterministic finite automaton (DFA) based
+// packetizing mechanism for VLIW architectures. It provides APIs to
+// determine whether there exists a legal mapping of instructions to
+// functional unit assignments in a packet. The DFA is auto-generated from
+// the target's Schedule.td file.
+//
+// A DFA consists of 3 major elements: states, inputs, and transitions. For
+// the packetizing mechanism, the input is the set of instruction classes for
+// a target. The state models all possible combinations of functional unit
+// consumption for a given set of instructions in a packet. A transition
+// models the addition of an instruction to a packet. In the DFA constructed
+// by this class, if an instruction can be added to a packet, then a valid
+// transition exists from the corresponding state. Invalid transitions
+// indicate that the instruction cannot be added to the current packet.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "packets"
+
+#include "llvm/CodeGen/DFAPacketizer.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+// --------------------------------------------------------------------
+// Definitions shared between DFAPacketizer.cpp and DFAPacketizerEmitter.cpp
+
+namespace {
+  DFAInput addDFAFuncUnits(DFAInput Inp, unsigned FuncUnits) {
+    return (Inp << DFA_MAX_RESOURCES) | FuncUnits;
+  }
+
+  /// Return the DFAInput for an instruction class input vector.
+  /// This function is used in both DFAPacketizer.cpp and in
+  /// DFAPacketizerEmitter.cpp.
+  DFAInput getDFAInsnInput(const std::vector<unsigned> &InsnClass) {
+    DFAInput InsnInput = 0;
+    assert((InsnClass.size() <= DFA_MAX_RESTERMS) &&
+           "Exceeded maximum number of DFA terms");
+    for (auto U : InsnClass)
+      InsnInput = addDFAFuncUnits(InsnInput, U);
+    return InsnInput;
+  }
+}
+// --------------------------------------------------------------------
+
+DFAPacketizer::DFAPacketizer(const InstrItineraryData *I,
+                             const DFAStateInput (*SIT)[2],
+                             const unsigned *SET):
+  InstrItins(I), CurrentState(0), DFAStateInputTable(SIT),
+  DFAStateEntryTable(SET) {
+  // Make sure DFA types are large enough for the number of terms & resources.
+  static_assert((DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <=
+                    (8 * sizeof(DFAInput)),
+                "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAInput");
+  static_assert(
+      (DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) <= (8 * sizeof(DFAStateInput)),
+      "(DFA_MAX_RESTERMS * DFA_MAX_RESOURCES) too big for DFAStateInput");
+}
+
+
+// Read the DFA transition table and update CachedTable.
+//
+// Format of the transition tables:
+// DFAStateInputTable[][2] = pairs of <Input, Transition> for all valid
+//                           transitions
+// DFAStateEntryTable[i] = Index of the first entry in DFAStateInputTable
+//                         for the ith state
+//
+void DFAPacketizer::ReadTable(unsigned int state) {
+  unsigned ThisState = DFAStateEntryTable[state];
+  unsigned NextStateInTable = DFAStateEntryTable[state+1];
+  // Early exit in case CachedTable has already contains this
+  // state's transitions.
+  if (CachedTable.count(UnsignPair(state, DFAStateInputTable[ThisState][0])))
+    return;
+
+  for (unsigned i = ThisState; i < NextStateInTable; i++)
+    CachedTable[UnsignPair(state, DFAStateInputTable[i][0])] =
+      DFAStateInputTable[i][1];
+}
+
+
+// Return the DFAInput for an instruction class.
+DFAInput DFAPacketizer::getInsnInput(unsigned InsnClass) {
+  // Note: this logic must match that in DFAPacketizerDefs.h for input vectors.
+  DFAInput InsnInput = 0;
+  unsigned i = 0;
+  (void)i;
+  for (const InstrStage *IS = InstrItins->beginStage(InsnClass),
+       *IE = InstrItins->endStage(InsnClass); IS != IE; ++IS) {
+    InsnInput = addDFAFuncUnits(InsnInput, IS->getUnits());
+    assert((i++ < DFA_MAX_RESTERMS) && "Exceeded maximum number of DFA inputs");
+  }
+  return InsnInput;
+}
+
+
+// Return the DFAInput for an instruction class input vector.
+DFAInput DFAPacketizer::getInsnInput(const std::vector<unsigned> &InsnClass) {
+  return getDFAInsnInput(InsnClass);
+}
+
+
+// Check if the resources occupied by a MCInstrDesc are available in the
+// current state.
+bool DFAPacketizer::canReserveResources(const llvm::MCInstrDesc *MID) {
+  unsigned InsnClass = MID->getSchedClass();
+  DFAInput InsnInput = getInsnInput(InsnClass);
+  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
+  ReadTable(CurrentState);
+  return CachedTable.count(StateTrans) != 0;
+}
+
+
+// Reserve the resources occupied by a MCInstrDesc and change the current
+// state to reflect that change.
+void DFAPacketizer::reserveResources(const llvm::MCInstrDesc *MID) {
+  unsigned InsnClass = MID->getSchedClass();
+  DFAInput InsnInput = getInsnInput(InsnClass);
+  UnsignPair StateTrans = UnsignPair(CurrentState, InsnInput);
+  ReadTable(CurrentState);
+  assert(CachedTable.count(StateTrans) != 0);
+  CurrentState = CachedTable[StateTrans];
+}
+
+
+// Check if the resources occupied by a machine instruction are available
+// in the current state.
+bool DFAPacketizer::canReserveResources(llvm::MachineInstr &MI) {
+  const llvm::MCInstrDesc &MID = MI.getDesc();
+  return canReserveResources(&MID);
+}
+
+
+// Reserve the resources occupied by a machine instruction and change the
+// current state to reflect that change.
+void DFAPacketizer::reserveResources(llvm::MachineInstr &MI) {
+  const llvm::MCInstrDesc &MID = MI.getDesc();
+  reserveResources(&MID);
+}
+
+
+namespace llvm {
+// This class extends ScheduleDAGInstrs and overrides the schedule method
+// to build the dependence graph.
+class DefaultVLIWScheduler : public ScheduleDAGInstrs {
+private:
+  AliasAnalysis *AA;
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+public:
+  DefaultVLIWScheduler(MachineFunction &MF, MachineLoopInfo &MLI,
+                       AliasAnalysis *AA);
+  // Actual scheduling work.
+  void schedule() override;
+
+  /// DefaultVLIWScheduler takes ownership of the Mutation object.
+  void addMutation(std::unique_ptr<ScheduleDAGMutation> Mutation) {
+    Mutations.push_back(std::move(Mutation));
+  }
+protected:
+  void postprocessDAG();
+};
+}
+
+
+DefaultVLIWScheduler::DefaultVLIWScheduler(MachineFunction &MF,
+                                           MachineLoopInfo &MLI,
+                                           AliasAnalysis *AA)
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA) {
+  CanHandleTerminators = true;
+}
+
+
+/// Apply each ScheduleDAGMutation step in order.
+void DefaultVLIWScheduler::postprocessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+
+void DefaultVLIWScheduler::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(AA);
+  postprocessDAG();
+}
+
+
+VLIWPacketizerList::VLIWPacketizerList(MachineFunction &mf,
+                                       MachineLoopInfo &mli, AliasAnalysis *aa)
+    : MF(mf), TII(mf.getSubtarget().getInstrInfo()), AA(aa) {
+  ResourceTracker = TII->CreateTargetScheduleState(MF.getSubtarget());
+  VLIWScheduler = new DefaultVLIWScheduler(MF, mli, AA);
+}
+
+
+VLIWPacketizerList::~VLIWPacketizerList() {
+  if (VLIWScheduler)
+    delete VLIWScheduler;
+  if (ResourceTracker)
+    delete ResourceTracker;
+}
+
+
+// End the current packet, bundle packet instructions and reset DFA state.
+void VLIWPacketizerList::endPacket(MachineBasicBlock *MBB,
+                                   MachineBasicBlock::iterator MI) {
+  if (CurrentPacketMIs.size() > 1) {
+    MachineInstr &MIFirst = *CurrentPacketMIs.front();
+    finalizeBundle(*MBB, MIFirst.getIterator(), MI.getInstrIterator());
+  }
+  CurrentPacketMIs.clear();
+  ResourceTracker->clearResources();
+  DEBUG(dbgs() << "End packet\n");
+}
+
+
+// Bundle machine instructions into packets.
+void VLIWPacketizerList::PacketizeMIs(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator BeginItr,
+                                      MachineBasicBlock::iterator EndItr) {
+  assert(VLIWScheduler && "VLIW Scheduler is not initialized!");
+  VLIWScheduler->startBlock(MBB);
+  VLIWScheduler->enterRegion(MBB, BeginItr, EndItr,
+                             std::distance(BeginItr, EndItr));
+  VLIWScheduler->schedule();
+
+  DEBUG({
+    dbgs() << "Scheduling DAG of the packetize region\n";
+    for (SUnit &SU : VLIWScheduler->SUnits)
+      SU.dumpAll(VLIWScheduler);
+  });
+
+  // Generate MI -> SU map.
+  MIToSUnit.clear();
+  for (SUnit &SU : VLIWScheduler->SUnits)
+    MIToSUnit[SU.getInstr()] = &SU;
+
+  // The main packetizer loop.
+  for (; BeginItr != EndItr; ++BeginItr) {
+    MachineInstr &MI = *BeginItr;
+    initPacketizerState();
+
+    // End the current packet if needed.
+    if (isSoloInstruction(MI)) {
+      endPacket(MBB, MI);
+      continue;
+    }
+
+    // Ignore pseudo instructions.
+    if (ignorePseudoInstruction(MI, MBB))
+      continue;
+
+    SUnit *SUI = MIToSUnit[&MI];
+    assert(SUI && "Missing SUnit Info!");
+
+    // Ask DFA if machine resource is available for MI.
+    DEBUG(dbgs() << "Checking resources for adding MI to packet " << MI);
+
+    bool ResourceAvail = ResourceTracker->canReserveResources(MI);
+    DEBUG({
+      if (ResourceAvail)
+        dbgs() << "  Resources are available for adding MI to packet\n";
+      else
+        dbgs() << "  Resources NOT available\n";
+    });
+    if (ResourceAvail && shouldAddToPacket(MI)) {
+      // Dependency check for MI with instructions in CurrentPacketMIs.
+      for (auto MJ : CurrentPacketMIs) {
+        SUnit *SUJ = MIToSUnit[MJ];
+        assert(SUJ && "Missing SUnit Info!");
+
+        DEBUG(dbgs() << "  Checking against MJ " << *MJ);
+        // Is it legal to packetize SUI and SUJ together.
+        if (!isLegalToPacketizeTogether(SUI, SUJ)) {
+          DEBUG(dbgs() << "  Not legal to add MI, try to prune\n");
+          // Allow packetization if dependency can be pruned.
+          if (!isLegalToPruneDependencies(SUI, SUJ)) {
+            // End the packet if dependency cannot be pruned.
+            DEBUG(dbgs() << "  Could not prune dependencies for adding MI\n");
+            endPacket(MBB, MI);
+            break;
+          }
+          DEBUG(dbgs() << "  Pruned dependence for adding MI\n");
+        }
+      }
+    } else {
+      DEBUG(if (ResourceAvail)
+        dbgs() << "Resources are available, but instruction should not be "
+                  "added to packet\n  " << MI);
+      // End the packet if resource is not available, or if the instruction
+      // shoud not be added to the current packet.
+      endPacket(MBB, MI);
+    }
+
+    // Add MI to the current packet.
+    DEBUG(dbgs() << "* Adding MI to packet " << MI << '\n');
+    BeginItr = addToPacket(MI);
+  } // For all instructions in the packetization range.
+
+  // End any packet left behind.
+  endPacket(MBB, EndItr);
+  VLIWScheduler->exitRegion();
+  VLIWScheduler->finishBlock();
+}
+
+
+// Add a DAG mutation object to the ordered list.
+void VLIWPacketizerList::addMutation(
+      std::unique_ptr<ScheduleDAGMutation> Mutation) {
+  VLIWScheduler->addMutation(std::move(Mutation));
+}
diff --git a/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
new file mode 100644
index 000000000000..0b8dc7a86ada
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -0,0 +1,181 @@
+//===- DeadMachineInstructionElim.cpp - Remove dead machine instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level dead-code-elimination pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen-dce"
+
+STATISTIC(NumDeletes,          "Number of dead instructions deleted");
+
+namespace {
+  class DeadMachineInstructionElim : public MachineFunctionPass {
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+    BitVector LivePhysRegs;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    DeadMachineInstructionElim() : MachineFunctionPass(ID) {
+     initializeDeadMachineInstructionElimPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    bool isDead(const MachineInstr *MI) const;
+  };
+}
+char DeadMachineInstructionElim::ID = 0;
+char &llvm::DeadMachineInstructionElimID = DeadMachineInstructionElim::ID;
+
+INITIALIZE_PASS(DeadMachineInstructionElim, "dead-mi-elimination",
+                "Remove dead machine instructions", false, false)
+
+bool DeadMachineInstructionElim::isDead(const MachineInstr *MI) const {
+  // Technically speaking inline asm without side effects and no defs can still
+  // be deleted. But there is so much bad inline asm code out there, we should
+  // let them be.
+  if (MI->isInlineAsm())
+    return false;
+
+  // Don't delete frame allocation labels.
+  if (MI->getOpcode() == TargetOpcode::LOCAL_ESCAPE)
+    return false;
+
+  // Don't delete instructions with side effects.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(nullptr, SawStore) && !MI->isPHI())
+    return false;
+
+  // Examine each operand.
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        // Don't delete live physreg defs, or any reserved register defs.
+        if (LivePhysRegs.test(Reg) || MRI->isReserved(Reg))
+          return false;
+      } else {
+        if (!MRI->use_nodbg_empty(Reg))
+          // This def has a non-debug use. Don't delete the instruction!
+          return false;
+      }
+    }
+  }
+
+  // If there are no defs with uses, the instruction is dead.
+  return true;
+}
+
+bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  bool AnyChanges = false;
+  MRI = &MF.getRegInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Loop over all instructions in all blocks, from bottom to top, so that it's
+  // more likely that chains of dependent but ultimately dead instructions will
+  // be cleaned up.
+  for (MachineBasicBlock &MBB : make_range(MF.rbegin(), MF.rend())) {
+    // Start out assuming that reserved registers are live out of this block.
+    LivePhysRegs = MRI->getReservedRegs();
+
+    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
+    // live across blocks, but some targets (x86) can have flags live out of a
+    // block.
+    for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
+           E = MBB.succ_end(); S != E; S++)
+      for (const auto &LI : (*S)->liveins())
+        LivePhysRegs.set(LI.PhysReg);
+
+    // Now scan the instructions and delete dead ones, tracking physreg
+    // liveness as we go.
+    for (MachineBasicBlock::reverse_iterator MII = MBB.rbegin(),
+         MIE = MBB.rend(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // If the instruction is dead, delete it!
+      if (isDead(MI)) {
+        DEBUG(dbgs() << "DeadMachineInstructionElim: DELETING: " << *MI);
+        // It is possible that some DBG_VALUE instructions refer to this
+        // instruction.  They get marked as undef and will be deleted
+        // in the live debug variable analysis.
+        MI->eraseFromParentAndMarkDBGValuesForRemoval();
+        AnyChanges = true;
+        ++NumDeletes;
+        MIE = MBB.rend();
+        // MII is now pointing to the next instruction to process,
+        // so don't increment it.
+        continue;
+      }
+
+      // Record the physreg defs.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isDef()) {
+          unsigned Reg = MO.getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            // Check the subreg set, not the alias set, because a def
+            // of a super-register may still be partially live after
+            // this def.
+            for (MCSubRegIterator SR(Reg, TRI,/*IncludeSelf=*/true);
+                 SR.isValid(); ++SR)
+              LivePhysRegs.reset(*SR);
+          }
+        } else if (MO.isRegMask()) {
+          // Register mask of preserved registers. All clobbers are dead.
+          LivePhysRegs.clearBitsNotInMask(MO.getRegMask());
+        }
+      }
+      // Record the physreg uses, after the defs, in case a physreg is
+      // both defined and used in the same instruction.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.isUse()) {
+          unsigned Reg = MO.getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+            for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+              LivePhysRegs.set(*AI);
+          }
+        }
+      }
+
+      // We didn't delete the current instruction, so increment MII to
+      // the next one.
+      ++MII;
+    }
+  }
+
+  LivePhysRegs.clear();
+  return AnyChanges;
+}
diff --git a/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
new file mode 100644
index 000000000000..1d9e79c055e0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DetectDeadLanes.cpp
@@ -0,0 +1,602 @@
+//===- DetectDeadLanes.cpp - SubRegister Lane Usage Analysis --*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Analysis that tracks defined/used subregister lanes across COPY instructions
+/// and instructions that get lowered to a COPY (PHI, REG_SEQUENCE,
+/// INSERT_SUBREG, EXTRACT_SUBREG).
+/// The information is used to detect dead definitions and the usage of
+/// (completely) undefined values and mark the operands as such.
+/// This pass is necessary because the dead/undef status is not obvious anymore
+/// when subregisters are involved.
+///
+/// Example:
+///    %vreg0 = some definition
+///    %vreg1 = IMPLICIT_DEF
+///    %vreg2 = REG_SEQUENCE %vreg0, sub0, %vreg1, sub1
+///    %vreg3 = EXTRACT_SUBREG %vreg2, sub1
+///           = use %vreg3
+/// The %vreg0 definition is dead and %vreg3 contains an undefined value.
+//
+//===----------------------------------------------------------------------===//
+
+#include <deque>
+#include <vector>
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "detect-dead-lanes"
+
+namespace {
+
+/// Contains a bitmask of which lanes of a given virtual register are
+/// defined and which ones are actually used.
+struct VRegInfo {
+  LaneBitmask UsedLanes;
+  LaneBitmask DefinedLanes;
+};
+
+class DetectDeadLanes : public MachineFunctionPass {
+public:
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+  DetectDeadLanes() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override { return "Detect Dead Lanes"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  /// Add used lane bits on the register used by operand \p MO. This translates
+  /// the bitmask based on the operands subregister, and puts the register into
+  /// the worklist if any new bits were added.
+  void addUsedLanesOnOperand(const MachineOperand &MO, LaneBitmask UsedLanes);
+
+  /// Given a bitmask \p UsedLanes for the used lanes on a def output of a
+  /// COPY-like instruction determine the lanes used on the use operands
+  /// and call addUsedLanesOnOperand() for them.
+  void transferUsedLanesStep(const MachineInstr &MI, LaneBitmask UsedLanes);
+
+  /// Given a use regiser operand \p Use and a mask of defined lanes, check
+  /// if the operand belongs to a lowersToCopies() instruction, transfer the
+  /// mask to the def and put the instruction into the worklist.
+  void transferDefinedLanesStep(const MachineOperand &Use,
+                                LaneBitmask DefinedLanes);
+
+  /// Given a mask \p DefinedLanes of lanes defined at operand \p OpNum
+  /// of COPY-like instruction, determine which lanes are defined at the output
+  /// operand \p Def.
+  LaneBitmask transferDefinedLanes(const MachineOperand &Def, unsigned OpNum,
+                                   LaneBitmask DefinedLanes) const;
+
+  /// Given a mask \p UsedLanes used from the output of instruction \p MI
+  /// determine which lanes are used from operand \p MO of this instruction.
+  LaneBitmask transferUsedLanes(const MachineInstr &MI, LaneBitmask UsedLanes,
+                                const MachineOperand &MO) const;
+
+  bool runOnce(MachineFunction &MF);
+
+  LaneBitmask determineInitialDefinedLanes(unsigned Reg);
+  LaneBitmask determineInitialUsedLanes(unsigned Reg);
+
+  bool isUndefRegAtInput(const MachineOperand &MO,
+                         const VRegInfo &RegInfo) const;
+
+  bool isUndefInput(const MachineOperand &MO, bool *CrossCopy) const;
+
+  const MachineRegisterInfo *MRI;
+  const TargetRegisterInfo *TRI;
+
+  void PutInWorklist(unsigned RegIdx) {
+    if (WorklistMembers.test(RegIdx))
+      return;
+    WorklistMembers.set(RegIdx);
+    Worklist.push_back(RegIdx);
+  }
+
+  VRegInfo *VRegInfos;
+  /// Worklist containing virtreg indexes.
+  std::deque<unsigned> Worklist;
+  BitVector WorklistMembers;
+  /// This bitvector is set for each vreg index where the vreg is defined
+  /// by an instruction where lowersToCopies()==true.
+  BitVector DefinedByCopy;
+};
+
+} // end anonymous namespace
+
+char DetectDeadLanes::ID = 0;
+char &llvm::DetectDeadLanesID = DetectDeadLanes::ID;
+
+INITIALIZE_PASS(DetectDeadLanes, "detect-dead-lanes", "Detect Dead Lanes",
+                false, false)
+
+/// Returns true if \p MI will get lowered to a series of COPY instructions.
+/// We call this a COPY-like instruction.
+static bool lowersToCopies(const MachineInstr &MI) {
+  // Note: We could support instructions with MCInstrDesc::isRegSequenceLike(),
+  // isExtractSubRegLike(), isInsertSubregLike() in the future even though they
+  // are not lowered to a COPY.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::REG_SEQUENCE:
+  case TargetOpcode::EXTRACT_SUBREG:
+    return true;
+  }
+  return false;
+}
+
+static bool isCrossCopy(const MachineRegisterInfo &MRI,
+                        const MachineInstr &MI,
+                        const TargetRegisterClass *DstRC,
+                        const MachineOperand &MO) {
+  assert(lowersToCopies(MI));
+  unsigned SrcReg = MO.getReg();
+  const TargetRegisterClass *SrcRC = MRI.getRegClass(SrcReg);
+  if (DstRC == SrcRC)
+    return false;
+
+  unsigned SrcSubIdx = MO.getSubReg();
+
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned DstSubIdx = 0;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::INSERT_SUBREG:
+    if (MI.getOperandNo(&MO) == 2)
+      DstSubIdx = MI.getOperand(3).getImm();
+    break;
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned OpNum = MI.getOperandNo(&MO);
+    DstSubIdx = MI.getOperand(OpNum+1).getImm();
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubReg = MI.getOperand(2).getImm();
+    SrcSubIdx = TRI.composeSubRegIndices(SubReg, SrcSubIdx);
+  }
+  }
+
+  unsigned PreA, PreB; // Unused.
+  if (SrcSubIdx && DstSubIdx)
+    return !TRI.getCommonSuperRegClass(SrcRC, SrcSubIdx, DstRC, DstSubIdx, PreA,
+                                       PreB);
+  if (SrcSubIdx)
+    return !TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSubIdx);
+  if (DstSubIdx)
+    return !TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSubIdx);
+  return !TRI.getCommonSubClass(SrcRC, DstRC);
+}
+
+void DetectDeadLanes::addUsedLanesOnOperand(const MachineOperand &MO,
+                                            LaneBitmask UsedLanes) {
+  if (!MO.readsReg())
+    return;
+  unsigned MOReg = MO.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(MOReg))
+    return;
+
+  unsigned MOSubReg = MO.getSubReg();
+  if (MOSubReg != 0)
+    UsedLanes = TRI->composeSubRegIndexLaneMask(MOSubReg, UsedLanes);
+  UsedLanes &= MRI->getMaxLaneMaskForVReg(MOReg);
+
+  unsigned MORegIdx = TargetRegisterInfo::virtReg2Index(MOReg);
+  VRegInfo &MORegInfo = VRegInfos[MORegIdx];
+  LaneBitmask PrevUsedLanes = MORegInfo.UsedLanes;
+  // Any change at all?
+  if ((UsedLanes & ~PrevUsedLanes) == 0)
+    return;
+
+  // Set UsedLanes and remember instruction for further propagation.
+  MORegInfo.UsedLanes = PrevUsedLanes | UsedLanes;
+  if (DefinedByCopy.test(MORegIdx))
+    PutInWorklist(MORegIdx);
+}
+
+void DetectDeadLanes::transferUsedLanesStep(const MachineInstr &MI,
+                                            LaneBitmask UsedLanes) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+    LaneBitmask UsedOnMO = transferUsedLanes(MI, UsedLanes, MO);
+    addUsedLanesOnOperand(MO, UsedOnMO);
+  }
+}
+
+LaneBitmask DetectDeadLanes::transferUsedLanes(const MachineInstr &MI,
+                                               LaneBitmask UsedLanes,
+                                               const MachineOperand &MO) const {
+  unsigned OpNum = MI.getOperandNo(&MO);
+  assert(lowersToCopies(MI) && DefinedByCopy[
+           TargetRegisterInfo::virtReg2Index(MI.getOperand(0).getReg())]);
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    return UsedLanes;
+  case TargetOpcode::REG_SEQUENCE: {
+    assert(OpNum % 2 == 1);
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    return TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    LaneBitmask MO2UsedLanes =
+        TRI->reverseComposeSubRegIndexLaneMask(SubIdx, UsedLanes);
+    if (OpNum == 2)
+      return MO2UsedLanes;
+
+    const MachineOperand &Def = MI.getOperand(0);
+    unsigned DefReg = Def.getReg();
+    const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+    LaneBitmask MO1UsedLanes;
+    if (RC->CoveredBySubRegs)
+      MO1UsedLanes = UsedLanes & ~TRI->getSubRegIndexLaneMask(SubIdx);
+    else
+      MO1UsedLanes = RC->LaneMask;
+
+    assert(OpNum == 1);
+    return MO1UsedLanes;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    assert(OpNum == 1);
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    return TRI->composeSubRegIndexLaneMask(SubIdx, UsedLanes);
+  }
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+}
+
+void DetectDeadLanes::transferDefinedLanesStep(const MachineOperand &Use,
+                                               LaneBitmask DefinedLanes) {
+  if (!Use.readsReg())
+    return;
+  // Check whether the operand writes a vreg and is part of a COPY-like
+  // instruction.
+  const MachineInstr &MI = *Use.getParent();
+  if (MI.getDesc().getNumDefs() != 1)
+    return;
+  // FIXME: PATCHPOINT instructions announce a Def that does not always exist,
+  // they really need to be modeled differently!
+  if (MI.getOpcode() == TargetOpcode::PATCHPOINT)
+    return;
+  const MachineOperand &Def = *MI.defs().begin();
+  unsigned DefReg = Def.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    return;
+  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  if (!DefinedByCopy.test(DefRegIdx))
+    return;
+
+  unsigned OpNum = MI.getOperandNo(&Use);
+  DefinedLanes =
+      TRI->reverseComposeSubRegIndexLaneMask(Use.getSubReg(), DefinedLanes);
+  DefinedLanes = transferDefinedLanes(Def, OpNum, DefinedLanes);
+
+  VRegInfo &RegInfo = VRegInfos[DefRegIdx];
+  LaneBitmask PrevDefinedLanes = RegInfo.DefinedLanes;
+  // Any change at all?
+  if ((DefinedLanes & ~PrevDefinedLanes) == 0)
+    return;
+
+  RegInfo.DefinedLanes = PrevDefinedLanes | DefinedLanes;
+  PutInWorklist(DefRegIdx);
+}
+
+LaneBitmask DetectDeadLanes::transferDefinedLanes(const MachineOperand &Def,
+    unsigned OpNum, LaneBitmask DefinedLanes) const {
+  const MachineInstr &MI = *Def.getParent();
+  // Translate DefinedLanes if necessary.
+  switch (MI.getOpcode()) {
+  case TargetOpcode::REG_SEQUENCE: {
+    unsigned SubIdx = MI.getOperand(OpNum + 1).getImm();
+    DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    break;
+  }
+  case TargetOpcode::INSERT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(3).getImm();
+    if (OpNum == 2) {
+      DefinedLanes = TRI->composeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+      DefinedLanes &= TRI->getSubRegIndexLaneMask(SubIdx);
+    } else {
+      assert(OpNum == 1 && "INSERT_SUBREG must have two operands");
+      // Ignore lanes defined by operand 2.
+      DefinedLanes &= ~TRI->getSubRegIndexLaneMask(SubIdx);
+    }
+    break;
+  }
+  case TargetOpcode::EXTRACT_SUBREG: {
+    unsigned SubIdx = MI.getOperand(2).getImm();
+    assert(OpNum == 1 && "EXTRACT_SUBREG must have one register operand only");
+    DefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(SubIdx, DefinedLanes);
+    break;
+  }
+  case TargetOpcode::COPY:
+  case TargetOpcode::PHI:
+    break;
+  default:
+    llvm_unreachable("function must be called with COPY-like instruction");
+  }
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  DefinedLanes &= MRI->getMaxLaneMaskForVReg(Def.getReg());
+  return DefinedLanes;
+}
+
+LaneBitmask DetectDeadLanes::determineInitialDefinedLanes(unsigned Reg) {
+  // Live-In or unused registers have no definition but are considered fully
+  // defined.
+  if (!MRI->hasOneDef(Reg))
+    return ~0u;
+
+  const MachineOperand &Def = *MRI->def_begin(Reg);
+  const MachineInstr &DefMI = *Def.getParent();
+  if (lowersToCopies(DefMI)) {
+    // Start optimisatically with no used or defined lanes for copy
+    // instructions. The following dataflow analysis will add more bits.
+    unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+    DefinedByCopy.set(RegIdx);
+    PutInWorklist(RegIdx);
+
+    if (Def.isDead())
+      return 0;
+
+    // COPY/PHI can copy across unrelated register classes (example: float/int)
+    // with incompatible subregister structure. Do not include these in the
+    // dataflow analysis since we cannot transfer lanemasks in a meaningful way.
+    const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+
+    // Determine initially DefinedLanes.
+    LaneBitmask DefinedLanes = 0;
+    for (const MachineOperand &MO : DefMI.uses()) {
+      if (!MO.isReg() || !MO.readsReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+
+      LaneBitmask MODefinedLanes;
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+        MODefinedLanes = ~0u;
+      } else if (isCrossCopy(*MRI, DefMI, DefRC, MO)) {
+        MODefinedLanes = ~0u;
+      } else {
+        assert(TargetRegisterInfo::isVirtualRegister(MOReg));
+        if (MRI->hasOneDef(MOReg)) {
+          const MachineOperand &MODef = *MRI->def_begin(MOReg);
+          const MachineInstr &MODefMI = *MODef.getParent();
+          // Bits from copy-like operations will be added later.
+          if (lowersToCopies(MODefMI) || MODefMI.isImplicitDef())
+            continue;
+        }
+        unsigned MOSubReg = MO.getSubReg();
+        MODefinedLanes = MRI->getMaxLaneMaskForVReg(MOReg);
+        MODefinedLanes = TRI->reverseComposeSubRegIndexLaneMask(
+            MOSubReg, MODefinedLanes);
+      }
+
+      unsigned OpNum = DefMI.getOperandNo(&MO);
+      DefinedLanes |= transferDefinedLanes(Def, OpNum, MODefinedLanes);
+    }
+    return DefinedLanes;
+  }
+  if (DefMI.isImplicitDef() || Def.isDead())
+    return 0;
+
+  assert(Def.getSubReg() == 0 &&
+         "Should not have subregister defs in machine SSA phase");
+  return MRI->getMaxLaneMaskForVReg(Reg);
+}
+
+LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
+  LaneBitmask UsedLanes = 0;
+  for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    if (!MO.readsReg())
+      continue;
+
+    const MachineInstr &UseMI = *MO.getParent();
+    if (UseMI.isKill())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (lowersToCopies(UseMI)) {
+      assert(UseMI.getDesc().getNumDefs() == 1);
+      const MachineOperand &Def = *UseMI.defs().begin();
+      unsigned DefReg = Def.getReg();
+      // The used lanes of COPY-like instruction operands are determined by the
+      // following dataflow analysis.
+      if (TargetRegisterInfo::isVirtualRegister(DefReg)) {
+        // But ignore copies across incompatible register classes.
+        bool CrossCopy = false;
+        if (lowersToCopies(UseMI)) {
+          const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+          CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
+          if (CrossCopy)
+            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI);
+        }
+
+        if (!CrossCopy)
+          continue;
+      }
+    }
+
+    // Shortcut: All lanes are used.
+    if (SubReg == 0)
+      return MRI->getMaxLaneMaskForVReg(Reg);
+
+    UsedLanes |= TRI->getSubRegIndexLaneMask(SubReg);
+  }
+  return UsedLanes;
+}
+
+bool DetectDeadLanes::isUndefRegAtInput(const MachineOperand &MO,
+                                        const VRegInfo &RegInfo) const {
+  unsigned SubReg = MO.getSubReg();
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+  return (RegInfo.DefinedLanes & RegInfo.UsedLanes & Mask) == 0;
+}
+
+bool DetectDeadLanes::isUndefInput(const MachineOperand &MO,
+                                   bool *CrossCopy) const {
+  if (!MO.isUse())
+    return false;
+  const MachineInstr &MI = *MO.getParent();
+  if (!lowersToCopies(MI))
+    return false;
+  const MachineOperand &Def = MI.getOperand(0);
+  unsigned DefReg = Def.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DefReg))
+    return false;
+  unsigned DefRegIdx = TargetRegisterInfo::virtReg2Index(DefReg);
+  if (!DefinedByCopy.test(DefRegIdx))
+    return false;
+
+  const VRegInfo &DefRegInfo = VRegInfos[DefRegIdx];
+  LaneBitmask UsedLanes = transferUsedLanes(MI, DefRegInfo.UsedLanes, MO);
+  if (UsedLanes != 0)
+    return false;
+
+  unsigned MOReg = MO.getReg();
+  if (TargetRegisterInfo::isVirtualRegister(MOReg)) {
+    const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
+    *CrossCopy = isCrossCopy(*MRI, MI, DstRC, MO);
+  }
+  return true;
+}
+
+bool DetectDeadLanes::runOnce(MachineFunction &MF) {
+  // First pass: Populate defs/uses of vregs with initial values
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+
+    // Determine used/defined lanes and add copy instructions to worklist.
+    VRegInfo &Info = VRegInfos[RegIdx];
+    Info.DefinedLanes = determineInitialDefinedLanes(Reg);
+    Info.UsedLanes = determineInitialUsedLanes(Reg);
+  }
+
+  // Iterate as long as defined lanes/used lanes keep changing.
+  while (!Worklist.empty()) {
+    unsigned RegIdx = Worklist.front();
+    Worklist.pop_front();
+    WorklistMembers.reset(RegIdx);
+    VRegInfo &Info = VRegInfos[RegIdx];
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+
+    // Transfer UsedLanes to operands of DefMI (backwards dataflow).
+    MachineOperand &Def = *MRI->def_begin(Reg);
+    const MachineInstr &MI = *Def.getParent();
+    transferUsedLanesStep(MI, Info.UsedLanes);
+    // Transfer DefinedLanes to users of Reg (forward dataflow).
+    for (const MachineOperand &MO : MRI->use_nodbg_operands(Reg))
+      transferDefinedLanesStep(MO, Info.DefinedLanes);
+  }
+
+  DEBUG(
+    dbgs() << "Defined/Used lanes:\n";
+    for (unsigned RegIdx = 0; RegIdx < NumVirtRegs; ++RegIdx) {
+      unsigned Reg = TargetRegisterInfo::index2VirtReg(RegIdx);
+      const VRegInfo &Info = VRegInfos[RegIdx];
+      dbgs() << PrintReg(Reg, nullptr)
+             << " Used: " << PrintLaneMask(Info.UsedLanes)
+             << " Def: " << PrintLaneMask(Info.DefinedLanes) << '\n';
+    }
+    dbgs() << "\n";
+  );
+
+  bool Again = false;
+  // Mark operands as dead/unused.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          continue;
+        unsigned RegIdx = TargetRegisterInfo::virtReg2Index(Reg);
+        const VRegInfo &RegInfo = VRegInfos[RegIdx];
+        if (MO.isDef() && !MO.isDead() && RegInfo.UsedLanes == 0) {
+          DEBUG(dbgs() << "Marking operand '" << MO << "' as dead in " << MI);
+          MO.setIsDead();
+        }
+        if (MO.readsReg()) {
+          bool CrossCopy = false;
+          if (isUndefRegAtInput(MO, RegInfo)) {
+            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
+                  << MI);
+            MO.setIsUndef();
+          } else if (isUndefInput(MO, &CrossCopy)) {
+            DEBUG(dbgs() << "Marking operand '" << MO << "' as undef in "
+                  << MI);
+            MO.setIsUndef();
+            if (CrossCopy)
+              Again = true;
+          }
+        }
+      }
+    }
+  }
+
+  return Again;
+}
+
+bool DetectDeadLanes::runOnMachineFunction(MachineFunction &MF) {
+  // Don't bother if we won't track subregister liveness later.  This pass is
+  // required for correctness if subregister liveness is enabled because the
+  // register coalescer cannot deal with hidden dead defs. However without
+  // subregister liveness enabled, the expected benefits of this pass are small
+  // so we safe the compile time.
+  if (!MF.getSubtarget().enableSubRegLiveness()) {
+    DEBUG(dbgs() << "Skipping Detect dead lanes pass\n");
+    return false;
+  }
+
+  MRI = &MF.getRegInfo();
+  TRI = MRI->getTargetRegisterInfo();
+
+  unsigned NumVirtRegs = MRI->getNumVirtRegs();
+  VRegInfos = new VRegInfo[NumVirtRegs];
+  WorklistMembers.resize(NumVirtRegs);
+  DefinedByCopy.resize(NumVirtRegs);
+
+  bool Again;
+  do {
+    Again = runOnce(MF);
+  } while(Again);
+
+  DefinedByCopy.clear();
+  WorklistMembers.clear();
+  delete[] VRegInfos;
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
new file mode 100644
index 000000000000..eae78a950d9a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -0,0 +1,264 @@
+//===-- DwarfEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass mulches exception handling code into a form adapted to code
+// generation. Required if using dwarf exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dwarfehprepare"
+
+STATISTIC(NumResumesLowered, "Number of resume calls lowered");
+
+namespace {
+  class DwarfEHPrepare : public FunctionPass {
+    const TargetMachine *TM;
+
+    // RewindFunction - _Unwind_Resume or the target equivalent.
+    Constant *RewindFunction;
+
+    DominatorTree *DT;
+    const TargetLowering *TLI;
+
+    bool InsertUnwindResumeCalls(Function &Fn);
+    Value *GetExceptionObject(ResumeInst *RI);
+    size_t
+    pruneUnreachableResumes(Function &Fn,
+                            SmallVectorImpl<ResumeInst *> &Resumes,
+                            SmallVectorImpl<LandingPadInst *> &CleanupLPads);
+
+  public:
+    static char ID; // Pass identification, replacement for typeid.
+
+    // INITIALIZE_TM_PASS requires a default constructor, but it isn't used in
+    // practice.
+    DwarfEHPrepare()
+        : FunctionPass(ID), TM(nullptr), RewindFunction(nullptr), DT(nullptr),
+          TLI(nullptr) {}
+
+    DwarfEHPrepare(const TargetMachine *TM)
+        : FunctionPass(ID), TM(TM), RewindFunction(nullptr), DT(nullptr),
+          TLI(nullptr) {}
+
+    bool runOnFunction(Function &Fn) override;
+
+    bool doFinalization(Module &M) override {
+      RewindFunction = nullptr;
+      return false;
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+    const char *getPassName() const override {
+      return "Exception handling preparation";
+    }
+  };
+} // end anonymous namespace
+
+char DwarfEHPrepare::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(DwarfEHPrepare, "dwarfehprepare",
+                         "Prepare DWARF exceptions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_TM_PASS_END(DwarfEHPrepare, "dwarfehprepare",
+                       "Prepare DWARF exceptions", false, false)
+
+FunctionPass *llvm::createDwarfEHPass(const TargetMachine *TM) {
+  return new DwarfEHPrepare(TM);
+}
+
+void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+}
+
+/// GetExceptionObject - Return the exception object from the value passed into
+/// the 'resume' instruction (typically an aggregate). Clean up any dead
+/// instructions, including the 'resume' instruction.
+Value *DwarfEHPrepare::GetExceptionObject(ResumeInst *RI) {
+  Value *V = RI->getOperand(0);
+  Value *ExnObj = nullptr;
+  InsertValueInst *SelIVI = dyn_cast<InsertValueInst>(V);
+  LoadInst *SelLoad = nullptr;
+  InsertValueInst *ExcIVI = nullptr;
+  bool EraseIVIs = false;
+
+  if (SelIVI) {
+    if (SelIVI->getNumIndices() == 1 && *SelIVI->idx_begin() == 1) {
+      ExcIVI = dyn_cast<InsertValueInst>(SelIVI->getOperand(0));
+      if (ExcIVI && isa<UndefValue>(ExcIVI->getOperand(0)) &&
+          ExcIVI->getNumIndices() == 1 && *ExcIVI->idx_begin() == 0) {
+        ExnObj = ExcIVI->getOperand(1);
+        SelLoad = dyn_cast<LoadInst>(SelIVI->getOperand(1));
+        EraseIVIs = true;
+      }
+    }
+  }
+
+  if (!ExnObj)
+    ExnObj = ExtractValueInst::Create(RI->getOperand(0), 0, "exn.obj", RI);
+
+  RI->eraseFromParent();
+
+  if (EraseIVIs) {
+    if (SelIVI->use_empty())
+      SelIVI->eraseFromParent();
+    if (ExcIVI->use_empty())
+      ExcIVI->eraseFromParent();
+    if (SelLoad && SelLoad->use_empty())
+      SelLoad->eraseFromParent();
+  }
+
+  return ExnObj;
+}
+
+/// Replace resumes that are not reachable from a cleanup landing pad with
+/// unreachable and then simplify those blocks.
+size_t DwarfEHPrepare::pruneUnreachableResumes(
+    Function &Fn, SmallVectorImpl<ResumeInst *> &Resumes,
+    SmallVectorImpl<LandingPadInst *> &CleanupLPads) {
+  BitVector ResumeReachable(Resumes.size());
+  size_t ResumeIndex = 0;
+  for (auto *RI : Resumes) {
+    for (auto *LP : CleanupLPads) {
+      if (isPotentiallyReachable(LP, RI, DT)) {
+        ResumeReachable.set(ResumeIndex);
+        break;
+      }
+    }
+    ++ResumeIndex;
+  }
+
+  // If everything is reachable, there is no change.
+  if (ResumeReachable.all())
+    return Resumes.size();
+
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(Fn);
+  LLVMContext &Ctx = Fn.getContext();
+
+  // Otherwise, insert unreachable instructions and call simplifycfg.
+  size_t ResumesLeft = 0;
+  for (size_t I = 0, E = Resumes.size(); I < E; ++I) {
+    ResumeInst *RI = Resumes[I];
+    if (ResumeReachable[I]) {
+      Resumes[ResumesLeft++] = RI;
+    } else {
+      BasicBlock *BB = RI->getParent();
+      new UnreachableInst(Ctx, RI);
+      RI->eraseFromParent();
+      SimplifyCFG(BB, TTI, 1);
+    }
+  }
+  Resumes.resize(ResumesLeft);
+  return ResumesLeft;
+}
+
+/// InsertUnwindResumeCalls - Convert the ResumeInsts that are still present
+/// into calls to the appropriate _Unwind_Resume function.
+bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
+  SmallVector<ResumeInst*, 16> Resumes;
+  SmallVector<LandingPadInst*, 16> CleanupLPads;
+  for (BasicBlock &BB : Fn) {
+    if (auto *RI = dyn_cast<ResumeInst>(BB.getTerminator()))
+      Resumes.push_back(RI);
+    if (auto *LP = BB.getLandingPadInst())
+      if (LP->isCleanup())
+        CleanupLPads.push_back(LP);
+  }
+
+  if (Resumes.empty())
+    return false;
+
+  // Check the personality, don't do anything if it's funclet-based.
+  EHPersonality Pers = classifyEHPersonality(Fn.getPersonalityFn());
+  if (isFuncletEHPersonality(Pers))
+    return false;
+
+  LLVMContext &Ctx = Fn.getContext();
+
+  size_t ResumesLeft = pruneUnreachableResumes(Fn, Resumes, CleanupLPads);
+  if (ResumesLeft == 0)
+    return true; // We pruned them all.
+
+  // Find the rewind function if we didn't already.
+  if (!RewindFunction) {
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx),
+                                          Type::getInt8PtrTy(Ctx), false);
+    const char *RewindName = TLI->getLibcallName(RTLIB::UNWIND_RESUME);
+    RewindFunction = Fn.getParent()->getOrInsertFunction(RewindName, FTy);
+  }
+
+  // Create the basic block where the _Unwind_Resume call will live.
+  if (ResumesLeft == 1) {
+    // Instead of creating a new BB and PHI node, just append the call to
+    // _Unwind_Resume to the end of the single resume block.
+    ResumeInst *RI = Resumes.front();
+    BasicBlock *UnwindBB = RI->getParent();
+    Value *ExnObj = GetExceptionObject(RI);
+
+    // Call the _Unwind_Resume function.
+    CallInst *CI = CallInst::Create(RewindFunction, ExnObj, "", UnwindBB);
+    CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+
+    // We never expect _Unwind_Resume to return.
+    new UnreachableInst(Ctx, UnwindBB);
+    return true;
+  }
+
+  BasicBlock *UnwindBB = BasicBlock::Create(Ctx, "unwind_resume", &Fn);
+  PHINode *PN = PHINode::Create(Type::getInt8PtrTy(Ctx), ResumesLeft,
+                                "exn.obj", UnwindBB);
+
+  // Extract the exception object from the ResumeInst and add it to the PHI node
+  // that feeds the _Unwind_Resume call.
+  for (ResumeInst *RI : Resumes) {
+    BasicBlock *Parent = RI->getParent();
+    BranchInst::Create(UnwindBB, Parent);
+
+    Value *ExnObj = GetExceptionObject(RI);
+    PN->addIncoming(ExnObj, Parent);
+
+    ++NumResumesLowered;
+  }
+
+  // Call the function.
+  CallInst *CI = CallInst::Create(RewindFunction, PN, "", UnwindBB);
+  CI->setCallingConv(TLI->getLibcallCallingConv(RTLIB::UNWIND_RESUME));
+
+  // We never expect _Unwind_Resume to return.
+  new UnreachableInst(Ctx, UnwindBB);
+  return true;
+}
+
+bool DwarfEHPrepare::runOnFunction(Function &Fn) {
+  assert(TM && "DWARF EH preparation requires a target machine");
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  bool Changed = InsertUnwindResumeCalls(Fn);
+  DT = nullptr;
+  TLI = nullptr;
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 000000000000..8c96124451f3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,817 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "early-ifcvt"
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumDiamondsSeen,  "Number of diamonds");
+STATISTIC(NumDiamondsConv,  "Number of diamonds converted");
+STATISTIC(NumTrianglesSeen, "Number of triangles");
+STATISTIC(NumTrianglesConv, "Number of triangles converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Returns the Tail predecessor for the True side.
+  MachineBasicBlock *getTPred() const { return TBB == Tail ? Head : TBB; }
+
+  /// Returns the Tail predecessor for the  False side.
+  MachineBasicBlock *getFPred() const { return FBB == Tail ? Head : FBB; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg, FReg;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles, TCycles, FCycles;
+
+    PHIInfo(MachineInstr *phi)
+      : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+private:
+  /// The branch condition determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+  /// Replace PHI instructions in Tail with selects.
+  void replacePHIInstrs();
+
+  /// Insert selects and rewrite PHI operands to use them.
+  void rewritePHIOperands();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getSubtarget().getInstrInfo();
+    TRI = MF.getSubtarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  bool canConvertIf(MachineBasicBlock *MBB);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+       E = MBB->getFirstTerminator(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(nullptr, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    for (const MachineOperand &MO : I->operands()) {
+      if (MO.isRegMask()) {
+        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        return false;
+      }
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      // Remember clobbered regunits.
+      if (MO.isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          ClobberedRegUnits.set(*Units);
+
+      if (!MO.readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI || DefMI->getParent() != Head)
+        continue;
+      if (InsertAfter.insert(DefMI).second)
+        DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
+      if (DefMI->isTerminator()) {
+        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<unsigned, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(&*I)) {
+      DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (const MachineOperand &MO : I->operands()) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO.isDef())
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          LiveRegUnits.erase(*Units);
+      // Unless I reads Reg.
+      if (MO.readsReg())
+        Reads.push_back(Reg);
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid();
+           ++Units)
+        if (ClobberedRegUnits.test(*Units))
+          LiveRegUnits.insert(*Units);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      DEBUG({
+        dbgs() << "Would clobber";
+        for (SparseSet<unsigned>::const_iterator
+             i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
+          dbgs() << ' ' << PrintRegUnit(*i, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+  Head = MBB;
+  TBB = FBB = Tail = nullptr;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  Tail = Succ0->succ_begin()[0];
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << "/BB#" << Succ1->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // If Tail doesn't have any phis, there must be side effects.
+  if (Tail->empty() || !Tail->front().isPHI()) {
+    DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->analyzeBranch(*Head, TBB, FBB, Cond)) {
+    DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = getTPred();
+  MachineBasicBlock *FPred = getFPred();
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
+                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (TBB != Tail && !canSpeculateInstrs(TBB))
+    return false;
+  if (FBB != Tail && !canSpeculateInstrs(FBB))
+    return false;
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  if (isTriangle())
+    ++NumTrianglesSeen;
+  else
+    ++NumDiamondsSeen;
+  return true;
+}
+
+/// replacePHIInstrs - Completely replace PHI instructions with selects.
+/// This is possible when the only Tail predecessors are the if-converted
+/// blocks.
+void SSAIfConv::replacePHIInstrs() {
+  assert(Tail->pred_size() == 2 && "Cannot replace PHIs");
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = nullptr;
+  }
+}
+
+/// rewritePHIOperands - When there are additional Tail predecessors, insert
+/// select instructions in Head and rewrite PHI operands to use the selects.
+/// Keep the PHI instructions in Tail to handle the other predecessors.
+void SSAIfConv::rewritePHIOperands() {
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    unsigned DstReg = 0;
+
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    if (PI.TReg == PI.FReg) {
+      // We do not need the select instruction if both incoming values are
+      // equal.
+      DstReg = PI.TReg;
+    } else {
+      unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+      DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
+      TII->insertSelect(*Head, FirstTerm, HeadDL,
+                         DstReg, Cond, PI.TReg, PI.FReg);
+      DEBUG(dbgs() << "          --> " << *std::prev(FirstTerm));
+    }
+
+    // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
+    for (unsigned i = PI.PHI->getNumOperands(); i != 1; i -= 2) {
+      MachineBasicBlock *MBB = PI.PHI->getOperand(i-1).getMBB();
+      if (MBB == getTPred()) {
+        PI.PHI->getOperand(i-1).setMBB(Head);
+        PI.PHI->getOperand(i-2).setReg(DstReg);
+      } else if (MBB == getFPred()) {
+        PI.PHI->RemoveOperand(i-1);
+        PI.PHI->RemoveOperand(i-2);
+      }
+    }
+    DEBUG(dbgs() << "          --> " << *PI.PHI);
+  }
+}
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Update statistics.
+  if (isTriangle())
+    ++NumTrianglesConv;
+  else
+    ++NumDiamondsConv;
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail)
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  if (FBB != Tail)
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+
+  // Are there extra Tail predecessors?
+  bool ExtraPreds = Tail->pred_size() != 2;
+  if (ExtraPreds)
+    rewritePHIOperands();
+  else
+    replacePHIInstrs();
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB, true);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail, true);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail, true);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber()
+                 << " into head BB#" << Head->getNumber() << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->InsertBranch(*Head, Tail, nullptr, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  DEBUG(dbgs() << *Head);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override { return "Early If-Conversion"; }
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
+  void invalidateTraces();
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    }
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void EarlyIfConverter::invalidateTraces() {
+  Traces->verifyAnalysis();
+  Traces->invalidate(IfConv.Head);
+  Traces->invalidate(IfConv.Tail);
+  Traces->invalidate(IfConv.TBB);
+  Traces->invalidate(IfConv.FBB);
+  Traces->verifyAnalysis();
+}
+
+// Adjust cycles with downward saturation.
+static unsigned adjCycles(unsigned Cyc, int Delta) {
+  if (Delta < 0 && Cyc + Delta > Cyc)
+    return 0;
+  return Cyc + Delta;
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool EarlyIfConverter::shouldConvertIf() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
+  MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
+  DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
+                              FBBTrace.getCriticalPath());
+
+  // Set a somewhat arbitrary limit on the critical path extension we accept.
+  unsigned CritLimit = SchedModel.MispredictPenalty/2;
+
+  // If-conversion only makes sense when there is unexploited ILP. Compute the
+  // maximum-ILP resource length of the trace after if-conversion. Compare it
+  // to the shortest critical path.
+  SmallVector<const MachineBasicBlock*, 1> ExtraBlocks;
+  if (IfConv.TBB != IfConv.Tail)
+    ExtraBlocks.push_back(IfConv.TBB);
+  unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
+  DEBUG(dbgs() << "Resource length " << ResLength
+               << ", minimal critical path " << MinCrit << '\n');
+  if (ResLength > MinCrit + CritLimit) {
+    DEBUG(dbgs() << "Not enough available ILP.\n");
+    return false;
+  }
+
+  // Assume that the depth of the first head terminator will also be the depth
+  // of the select instruction inserted, as determined by the flag dependency.
+  // TBB / FBB data dependencies may delay the select even more.
+  MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
+  unsigned BranchDepth =
+      HeadTrace.getInstrCycles(*IfConv.Head->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+
+  // Look at all the tail phis, and compute the critical path extension caused
+  // by inserting select instructions.
+  MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
+    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+    unsigned Slack = TailTrace.getInstrSlack(*PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(*PI.PHI).Depth;
+    DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+
+    // The condition is pulled into the critical path.
+    unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
+    if (CondDepth > MaxDepth) {
+      unsigned Extra = CondDepth - MaxDepth;
+      DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The TBB value is pulled into the critical path.
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(*PI.PHI), PI.TCycles);
+    if (TDepth > MaxDepth) {
+      unsigned Extra = TDepth - MaxDepth;
+      DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The FBB value is pulled into the critical path.
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(*PI.PHI), PI.FCycles);
+    if (FDepth > MaxDepth) {
+      unsigned Extra = FDepth - MaxDepth;
+      DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    invalidateTraces();
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  // Only run if conversion if the target wants it.
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  if (!STI.enableEarlyIfConversion())
+    return false;
+
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (auto DomNode : post_order(DomTree))
+    if (tryConvertIf(DomNode->getBlock()))
+      Changed = true;
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/EdgeBundles.cpp b/contrib/llvm/lib/CodeGen/EdgeBundles.cpp
new file mode 100644
index 000000000000..aea7c31ba316
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -0,0 +1,97 @@
+//===-------- EdgeBundles.cpp - Bundles of CFG edges ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the implementation of the EdgeBundles analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/GraphWriter.h"
+
+using namespace llvm;
+
+static cl::opt<bool>
+ViewEdgeBundles("view-edge-bundles", cl::Hidden,
+                cl::desc("Pop up a window to show edge bundle graphs"));
+
+char EdgeBundles::ID = 0;
+
+INITIALIZE_PASS(EdgeBundles, "edge-bundles", "Bundle Machine CFG Edges",
+                /* cfg = */true, /* analysis = */ true)
+
+char &llvm::EdgeBundlesID = EdgeBundles::ID;
+
+void EdgeBundles::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool EdgeBundles::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  EC.clear();
+  EC.grow(2 * MF->getNumBlockIDs());
+
+  for (const auto &MBB : *MF) {
+    unsigned OutE = 2 * MBB.getNumber() + 1;
+    // Join the outgoing bundle with the ingoing bundles of all successors.
+    for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      EC.join(OutE, 2 * (*SI)->getNumber());
+  }
+  EC.compress();
+  if (ViewEdgeBundles)
+    view();
+
+  // Compute the reverse mapping.
+  Blocks.clear();
+  Blocks.resize(getNumBundles());
+
+  for (unsigned i = 0, e = MF->getNumBlockIDs(); i != e; ++i) {
+    unsigned b0 = getBundle(i, 0);
+    unsigned b1 = getBundle(i, 1);
+    Blocks[b0].push_back(i);
+    if (b1 != b0)
+      Blocks[b1].push_back(i);
+  }
+
+  return false;
+}
+
+/// Specialize WriteGraph, the standard implementation won't work.
+namespace llvm {
+template<>
+raw_ostream &WriteGraph<>(raw_ostream &O, const EdgeBundles &G,
+                          bool ShortNames,
+                          const Twine &Title) {
+  const MachineFunction *MF = G.getMachineFunction();
+
+  O << "digraph {\n";
+  for (const auto &MBB : *MF) {
+    unsigned BB = MBB.getNumber();
+    O << "\t\"BB#" << BB << "\" [ shape=box ]\n"
+      << '\t' << G.getBundle(BB, false) << " -> \"BB#" << BB << "\"\n"
+      << "\t\"BB#" << BB << "\" -> " << G.getBundle(BB, true) << '\n';
+    for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+           SE = MBB.succ_end(); SI != SE; ++SI)
+      O << "\t\"BB#" << BB << "\" -> \"BB#" << (*SI)->getNumber()
+        << "\" [ color=lightgray ]\n";
+  }
+  O << "}\n";
+  return O;
+}
+}
+
+/// view - Visualize the annotated bipartite CFG with Graphviz.
+void EdgeBundles::view() const {
+  ViewGraph(*this, "EdgeBundles");
+}
diff --git a/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
new file mode 100644
index 000000000000..566b8d507b2b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExecutionDepsFix.cpp
@@ -0,0 +1,811 @@
+//===- ExecutionDepsFix.cpp - Fix execution dependecy issues ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the execution dependency fix pass.
+//
+// Some X86 SSE instructions like mov, and, or, xor are available in different
+// variants for different operand types. These variant instructions are
+// equivalent, but on Nehalem and newer cpus there is extra latency
+// transferring data between integer and floating point domains.  ARM cores
+// have similar issues when they are configured with both VFP and NEON
+// pipelines.
+//
+// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "execution-fix"
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact that the register is now available in multiple
+/// domains.
+namespace {
+struct DomainValue {
+  // Basic reference counting.
+  unsigned Refs;
+
+  // Bitmask of available domains. For an open DomainValue, it is the still
+  // possible domains for collapsing. For a collapsed DomainValue it is the
+  // domains where the register is available for free.
+  unsigned AvailableDomains;
+
+  // Pointer to the next DomainValue in a chain.  When two DomainValues are
+  // merged, Victim.Next is set to point to Victor, so old DomainValue
+  // references can be updated by following the chain.
+  DomainValue *Next;
+
+  // Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr*, 8> Instrs;
+
+  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
+  // track of the domains where the registers are already available.
+  bool isCollapsed() const { return Instrs.empty(); }
+
+  // Is domain available?
+  bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    return AvailableDomains & (1u << domain);
+  }
+
+  // Mark domain as available.
+  void addDomain(unsigned domain) {
+    AvailableDomains |= 1u << domain;
+  }
+
+  // Restrict to a single domain available.
+  void setSingleDomain(unsigned domain) {
+    AvailableDomains = 1u << domain;
+  }
+
+  // Return bitmask of domains that are available and in mask.
+  unsigned getCommonDomains(unsigned mask) const {
+    return AvailableDomains & mask;
+  }
+
+  // First domain available.
+  unsigned getFirstDomain() const {
+    return countTrailingZeros(AvailableDomains);
+  }
+
+  DomainValue() : Refs(0) { clear(); }
+
+  // Clear this DomainValue and point to next which has all its data.
+  void clear() {
+    AvailableDomains = 0;
+    Next = nullptr;
+    Instrs.clear();
+  }
+};
+}
+
+namespace {
+/// Information about a live register.
+struct LiveReg {
+  /// Value currently in this register, or NULL when no value is being tracked.
+  /// This counts as a DomainValue reference.
+  DomainValue *Value;
+
+  /// Instruction that defined this register, relative to the beginning of the
+  /// current basic block.  When a LiveReg is used to represent a live-out
+  /// register, this value is relative to the end of the basic block, so it
+  /// will be a negative number.
+  int Def;
+};
+} // anonymous namespace
+
+namespace {
+class ExeDepsFix : public MachineFunctionPass {
+  static char ID;
+  SpecificBumpPtrAllocator<DomainValue> Allocator;
+  SmallVector<DomainValue*,16> Avail;
+
+  const TargetRegisterClass *const RC;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  std::vector<SmallVector<int, 1>> AliasMap;
+  const unsigned NumRegs;
+  LiveReg *LiveRegs;
+  typedef DenseMap<MachineBasicBlock*, LiveReg*> LiveOutMap;
+  LiveOutMap LiveOuts;
+
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+
+  /// Storage for register unit liveness.
+  LivePhysRegs LiveRegSet;
+
+  /// Current instruction number.
+  /// The first instruction in each basic block is 0.
+  int CurInstr;
+
+  /// True when the current block has a predecessor that hasn't been visited
+  /// yet.
+  bool SeenUnknownBackEdge;
+
+public:
+  ExeDepsFix(const TargetRegisterClass *rc)
+    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+  const char *getPassName() const override {
+    return "Execution dependency fix";
+  }
+
+private:
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndices(unsigned Reg) const;
+
+  // DomainValue allocation.
+  DomainValue *alloc(int domain = -1);
+  DomainValue *retain(DomainValue *DV) {
+    if (DV) ++DV->Refs;
+    return DV;
+  }
+  void release(DomainValue*);
+  DomainValue *resolve(DomainValue*&);
+
+  // LiveRegs manipulations.
+  void setLiveReg(int rx, DomainValue *DV);
+  void kill(int rx);
+  void force(int rx, unsigned domain);
+  void collapse(DomainValue *dv, unsigned domain);
+  bool merge(DomainValue *A, DomainValue *B);
+
+  void enterBasicBlock(MachineBasicBlock*);
+  void leaveBasicBlock(MachineBasicBlock*);
+  void visitInstr(MachineInstr*);
+  void processDefs(MachineInstr*, bool Kill);
+  void visitSoftInstr(MachineInstr*, unsigned mask);
+  void visitHardInstr(MachineInstr*, unsigned domain);
+  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
+  void processUndefReads(MachineBasicBlock*);
+};
+}
+
+char ExeDepsFix::ID = 0;
+
+/// Translate TRI register number to a list of indices into our smaller tables
+/// of interesting registers.
+iterator_range<SmallVectorImpl<int>::const_iterator>
+ExeDepsFix::regIndices(unsigned Reg) const {
+  assert(Reg < AliasMap.size() && "Invalid register");
+  const auto &Entry = AliasMap[Reg];
+  return make_range(Entry.begin(), Entry.end());
+}
+
+DomainValue *ExeDepsFix::alloc(int domain) {
+  DomainValue *dv = Avail.empty() ?
+                      new(Allocator.Allocate()) DomainValue :
+                      Avail.pop_back_val();
+  if (domain >= 0)
+    dv->addDomain(domain);
+  assert(dv->Refs == 0 && "Reference count wasn't cleared");
+  assert(!dv->Next && "Chained DomainValue shouldn't have been recycled");
+  return dv;
+}
+
+/// Release a reference to DV.  When the last reference is released,
+/// collapse if needed.
+void ExeDepsFix::release(DomainValue *DV) {
+  while (DV) {
+    assert(DV->Refs && "Bad DomainValue");
+    if (--DV->Refs)
+      return;
+
+    // There are no more DV references. Collapse any contained instructions.
+    if (DV->AvailableDomains && !DV->isCollapsed())
+      collapse(DV, DV->getFirstDomain());
+
+    DomainValue *Next = DV->Next;
+    DV->clear();
+    Avail.push_back(DV);
+    // Also release the next DomainValue in the chain.
+    DV = Next;
+  }
+}
+
+/// Follow the chain of dead DomainValues until a live DomainValue is reached.
+/// Update the referenced pointer when necessary.
+DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
+  DomainValue *DV = DVRef;
+  if (!DV || !DV->Next)
+    return DV;
+
+  // DV has a chain. Find the end.
+  do DV = DV->Next;
+  while (DV->Next);
+
+  // Update DVRef to point to DV.
+  retain(DV);
+  release(DVRef);
+  DVRef = DV;
+  return DV;
+}
+
+/// Set LiveRegs[rx] = dv, updating reference counts.
+void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+
+  if (LiveRegs[rx].Value == dv)
+    return;
+  if (LiveRegs[rx].Value)
+    release(LiveRegs[rx].Value);
+  LiveRegs[rx].Value = retain(dv);
+}
+
+// Kill register rx, recycle or collapse any DomainValue.
+void ExeDepsFix::kill(int rx) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+  if (!LiveRegs[rx].Value)
+    return;
+
+  release(LiveRegs[rx].Value);
+  LiveRegs[rx].Value = nullptr;
+}
+
+/// Force register rx into domain.
+void ExeDepsFix::force(int rx, unsigned domain) {
+  assert(unsigned(rx) < NumRegs && "Invalid index");
+  assert(LiveRegs && "Must enter basic block first.");
+  if (DomainValue *dv = LiveRegs[rx].Value) {
+    if (dv->isCollapsed())
+      dv->addDomain(domain);
+    else if (dv->hasDomain(domain))
+      collapse(dv, domain);
+    else {
+      // This is an incompatible open DomainValue. Collapse it to whatever and
+      // force the new value into domain. This costs a domain crossing.
+      collapse(dv, dv->getFirstDomain());
+      assert(LiveRegs[rx].Value && "Not live after collapse?");
+      LiveRegs[rx].Value->addDomain(domain);
+    }
+  } else {
+    // Set up basic collapsed DomainValue.
+    setLiveReg(rx, alloc(domain));
+  }
+}
+
+/// Collapse open DomainValue into given domain. If there are multiple
+/// registers using dv, they each get a unique collapsed DomainValue.
+void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
+  assert(dv->hasDomain(domain) && "Cannot collapse");
+
+  // Collapse all the instructions.
+  while (!dv->Instrs.empty())
+    TII->setExecutionDomain(*dv->Instrs.pop_back_val(), domain);
+  dv->setSingleDomain(domain);
+
+  // If there are multiple users, give them new, unique DomainValues.
+  if (LiveRegs && dv->Refs > 1)
+    for (unsigned rx = 0; rx != NumRegs; ++rx)
+      if (LiveRegs[rx].Value == dv)
+        setLiveReg(rx, alloc(domain));
+}
+
+/// All instructions and registers in B are moved to A, and B is released.
+bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
+  assert(!A->isCollapsed() && "Cannot merge into collapsed");
+  assert(!B->isCollapsed() && "Cannot merge from collapsed");
+  if (A == B)
+    return true;
+  // Restrict to the domains that A and B have in common.
+  unsigned common = A->getCommonDomains(B->AvailableDomains);
+  if (!common)
+    return false;
+  A->AvailableDomains = common;
+  A->Instrs.append(B->Instrs.begin(), B->Instrs.end());
+
+  // Clear the old DomainValue so we won't try to swizzle instructions twice.
+  B->clear();
+  // All uses of B are referred to A.
+  B->Next = retain(A);
+
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    assert(LiveRegs && "no space allocated for live registers");
+    if (LiveRegs[rx].Value == B)
+      setLiveReg(rx, A);
+  }
+  return true;
+}
+
+/// Set up LiveRegs by merging predecessor live-out values.
+void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
+  // Detect back-edges from predecessors we haven't processed yet.
+  SeenUnknownBackEdge = false;
+
+  // Reset instruction counter in each basic block.
+  CurInstr = 0;
+
+  // Set up UndefReads to track undefined register reads.
+  UndefReads.clear();
+  LiveRegSet.clear();
+
+  // Set up LiveRegs to represent registers entering MBB.
+  if (!LiveRegs)
+    LiveRegs = new LiveReg[NumRegs];
+
+  // Default values are 'nothing happened a long time ago'.
+  for (unsigned rx = 0; rx != NumRegs; ++rx) {
+    LiveRegs[rx].Value = nullptr;
+    LiveRegs[rx].Def = -(1 << 20);
+  }
+
+  // This is the entry block.
+  if (MBB->pred_empty()) {
+    for (const auto &LI : MBB->liveins()) {
+      for (int rx : regIndices(LI.PhysReg)) {
+        // Treat function live-ins as if they were defined just before the first
+        // instruction.  Usually, function arguments are set up immediately
+        // before the call.
+        LiveRegs[rx].Def = -1;
+      }
+    }
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << ": entry\n");
+    return;
+  }
+
+  // Try to coalesce live-out registers from predecessors.
+  for (MachineBasicBlock::const_pred_iterator pi = MBB->pred_begin(),
+       pe = MBB->pred_end(); pi != pe; ++pi) {
+    LiveOutMap::const_iterator fi = LiveOuts.find(*pi);
+    if (fi == LiveOuts.end()) {
+      SeenUnknownBackEdge = true;
+      continue;
+    }
+    assert(fi->second && "Can't have NULL entries");
+
+    for (unsigned rx = 0; rx != NumRegs; ++rx) {
+      // Use the most recent predecessor def for each register.
+      LiveRegs[rx].Def = std::max(LiveRegs[rx].Def, fi->second[rx].Def);
+
+      DomainValue *pdv = resolve(fi->second[rx].Value);
+      if (!pdv)
+        continue;
+      if (!LiveRegs[rx].Value) {
+        setLiveReg(rx, pdv);
+        continue;
+      }
+
+      // We have a live DomainValue from more than one predecessor.
+      if (LiveRegs[rx].Value->isCollapsed()) {
+        // We are already collapsed, but predecessor is not. Force it.
+        unsigned Domain = LiveRegs[rx].Value->getFirstDomain();
+        if (!pdv->isCollapsed() && pdv->hasDomain(Domain))
+          collapse(pdv, Domain);
+        continue;
+      }
+
+      // Currently open, merge in predecessor.
+      if (!pdv->isCollapsed())
+        merge(LiveRegs[rx].Value, pdv);
+      else
+        force(rx, pdv->getFirstDomain());
+    }
+  }
+  DEBUG(dbgs() << "BB#" << MBB->getNumber()
+        << (SeenUnknownBackEdge ? ": incomplete\n" : ": all preds known\n"));
+}
+
+void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
+  assert(LiveRegs && "Must enter basic block first.");
+  // Save live registers at end of MBB - used by enterBasicBlock().
+  // Also use LiveOuts as a visited set to detect back-edges.
+  bool First = LiveOuts.insert(std::make_pair(MBB, LiveRegs)).second;
+
+  if (First) {
+    // LiveRegs was inserted in LiveOuts.  Adjust all defs to be relative to
+    // the end of this block instead of the beginning.
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      LiveRegs[i].Def -= CurInstr;
+  } else {
+    // Insertion failed, this must be the second pass.
+    // Release all the DomainValues instead of keeping them.
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      release(LiveRegs[i].Value);
+    delete[] LiveRegs;
+  }
+  LiveRegs = nullptr;
+}
+
+void ExeDepsFix::visitInstr(MachineInstr *MI) {
+  if (MI->isDebugValue())
+    return;
+
+  // Update instructions with explicit execution domains.
+  std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
+  if (DomP.first) {
+    if (DomP.second)
+      visitSoftInstr(MI, DomP.second);
+    else
+      visitHardInstr(MI, DomP.first);
+  }
+
+  // Process defs to track register ages, and kill values clobbered by generic
+  // instructions.
+  processDefs(MI, !DomP.first);
+}
+
+/// \brief Return true to if it makes sense to break dependence on a partial def
+/// or undef use.
+bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                       unsigned Pref) {
+  unsigned reg = MI->getOperand(OpIdx).getReg();
+  for (int rx : regIndices(reg)) {
+    unsigned Clearance = CurInstr - LiveRegs[rx].Def;
+    DEBUG(dbgs() << "Clearance: " << Clearance << ", want " << Pref);
+
+    if (Pref > Clearance) {
+      DEBUG(dbgs() << ": Break dependency.\n");
+      continue;
+    }
+    // The current clearance seems OK, but we may be ignoring a def from a
+    // back-edge.
+    if (!SeenUnknownBackEdge || Pref <= unsigned(CurInstr)) {
+      DEBUG(dbgs() << ": OK .\n");
+      return false;
+    }
+    // A def from an unprocessed back-edge may make us break this dependency.
+    DEBUG(dbgs() << ": Wait for back-edge to resolve.\n");
+    return false;
+  }
+  return true;
+}
+
+// Update def-ages for registers defined by MI.
+// If Kill is set, also kill off DomainValues clobbered by the defs.
+//
+// Also break dependencies on partial defs and undef uses.
+void ExeDepsFix::processDefs(MachineInstr *MI, bool Kill) {
+  assert(!MI->isDebugValue() && "Won't process debug values");
+
+  // Break dependence on undef uses. Do this before updating LiveRegs below.
+  unsigned OpNum;
+  unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
+  if (Pref) {
+    if (shouldBreakDependence(MI, OpNum, Pref))
+      UndefReads.push_back(std::make_pair(MI, OpNum));
+  }
+  const MCInstrDesc &MCID = MI->getDesc();
+  for (unsigned i = 0,
+         e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
+         i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg())
+      continue;
+    if (MO.isImplicit())
+      break;
+    if (MO.isUse())
+      continue;
+    for (int rx : regIndices(MO.getReg())) {
+      // This instruction explicitly defines rx.
+      DEBUG(dbgs() << TRI->getName(RC->getRegister(rx)) << ":\t" << CurInstr
+                   << '\t' << *MI);
+
+      // Check clearance before partial register updates.
+      // Call breakDependence before setting LiveRegs[rx].Def.
+      unsigned Pref = TII->getPartialRegUpdateClearance(*MI, i, TRI);
+      if (Pref && shouldBreakDependence(MI, i, Pref))
+        TII->breakPartialRegDependency(*MI, i, TRI);
+
+      // How many instructions since rx was last written?
+      LiveRegs[rx].Def = CurInstr;
+
+      // Kill off domains redefined by generic instructions.
+      if (Kill)
+        kill(rx);
+    }
+  }
+  ++CurInstr;
+}
+
+/// \break Break false dependencies on undefined register reads.
+///
+/// Walk the block backward computing precise liveness. This is expensive, so we
+/// only do it on demand. Note that the occurrence of undefined register reads
+/// that should be broken is very rare, but when they occur we may have many in
+/// a single block.
+void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+  if (UndefReads.empty())
+    return;
+
+  // Collect this block's live out register units.
+  LiveRegSet.init(TRI);
+  // We do not need to care about pristine registers as they are just preserved
+  // but not actually used in the function.
+  LiveRegSet.addLiveOutsNoPristines(*MBB);
+
+  MachineInstr *UndefMI = UndefReads.back().first;
+  unsigned OpIdx = UndefReads.back().second;
+
+  for (MachineInstr &I : make_range(MBB->rbegin(), MBB->rend())) {
+    // Update liveness, including the current instruction's defs.
+    LiveRegSet.stepBackward(I);
+
+    if (UndefMI == &I) {
+      if (!LiveRegSet.contains(UndefMI->getOperand(OpIdx).getReg()))
+        TII->breakPartialRegDependency(*UndefMI, OpIdx, TRI);
+
+      UndefReads.pop_back();
+      if (UndefReads.empty())
+        return;
+
+      UndefMI = UndefReads.back().first;
+      OpIdx = UndefReads.back().second;
+    }
+  }
+}
+
+// A hard instruction only works in one domain. All input registers will be
+// forced into that domain.
+void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+  // Collapse all uses.
+  for (unsigned i = mi->getDesc().getNumDefs(),
+                e = mi->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    for (int rx : regIndices(mo.getReg())) {
+      force(rx, domain);
+    }
+  }
+
+  // Kill all defs and force them.
+  for (unsigned i = 0, e = mi->getDesc().getNumDefs(); i != e; ++i) {
+    MachineOperand &mo = mi->getOperand(i);
+    if (!mo.isReg()) continue;
+    for (int rx : regIndices(mo.getReg())) {
+      kill(rx);
+      force(rx, domain);
+    }
+  }
+}
+
+// A soft instruction can be changed to work in other domains given by mask.
+void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+  // Bitmask of available domains for this instruction after taking collapsed
+  // operands into account.
+  unsigned available = mask;
+
+  // Scan the explicit use operands for incoming domains.
+  SmallVector<int, 4> used;
+  if (LiveRegs)
+    for (unsigned i = mi->getDesc().getNumDefs(),
+                  e = mi->getDesc().getNumOperands(); i != e; ++i) {
+      MachineOperand &mo = mi->getOperand(i);
+      if (!mo.isReg()) continue;
+      for (int rx : regIndices(mo.getReg())) {
+        DomainValue *dv = LiveRegs[rx].Value;
+        if (dv == nullptr)
+          continue;
+        // Bitmask of domains that dv and available have in common.
+        unsigned common = dv->getCommonDomains(available);
+        // Is it possible to use this collapsed register for free?
+        if (dv->isCollapsed()) {
+          // Restrict available domains to the ones in common with the operand.
+          // If there are no common domains, we must pay the cross-domain
+          // penalty for this operand.
+          if (common) available = common;
+        } else if (common)
+          // Open DomainValue is compatible, save it for merging.
+          used.push_back(rx);
+        else
+          // Open DomainValue is not compatible with instruction. It is useless
+          // now.
+          kill(rx);
+      }
+    }
+
+  // If the collapsed operands force a single domain, propagate the collapse.
+  if (isPowerOf2_32(available)) {
+    unsigned domain = countTrailingZeros(available);
+    TII->setExecutionDomain(*mi, domain);
+    visitHardInstr(mi, domain);
+    return;
+  }
+
+  // Kill off any remaining uses that don't match available, and build a list of
+  // incoming DomainValues that we want to merge.
+  SmallVector<LiveReg, 4> Regs;
+  for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
+    int rx = *i;
+    assert(LiveRegs && "no space allocated for live registers");
+    const LiveReg &LR = LiveRegs[rx];
+    // This useless DomainValue could have been missed above.
+    if (!LR.Value->getCommonDomains(available)) {
+      kill(rx);
+      continue;
+    }
+    // Sorted insertion.
+    bool Inserted = false;
+    for (SmallVectorImpl<LiveReg>::iterator i = Regs.begin(), e = Regs.end();
+           i != e && !Inserted; ++i) {
+      if (LR.Def < i->Def) {
+        Inserted = true;
+        Regs.insert(i, LR);
+      }
+    }
+    if (!Inserted)
+      Regs.push_back(LR);
+  }
+
+  // doms are now sorted in order of appearance. Try to merge them all, giving
+  // priority to the latest ones.
+  DomainValue *dv = nullptr;
+  while (!Regs.empty()) {
+    if (!dv) {
+      dv = Regs.pop_back_val().Value;
+      // Force the first dv to match the current instruction.
+      dv->AvailableDomains = dv->getCommonDomains(available);
+      assert(dv->AvailableDomains && "Domain should have been filtered");
+      continue;
+    }
+
+    DomainValue *Latest = Regs.pop_back_val().Value;
+    // Skip already merged values.
+    if (Latest == dv || Latest->Next)
+      continue;
+    if (merge(dv, Latest))
+      continue;
+
+    // If latest didn't merge, it is useless now. Kill all registers using it.
+    for (int i : used) {
+      assert(LiveRegs && "no space allocated for live registers");
+      if (LiveRegs[i].Value == Latest)
+        kill(i);
+    }
+  }
+
+  // dv is the DomainValue we are going to use for this instruction.
+  if (!dv) {
+    dv = alloc();
+    dv->AvailableDomains = available;
+  }
+  dv->Instrs.push_back(mi);
+
+  // Finally set all defs and non-collapsed uses to dv. We must iterate through
+  // all the operators, including imp-def ones.
+  for (MachineInstr::mop_iterator ii = mi->operands_begin(),
+                                  ee = mi->operands_end();
+                                  ii != ee; ++ii) {
+    MachineOperand &mo = *ii;
+    if (!mo.isReg()) continue;
+    for (int rx : regIndices(mo.getReg())) {
+      if (!LiveRegs[rx].Value || (mo.isDef() && LiveRegs[rx].Value != dv)) {
+        kill(rx);
+        setLiveReg(rx, dv);
+      }
+    }
+  }
+}
+
+bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(*mf.getFunction()))
+    return false;
+  MF = &mf;
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  LiveRegs = nullptr;
+  assert(NumRegs == RC->getNumRegs() && "Bad regclass");
+
+  DEBUG(dbgs() << "********** FIX EXECUTION DEPENDENCIES: "
+               << TRI->getRegClassName(RC) << " **********\n");
+
+  // If no relevant registers are used in the function, we can skip it
+  // completely.
+  bool anyregs = false;
+  const MachineRegisterInfo &MRI = mf.getRegInfo();
+  for (unsigned Reg : *RC) {
+    if (MRI.isPhysRegUsed(Reg)) {
+      anyregs = true;
+      break;
+    }
+  }
+  if (!anyregs) return false;
+
+  // Initialize the AliasMap on the first use.
+  if (AliasMap.empty()) {
+    // Given a PhysReg, AliasMap[PhysReg] returns a list of indices into RC and
+    // therefore the LiveRegs array.
+    AliasMap.resize(TRI->getNumRegs());
+    for (unsigned i = 0, e = RC->getNumRegs(); i != e; ++i)
+      for (MCRegAliasIterator AI(RC->getRegister(i), TRI, true);
+           AI.isValid(); ++AI)
+        AliasMap[*AI].push_back(i);
+  }
+
+  MachineBasicBlock *Entry = &*MF->begin();
+  ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
+  SmallVector<MachineBasicBlock*, 16> Loops;
+  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
+         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
+    MachineBasicBlock *MBB = *MBBI;
+    enterBasicBlock(MBB);
+    if (SeenUnknownBackEdge)
+      Loops.push_back(MBB);
+    for (MachineInstr &MI : *MBB)
+      visitInstr(&MI);
+    processUndefReads(MBB);
+    leaveBasicBlock(MBB);
+  }
+
+  // Visit all the loop blocks again in order to merge DomainValues from
+  // back-edges.
+  for (MachineBasicBlock *MBB : Loops) {
+    enterBasicBlock(MBB);
+    for (MachineInstr &MI : *MBB)
+      if (!MI.isDebugValue())
+        processDefs(&MI, false);
+    processUndefReads(MBB);
+    leaveBasicBlock(MBB);
+  }
+
+  // Clear the LiveOuts vectors and collapse any remaining DomainValues.
+  for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
+         MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
+    LiveOutMap::const_iterator FI = LiveOuts.find(*MBBI);
+    if (FI == LiveOuts.end() || !FI->second)
+      continue;
+    for (unsigned i = 0, e = NumRegs; i != e; ++i)
+      if (FI->second[i].Value)
+        release(FI->second[i].Value);
+    delete[] FI->second;
+  }
+  LiveOuts.clear();
+  UndefReads.clear();
+  Avail.clear();
+  Allocator.DestroyAll();
+
+  return false;
+}
+
+FunctionPass *
+llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
+  return new ExeDepsFix(RC);
+}
diff --git a/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
new file mode 100644
index 000000000000..0ec79c2e69f9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExpandISelPseudos.cpp
@@ -0,0 +1,74 @@
+//===-- llvm/CodeGen/ExpandISelPseudos.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Expand Pseudo-instructions produced by ISel. These are usually to allow
+// the expansion to contain control flow, such as a conditional move
+// implemented with a conditional branch and a phi, or an atomic operation
+// implemented with a loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "expand-isel-pseudos"
+
+namespace {
+  class ExpandISelPseudos : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    ExpandISelPseudos() : MachineFunctionPass(ID) {}
+
+  private:
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+  };
+} // end anonymous namespace
+
+char ExpandISelPseudos::ID = 0;
+char &llvm::ExpandISelPseudosID = ExpandISelPseudos::ID;
+INITIALIZE_PASS(ExpandISelPseudos, "expand-isel-pseudos",
+                "Expand ISel Pseudo-instructions", false, false)
+
+bool ExpandISelPseudos::runOnMachineFunction(MachineFunction &MF) {
+  bool Changed = false;
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
+
+  // Iterate through each instruction in the function, looking for pseudos.
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+         MBBI != MBBE; ) {
+      MachineInstr &MI = *MBBI++;
+
+      // If MI is a pseudo, expand it.
+      if (MI.usesCustomInsertionHook()) {
+        Changed = true;
+        MachineBasicBlock *NewMBB = TLI->EmitInstrWithCustomInserter(MI, MBB);
+        // The expansion may involve new basic blocks.
+        if (NewMBB != MBB) {
+          MBB = NewMBB;
+          I = NewMBB->getIterator();
+          MBBI = NewMBB->begin();
+          MBBE = NewMBB->end();
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
new file mode 100644
index 000000000000..ab2382e2db6d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -0,0 +1,223 @@
+//===-- ExpandPostRAPseudos.cpp - Pseudo instruction expansion pass -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a pass that expands COPY and SUBREG_TO_REG pseudo
+// instructions after register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "postrapseudos"
+
+namespace {
+struct ExpandPostRA : public MachineFunctionPass {
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ExpandPostRA() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// runOnMachineFunction - pass entry point
+  bool runOnMachineFunction(MachineFunction&) override;
+
+private:
+  bool LowerSubregToReg(MachineInstr *MI);
+  bool LowerCopy(MachineInstr *MI);
+
+  void TransferImplicitOperands(MachineInstr *MI);
+};
+} // end anonymous namespace
+
+char ExpandPostRA::ID = 0;
+char &llvm::ExpandPostRAPseudosID = ExpandPostRA::ID;
+
+INITIALIZE_PASS(ExpandPostRA, "postrapseudos",
+                "Post-RA pseudo instruction expansion pass", false, false)
+
+/// TransferImplicitOperands - MI is a pseudo-instruction, and the lowered
+/// replacement instructions immediately precede it.  Copy any implicit
+/// operands from MI to the replacement instruction.
+void ExpandPostRA::TransferImplicitOperands(MachineInstr *MI) {
+  MachineBasicBlock::iterator CopyMI = MI;
+  --CopyMI;
+
+  for (const MachineOperand &MO : MI->implicit_operands())
+    if (MO.isReg())
+      CopyMI->addOperand(MO);
+}
+
+bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
+  MachineBasicBlock *MBB = MI->getParent();
+  assert((MI->getOperand(0).isReg() && MI->getOperand(0).isDef()) &&
+         MI->getOperand(1).isImm() &&
+         (MI->getOperand(2).isReg() && MI->getOperand(2).isUse()) &&
+          MI->getOperand(3).isImm() && "Invalid subreg_to_reg");
+
+  unsigned DstReg  = MI->getOperand(0).getReg();
+  unsigned InsReg  = MI->getOperand(2).getReg();
+  assert(!MI->getOperand(2).getSubReg() && "SubIdx on physreg?");
+  unsigned SubIdx  = MI->getOperand(3).getImm();
+
+  assert(SubIdx != 0 && "Invalid index for insert_subreg");
+  unsigned DstSubReg = TRI->getSubReg(DstReg, SubIdx);
+
+  assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+         "Insert destination must be in a physical register");
+  assert(TargetRegisterInfo::isPhysicalRegister(InsReg) &&
+         "Inserted value must be in a physical register");
+
+  DEBUG(dbgs() << "subreg: CONVERTING: " << *MI);
+
+  if (MI->allDefsAreDead()) {
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "subreg: replaced by: " << *MI);
+    return true;
+  }
+
+  if (DstSubReg == InsReg) {
+    // No need to insert an identity copy instruction.
+    // Watch out for case like this:
+    // %RAX<def> = SUBREG_TO_REG 0, %EAX<kill>, 3
+    // We must leave %RAX live.
+    if (DstReg != InsReg) {
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      MI->RemoveOperand(3);     // SubIdx
+      MI->RemoveOperand(1);     // Imm
+      DEBUG(dbgs() << "subreg: replace by: " << *MI);
+      return true;
+    }
+    DEBUG(dbgs() << "subreg: eliminated!");
+  } else {
+    TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
+                     MI->getOperand(2).isKill());
+
+    // Implicitly define DstReg for subsequent uses.
+    MachineBasicBlock::iterator CopyMI = MI;
+    --CopyMI;
+    CopyMI->addRegisterDefined(DstReg);
+    DEBUG(dbgs() << "subreg: " << *CopyMI);
+  }
+
+  DEBUG(dbgs() << '\n');
+  MBB->erase(MI);
+  return true;
+}
+
+bool ExpandPostRA::LowerCopy(MachineInstr *MI) {
+
+  if (MI->allDefsAreDead()) {
+    DEBUG(dbgs() << "dead copy: " << *MI);
+    MI->setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "replaced by: " << *MI);
+    return true;
+  }
+
+  MachineOperand &DstMO = MI->getOperand(0);
+  MachineOperand &SrcMO = MI->getOperand(1);
+
+  if (SrcMO.getReg() == DstMO.getReg()) {
+    DEBUG(dbgs() << "identity copy: " << *MI);
+    // No need to insert an identity copy instruction, but replace with a KILL
+    // if liveness is changed.
+    if (SrcMO.isUndef() || MI->getNumOperands() > 2) {
+      // We must make sure the super-register gets killed. Replace the
+      // instruction with KILL.
+      MI->setDesc(TII->get(TargetOpcode::KILL));
+      DEBUG(dbgs() << "replaced by:   " << *MI);
+      return true;
+    }
+    // Vanilla identity copy.
+    MI->eraseFromParent();
+    return true;
+  }
+
+  DEBUG(dbgs() << "real copy:   " << *MI);
+  TII->copyPhysReg(*MI->getParent(), MI, MI->getDebugLoc(),
+                   DstMO.getReg(), SrcMO.getReg(), SrcMO.isKill());
+
+  if (MI->getNumOperands() > 2)
+    TransferImplicitOperands(MI);
+  DEBUG({
+    MachineBasicBlock::iterator dMI = MI;
+    dbgs() << "replaced by: " << *(--dMI);
+  });
+  MI->eraseFromParent();
+  return true;
+}
+
+/// runOnMachineFunction - Reduce subregister inserts and extracts to register
+/// copies.
+///
+bool ExpandPostRA::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Machine Function\n"
+               << "********** EXPANDING POST-RA PSEUDO INSTRS **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool MadeChange = false;
+
+  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+       mbbi != mbbe; ++mbbi) {
+    for (MachineBasicBlock::iterator mi = mbbi->begin(), me = mbbi->end();
+         mi != me;) {
+      MachineInstr &MI = *mi;
+      // Advance iterator here because MI may be erased.
+      ++mi;
+
+      // Only expand pseudos.
+      if (!MI.isPseudo())
+        continue;
+
+      // Give targets a chance to expand even standard pseudos.
+      if (TII->expandPostRAPseudo(MI)) {
+        MadeChange = true;
+        continue;
+      }
+
+      // Expand standard pseudos.
+      switch (MI.getOpcode()) {
+      case TargetOpcode::SUBREG_TO_REG:
+        MadeChange |= LowerSubregToReg(&MI);
+        break;
+      case TargetOpcode::COPY:
+        MadeChange |= LowerCopy(&MI);
+        break;
+      case TargetOpcode::DBG_VALUE:
+        continue;
+      case TargetOpcode::INSERT_SUBREG:
+      case TargetOpcode::EXTRACT_SUBREG:
+        llvm_unreachable("Sub-register pseudos should have been eliminated.");
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/contrib/llvm/lib/CodeGen/FaultMaps.cpp b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
new file mode 100644
index 000000000000..2acafafdb9fc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/FaultMaps.cpp
@@ -0,0 +1,150 @@
+//===---------------------------- FaultMaps.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/FaultMaps.h"
+
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "faultmaps"
+
+static const int FaultMapVersion = 1;
+const char *FaultMaps::WFMP = "Fault Maps: ";
+
+FaultMaps::FaultMaps(AsmPrinter &AP) : AP(AP) {}
+
+void FaultMaps::recordFaultingOp(FaultKind FaultTy,
+                                 const MCSymbol *HandlerLabel) {
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCSymbol *FaultingLabel = OutContext.createTempSymbol();
+
+  AP.OutStreamer->EmitLabel(FaultingLabel);
+
+  const MCExpr *FaultingOffset = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(FaultingLabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  const MCExpr *HandlerOffset = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(HandlerLabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  FunctionInfos[AP.CurrentFnSym].emplace_back(FaultTy, FaultingOffset,
+                                              HandlerOffset);
+}
+
+void FaultMaps::serializeToFaultMapSection() {
+  if (FunctionInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCStreamer &OS = *AP.OutStreamer;
+
+  // Create the section.
+  MCSection *FaultMapSection =
+      OutContext.getObjectFileInfo()->getFaultMapSection();
+  OS.SwitchSection(FaultMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
+
+  DEBUG(dbgs() << "********** Fault Map Output **********\n");
+
+  // Header
+  OS.EmitIntValue(FaultMapVersion, 1); // Version.
+  OS.EmitIntValue(0, 1);               // Reserved.
+  OS.EmitIntValue(0, 2);               // Reserved.
+
+  DEBUG(dbgs() << WFMP << "#functions = " << FunctionInfos.size() << "\n");
+  OS.EmitIntValue(FunctionInfos.size(), 4);
+
+  DEBUG(dbgs() << WFMP << "functions:\n");
+
+  for (const auto &FFI : FunctionInfos)
+    emitFunctionInfo(FFI.first, FFI.second);
+}
+
+void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
+                                 const FunctionFaultInfos &FFI) {
+  MCStreamer &OS = *AP.OutStreamer;
+
+  DEBUG(dbgs() << WFMP << "  function addr: " << *FnLabel << "\n");
+  OS.EmitSymbolValue(FnLabel, 8);
+
+  DEBUG(dbgs() << WFMP << "  #faulting PCs: " << FFI.size() << "\n");
+  OS.EmitIntValue(FFI.size(), 4);
+
+  OS.EmitIntValue(0, 4); // Reserved
+
+  for (auto &Fault : FFI) {
+    DEBUG(dbgs() << WFMP << "    fault type: "
+          << faultTypeToString(Fault.Kind) << "\n");
+    OS.EmitIntValue(Fault.Kind, 4);
+
+    DEBUG(dbgs() << WFMP << "    faulting PC offset: "
+          << *Fault.FaultingOffsetExpr << "\n");
+    OS.EmitValue(Fault.FaultingOffsetExpr, 4);
+
+    DEBUG(dbgs() << WFMP << "    fault handler PC offset: "
+          << *Fault.HandlerOffsetExpr << "\n");
+    OS.EmitValue(Fault.HandlerOffsetExpr, 4);
+  }
+}
+
+
+const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {
+  switch (FT) {
+  default:
+    llvm_unreachable("unhandled fault type!");
+
+  case FaultMaps::FaultingLoad:
+    return "FaultingLoad";
+  }
+}
+
+raw_ostream &llvm::
+operator<<(raw_ostream &OS,
+           const FaultMapParser::FunctionFaultInfoAccessor &FFI) {
+  OS << "Fault kind: "
+     << FaultMaps::faultTypeToString((FaultMaps::FaultKind)FFI.getFaultKind())
+     << ", faulting PC offset: " << FFI.getFaultingPCOffset()
+     << ", handling PC offset: " << FFI.getHandlerPCOffset();
+  return OS;
+}
+
+raw_ostream &llvm::
+operator<<(raw_ostream &OS, const FaultMapParser::FunctionInfoAccessor &FI) {
+  OS << "FunctionAddress: " << format_hex(FI.getFunctionAddr(), 8)
+     << ", NumFaultingPCs: " << FI.getNumFaultingPCs() << "\n";
+  for (unsigned i = 0, e = FI.getNumFaultingPCs(); i != e; ++i)
+    OS << FI.getFunctionFaultInfoAt(i) << "\n";
+  return OS;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const FaultMapParser &FMP) {
+  OS << "Version: " << format_hex(FMP.getFaultMapVersion(), 2) << "\n";
+  OS << "NumFunctions: " << FMP.getNumFunctions() << "\n";
+
+  if (FMP.getNumFunctions() == 0)
+    return OS;
+
+  FaultMapParser::FunctionInfoAccessor FI;
+
+  for (unsigned i = 0, e = FMP.getNumFunctions(); i != e; ++i) {
+    FI = (i == 0) ? FMP.getFirstFunctionInfo() : FI.getNextFunctionInfo();
+    OS << FI;
+  }
+
+  return OS;
+}
diff --git a/contrib/llvm/lib/CodeGen/FuncletLayout.cpp b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
new file mode 100644
index 000000000000..b16f81c728d0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/FuncletLayout.cpp
@@ -0,0 +1,59 @@
+//===-- FuncletLayout.cpp - Contiguously lay out funclets -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations which result in
+// funclets being contiguous.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "funclet-layout"
+
+namespace {
+class FuncletLayout : public MachineFunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  FuncletLayout() : MachineFunctionPass(ID) {
+    initializeFuncletLayoutPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+};
+}
+
+char FuncletLayout::ID = 0;
+char &llvm::FuncletLayoutID = FuncletLayout::ID;
+INITIALIZE_PASS(FuncletLayout, "funclet-layout",
+                "Contiguously Lay Out Funclets", false, false)
+
+bool FuncletLayout::runOnMachineFunction(MachineFunction &F) {
+  DenseMap<const MachineBasicBlock *, int> FuncletMembership =
+      getFuncletMembership(F);
+  if (FuncletMembership.empty())
+    return false;
+
+  F.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) {
+    auto FuncletX = FuncletMembership.find(&X);
+    auto FuncletY = FuncletMembership.find(&Y);
+    assert(FuncletX != FuncletMembership.end());
+    assert(FuncletY != FuncletMembership.end());
+    return FuncletX->second < FuncletY->second;
+  });
+
+  // Conservatively assume we changed something.
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/GCMetadata.cpp b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
new file mode 100644
index 000000000000..c8116a453d2d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCMetadata.cpp
@@ -0,0 +1,177 @@
+//===-- GCMetadata.cpp - Garbage collector metadata -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the GCFunctionInfo class and GCModuleInfo pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+namespace {
+
+class Printer : public FunctionPass {
+  static char ID;
+  raw_ostream &OS;
+
+public:
+  explicit Printer(raw_ostream &OS) : FunctionPass(ID), OS(OS) {}
+
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnFunction(Function &F) override;
+  bool doFinalization(Module &M) override;
+};
+}
+
+INITIALIZE_PASS(GCModuleInfo, "collector-metadata",
+                "Create Garbage Collector Module Metadata", false, false)
+
+// -----------------------------------------------------------------------------
+
+GCFunctionInfo::GCFunctionInfo(const Function &F, GCStrategy &S)
+    : F(F), S(S), FrameSize(~0LL) {}
+
+GCFunctionInfo::~GCFunctionInfo() {}
+
+// -----------------------------------------------------------------------------
+
+char GCModuleInfo::ID = 0;
+
+GCModuleInfo::GCModuleInfo() : ImmutablePass(ID) {
+  initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+GCFunctionInfo &GCModuleInfo::getFunctionInfo(const Function &F) {
+  assert(!F.isDeclaration() && "Can only get GCFunctionInfo for a definition!");
+  assert(F.hasGC());
+
+  finfo_map_type::iterator I = FInfoMap.find(&F);
+  if (I != FInfoMap.end())
+    return *I->second;
+
+  GCStrategy *S = getGCStrategy(F.getGC());
+  Functions.push_back(make_unique<GCFunctionInfo>(F, *S));
+  GCFunctionInfo *GFI = Functions.back().get();
+  FInfoMap[&F] = GFI;
+  return *GFI;
+}
+
+void GCModuleInfo::clear() {
+  Functions.clear();
+  FInfoMap.clear();
+  GCStrategyList.clear();
+}
+
+// -----------------------------------------------------------------------------
+
+char Printer::ID = 0;
+
+FunctionPass *llvm::createGCInfoPrinter(raw_ostream &OS) {
+  return new Printer(OS);
+}
+
+const char *Printer::getPassName() const {
+  return "Print Garbage Collector Information";
+}
+
+void Printer::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<GCModuleInfo>();
+}
+
+static const char *DescKind(GC::PointKind Kind) {
+  switch (Kind) {
+  case GC::PreCall:
+    return "pre-call";
+  case GC::PostCall:
+    return "post-call";
+  }
+  llvm_unreachable("Invalid point kind");
+}
+
+bool Printer::runOnFunction(Function &F) {
+  if (F.hasGC())
+    return false;
+
+  GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+
+  OS << "GC roots for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
+                                      RE = FD->roots_end();
+       RI != RE; ++RI)
+    OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
+
+  OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
+  for (GCFunctionInfo::iterator PI = FD->begin(), PE = FD->end(); PI != PE;
+       ++PI) {
+
+    OS << "\t" << PI->Label->getName() << ": " << DescKind(PI->Kind)
+       << ", live = {";
+
+    for (GCFunctionInfo::live_iterator RI = FD->live_begin(PI),
+                                       RE = FD->live_end(PI);
+         ;) {
+      OS << " " << RI->Num;
+      if (++RI == RE)
+        break;
+      OS << ",";
+    }
+
+    OS << " }\n";
+  }
+
+  return false;
+}
+
+bool Printer::doFinalization(Module &M) {
+  GCModuleInfo *GMI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(GMI && "Printer didn't require GCModuleInfo?!");
+  GMI->clear();
+  return false;
+}
+
+GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
+  // TODO: Arguably, just doing a linear search would be faster for small N
+  auto NMI = GCStrategyMap.find(Name);
+  if (NMI != GCStrategyMap.end())
+    return NMI->getValue();
+  
+  for (auto& Entry : GCRegistry::entries()) {
+    if (Name == Entry.getName()) {
+      std::unique_ptr<GCStrategy> S = Entry.instantiate();
+      S->Name = Name;
+      GCStrategyMap[Name] = S.get();
+      GCStrategyList.push_back(std::move(S));
+      return GCStrategyList.back().get();
+    }
+  }
+
+  if (GCRegistry::begin() == GCRegistry::end()) {
+    // In normal operation, the registry should not be empty.  There should 
+    // be the builtin GCs if nothing else.  The most likely scenario here is
+    // that we got here without running the initializers used by the Registry 
+    // itself and it's registration mechanism.
+    const std::string error = ("unsupported GC: " + Name).str() + 
+      " (did you remember to link and initialize the CodeGen library?)";
+    report_fatal_error(error);
+  } else
+    report_fatal_error(std::string("unsupported GC: ") + Name);
+}
diff --git a/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
new file mode 100644
index 000000000000..bb8cfa1cc809
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCMetadataPrinter.cpp
@@ -0,0 +1,19 @@
+//===-- GCMetadataPrinter.cpp - Garbage collection infrastructure ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the abstract base class GCMetadataPrinter.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadataPrinter.h"
+using namespace llvm;
+
+GCMetadataPrinter::GCMetadataPrinter() {}
+
+GCMetadataPrinter::~GCMetadataPrinter() {}
diff --git a/contrib/llvm/lib/CodeGen/GCRootLowering.cpp b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
new file mode 100644
index 000000000000..326adab2ba64
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -0,0 +1,355 @@
+//===-- GCRootLowering.cpp - Garbage collection infrastructure ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lowering for the gc.root mechanism.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// LowerIntrinsics - This pass rewrites calls to the llvm.gcread or
+/// llvm.gcwrite intrinsics, replacing them with simple loads and stores as
+/// directed by the GCStrategy. It also performs automatic root initialization
+/// and custom intrinsic lowering.
+class LowerIntrinsics : public FunctionPass {
+  bool PerformDefaultLowering(Function &F, GCStrategy &Coll);
+
+public:
+  static char ID;
+
+  LowerIntrinsics();
+  const char *getPassName() const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+};
+
+/// GCMachineCodeAnalysis - This is a target-independent pass over the machine
+/// function representation to identify safe points for the garbage collector
+/// in the machine code. It inserts labels at safe points and populates a
+/// GCMetadata record for each function.
+class GCMachineCodeAnalysis : public MachineFunctionPass {
+  GCFunctionInfo *FI;
+  MachineModuleInfo *MMI;
+  const TargetInstrInfo *TII;
+
+  void FindSafePoints(MachineFunction &MF);
+  void VisitCallPoint(MachineBasicBlock::iterator MI);
+  MCSymbol *InsertLabel(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+                        const DebugLoc &DL) const;
+
+  void FindStackOffsets(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  GCMachineCodeAnalysis();
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+// -----------------------------------------------------------------------------
+
+INITIALIZE_PASS_BEGIN(LowerIntrinsics, "gc-lowering", "GC Lowering", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(LowerIntrinsics, "gc-lowering", "GC Lowering", false, false)
+
+FunctionPass *llvm::createGCLoweringPass() { return new LowerIntrinsics(); }
+
+char LowerIntrinsics::ID = 0;
+
+LowerIntrinsics::LowerIntrinsics() : FunctionPass(ID) {
+  initializeLowerIntrinsicsPass(*PassRegistry::getPassRegistry());
+}
+
+const char *LowerIntrinsics::getPassName() const {
+  return "Lower Garbage Collection Instructions";
+}
+
+void LowerIntrinsics::getAnalysisUsage(AnalysisUsage &AU) const {
+  FunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<GCModuleInfo>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+static bool NeedsDefaultLoweringPass(const GCStrategy &C) {
+  // Default lowering is necessary only if read or write barriers have a default
+  // action. The default for roots is no action.
+  return !C.customWriteBarrier() || !C.customReadBarrier() ||
+         C.initializeRoots();
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.
+bool LowerIntrinsics::doInitialization(Module &M) {
+  GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
+  assert(MI && "LowerIntrinsics didn't require GCModuleInfo!?");
+  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
+    if (!I->isDeclaration() && I->hasGC())
+      MI->getFunctionInfo(*I); // Instantiate the GC strategy.
+
+  return false;
+}
+
+/// CouldBecomeSafePoint - Predicate to conservatively determine whether the
+/// instruction could introduce a safe point.
+static bool CouldBecomeSafePoint(Instruction *I) {
+  // The natural definition of instructions which could introduce safe points
+  // are:
+  //
+  //   - call, invoke (AfterCall, BeforeCall)
+  //   - phis (Loops)
+  //   - invoke, ret, unwind (Exit)
+  //
+  // However, instructions as seemingly inoccuous as arithmetic can become
+  // libcalls upon lowering (e.g., div i64 on a 32-bit platform), so instead
+  // it is necessary to take a conservative approach.
+
+  if (isa<AllocaInst>(I) || isa<GetElementPtrInst>(I) || isa<StoreInst>(I) ||
+      isa<LoadInst>(I))
+    return false;
+
+  // llvm.gcroot is safe because it doesn't do anything at runtime.
+  if (CallInst *CI = dyn_cast<CallInst>(I))
+    if (Function *F = CI->getCalledFunction())
+      if (Intrinsic::ID IID = F->getIntrinsicID())
+        if (IID == Intrinsic::gcroot)
+          return false;
+
+  return true;
+}
+
+static bool InsertRootInitializers(Function &F, AllocaInst **Roots,
+                                   unsigned Count) {
+  // Scroll past alloca instructions.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  while (isa<AllocaInst>(IP))
+    ++IP;
+
+  // Search for initializers in the initial BB.
+  SmallPtrSet<AllocaInst *, 16> InitedRoots;
+  for (; !CouldBecomeSafePoint(&*IP); ++IP)
+    if (StoreInst *SI = dyn_cast<StoreInst>(IP))
+      if (AllocaInst *AI =
+              dyn_cast<AllocaInst>(SI->getOperand(1)->stripPointerCasts()))
+        InitedRoots.insert(AI);
+
+  // Add root initializers.
+  bool MadeChange = false;
+
+  for (AllocaInst **I = Roots, **E = Roots + Count; I != E; ++I)
+    if (!InitedRoots.count(*I)) {
+      StoreInst *SI = new StoreInst(
+          ConstantPointerNull::get(cast<PointerType>((*I)->getAllocatedType())),
+          *I);
+      SI->insertAfter(*I);
+      MadeChange = true;
+    }
+
+  return MadeChange;
+}
+
+/// runOnFunction - Replace gcread/gcwrite intrinsics with loads and stores.
+/// Leave gcroot intrinsics; the code generator needs to see those.
+bool LowerIntrinsics::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use GC.
+  if (!F.hasGC())
+    return false;
+
+  GCFunctionInfo &FI = getAnalysis<GCModuleInfo>().getFunctionInfo(F);
+  GCStrategy &S = FI.getStrategy();
+
+  bool MadeChange = false;
+
+  if (NeedsDefaultLoweringPass(S))
+    MadeChange |= PerformDefaultLowering(F, S);
+
+  return MadeChange;
+}
+
+bool LowerIntrinsics::PerformDefaultLowering(Function &F, GCStrategy &S) {
+  bool LowerWr = !S.customWriteBarrier();
+  bool LowerRd = !S.customReadBarrier();
+  bool InitRoots = S.initializeRoots();
+
+  SmallVector<AllocaInst *, 32> Roots;
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;) {
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++)) {
+        Function *F = CI->getCalledFunction();
+        switch (F->getIntrinsicID()) {
+        case Intrinsic::gcwrite:
+          if (LowerWr) {
+            // Replace a write barrier with a simple store.
+            Value *St =
+                new StoreInst(CI->getArgOperand(0), CI->getArgOperand(2), CI);
+            CI->replaceAllUsesWith(St);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcread:
+          if (LowerRd) {
+            // Replace a read barrier with a simple load.
+            Value *Ld = new LoadInst(CI->getArgOperand(1), "", CI);
+            Ld->takeName(CI);
+            CI->replaceAllUsesWith(Ld);
+            CI->eraseFromParent();
+          }
+          break;
+        case Intrinsic::gcroot:
+          if (InitRoots) {
+            // Initialize the GC root, but do not delete the intrinsic. The
+            // backend needs the intrinsic to flag the stack slot.
+            Roots.push_back(
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+          }
+          break;
+        default:
+          continue;
+        }
+
+        MadeChange = true;
+      }
+    }
+  }
+
+  if (Roots.size())
+    MadeChange |= InsertRootInitializers(F, Roots.begin(), Roots.size());
+
+  return MadeChange;
+}
+
+// -----------------------------------------------------------------------------
+
+char GCMachineCodeAnalysis::ID = 0;
+char &llvm::GCMachineCodeAnalysisID = GCMachineCodeAnalysis::ID;
+
+INITIALIZE_PASS(GCMachineCodeAnalysis, "gc-analysis",
+                "Analyze Machine Code For Garbage Collection", false, false)
+
+GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
+
+void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<GCModuleInfo>();
+}
+
+MCSymbol *GCMachineCodeAnalysis::InsertLabel(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MI,
+                                             const DebugLoc &DL) const {
+  MCSymbol *Label = MBB.getParent()->getContext().createTempSymbol();
+  BuildMI(MBB, MI, DL, TII->get(TargetOpcode::GC_LABEL)).addSym(Label);
+  return Label;
+}
+
+void GCMachineCodeAnalysis::VisitCallPoint(MachineBasicBlock::iterator CI) {
+  // Find the return address (next instruction), too, so as to bracket the call
+  // instruction.
+  MachineBasicBlock::iterator RAI = CI;
+  ++RAI;
+
+  if (FI->getStrategy().needsSafePoint(GC::PreCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), CI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PreCall, Label, CI->getDebugLoc());
+  }
+
+  if (FI->getStrategy().needsSafePoint(GC::PostCall)) {
+    MCSymbol *Label = InsertLabel(*CI->getParent(), RAI, CI->getDebugLoc());
+    FI->addSafePoint(GC::PostCall, Label, CI->getDebugLoc());
+  }
+}
+
+void GCMachineCodeAnalysis::FindSafePoints(MachineFunction &MF) {
+  for (MachineFunction::iterator BBI = MF.begin(), BBE = MF.end(); BBI != BBE;
+       ++BBI)
+    for (MachineBasicBlock::iterator MI = BBI->begin(), ME = BBI->end();
+         MI != ME; ++MI)
+      if (MI->isCall()) {
+        // Do not treat tail or sibling call sites as safe points.  This is
+        // legal since any arguments passed to the callee which live in the
+        // remnants of the callers frame will be owned and updated by the
+        // callee if required.
+        if (MI->isTerminator())
+          continue;
+        VisitCallPoint(MI);
+      }
+}
+
+void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  assert(TFI && "TargetRegisterInfo not available!");
+
+  for (GCFunctionInfo::roots_iterator RI = FI->roots_begin();
+       RI != FI->roots_end();) {
+    // If the root references a dead object, no need to keep it.
+    if (MF.getFrameInfo()->isDeadObjectIndex(RI->Num)) {
+      RI = FI->removeStackRoot(RI);
+    } else {
+      unsigned FrameReg; // FIXME: surely GCRoot ought to store the
+                         // register that the offset is from?
+      RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
+      ++RI;
+    }
+  }
+}
+
+bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
+  // Quick exit for functions that do not use GC.
+  if (!MF.getFunction()->hasGC())
+    return false;
+
+  FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(*MF.getFunction());
+  MMI = &getAnalysis<MachineModuleInfo>();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Find the size of the stack frame.  There may be no correct static frame
+  // size, we use UINT64_MAX to represent this.
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  const bool DynamicFrameSize = MFI->hasVarSizedObjects() ||
+    RegInfo->needsStackRealignment(MF);
+  FI->setFrameSize(DynamicFrameSize ? UINT64_MAX : MFI->getStackSize());
+
+  // Find all safe points.
+  if (FI->getStrategy().needsSafePoints())
+    FindSafePoints(MF);
+
+  // Find the concrete stack offsets for all roots (stack slots)
+  FindStackOffsets(MF);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/GCStrategy.cpp b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
new file mode 100644
index 000000000000..554d326942e9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GCStrategy.cpp
@@ -0,0 +1,22 @@
+//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the policy object GCStrategy which describes the
+// behavior of a given garbage collector.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GCStrategy.h"
+
+using namespace llvm;
+
+GCStrategy::GCStrategy()
+    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
+      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
+      UsesMetadata(false) {}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
new file mode 100644
index 000000000000..231e5ac82bec
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/GlobalISel.cpp
@@ -0,0 +1,30 @@
+//===-- llvm/CodeGen/GlobalISel/GlobalIsel.cpp --- GlobalISel ----*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+// This file implements the common initialization routines for the
+// GlobalISel library.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+
+void llvm::initializeGlobalISel(PassRegistry &Registry) {
+}
+
+#else
+
+void llvm::initializeGlobalISel(PassRegistry &Registry) {
+  initializeIRTranslatorPass(Registry);
+  initializeRegBankSelectPass(Registry);
+}
+#endif // LLVM_BUILD_GLOBAL_ISEL
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
new file mode 100644
index 000000000000..b8a960cfac76
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -0,0 +1,164 @@
+//===-- llvm/CodeGen/GlobalISel/IRTranslator.cpp - IRTranslator --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the IRTranslator class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Target/TargetLowering.h"
+
+#define DEBUG_TYPE "irtranslator"
+
+using namespace llvm;
+
+char IRTranslator::ID = 0;
+INITIALIZE_PASS(IRTranslator, "irtranslator", "IRTranslator LLVM IR -> MI",
+                false, false);
+
+IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
+  initializeIRTranslatorPass(*PassRegistry::getPassRegistry());
+}
+
+unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
+  unsigned &ValReg = ValToVReg[&Val];
+  // Check if this is the first time we see Val.
+  if (!ValReg) {
+    // Fill ValRegsSequence with the sequence of registers
+    // we need to concat together to produce the value.
+    assert(Val.getType()->isSized() &&
+           "Don't know how to create an empty vreg");
+    assert(!Val.getType()->isAggregateType() && "Not yet implemented");
+    unsigned Size = Val.getType()->getPrimitiveSizeInBits();
+    unsigned VReg = MRI->createGenericVirtualRegister(Size);
+    ValReg = VReg;
+    assert(!isa<Constant>(Val) && "Not yet implemented");
+  }
+  return ValReg;
+}
+
+MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
+  MachineBasicBlock *&MBB = BBToMBB[&BB];
+  if (!MBB) {
+    MachineFunction &MF = MIRBuilder.getMF();
+    MBB = MF.CreateMachineBasicBlock();
+    MF.push_back(MBB);
+  }
+  return *MBB;
+}
+
+bool IRTranslator::translateBinaryOp(unsigned Opcode, const Instruction &Inst) {
+  // Get or create a virtual register for each value.
+  // Unless the value is a Constant => loadimm cst?
+  // or inline constant each time?
+  // Creation of a virtual register needs to have a size.
+  unsigned Op0 = getOrCreateVReg(*Inst.getOperand(0));
+  unsigned Op1 = getOrCreateVReg(*Inst.getOperand(1));
+  unsigned Res = getOrCreateVReg(Inst);
+  MIRBuilder.buildInstr(Opcode, Inst.getType(), Res, Op0, Op1);
+  return true;
+}
+
+bool IRTranslator::translateReturn(const Instruction &Inst) {
+  assert(isa<ReturnInst>(Inst) && "Return expected");
+  const Value *Ret = cast<ReturnInst>(Inst).getReturnValue();
+  // The target may mess up with the insertion point, but
+  // this is not important as a return is the last instruction
+  // of the block anyway.
+  return CLI->lowerReturn(MIRBuilder, Ret, !Ret ? 0 : getOrCreateVReg(*Ret));
+}
+
+bool IRTranslator::translateBr(const Instruction &Inst) {
+  assert(isa<BranchInst>(Inst) && "Branch expected");
+  const BranchInst &BrInst = *cast<BranchInst>(&Inst);
+  if (BrInst.isUnconditional()) {
+    const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getOperand(0));
+    MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt);
+    MIRBuilder.buildInstr(TargetOpcode::G_BR, BrTgt.getType(), TgtBB);
+  } else {
+    assert(0 && "Not yet implemented");
+  }
+  // Link successors.
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+  for (const BasicBlock *Succ : BrInst.successors())
+    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+  return true;
+}
+
+bool IRTranslator::translate(const Instruction &Inst) {
+  MIRBuilder.setDebugLoc(Inst.getDebugLoc());
+  switch(Inst.getOpcode()) {
+  case Instruction::Add:
+    return translateBinaryOp(TargetOpcode::G_ADD, Inst);
+  case Instruction::Or:
+    return translateBinaryOp(TargetOpcode::G_OR, Inst);
+  case Instruction::Br:
+    return translateBr(Inst);
+  case Instruction::Ret:
+    return translateReturn(Inst);
+
+  default:
+    llvm_unreachable("Opcode not supported");
+  }
+}
+
+
+void IRTranslator::finalize() {
+  // Release the memory used by the different maps we
+  // needed during the translation.
+  ValToVReg.clear();
+  Constants.clear();
+}
+
+bool IRTranslator::runOnMachineFunction(MachineFunction &MF) {
+  const Function &F = *MF.getFunction();
+  if (F.empty())
+    return false;
+  CLI = MF.getSubtarget().getCallLowering();
+  MIRBuilder.setMF(MF);
+  MRI = &MF.getRegInfo();
+  // Setup the arguments.
+  MachineBasicBlock &MBB = getOrCreateBB(F.front());
+  MIRBuilder.setMBB(MBB);
+  SmallVector<unsigned, 8> VRegArgs;
+  for (const Argument &Arg: F.args())
+    VRegArgs.push_back(getOrCreateVReg(Arg));
+  bool Succeeded =
+      CLI->lowerFormalArguments(MIRBuilder, F.getArgumentList(), VRegArgs);
+  if (!Succeeded)
+    report_fatal_error("Unable to lower arguments");
+
+  for (const BasicBlock &BB: F) {
+    MachineBasicBlock &MBB = getOrCreateBB(BB);
+    // Set the insertion point of all the following translations to
+    // the end of this basic block.
+    MIRBuilder.setMBB(MBB);
+    for (const Instruction &Inst: BB) {
+      bool Succeeded = translate(Inst);
+      if (!Succeeded) {
+        DEBUG(dbgs() << "Cannot translate: " << Inst << '\n');
+        report_fatal_error("Unable to translate instruction");
+      }
+    }
+  }
+
+  // Now that the MachineFrameInfo has been configured, no further changes to
+  // the reserved registers are possible.
+  MRI->freezeReservedRegs(MF);
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
new file mode 100644
index 000000000000..2f19bcf1e68b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -0,0 +1,104 @@
+//===-- llvm/CodeGen/GlobalISel/MachineIRBuilder.cpp - MIBuilder--*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the MachineIRBuidler class.
+//===----------------------------------------------------------------------===//
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+void MachineIRBuilder::setMF(MachineFunction &MF) {
+  this->MF = &MF;
+  this->MBB = nullptr;
+  this->TII = MF.getSubtarget().getInstrInfo();
+  this->DL = DebugLoc();
+  this->MI = nullptr;
+}
+
+void MachineIRBuilder::setMBB(MachineBasicBlock &MBB, bool Beginning) {
+  this->MBB = &MBB;
+  Before = Beginning;
+  assert(&getMF() == MBB.getParent() &&
+         "Basic block is in a different function");
+}
+
+void MachineIRBuilder::setInstr(MachineInstr &MI, bool Before) {
+  assert(MI.getParent() && "Instruction is not part of a basic block");
+  setMBB(*MI.getParent());
+  this->MI = &MI;
+  this->Before = Before;
+}
+
+MachineBasicBlock::iterator MachineIRBuilder::getInsertPt() {
+  if (MI) {
+    if (Before)
+      return MI;
+    if (!MI->getNextNode())
+      return getMBB().end();
+    return MI->getNextNode();
+  }
+  return Before ? getMBB().begin() : getMBB().end();
+}
+
+//------------------------------------------------------------------------------
+// Build instruction variants.
+//------------------------------------------------------------------------------
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty) {
+  MachineInstr *NewMI = BuildMI(getMF(), DL, getTII().get(Opcode));
+  if (Ty) {
+    assert(isPreISelGenericOpcode(Opcode) &&
+           "Only generic instruction can have a type");
+    NewMI->setType(Ty);
+  } else
+    assert(!isPreISelGenericOpcode(Opcode) &&
+           "Generic instruction must have a type");
+  getMBB().insert(getInsertPt(), NewMI);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, unsigned Res,
+                                           unsigned Op0, unsigned Op1) {
+  return buildInstr(Opcode, nullptr, Res, Op0, Op1);
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty,
+                                           unsigned Res, unsigned Op0,
+                                           unsigned Op1) {
+  MachineInstr *NewMI = buildInstr(Opcode, Ty);
+  MachineInstrBuilder(getMF(), NewMI)
+      .addReg(Res, RegState::Define)
+      .addReg(Op0)
+      .addReg(Op1);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, unsigned Res,
+                                           unsigned Op0) {
+  MachineInstr *NewMI = buildInstr(Opcode, nullptr);
+  MachineInstrBuilder(getMF(), NewMI).addReg(Res, RegState::Define).addReg(Op0);
+  return NewMI;
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode) {
+  return buildInstr(Opcode, nullptr);
+}
+
+MachineInstr *MachineIRBuilder::buildInstr(unsigned Opcode, Type *Ty,
+                                           MachineBasicBlock &BB) {
+  MachineInstr *NewMI = buildInstr(Opcode, Ty);
+  MachineInstrBuilder(getMF(), NewMI).addMBB(&BB);
+  return NewMI;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
new file mode 100644
index 000000000000..419e270c9127
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -0,0 +1,897 @@
+//===- llvm/CodeGen/GlobalISel/RegBankSelect.cpp - RegBankSelect -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegBankSelect class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "regbankselect"
+
+using namespace llvm;
+
+static cl::opt<RegBankSelect::Mode> RegBankSelectMode(
+    cl::desc("Mode of the RegBankSelect pass"), cl::Hidden, cl::Optional,
+    cl::values(clEnumValN(RegBankSelect::Mode::Fast, "regbankselect-fast",
+                          "Run the Fast mode (default mapping)"),
+               clEnumValN(RegBankSelect::Mode::Greedy, "regbankselect-greedy",
+                          "Use the Greedy mode (best local mapping)"),
+               clEnumValEnd));
+
+char RegBankSelect::ID = 0;
+INITIALIZE_PASS_BEGIN(RegBankSelect, "regbankselect",
+                      "Assign register bank of generic virtual registers",
+                      false, false);
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(RegBankSelect, "regbankselect",
+                    "Assign register bank of generic virtual registers", false,
+                    false);
+
+RegBankSelect::RegBankSelect(Mode RunningMode)
+    : MachineFunctionPass(ID), RBI(nullptr), MRI(nullptr), TRI(nullptr),
+      MBFI(nullptr), MBPI(nullptr), OptMode(RunningMode) {
+  initializeRegBankSelectPass(*PassRegistry::getPassRegistry());
+  if (RegBankSelectMode.getNumOccurrences() != 0) {
+    OptMode = RegBankSelectMode;
+    if (RegBankSelectMode != RunningMode)
+      DEBUG(dbgs() << "RegBankSelect mode overrided by command line\n");
+  }
+}
+
+void RegBankSelect::init(MachineFunction &MF) {
+  RBI = MF.getSubtarget().getRegBankInfo();
+  assert(RBI && "Cannot work without RegisterBankInfo");
+  MRI = &MF.getRegInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  if (OptMode != Mode::Fast) {
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  } else {
+    MBFI = nullptr;
+    MBPI = nullptr;
+  }
+  MIRBuilder.setMF(MF);
+}
+
+void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (OptMode != Mode::Fast) {
+    // We could preserve the information from these two analysis but
+    // the APIs do not allow to do so yet.
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineBranchProbabilityInfo>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegBankSelect::assignmentMatch(
+    unsigned Reg, const RegisterBankInfo::ValueMapping &ValMapping,
+    bool &OnlyAssign) const {
+  // By default we assume we will have to repair something.
+  OnlyAssign = false;
+  // Each part of a break down needs to end up in a different register.
+  // In other word, Reg assignement does not match.
+  if (ValMapping.BreakDown.size() > 1)
+    return false;
+
+  const RegisterBank *CurRegBank = RBI->getRegBank(Reg, *MRI, *TRI);
+  const RegisterBank *DesiredRegBrank = ValMapping.BreakDown[0].RegBank;
+  // Reg is free of assignment, a simple assignment will make the
+  // register bank to match.
+  OnlyAssign = CurRegBank == nullptr;
+  DEBUG(dbgs() << "Does assignment already match: ";
+        if (CurRegBank) dbgs() << *CurRegBank; else dbgs() << "none";
+        dbgs() << " against ";
+        assert(DesiredRegBrank && "The mapping must be valid");
+        dbgs() << *DesiredRegBrank << '\n';);
+  return CurRegBank == DesiredRegBrank;
+}
+
+void RegBankSelect::repairReg(
+    MachineOperand &MO, const RegisterBankInfo::ValueMapping &ValMapping,
+    RegBankSelect::RepairingPlacement &RepairPt,
+    const iterator_range<SmallVectorImpl<unsigned>::const_iterator> &NewVRegs) {
+  assert(ValMapping.BreakDown.size() == 1 && "Not yet implemented");
+  // An empty range of new register means no repairing.
+  assert(NewVRegs.begin() != NewVRegs.end() && "We should not have to repair");
+
+  // Assume we are repairing a use and thus, the original reg will be
+  // the source of the repairing.
+  unsigned Src = MO.getReg();
+  unsigned Dst = *NewVRegs.begin();
+
+  // If we repair a definition, swap the source and destination for
+  // the repairing.
+  if (MO.isDef())
+    std::swap(Src, Dst);
+
+  assert((RepairPt.getNumInsertPoints() == 1 ||
+          TargetRegisterInfo::isPhysicalRegister(Dst)) &&
+         "We are about to create several defs for Dst");
+
+  // Build the instruction used to repair, then clone it at the right places.
+  MachineInstr *MI = MIRBuilder.buildInstr(TargetOpcode::COPY, Dst, Src);
+  MI->removeFromParent();
+  DEBUG(dbgs() << "Copy: " << PrintReg(Src) << " to: " << PrintReg(Dst)
+               << '\n');
+  // TODO:
+  // Check if MI is legal. if not, we need to legalize all the
+  // instructions we are going to insert.
+  std::unique_ptr<MachineInstr *[]> NewInstrs(
+      new MachineInstr *[RepairPt.getNumInsertPoints()]);
+  bool IsFirst = true;
+  unsigned Idx = 0;
+  for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+    MachineInstr *CurMI;
+    if (IsFirst)
+      CurMI = MI;
+    else
+      CurMI = MIRBuilder.getMF().CloneMachineInstr(MI);
+    InsertPt->insert(*CurMI);
+    NewInstrs[Idx++] = CurMI;
+    IsFirst = false;
+  }
+  // TODO:
+  // Legalize NewInstrs if need be.
+}
+
+uint64_t RegBankSelect::getRepairCost(
+    const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  assert(MO.isReg() && "We should only repair register operand");
+  assert(!ValMapping.BreakDown.empty() && "Nothing to map??");
+
+  bool IsSameNumOfValues = ValMapping.BreakDown.size() == 1;
+  const RegisterBank *CurRegBank = RBI->getRegBank(MO.getReg(), *MRI, *TRI);
+  // If MO does not have a register bank, we should have just been
+  // able to set one unless we have to break the value down.
+  assert((!IsSameNumOfValues || CurRegBank) && "We should not have to repair");
+  // Def: Val <- NewDefs
+  //     Same number of values: copy
+  //     Different number: Val = build_sequence Defs1, Defs2, ...
+  // Use: NewSources <- Val.
+  //     Same number of values: copy.
+  //     Different number: Src1, Src2, ... =
+  //           extract_value Val, Src1Begin, Src1Len, Src2Begin, Src2Len, ...
+  // We should remember that this value is available somewhere else to
+  // coalesce the value.
+
+  if (IsSameNumOfValues) {
+    const RegisterBank *DesiredRegBrank = ValMapping.BreakDown[0].RegBank;
+    // If we repair a definition, swap the source and destination for
+    // the repairing.
+    if (MO.isDef())
+      std::swap(CurRegBank, DesiredRegBrank);
+    // TODO: It may be possible to actually avoid the copy.
+    // If we repair something where the source is defined by a copy
+    // and the source of that copy is on the right bank, we can reuse
+    // it for free.
+    // E.g.,
+    // RegToRepair<BankA> = copy AlternativeSrc<BankB>
+    // = op RegToRepair<BankA>
+    // We can simply propagate AlternativeSrc instead of copying RegToRepair
+    // into a new virtual register.
+    // We would also need to propagate this information in the
+    // repairing placement.
+    unsigned Cost =
+        RBI->copyCost(*DesiredRegBrank, *CurRegBank,
+                      RegisterBankInfo::getSizeInBits(MO.getReg(), *MRI, *TRI));
+    // TODO: use a dedicated constant for ImpossibleCost.
+    if (Cost != UINT_MAX)
+      return Cost;
+    assert(false && "Legalization not available yet");
+    // Return the legalization cost of that repairing.
+  }
+  assert(false && "Complex repairing not implemented yet");
+  return 1;
+}
+
+RegisterBankInfo::InstructionMapping &RegBankSelect::findBestMapping(
+    MachineInstr &MI, RegisterBankInfo::InstructionMappings &PossibleMappings,
+    SmallVectorImpl<RepairingPlacement> &RepairPts) {
+
+  RegisterBankInfo::InstructionMapping *BestMapping = nullptr;
+  MappingCost Cost = MappingCost::ImpossibleCost();
+  SmallVector<RepairingPlacement, 4> LocalRepairPts;
+  for (RegisterBankInfo::InstructionMapping &CurMapping : PossibleMappings) {
+    MappingCost CurCost = computeMapping(MI, CurMapping, LocalRepairPts, &Cost);
+    if (CurCost < Cost) {
+      Cost = CurCost;
+      BestMapping = &CurMapping;
+      RepairPts.clear();
+      for (RepairingPlacement &RepairPt : LocalRepairPts)
+        RepairPts.emplace_back(std::move(RepairPt));
+    }
+  }
+  assert(BestMapping && "No suitable mapping for instruction");
+  return *BestMapping;
+}
+
+void RegBankSelect::tryAvoidingSplit(
+    RegBankSelect::RepairingPlacement &RepairPt, const MachineOperand &MO,
+    const RegisterBankInfo::ValueMapping &ValMapping) const {
+  const MachineInstr &MI = *MO.getParent();
+  assert(RepairPt.hasSplit() && "We should not have to adjust for split");
+  // Splitting should only occur for PHIs or between terminators,
+  // because we only do local repairing.
+  assert((MI.isPHI() || MI.isTerminator()) && "Why do we split?");
+
+  assert(&MI.getOperand(RepairPt.getOpIdx()) == &MO &&
+         "Repairing placement does not match operand");
+
+  // If we need splitting for phis, that means it is because we
+  // could not find an insertion point before the terminators of
+  // the predecessor block for this argument. In other words,
+  // the input value is defined by one of the terminators.
+  assert((!MI.isPHI() || !MO.isDef()) && "Need split for phi def?");
+
+  // We split to repair the use of a phi or a terminator.
+  if (!MO.isDef()) {
+    if (MI.isTerminator()) {
+      assert(&MI != &(*MI.getParent()->getFirstTerminator()) &&
+             "Need to split for the first terminator?!");
+    } else {
+      // For the PHI case, the split may not be actually required.
+      // In the copy case, a phi is already a copy on the incoming edge,
+      // therefore there is no need to split.
+      if (ValMapping.BreakDown.size() == 1)
+        // This is a already a copy, there is nothing to do.
+        RepairPt.switchTo(RepairingPlacement::RepairingKind::Reassign);
+    }
+    return;
+  }
+
+  // At this point, we need to repair a defintion of a terminator.
+
+  // Technically we need to fix the def of MI on all outgoing
+  // edges of MI to keep the repairing local. In other words, we
+  // will create several definitions of the same register. This
+  // does not work for SSA unless that definition is a physical
+  // register.
+  // However, there are other cases where we can get away with
+  // that while still keeping the repairing local.
+  assert(MI.isTerminator() && MO.isDef() &&
+         "This code is for the def of a terminator");
+
+  // Since we use RPO traversal, if we need to repair a definition
+  // this means this definition could be:
+  // 1. Used by PHIs (i.e., this VReg has been visited as part of the
+  //    uses of a phi.), or
+  // 2. Part of a target specific instruction (i.e., the target applied
+  //    some register class constraints when creating the instruction.)
+  // If the constraints come for #2, the target said that another mapping
+  // is supported so we may just drop them. Indeed, if we do not change
+  // the number of registers holding that value, the uses will get fixed
+  // when we get to them.
+  // Uses in PHIs may have already been proceeded though.
+  // If the constraints come for #1, then, those are weak constraints and
+  // no actual uses may rely on them. However, the problem remains mainly
+  // the same as for #2. If the value stays in one register, we could
+  // just switch the register bank of the definition, but we would need to
+  // account for a repairing cost for each phi we silently change.
+  //
+  // In any case, if the value needs to be broken down into several
+  // registers, the repairing is not local anymore as we need to patch
+  // every uses to rebuild the value in just one register.
+  //
+  // To summarize:
+  // - If the value is in a physical register, we can do the split and
+  //   fix locally.
+  // Otherwise if the value is in a virtual register:
+  // - If the value remains in one register, we do not have to split
+  //   just switching the register bank would do, but we need to account
+  //   in the repairing cost all the phi we changed.
+  // - If the value spans several registers, then we cannot do a local
+  //   repairing.
+
+  // Check if this is a physical or virtual register.
+  unsigned Reg = MO.getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    // We are going to split every outgoing edges.
+    // Check that this is possible.
+    // FIXME: The machine representation is currently broken
+    // since it also several terminators in one basic block.
+    // Because of that we would technically need a way to get
+    // the targets of just one terminator to know which edges
+    // we have to split.
+    // Assert that we do not hit the ill-formed representation.
+
+    // If there are other terminators before that one, some of
+    // the outgoing edges may not be dominated by this definition.
+    assert(&MI == &(*MI.getParent()->getFirstTerminator()) &&
+           "Do not know which outgoing edges are relevant");
+    const MachineInstr *Next = MI.getNextNode();
+    assert((!Next || Next->isUnconditionalBranch()) &&
+           "Do not know where each terminator ends up");
+    if (Next)
+      // If the next terminator uses Reg, this means we have
+      // to split right after MI and thus we need a way to ask
+      // which outgoing edges are affected.
+      assert(!Next->readsRegister(Reg) && "Need to split between terminators");
+    // We will split all the edges and repair there.
+  } else {
+    // This is a virtual register defined by a terminator.
+    if (ValMapping.BreakDown.size() == 1) {
+      // There is nothing to repair, but we may actually lie on
+      // the repairing cost because of the PHIs already proceeded
+      // as already stated.
+      // Though the code will be correct.
+      assert(0 && "Repairing cost may not be accurate");
+    } else {
+      // We need to do non-local repairing. Basically, patch all
+      // the uses (i.e., phis) that we already proceeded.
+      // For now, just say this mapping is not possible.
+      RepairPt.switchTo(RepairingPlacement::RepairingKind::Impossible);
+    }
+  }
+}
+
+RegBankSelect::MappingCost RegBankSelect::computeMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RepairingPlacement> &RepairPts,
+    const RegBankSelect::MappingCost *BestCost) {
+  assert((MBFI || !BestCost) && "Costs comparison require MBFI");
+
+  // If mapped with InstrMapping, MI will have the recorded cost.
+  MappingCost Cost(MBFI ? MBFI->getBlockFreq(MI.getParent()) : 1);
+  bool Saturated = Cost.addLocalCost(InstrMapping.getCost());
+  assert(!Saturated && "Possible mapping saturated the cost");
+  DEBUG(dbgs() << "Evaluating mapping cost for: " << MI);
+  DEBUG(dbgs() << "With: " << InstrMapping << '\n');
+  RepairPts.clear();
+  if (BestCost && Cost > *BestCost)
+    return Cost;
+
+  // Moreover, to realize this mapping, the register bank of each operand must
+  // match this mapping. In other words, we may need to locally reassign the
+  // register banks. Account for that repairing cost as well.
+  // In this context, local means in the surrounding of MI.
+  for (unsigned OpIdx = 0, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    DEBUG(dbgs() << "Opd" << OpIdx);
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    // If Reg is already properly mapped, this is free.
+    bool Assign;
+    if (assignmentMatch(Reg, ValMapping, Assign)) {
+      DEBUG(dbgs() << " is free (match).\n");
+      continue;
+    }
+    if (Assign) {
+      DEBUG(dbgs() << " is free (simple assignment).\n");
+      RepairPts.emplace_back(RepairingPlacement(MI, OpIdx, *TRI, *this,
+                                                RepairingPlacement::Reassign));
+      continue;
+    }
+
+    // Find the insertion point for the repairing code.
+    RepairPts.emplace_back(
+        RepairingPlacement(MI, OpIdx, *TRI, *this, RepairingPlacement::Insert));
+    RepairingPlacement &RepairPt = RepairPts.back();
+
+    // If we need to split a basic block to materialize this insertion point,
+    // we may give a higher cost to this mapping.
+    // Nevertheless, we may get away with the split, so try that first.
+    if (RepairPt.hasSplit())
+      tryAvoidingSplit(RepairPt, MO, ValMapping);
+
+    // Check that the materialization of the repairing is possible.
+    if (!RepairPt.canMaterialize())
+      return MappingCost::ImpossibleCost();
+
+    // Account for the split cost and repair cost.
+    // Unless the cost is already saturated or we do not care about the cost.
+    if (!BestCost || Saturated)
+      continue;
+
+    // To get accurate information we need MBFI and MBPI.
+    // Thus, if we end up here this information should be here.
+    assert(MBFI && MBPI && "Cost computation requires MBFI and MBPI");
+
+    // FIXME: We will have to rework the repairing cost model.
+    // The repairing cost depends on the register bank that MO has.
+    // However, when we break down the value into different values,
+    // MO may not have a register bank while still needing repairing.
+    // For the fast mode, we don't compute the cost so that is fine,
+    // but still for the repairing code, we will have to make a choice.
+    // For the greedy mode, we should choose greedily what is the best
+    // choice based on the next use of MO.
+
+    // Sums up the repairing cost of MO at each insertion point.
+    uint64_t RepairCost = getRepairCost(MO, ValMapping);
+    // Bias used for splitting: 5%.
+    const uint64_t PercentageForBias = 5;
+    uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100;
+    // We should not need more than a couple of instructions to repair
+    // an assignment. In other words, the computation should not
+    // overflow because the repairing cost is free of basic block
+    // frequency.
+    assert(((RepairCost < RepairCost * PercentageForBias) &&
+            (RepairCost * PercentageForBias <
+             RepairCost * PercentageForBias + 99)) &&
+           "Repairing involves more than a billion of instructions?!");
+    for (const std::unique_ptr<InsertPoint> &InsertPt : RepairPt) {
+      assert(InsertPt->canMaterialize() && "We should not have made it here");
+      // We will applied some basic block frequency and those uses uint64_t.
+      if (!InsertPt->isSplit())
+        Saturated = Cost.addLocalCost(RepairCost);
+      else {
+        uint64_t CostForInsertPt = RepairCost;
+        // Again we shouldn't overflow here givent that
+        // CostForInsertPt is frequency free at this point.
+        assert(CostForInsertPt + Bias > CostForInsertPt &&
+               "Repairing + split bias overflows");
+        CostForInsertPt += Bias;
+        uint64_t PtCost = InsertPt->frequency(*this) * CostForInsertPt;
+        // Check if we just overflowed.
+        if ((Saturated = PtCost < CostForInsertPt))
+          Cost.saturate();
+        else
+          Saturated = Cost.addNonLocalCost(PtCost);
+      }
+
+      // Stop looking into what it takes to repair, this is already
+      // too expensive.
+      if (BestCost && Cost > *BestCost)
+        return Cost;
+
+      // No need to accumulate more cost information.
+      // We need to still gather the repairing information though.
+      if (Saturated)
+        break;
+    }
+  }
+  return Cost;
+}
+
+void RegBankSelect::applyMapping(
+    MachineInstr &MI, const RegisterBankInfo::InstructionMapping &InstrMapping,
+    SmallVectorImpl<RegBankSelect::RepairingPlacement> &RepairPts) {
+  // OpdMapper will hold all the information needed for the rewritting.
+  RegisterBankInfo::OperandsMapper OpdMapper(MI, InstrMapping, *MRI);
+
+  // First, place the repairing code.
+  for (RepairingPlacement &RepairPt : RepairPts) {
+    assert(RepairPt.canMaterialize() &&
+           RepairPt.getKind() != RepairingPlacement::Impossible &&
+           "This mapping is impossible");
+    assert(RepairPt.getKind() != RepairingPlacement::None &&
+           "This should not make its way in the list");
+    unsigned OpIdx = RepairPt.getOpIdx();
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    const RegisterBankInfo::ValueMapping &ValMapping =
+        InstrMapping.getOperandMapping(OpIdx);
+    unsigned BreakDownSize = ValMapping.BreakDown.size();
+    (void)BreakDownSize;
+    unsigned Reg = MO.getReg();
+
+    switch (RepairPt.getKind()) {
+    case RepairingPlacement::Reassign:
+      assert(BreakDownSize == 1 &&
+             "Reassignment should only be for simple mapping");
+      MRI->setRegBank(Reg, *ValMapping.BreakDown[0].RegBank);
+      break;
+    case RepairingPlacement::Insert:
+      OpdMapper.createVRegs(OpIdx);
+      repairReg(MO, ValMapping, RepairPt, OpdMapper.getVRegs(OpIdx));
+      break;
+    default:
+      llvm_unreachable("Other kind should not happen");
+    }
+  }
+  // Second, rewrite the instruction.
+  DEBUG(dbgs() << "Actual mapping of the operands: " << OpdMapper << '\n');
+  RBI->applyMapping(OpdMapper);
+}
+
+void RegBankSelect::assignInstr(MachineInstr &MI) {
+  DEBUG(dbgs() << "Assign: " << MI);
+  // Remember the repairing placement for all the operands.
+  SmallVector<RepairingPlacement, 4> RepairPts;
+
+  RegisterBankInfo::InstructionMapping BestMapping;
+  if (OptMode == RegBankSelect::Mode::Fast) {
+    BestMapping = RBI->getInstrMapping(MI);
+    MappingCost DefaultCost = computeMapping(MI, BestMapping, RepairPts);
+    (void)DefaultCost;
+    assert(DefaultCost != MappingCost::ImpossibleCost() &&
+           "Default mapping is not suited");
+  } else {
+    RegisterBankInfo::InstructionMappings PossibleMappings =
+        RBI->getInstrPossibleMappings(MI);
+    assert(!PossibleMappings.empty() &&
+           "Do not know how to map this instruction");
+    BestMapping = std::move(findBestMapping(MI, PossibleMappings, RepairPts));
+  }
+  // Make sure the mapping is valid for MI.
+  assert(BestMapping.verify(MI) && "Invalid instruction mapping");
+
+  DEBUG(dbgs() << "Mapping: " << BestMapping << '\n');
+
+  // After this call, MI may not be valid anymore.
+  // Do not use it.
+  applyMapping(MI, BestMapping, RepairPts);
+}
+
+bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "Assign register banks for: " << MF.getName() << '\n');
+  const Function *F = MF.getFunction();
+  Mode SaveOptMode = OptMode;
+  if (F->hasFnAttribute(Attribute::OptimizeNone))
+    OptMode = Mode::Fast;
+  init(MF);
+  // Walk the function and assign register banks to all operands.
+  // Use a RPOT to make sure all registers are assigned before we choose
+  // the best mapping of the current instruction.
+  ReversePostOrderTraversal<MachineFunction*> RPOT(&MF);
+  for (MachineBasicBlock *MBB : RPOT) {
+    // Set a sensible insertion point so that subsequent calls to
+    // MIRBuilder.
+    MIRBuilder.setMBB(*MBB);
+    for (MachineBasicBlock::iterator MII = MBB->begin(), End = MBB->end();
+         MII != End;) {
+      // MI might be invalidated by the assignment, so move the
+      // iterator before hand.
+      assignInstr(*MII++);
+    }
+  }
+  OptMode = SaveOptMode;
+  return false;
+}
+
+//------------------------------------------------------------------------------
+//                  Helper Classes Implementation
+//------------------------------------------------------------------------------
+RegBankSelect::RepairingPlacement::RepairingPlacement(
+    MachineInstr &MI, unsigned OpIdx, const TargetRegisterInfo &TRI, Pass &P,
+    RepairingPlacement::RepairingKind Kind)
+    // Default is, we are going to insert code to repair OpIdx.
+    : Kind(Kind),
+      OpIdx(OpIdx),
+      CanMaterialize(Kind != RepairingKind::Impossible),
+      HasSplit(false),
+      P(P) {
+  const MachineOperand &MO = MI.getOperand(OpIdx);
+  assert(MO.isReg() && "Trying to repair a non-reg operand");
+
+  if (Kind != RepairingKind::Insert)
+    return;
+
+  // Repairings for definitions happen after MI, uses happen before.
+  bool Before = !MO.isDef();
+
+  // Check if we are done with MI.
+  if (!MI.isPHI() && !MI.isTerminator()) {
+    addInsertPoint(MI, Before);
+    // We are done with the initialization.
+    return;
+  }
+
+  // Now, look for the special cases.
+  if (MI.isPHI()) {
+    // - PHI must be the first instructions:
+    //   * Before, we have to split the related incoming edge.
+    //   * After, move the insertion point past the last phi.
+    if (!Before) {
+      MachineBasicBlock::iterator It = MI.getParent()->getFirstNonPHI();
+      if (It != MI.getParent()->end())
+        addInsertPoint(*It, /*Before*/ true);
+      else
+        addInsertPoint(*(--It), /*Before*/ false);
+      return;
+    }
+    // We repair a use of a phi, we may need to split the related edge.
+    MachineBasicBlock &Pred = *MI.getOperand(OpIdx + 1).getMBB();
+    // Check if we can move the insertion point prior to the
+    // terminators of the predecessor.
+    unsigned Reg = MO.getReg();
+    MachineBasicBlock::iterator It = Pred.getLastNonDebugInstr();
+    for (auto Begin = Pred.begin(); It != Begin && It->isTerminator(); --It)
+      if (It->modifiesRegister(Reg, &TRI)) {
+        // We cannot hoist the repairing code in the predecessor.
+        // Split the edge.
+        addInsertPoint(Pred, *MI.getParent());
+        return;
+      }
+    // At this point, we can insert in Pred.
+
+    // - If It is invalid, Pred is empty and we can insert in Pred
+    //   wherever we want.
+    // - If It is valid, It is the first non-terminator, insert after It.
+    if (It == Pred.end())
+      addInsertPoint(Pred, /*Beginning*/ false);
+    else
+      addInsertPoint(*It, /*Before*/ false);
+  } else {
+    // - Terminators must be the last instructions:
+    //   * Before, move the insert point before the first terminator.
+    //   * After, we have to split the outcoming edges.
+    unsigned Reg = MO.getReg();
+    if (Before) {
+      // Check whether Reg is defined by any terminator.
+      MachineBasicBlock::iterator It = MI;
+      for (auto Begin = MI.getParent()->begin();
+           --It != Begin && It->isTerminator();)
+        if (It->modifiesRegister(Reg, &TRI)) {
+          // Insert the repairing code right after the definition.
+          addInsertPoint(*It, /*Before*/ false);
+          return;
+        }
+      addInsertPoint(*It, /*Before*/ true);
+      return;
+    }
+    // Make sure Reg is not redefined by other terminators, otherwise
+    // we do not know how to split.
+    for (MachineBasicBlock::iterator It = MI, End = MI.getParent()->end();
+         ++It != End;)
+      // The machine verifier should reject this kind of code.
+      assert(It->modifiesRegister(Reg, &TRI) && "Do not know where to split");
+    // Split each outcoming edges.
+    MachineBasicBlock &Src = *MI.getParent();
+    for (auto &Succ : Src.successors())
+      addInsertPoint(Src, Succ);
+  }
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineInstr &MI,
+                                                       bool Before) {
+  addInsertPoint(*new InstrInsertPoint(MI, Before));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &MBB,
+                                                       bool Beginning) {
+  addInsertPoint(*new MBBInsertPoint(MBB, Beginning));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(MachineBasicBlock &Src,
+                                                       MachineBasicBlock &Dst) {
+  addInsertPoint(*new EdgeInsertPoint(Src, Dst, P));
+}
+
+void RegBankSelect::RepairingPlacement::addInsertPoint(
+    RegBankSelect::InsertPoint &Point) {
+  CanMaterialize &= Point.canMaterialize();
+  HasSplit |= Point.isSplit();
+  InsertPoints.emplace_back(&Point);
+}
+
+RegBankSelect::InstrInsertPoint::InstrInsertPoint(MachineInstr &Instr,
+                                                  bool Before)
+    : InsertPoint(), Instr(Instr), Before(Before) {
+  // Since we do not support splitting, we do not need to update
+  // liveness and such, so do not do anything with P.
+  assert((!Before || !Instr.isPHI()) &&
+         "Splitting before phis requires more points");
+  assert((!Before || !Instr.getNextNode() || !Instr.getNextNode()->isPHI()) &&
+         "Splitting between phis does not make sense");
+}
+
+void RegBankSelect::InstrInsertPoint::materialize() {
+  if (isSplit()) {
+    // Slice and return the beginning of the new block.
+    // If we need to split between the terminators, we theoritically
+    // need to know where the first and second set of terminators end
+    // to update the successors properly.
+    // Now, in pratice, we should have a maximum of 2 branch
+    // instructions; one conditional and one unconditional. Therefore
+    // we know how to update the successor by looking at the target of
+    // the unconditional branch.
+    // If we end up splitting at some point, then, we should update
+    // the liveness information and such. I.e., we would need to
+    // access P here.
+    // The machine verifier should actually make sure such cases
+    // cannot happen.
+    llvm_unreachable("Not yet implemented");
+  }
+  // Otherwise the insertion point is just the current or next
+  // instruction depending on Before. I.e., there is nothing to do
+  // here.
+}
+
+bool RegBankSelect::InstrInsertPoint::isSplit() const {
+  // If the insertion point is after a terminator, we need to split.
+  if (!Before)
+    return Instr.isTerminator();
+  // If we insert before an instruction that is after a terminator,
+  // we are still after a terminator.
+  return Instr.getPrevNode() && Instr.getPrevNode()->isTerminator();
+}
+
+uint64_t RegBankSelect::InstrInsertPoint::frequency(const Pass &P) const {
+  // Even if we need to split, because we insert between terminators,
+  // this split has actually the same frequency as the instruction.
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(Instr.getParent()).getFrequency();
+}
+
+uint64_t RegBankSelect::MBBInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  return MBFI->getBlockFreq(&MBB).getFrequency();
+}
+
+void RegBankSelect::EdgeInsertPoint::materialize() {
+  // If we end up repairing twice at the same place before materializing the
+  // insertion point, we may think we have to split an edge twice.
+  // We should have a factory for the insert point such that identical points
+  // are the same instance.
+  assert(Src.isSuccessor(DstOrSplit) && DstOrSplit->isPredecessor(&Src) &&
+         "This point has already been split");
+  MachineBasicBlock *NewBB = Src.SplitCriticalEdge(DstOrSplit, P);
+  assert(NewBB && "Invalid call to materialize");
+  // We reuse the destination block to hold the information of the new block.
+  DstOrSplit = NewBB;
+}
+
+uint64_t RegBankSelect::EdgeInsertPoint::frequency(const Pass &P) const {
+  const MachineBlockFrequencyInfo *MBFI =
+      P.getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (!MBFI)
+    return 1;
+  if (WasMaterialized)
+    return MBFI->getBlockFreq(DstOrSplit).getFrequency();
+
+  const MachineBranchProbabilityInfo *MBPI =
+      P.getAnalysisIfAvailable<MachineBranchProbabilityInfo>();
+  if (!MBPI)
+    return 1;
+  // The basic block will be on the edge.
+  return (MBFI->getBlockFreq(&Src) * MBPI->getEdgeProbability(&Src, DstOrSplit))
+      .getFrequency();
+}
+
+bool RegBankSelect::EdgeInsertPoint::canMaterialize() const {
+  // If this is not a critical edge, we should not have used this insert
+  // point. Indeed, either the successor or the predecessor should
+  // have do.
+  assert(Src.succ_size() > 1 && DstOrSplit->pred_size() > 1 &&
+         "Edge is not critical");
+  return Src.canSplitCriticalEdge(DstOrSplit);
+}
+
+RegBankSelect::MappingCost::MappingCost(const BlockFrequency &LocalFreq)
+    : LocalCost(0), NonLocalCost(0), LocalFreq(LocalFreq.getFrequency()) {}
+
+bool RegBankSelect::MappingCost::addLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (LocalCost + Cost < LocalCost) {
+    saturate();
+    return true;
+  }
+  LocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::addNonLocalCost(uint64_t Cost) {
+  // Check if this overflows.
+  if (NonLocalCost + Cost < NonLocalCost) {
+    saturate();
+    return true;
+  }
+  NonLocalCost += Cost;
+  return isSaturated();
+}
+
+bool RegBankSelect::MappingCost::isSaturated() const {
+  return LocalCost == UINT64_MAX - 1 && NonLocalCost == UINT64_MAX &&
+         LocalFreq == UINT64_MAX;
+}
+
+void RegBankSelect::MappingCost::saturate() {
+  *this = ImpossibleCost();
+  --LocalCost;
+}
+
+RegBankSelect::MappingCost RegBankSelect::MappingCost::ImpossibleCost() {
+  return MappingCost(UINT64_MAX, UINT64_MAX, UINT64_MAX);
+}
+
+bool RegBankSelect::MappingCost::operator<(const MappingCost &Cost) const {
+  // Sort out the easy cases.
+  if (*this == Cost)
+    return false;
+  // If one is impossible to realize the other is cheaper unless it is
+  // impossible as well.
+  if ((*this == ImpossibleCost()) || (Cost == ImpossibleCost()))
+    return (*this == ImpossibleCost()) < (Cost == ImpossibleCost());
+  // If one is saturated the other is cheaper, unless it is saturated
+  // as well.
+  if (isSaturated() || Cost.isSaturated())
+    return isSaturated() < Cost.isSaturated();
+  // At this point we know both costs hold sensible values.
+
+  // If both values have a different base frequency, there is no much
+  // we can do but to scale everything.
+  // However, if they have the same base frequency we can avoid making
+  // complicated computation.
+  uint64_t ThisLocalAdjust;
+  uint64_t OtherLocalAdjust;
+  if (LLVM_LIKELY(LocalFreq == Cost.LocalFreq)) {
+
+    // At this point, we know the local costs are comparable.
+    // Do the case that do not involve potential overflow first.
+    if (NonLocalCost == Cost.NonLocalCost)
+      // Since the non-local costs do not discriminate on the result,
+      // just compare the local costs.
+      return LocalCost < Cost.LocalCost;
+
+    // The base costs are comparable so we may only keep the relative
+    // value to increase our chances of avoiding overflows.
+    ThisLocalAdjust = 0;
+    OtherLocalAdjust = 0;
+    if (LocalCost < Cost.LocalCost)
+      OtherLocalAdjust = Cost.LocalCost - LocalCost;
+    else
+      ThisLocalAdjust = LocalCost - Cost.LocalCost;
+
+  } else {
+    ThisLocalAdjust = LocalCost;
+    OtherLocalAdjust = Cost.LocalCost;
+  }
+
+  // The non-local costs are comparable, just keep the relative value.
+  uint64_t ThisNonLocalAdjust = 0;
+  uint64_t OtherNonLocalAdjust = 0;
+  if (NonLocalCost < Cost.NonLocalCost)
+    OtherNonLocalAdjust = Cost.NonLocalCost - NonLocalCost;
+  else
+    ThisNonLocalAdjust = NonLocalCost - Cost.NonLocalCost;
+  // Scale everything to make them comparable.
+  uint64_t ThisScaledCost = ThisLocalAdjust * LocalFreq;
+  // Check for overflow on that operation.
+  bool ThisOverflows = ThisLocalAdjust && (ThisScaledCost < ThisLocalAdjust ||
+                                           ThisScaledCost < LocalFreq);
+  uint64_t OtherScaledCost = OtherLocalAdjust * Cost.LocalFreq;
+  // Check for overflow on the last operation.
+  bool OtherOverflows =
+      OtherLocalAdjust &&
+      (OtherScaledCost < OtherLocalAdjust || OtherScaledCost < Cost.LocalFreq);
+  // Add the non-local costs.
+  ThisOverflows |= ThisNonLocalAdjust &&
+                   ThisScaledCost + ThisNonLocalAdjust < ThisNonLocalAdjust;
+  ThisScaledCost += ThisNonLocalAdjust;
+  OtherOverflows |= OtherNonLocalAdjust &&
+                    OtherScaledCost + OtherNonLocalAdjust < OtherNonLocalAdjust;
+  OtherScaledCost += OtherNonLocalAdjust;
+  // If both overflows, we cannot compare without additional
+  // precision, e.g., APInt. Just give up on that case.
+  if (ThisOverflows && OtherOverflows)
+    return false;
+  // If one overflows but not the other, we can still compare.
+  if (ThisOverflows || OtherOverflows)
+    return ThisOverflows < OtherOverflows;
+  // Otherwise, just compare the values.
+  return ThisScaledCost < OtherScaledCost;
+}
+
+bool RegBankSelect::MappingCost::operator==(const MappingCost &Cost) const {
+  return LocalCost == Cost.LocalCost && NonLocalCost == Cost.NonLocalCost &&
+         LocalFreq == Cost.LocalFreq;
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
new file mode 100644
index 000000000000..a911225b5af5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBank.cpp
@@ -0,0 +1,107 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBank.cpp - Register Bank --*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBank class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define DEBUG_TYPE "registerbank"
+
+using namespace llvm;
+
+const unsigned RegisterBank::InvalidID = UINT_MAX;
+
+RegisterBank::RegisterBank() : ID(InvalidID), Name(nullptr), Size(0) {}
+
+bool RegisterBank::verify(const TargetRegisterInfo &TRI) const {
+  assert(isValid() && "Invalid register bank");
+  assert(ContainedRegClasses.size() == TRI.getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  for (unsigned RCId = 0, End = TRI.getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI.getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+    // Verify that the register bank covers all the sub classes of the
+    // classes it covers.
+
+    // Use a different (slow in that case) method than
+    // RegisterBankInfo to find the subclasses of RC, to make sure
+    // both agree on the covers.
+    for (unsigned SubRCId = 0; SubRCId != End; ++SubRCId) {
+      const TargetRegisterClass &SubRC = *TRI.getRegClass(RCId);
+
+      if (!RC.hasSubClassEq(&SubRC))
+        continue;
+
+      // Verify that the Size of the register bank is big enough to cover
+      // all the register classes it covers.
+      assert((getSize() >= SubRC.getSize() * 8) &&
+             "Size is not big enough for all the subclasses!");
+      assert(covers(SubRC) && "Not all subclasses are covered");
+    }
+  }
+  return true;
+}
+
+bool RegisterBank::covers(const TargetRegisterClass &RC) const {
+  assert(isValid() && "RB hasn't been initialized yet");
+  return ContainedRegClasses.test(RC.getID());
+}
+
+bool RegisterBank::isValid() const {
+  return ID != InvalidID && Name != nullptr && Size != 0 &&
+         // A register bank that does not cover anything is useless.
+         !ContainedRegClasses.empty();
+}
+
+bool RegisterBank::operator==(const RegisterBank &OtherRB) const {
+  // There must be only one instance of a given register bank alive
+  // for the whole compilation.
+  // The RegisterBankInfo is supposed to enforce that.
+  assert((OtherRB.getID() != getID() || &OtherRB == this) &&
+         "ID does not uniquely identify a RegisterBank");
+  return &OtherRB == this;
+}
+
+void RegisterBank::dump(const TargetRegisterInfo *TRI) const {
+  print(dbgs(), /* IsForDebug */ true, TRI);
+}
+
+void RegisterBank::print(raw_ostream &OS, bool IsForDebug,
+                         const TargetRegisterInfo *TRI) const {
+  OS << getName();
+  if (!IsForDebug)
+    return;
+  OS << "(ID:" << getID() << ", Size:" << getSize() << ")\n"
+     << "isValid:" << isValid() << '\n'
+     << "Number of Covered register classes: " << ContainedRegClasses.count()
+     << '\n';
+  // Print all the subclasses if we can.
+  // This register classes may not be properly initialized yet.
+  if (!TRI || ContainedRegClasses.empty())
+    return;
+  assert(ContainedRegClasses.size() == TRI->getNumRegClasses() &&
+         "TRI does not match the initialization process?");
+  bool IsFirst = true;
+  OS << "Covered register classes:\n";
+  for (unsigned RCId = 0, End = TRI->getNumRegClasses(); RCId != End; ++RCId) {
+    const TargetRegisterClass &RC = *TRI->getRegClass(RCId);
+
+    if (!covers(RC))
+      continue;
+
+    if (!IsFirst)
+      OS << ", ";
+    OS << TRI->getRegClassName(&RC);
+    IsFirst = false;
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
new file mode 100644
index 000000000000..ef8e4f6d6851
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -0,0 +1,663 @@
+//===- llvm/CodeGen/GlobalISel/RegisterBankInfo.cpp --------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the RegisterBankInfo class.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#include <algorithm> // For std::max.
+
+#define DEBUG_TYPE "registerbankinfo"
+
+using namespace llvm;
+
+const unsigned RegisterBankInfo::DefaultMappingID = UINT_MAX;
+const unsigned RegisterBankInfo::InvalidMappingID = UINT_MAX - 1;
+
+//------------------------------------------------------------------------------
+// RegisterBankInfo implementation.
+//------------------------------------------------------------------------------
+RegisterBankInfo::RegisterBankInfo(unsigned NumRegBanks)
+    : NumRegBanks(NumRegBanks) {
+  RegBanks.reset(new RegisterBank[NumRegBanks]);
+}
+
+bool RegisterBankInfo::verify(const TargetRegisterInfo &TRI) const {
+  DEBUG(for (unsigned Idx = 0, End = getNumRegBanks(); Idx != End; ++Idx) {
+    const RegisterBank &RegBank = getRegBank(Idx);
+    assert(Idx == RegBank.getID() &&
+           "ID does not match the index in the array");
+    dbgs() << "Verify " << RegBank << '\n';
+    assert(RegBank.verify(TRI) && "RegBank is invalid");
+  });
+  return true;
+}
+
+void RegisterBankInfo::createRegisterBank(unsigned ID, const char *Name) {
+  DEBUG(dbgs() << "Create register bank: " << ID << " with name \"" << Name
+               << "\"\n");
+  RegisterBank &RegBank = getRegBank(ID);
+  assert(RegBank.getID() == RegisterBank::InvalidID &&
+         "A register bank should be created only once");
+  RegBank.ID = ID;
+  RegBank.Name = Name;
+}
+
+void RegisterBankInfo::addRegBankCoverage(unsigned ID, unsigned RCId,
+                                          const TargetRegisterInfo &TRI,
+                                          bool AddTypeMapping) {
+  RegisterBank &RB = getRegBank(ID);
+  unsigned NbOfRegClasses = TRI.getNumRegClasses();
+
+  DEBUG(dbgs() << "Add coverage for: " << RB << '\n');
+
+  // Check if RB is underconstruction.
+  if (!RB.isValid())
+    RB.ContainedRegClasses.resize(NbOfRegClasses);
+  else if (RB.covers(*TRI.getRegClass(RCId)))
+    // If RB already covers this register class, there is nothing
+    // to do.
+    return;
+
+  BitVector &Covered = RB.ContainedRegClasses;
+  SmallVector<unsigned, 8> WorkList;
+
+  WorkList.push_back(RCId);
+  Covered.set(RCId);
+
+  unsigned &MaxSize = RB.Size;
+  do {
+    unsigned RCId = WorkList.pop_back_val();
+
+    const TargetRegisterClass &CurRC = *TRI.getRegClass(RCId);
+
+    DEBUG(dbgs() << "Examine: " << TRI.getRegClassName(&CurRC)
+                 << "(Size*8: " << (CurRC.getSize() * 8) << ")\n");
+
+    // Remember the biggest size in bits.
+    MaxSize = std::max(MaxSize, CurRC.getSize() * 8);
+
+    // If we have been asked to record the type supported by this
+    // register bank, do it now.
+    if (AddTypeMapping)
+      for (MVT::SimpleValueType SVT :
+           make_range(CurRC.vt_begin(), CurRC.vt_end()))
+        recordRegBankForType(getRegBank(ID), SVT);
+
+    // Walk through all sub register classes and push them into the worklist.
+    bool First = true;
+    for (BitMaskClassIterator It(CurRC.getSubClassMask(), TRI); It.isValid();
+         ++It) {
+      unsigned SubRCId = It.getID();
+      if (!Covered.test(SubRCId)) {
+        if (First)
+          DEBUG(dbgs() << "  Enqueue sub-class: ");
+        DEBUG(dbgs() << TRI.getRegClassName(TRI.getRegClass(SubRCId)) << ", ");
+        WorkList.push_back(SubRCId);
+        // Remember that we saw the sub class.
+        Covered.set(SubRCId);
+        First = false;
+      }
+    }
+    if (!First)
+      DEBUG(dbgs() << '\n');
+
+    // Push also all the register classes that can be accessed via a
+    // subreg index, i.e., its subreg-class (which is different than
+    // its subclass).
+    //
+    // Note: It would probably be faster to go the other way around
+    // and have this method add only super classes, since this
+    // information is available in a more efficient way. However, it
+    // feels less natural for the client of this APIs plus we will
+    // TableGen the whole bitset at some point, so compile time for
+    // the initialization is not very important.
+    First = true;
+    for (unsigned SubRCId = 0; SubRCId < NbOfRegClasses; ++SubRCId) {
+      if (Covered.test(SubRCId))
+        continue;
+      bool Pushed = false;
+      const TargetRegisterClass *SubRC = TRI.getRegClass(SubRCId);
+      for (SuperRegClassIterator SuperRCIt(SubRC, &TRI); SuperRCIt.isValid();
+           ++SuperRCIt) {
+        if (Pushed)
+          break;
+        for (BitMaskClassIterator It(SuperRCIt.getMask(), TRI); It.isValid();
+             ++It) {
+          unsigned SuperRCId = It.getID();
+          if (SuperRCId == RCId) {
+            if (First)
+              DEBUG(dbgs() << "  Enqueue subreg-class: ");
+            DEBUG(dbgs() << TRI.getRegClassName(SubRC) << ", ");
+            WorkList.push_back(SubRCId);
+            // Remember that we saw the sub class.
+            Covered.set(SubRCId);
+            Pushed = true;
+            First = false;
+            break;
+          }
+        }
+      }
+    }
+    if (!First)
+      DEBUG(dbgs() << '\n');
+  } while (!WorkList.empty());
+}
+
+const RegisterBank *
+RegisterBankInfo::getRegBank(unsigned Reg, const MachineRegisterInfo &MRI,
+                             const TargetRegisterInfo &TRI) const {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return &getRegBankFromRegClass(*TRI.getMinimalPhysRegClass(Reg));
+
+  assert(Reg && "NoRegister does not have a register bank");
+  const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
+  if (RegClassOrBank.is<const RegisterBank *>())
+    return RegClassOrBank.get<const RegisterBank *>();
+  const TargetRegisterClass *RC =
+      RegClassOrBank.get<const TargetRegisterClass *>();
+  if (RC)
+    return &getRegBankFromRegClass(*RC);
+  return nullptr;
+}
+
+const RegisterBank *RegisterBankInfo::getRegBankFromConstraints(
+    const MachineInstr &MI, unsigned OpIdx, const TargetInstrInfo &TII,
+    const TargetRegisterInfo &TRI) const {
+  // The mapping of the registers may be available via the
+  // register class constraints.
+  const TargetRegisterClass *RC = MI.getRegClassConstraint(OpIdx, &TII, &TRI);
+
+  if (!RC)
+    return nullptr;
+
+  const RegisterBank &RegBank = getRegBankFromRegClass(*RC);
+  // Sanity check that the target properly implemented getRegBankFromRegClass.
+  assert(RegBank.covers(*RC) &&
+         "The mapping of the register bank does not make sense");
+  return &RegBank;
+}
+
+RegisterBankInfo::InstructionMapping
+RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
+  RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,
+                                               MI.getNumOperands());
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  // We may need to query the instruction encoding to guess the mapping.
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Before doing anything complicated check if the mapping is not
+  // directly available.
+  bool CompleteMapping = true;
+  // For copies we want to walk over the operands and try to find one
+  // that has a register bank.
+  bool isCopyLike = MI.isCopy() || MI.isPHI();
+  // Remember the register bank for reuse for copy-like instructions.
+  const RegisterBank *RegBank = nullptr;
+  // Remember the size of the register for reuse for copy-like instructions.
+  unsigned RegSize = 0;
+  for (unsigned OpIdx = 0, End = MI.getNumOperands(); OpIdx != End; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // The register bank of Reg is just a side effect of the current
+    // excution and in particular, there is no reason to believe this
+    // is the best default mapping for the current instruction.  Keep
+    // it as an alternative register bank if we cannot figure out
+    // something.
+    const RegisterBank *AltRegBank = getRegBank(Reg, MRI, TRI);
+    // For copy-like instruction, we want to reuse the register bank
+    // that is already set on Reg, if any, since those instructions do
+    // not have any constraints.
+    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr;
+    if (!CurRegBank) {
+      // If this is a target specific instruction, we can deduce
+      // the register bank from the encoding constraints.
+      CurRegBank = getRegBankFromConstraints(MI, OpIdx, TII, TRI);
+      if (!CurRegBank) {
+        // Check if we can deduce the register bank from the type of
+        // the instruction.
+        Type *MITy = MI.getType();
+        if (MITy)
+          CurRegBank = getRegBankForType(
+              MVT::getVT(MITy, /*HandleUnknown*/ true).SimpleTy);
+        if (!CurRegBank)
+          // Use the current assigned register bank.
+          // That may not make much sense though.
+          CurRegBank = AltRegBank;
+        if (!CurRegBank) {
+          // All our attempts failed, give up.
+          CompleteMapping = false;
+
+          if (!isCopyLike)
+            // MI does not carry enough information to guess the mapping.
+            return InstructionMapping();
+
+          // For copies, we want to keep interating to find a register
+          // bank for the other operands if we did not find one yet.
+          if (RegBank)
+            break;
+          continue;
+        }
+      }
+    }
+    RegBank = CurRegBank;
+    RegSize = getSizeInBits(Reg, MRI, TRI);
+    Mapping.setOperandMapping(OpIdx, RegSize, *CurRegBank);
+  }
+
+  if (CompleteMapping)
+    return Mapping;
+
+  assert(isCopyLike && "We should have bailed on non-copies at this point");
+  // For copy like instruction, if none of the operand has a register
+  // bank avialable, there is nothing we can propagate.
+  if (!RegBank)
+    return InstructionMapping();
+
+  // This is a copy-like instruction.
+  // Propagate RegBank to all operands that do not have a
+  // mapping yet.
+  for (unsigned OpIdx = 0, End = MI.getNumOperands(); OpIdx != End; ++OpIdx) {
+    const MachineOperand &MO = MI.getOperand(OpIdx);
+    // Don't assign a mapping for non-reg operands.
+    if (!MO.isReg())
+      continue;
+
+    // If a mapping already exists, do not touch it.
+    if (!static_cast<const InstructionMapping *>(&Mapping)
+             ->getOperandMapping(OpIdx)
+             .BreakDown.empty())
+      continue;
+
+    Mapping.setOperandMapping(OpIdx, RegSize, *RegBank);
+  }
+  return Mapping;
+}
+
+RegisterBankInfo::InstructionMapping
+RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+    RegisterBankInfo::InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  llvm_unreachable("The target must implement this");
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrPossibleMappings(const MachineInstr &MI) const {
+  InstructionMappings PossibleMappings;
+  // Put the default mapping first.
+  PossibleMappings.push_back(getInstrMapping(MI));
+  // Then the alternative mapping, if any.
+  InstructionMappings AltMappings = getInstrAlternativeMappings(MI);
+  for (InstructionMapping &AltMapping : AltMappings)
+    PossibleMappings.emplace_back(std::move(AltMapping));
+#ifndef NDEBUG
+  for (const InstructionMapping &Mapping : PossibleMappings)
+    assert(Mapping.verify(MI) && "Mapping is invalid");
+#endif
+  return PossibleMappings;
+}
+
+RegisterBankInfo::InstructionMappings
+RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+  // No alternative for MI.
+  return InstructionMappings();
+}
+
+void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
+  MachineInstr &MI = OpdMapper.getMI();
+  DEBUG(dbgs() << "Applying default-like mapping\n");
+  for (unsigned OpIdx = 0, EndIdx = MI.getNumOperands(); OpIdx != EndIdx;
+       ++OpIdx) {
+    DEBUG(dbgs() << "OpIdx " << OpIdx);
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg()) {
+      DEBUG(dbgs() << " is not a register, nothing to be done\n");
+      continue;
+    }
+    assert(
+        OpdMapper.getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() ==
+            1 &&
+        "This mapping is too complex for this function");
+    iterator_range<SmallVectorImpl<unsigned>::const_iterator> NewRegs =
+        OpdMapper.getVRegs(OpIdx);
+    if (NewRegs.begin() == NewRegs.end()) {
+      DEBUG(dbgs() << " has not been repaired, nothing to be done\n");
+      continue;
+    }
+    DEBUG(dbgs() << " changed, replace " << MO.getReg());
+    MO.setReg(*NewRegs.begin());
+    DEBUG(dbgs() << " with " << MO.getReg());
+  }
+}
+
+unsigned RegisterBankInfo::getSizeInBits(unsigned Reg,
+                                         const MachineRegisterInfo &MRI,
+                                         const TargetRegisterInfo &TRI) {
+  const TargetRegisterClass *RC = nullptr;
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    // The size is not directly available for physical registers.
+    // Instead, we need to access a register class that contains Reg and
+    // get the size of that register class.
+    RC = TRI.getMinimalPhysRegClass(Reg);
+  } else {
+    unsigned RegSize = MRI.getSize(Reg);
+    // If Reg is not a generic register, query the register class to
+    // get its size.
+    if (RegSize)
+      return RegSize;
+    // Since Reg is not a generic register, it must have a register class.
+    RC = MRI.getRegClass(Reg);
+  }
+  assert(RC && "Unable to deduce the register class");
+  return RC->getSize() * 8;
+}
+
+//------------------------------------------------------------------------------
+// Helper classes implementation.
+//------------------------------------------------------------------------------
+void RegisterBankInfo::PartialMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+bool RegisterBankInfo::PartialMapping::verify() const {
+  assert(RegBank && "Register bank not set");
+  assert(Length && "Empty mapping");
+  assert((StartIdx < getHighBitIdx()) && "Overflow, switch to APInt?");
+  // Check if the minimum width fits into RegBank.
+  assert(RegBank->getSize() >= Length && "Register bank too small for Mask");
+  return true;
+}
+
+void RegisterBankInfo::PartialMapping::print(raw_ostream &OS) const {
+  OS << "[" << StartIdx << ", " << getHighBitIdx() << "], RegBank = ";
+  if (RegBank)
+    OS << *RegBank;
+  else
+    OS << "nullptr";
+}
+
+bool RegisterBankInfo::ValueMapping::verify(unsigned ExpectedBitWidth) const {
+  assert(!BreakDown.empty() && "Value mapped nowhere?!");
+  unsigned OrigValueBitWidth = 0;
+  for (const RegisterBankInfo::PartialMapping &PartMap : BreakDown) {
+    // Check that each register bank is big enough to hold the partial value:
+    // this check is done by PartialMapping::verify
+    assert(PartMap.verify() && "Partial mapping is invalid");
+    // The original value should completely be mapped.
+    // Thus the maximum accessed index + 1 is the size of the original value.
+    OrigValueBitWidth =
+        std::max(OrigValueBitWidth, PartMap.getHighBitIdx() + 1);
+  }
+  assert(OrigValueBitWidth == ExpectedBitWidth && "BitWidth does not match");
+  APInt ValueMask(OrigValueBitWidth, 0);
+  for (const RegisterBankInfo::PartialMapping &PartMap : BreakDown) {
+    // Check that the union of the partial mappings covers the whole value,
+    // without overlaps.
+    // The high bit is exclusive in the APInt API, thus getHighBitIdx + 1.
+    APInt PartMapMask = APInt::getBitsSet(OrigValueBitWidth, PartMap.StartIdx,
+                                          PartMap.getHighBitIdx() + 1);
+    ValueMask ^= PartMapMask;
+    assert((ValueMask & PartMapMask) == PartMapMask &&
+           "Some partial mappings overlap");
+  }
+  assert(ValueMask.isAllOnesValue() && "Value is not fully mapped");
+  return true;
+}
+
+void RegisterBankInfo::ValueMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::ValueMapping::print(raw_ostream &OS) const {
+  OS << "#BreakDown: " << BreakDown.size() << " ";
+  bool IsFirst = true;
+  for (const PartialMapping &PartMap : BreakDown) {
+    if (!IsFirst)
+      OS << ", ";
+    OS << '[' << PartMap << ']';
+    IsFirst = false;
+  }
+}
+
+void RegisterBankInfo::InstructionMapping::setOperandMapping(
+    unsigned OpIdx, unsigned MaskSize, const RegisterBank &RegBank) {
+  // Build the value mapping.
+  assert(MaskSize <= RegBank.getSize() && "Register bank is too small");
+
+  // Create the mapping object.
+  getOperandMapping(OpIdx).BreakDown.push_back(
+      PartialMapping(0, MaskSize, RegBank));
+}
+
+bool RegisterBankInfo::InstructionMapping::verify(
+    const MachineInstr &MI) const {
+  // Check that all the register operands are properly mapped.
+  // Check the constructor invariant.
+  assert(NumOperands == MI.getNumOperands() &&
+         "NumOperands must match, see constructor");
+  assert(MI.getParent() && MI.getParent()->getParent() &&
+         "MI must be connected to a MachineFunction");
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  (void)MF;
+
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    const MachineOperand &MO = MI.getOperand(Idx);
+    const RegisterBankInfo::ValueMapping &MOMapping = getOperandMapping(Idx);
+    (void)MOMapping;
+    if (!MO.isReg()) {
+      assert(MOMapping.BreakDown.empty() &&
+             "We should not care about non-reg mapping");
+      continue;
+    }
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    // Register size in bits.
+    // This size must match what the mapping expects.
+    assert(MOMapping.verify(getSizeInBits(
+               Reg, MF.getRegInfo(), *MF.getSubtarget().getRegisterInfo())) &&
+           "Value mapping is invalid");
+  }
+  return true;
+}
+
+void RegisterBankInfo::InstructionMapping::dump() const {
+  print(dbgs());
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::InstructionMapping::print(raw_ostream &OS) const {
+  OS << "ID: " << getID() << " Cost: " << getCost() << " Mapping: ";
+
+  for (unsigned OpIdx = 0; OpIdx != NumOperands; ++OpIdx) {
+    const ValueMapping &ValMapping = getOperandMapping(OpIdx);
+    if (OpIdx)
+      OS << ", ";
+    OS << "{ Idx: " << OpIdx << " Map: " << ValMapping << '}';
+  }
+}
+
+const int RegisterBankInfo::OperandsMapper::DontKnowIdx = -1;
+
+RegisterBankInfo::OperandsMapper::OperandsMapper(
+    MachineInstr &MI, const InstructionMapping &InstrMapping,
+    MachineRegisterInfo &MRI)
+    : MRI(MRI), MI(MI), InstrMapping(InstrMapping) {
+  unsigned NumOpds = MI.getNumOperands();
+  OpToNewVRegIdx.reset(new int[NumOpds]);
+  std::fill(&OpToNewVRegIdx[0], &OpToNewVRegIdx[NumOpds],
+            OperandsMapper::DontKnowIdx);
+  assert(InstrMapping.verify(MI) && "Invalid mapping for MI");
+}
+
+iterator_range<SmallVectorImpl<unsigned>::iterator>
+RegisterBankInfo::OperandsMapper::getVRegsMem(unsigned OpIdx) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  unsigned NumPartialVal =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown.size();
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx) {
+    // This is the first time we try to access OpIdx.
+    // Create the cells that will hold all the partial values at the
+    // end of the list of NewVReg.
+    StartIdx = NewVRegs.size();
+    OpToNewVRegIdx[OpIdx] = StartIdx;
+    for (unsigned i = 0; i < NumPartialVal; ++i)
+      NewVRegs.push_back(0);
+  }
+  SmallVectorImpl<unsigned>::iterator End =
+      getNewVRegsEnd(StartIdx, NumPartialVal);
+
+  return make_range(&NewVRegs[StartIdx], End);
+}
+
+SmallVectorImpl<unsigned>::const_iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) const {
+  return const_cast<OperandsMapper *>(this)->getNewVRegsEnd(StartIdx, NumVal);
+}
+SmallVectorImpl<unsigned>::iterator
+RegisterBankInfo::OperandsMapper::getNewVRegsEnd(unsigned StartIdx,
+                                                 unsigned NumVal) {
+  assert((NewVRegs.size() == StartIdx + NumVal ||
+          NewVRegs.size() > StartIdx + NumVal) &&
+         "NewVRegs too small to contain all the partial mapping");
+  return NewVRegs.size() <= StartIdx + NumVal ? NewVRegs.end()
+                                              : &NewVRegs[StartIdx + NumVal];
+}
+
+void RegisterBankInfo::OperandsMapper::createVRegs(unsigned OpIdx) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  iterator_range<SmallVectorImpl<unsigned>::iterator> NewVRegsForOpIdx =
+      getVRegsMem(OpIdx);
+  const SmallVectorImpl<PartialMapping> &PartMapList =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown;
+  SmallVectorImpl<PartialMapping>::const_iterator PartMap = PartMapList.begin();
+  for (unsigned &NewVReg : NewVRegsForOpIdx) {
+    assert(PartMap != PartMapList.end() && "Out-of-bound access");
+    assert(NewVReg == 0 && "Register has already been created");
+    NewVReg = MRI.createGenericVirtualRegister(PartMap->Length);
+    MRI.setRegBank(NewVReg, *PartMap->RegBank);
+    ++PartMap;
+  }
+}
+
+void RegisterBankInfo::OperandsMapper::setVRegs(unsigned OpIdx,
+                                                unsigned PartialMapIdx,
+                                                unsigned NewVReg) {
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  assert(getInstrMapping().getOperandMapping(OpIdx).BreakDown.size() >
+             PartialMapIdx &&
+         "Out-of-bound access for partial mapping");
+  // Make sure the memory is initialized for that operand.
+  (void)getVRegsMem(OpIdx);
+  assert(NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] == 0 &&
+         "This value is already set");
+  NewVRegs[OpToNewVRegIdx[OpIdx] + PartialMapIdx] = NewVReg;
+}
+
+iterator_range<SmallVectorImpl<unsigned>::const_iterator>
+RegisterBankInfo::OperandsMapper::getVRegs(unsigned OpIdx,
+                                           bool ForDebug) const {
+  (void)ForDebug;
+  assert(OpIdx < getMI().getNumOperands() && "Out-of-bound access");
+  int StartIdx = OpToNewVRegIdx[OpIdx];
+
+  if (StartIdx == OperandsMapper::DontKnowIdx)
+    return make_range(NewVRegs.end(), NewVRegs.end());
+
+  unsigned PartMapSize =
+      getInstrMapping().getOperandMapping(OpIdx).BreakDown.size();
+  SmallVectorImpl<unsigned>::const_iterator End =
+      getNewVRegsEnd(StartIdx, PartMapSize);
+  iterator_range<SmallVectorImpl<unsigned>::const_iterator> Res =
+      make_range(&NewVRegs[StartIdx], End);
+#ifndef NDEBUG
+  for (unsigned VReg : Res)
+    assert((VReg || ForDebug) && "Some registers are uninitialized");
+#endif
+  return Res;
+}
+
+void RegisterBankInfo::OperandsMapper::dump() const {
+  print(dbgs(), true);
+  dbgs() << '\n';
+}
+
+void RegisterBankInfo::OperandsMapper::print(raw_ostream &OS,
+                                             bool ForDebug) const {
+  unsigned NumOpds = getMI().getNumOperands();
+  if (ForDebug) {
+    OS << "Mapping for " << getMI() << "\nwith " << getInstrMapping() << '\n';
+    // Print out the internal state of the index table.
+    OS << "Populated indices (CellNumber, IndexInNewVRegs): ";
+    bool IsFirst = true;
+    for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+      if (OpToNewVRegIdx[Idx] != DontKnowIdx) {
+        if (!IsFirst)
+          OS << ", ";
+        OS << '(' << Idx << ", " << OpToNewVRegIdx[Idx] << ')';
+        IsFirst = false;
+      }
+    }
+    OS << '\n';
+  } else
+    OS << "Mapping ID: " << getInstrMapping().getID() << ' ';
+
+  OS << "Operand Mapping: ";
+  // If we have a function, we can pretty print the name of the registers.
+  // Otherwise we will print the raw numbers.
+  const TargetRegisterInfo *TRI =
+      getMI().getParent() && getMI().getParent()->getParent()
+          ? getMI().getParent()->getParent()->getSubtarget().getRegisterInfo()
+          : nullptr;
+  bool IsFirst = true;
+  for (unsigned Idx = 0; Idx != NumOpds; ++Idx) {
+    if (OpToNewVRegIdx[Idx] == DontKnowIdx)
+      continue;
+    if (!IsFirst)
+      OS << ", ";
+    IsFirst = false;
+    OS << '(' << PrintReg(getMI().getOperand(Idx).getReg(), TRI) << ", [";
+    bool IsFirstNewVReg = true;
+    for (unsigned VReg : getVRegs(Idx)) {
+      if (!IsFirstNewVReg)
+        OS << ", ";
+      IsFirstNewVReg = false;
+      OS << PrintReg(VReg, TRI);
+    }
+    OS << "])";
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/GlobalMerge.cpp b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
new file mode 100644
index 000000000000..8c760b724d13
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -0,0 +1,605 @@
+//===-- GlobalMerge.cpp - Internal globals merging  -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass merges globals with internal linkage into one. This way all the
+// globals which were merged into a biggest one can be addressed using offsets
+// from the same base pointer (no need for separate base pointer for each of the
+// global). Such a transformation can significantly reduce the register pressure
+// when many globals are involved.
+//
+// For example, consider the code which touches several global variables at
+// once:
+//
+// static int foo[N], bar[N], baz[N];
+//
+// for (i = 0; i < N; ++i) {
+//    foo[i] = bar[i] * baz[i];
+// }
+//
+//  On ARM the addresses of 3 arrays should be kept in the registers, thus
+//  this code has quite large register pressure (loop body):
+//
+//  ldr     r1, [r5], #4
+//  ldr     r2, [r6], #4
+//  mul     r1, r2, r1
+//  str     r1, [r0], #4
+//
+//  Pass converts the code to something like:
+//
+//  static struct {
+//    int foo[N];
+//    int bar[N];
+//    int baz[N];
+//  } merged;
+//
+//  for (i = 0; i < N; ++i) {
+//    merged.foo[i] = merged.bar[i] * merged.baz[i];
+//  }
+//
+//  and in ARM code this becomes:
+//
+//  ldr     r0, [r5, #40]
+//  ldr     r1, [r5, #80]
+//  mul     r0, r1, r0
+//  str     r0, [r5], #4
+//
+//  note that we saved 2 registers here almostly "for free".
+//
+// However, merging globals can have tradeoffs:
+// - it confuses debuggers, tools, and users
+// - it makes linker optimizations less useful (order files, LOHs, ...)
+// - it forces usage of indexed addressing (which isn't necessarily "free")
+// - it can increase register pressure when the uses are disparate enough.
+// 
+// We use heuristics to discover the best global grouping we can (cf cl::opts).
+// ===---------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "global-merge"
+
+// FIXME: This is only useful as a last-resort way to disable the pass.
+static cl::opt<bool>
+EnableGlobalMerge("enable-global-merge", cl::Hidden,
+                  cl::desc("Enable the global merge pass"),
+                  cl::init(true));
+
+static cl::opt<unsigned>
+GlobalMergeMaxOffset("global-merge-max-offset", cl::Hidden,
+                     cl::desc("Set maximum offset for global merge pass"),
+                     cl::init(0));
+
+static cl::opt<bool> GlobalMergeGroupByUse(
+    "global-merge-group-by-use", cl::Hidden,
+    cl::desc("Improve global merge pass to look at uses"), cl::init(true));
+
+static cl::opt<bool> GlobalMergeIgnoreSingleUse(
+    "global-merge-ignore-single-use", cl::Hidden,
+    cl::desc("Improve global merge pass to ignore globals only used alone"),
+    cl::init(true));
+
+static cl::opt<bool>
+EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
+                         cl::desc("Enable global merge pass on constants"),
+                         cl::init(false));
+
+// FIXME: this could be a transitional option, and we probably need to remove
+// it if only we are sure this optimization could always benefit all targets.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalMergeOnExternal("global-merge-on-external", cl::Hidden,
+     cl::desc("Enable global merge pass on external linkage"));
+
+STATISTIC(NumMerged, "Number of globals merged");
+namespace {
+  class GlobalMerge : public FunctionPass {
+    const TargetMachine *TM;
+    // FIXME: Infer the maximum possible offset depending on the actual users
+    // (these max offsets are different for the users inside Thumb or ARM
+    // functions), see the code that passes in the offset in the ARM backend
+    // for more information.
+    unsigned MaxOffset;
+
+    /// Whether we should try to optimize for size only.
+    /// Currently, this applies a dead simple heuristic: only consider globals
+    /// used in minsize functions for merging.
+    /// FIXME: This could learn about optsize, and be used in the cost model.
+    bool OnlyOptimizeForSize;
+
+    /// Whether we should merge global variables that have external linkage.
+    bool MergeExternalGlobals;
+
+    bool IsMachO;
+
+    bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                 Module &M, bool isConst, unsigned AddrSpace) const;
+    /// \brief Merge everything in \p Globals for which the corresponding bit
+    /// in \p GlobalSet is set.
+    bool doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
+                 const BitVector &GlobalSet, Module &M, bool isConst,
+                 unsigned AddrSpace) const;
+
+    /// \brief Check if the given variable has been identified as must keep
+    /// \pre setMustKeepGlobalVariables must have been called on the Module that
+    ///      contains GV
+    bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
+      return MustKeepGlobalVariables.count(GV);
+    }
+
+    /// Collect every variables marked as "used" or used in a landing pad
+    /// instruction for this Module.
+    void setMustKeepGlobalVariables(Module &M);
+
+    /// Collect every variables marked as "used"
+    void collectUsedGlobalVariables(Module &M);
+
+    /// Keep track of the GlobalVariable that must not be merged away
+    SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
+
+  public:
+    static char ID;             // Pass identification, replacement for typeid.
+    explicit GlobalMerge()
+        : FunctionPass(ID), TM(nullptr), MaxOffset(GlobalMergeMaxOffset),
+          OnlyOptimizeForSize(false), MergeExternalGlobals(false) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    explicit GlobalMerge(const TargetMachine *TM, unsigned MaximalOffset,
+                         bool OnlyOptimizeForSize, bool MergeExternalGlobals)
+        : FunctionPass(ID), TM(TM), MaxOffset(MaximalOffset),
+          OnlyOptimizeForSize(OnlyOptimizeForSize),
+          MergeExternalGlobals(MergeExternalGlobals) {
+      initializeGlobalMergePass(*PassRegistry::getPassRegistry());
+    }
+
+    bool doInitialization(Module &M) override;
+    bool runOnFunction(Function &F) override;
+    bool doFinalization(Module &M) override;
+
+    const char *getPassName() const override {
+      return "Merge internal globals";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      FunctionPass::getAnalysisUsage(AU);
+    }
+  };
+} // end anonymous namespace
+
+char GlobalMerge::ID = 0;
+INITIALIZE_PASS_BEGIN(GlobalMerge, "global-merge", "Merge global variables",
+                      false, false)
+INITIALIZE_PASS_END(GlobalMerge, "global-merge", "Merge global variables",
+                    false, false)
+
+bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
+                          Module &M, bool isConst, unsigned AddrSpace) const {
+  auto &DL = M.getDataLayout();
+  // FIXME: Find better heuristics
+  std::stable_sort(Globals.begin(), Globals.end(),
+                   [&DL](const GlobalVariable *GV1, const GlobalVariable *GV2) {
+                     return DL.getTypeAllocSize(GV1->getValueType()) <
+                            DL.getTypeAllocSize(GV2->getValueType());
+                   });
+
+  // If we want to just blindly group all globals together, do so.
+  if (!GlobalMergeGroupByUse) {
+    BitVector AllGlobals(Globals.size());
+    AllGlobals.set();
+    return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
+  }
+
+  // If we want to be smarter, look at all uses of each global, to try to
+  // discover all sets of globals used together, and how many times each of
+  // these sets occurred.
+  //
+  // Keep this reasonably efficient, by having an append-only list of all sets
+  // discovered so far (UsedGlobalSet), and mapping each "together-ness" unit of
+  // code (currently, a Function) to the set of globals seen so far that are
+  // used together in that unit (GlobalUsesByFunction).
+  //
+  // When we look at the Nth global, we now that any new set is either:
+  // - the singleton set {N}, containing this global only, or
+  // - the union of {N} and a previously-discovered set, containing some
+  //   combination of the previous N-1 globals.
+  // Using that knowledge, when looking at the Nth global, we can keep:
+  // - a reference to the singleton set {N} (CurGVOnlySetIdx)
+  // - a list mapping each previous set to its union with {N} (EncounteredUGS),
+  //   if it actually occurs.
+
+  // We keep track of the sets of globals used together "close enough".
+  struct UsedGlobalSet {
+    UsedGlobalSet(size_t Size) : Globals(Size), UsageCount(1) {}
+    BitVector Globals;
+    unsigned UsageCount;
+  };
+
+  // Each set is unique in UsedGlobalSets.
+  std::vector<UsedGlobalSet> UsedGlobalSets;
+
+  // Avoid repeating the create-global-set pattern.
+  auto CreateGlobalSet = [&]() -> UsedGlobalSet & {
+    UsedGlobalSets.emplace_back(Globals.size());
+    return UsedGlobalSets.back();
+  };
+
+  // The first set is the empty set.
+  CreateGlobalSet().UsageCount = 0;
+
+  // We define "close enough" to be "in the same function".
+  // FIXME: Grouping uses by function is way too aggressive, so we should have
+  // a better metric for distance between uses.
+  // The obvious alternative would be to group by BasicBlock, but that's in
+  // turn too conservative..
+  // Anything in between wouldn't be trivial to compute, so just stick with
+  // per-function grouping.
+
+  // The value type is an index into UsedGlobalSets.
+  // The default (0) conveniently points to the empty set.
+  DenseMap<Function *, size_t /*UsedGlobalSetIdx*/> GlobalUsesByFunction;
+
+  // Now, look at each merge-eligible global in turn.
+
+  // Keep track of the sets we already encountered to which we added the
+  // current global.
+  // Each element matches the same-index element in UsedGlobalSets.
+  // This lets us efficiently tell whether a set has already been expanded to
+  // include the current global.
+  std::vector<size_t> EncounteredUGS;
+
+  for (size_t GI = 0, GE = Globals.size(); GI != GE; ++GI) {
+    GlobalVariable *GV = Globals[GI];
+
+    // Reset the encountered sets for this global...
+    std::fill(EncounteredUGS.begin(), EncounteredUGS.end(), 0);
+    // ...and grow it in case we created new sets for the previous global.
+    EncounteredUGS.resize(UsedGlobalSets.size());
+
+    // We might need to create a set that only consists of the current global.
+    // Keep track of its index into UsedGlobalSets.
+    size_t CurGVOnlySetIdx = 0;
+
+    // For each global, look at all its Uses.
+    for (auto &U : GV->uses()) {
+      // This Use might be a ConstantExpr.  We're interested in Instruction
+      // users, so look through ConstantExpr...
+      Use *UI, *UE;
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
+        if (CE->use_empty())
+          continue;
+        UI = &*CE->use_begin();
+        UE = nullptr;
+      } else if (isa<Instruction>(U.getUser())) {
+        UI = &U;
+        UE = UI->getNext();
+      } else {
+        continue;
+      }
+
+      // ...to iterate on all the instruction users of the global.
+      // Note that we iterate on Uses and not on Users to be able to getNext().
+      for (; UI != UE; UI = UI->getNext()) {
+        Instruction *I = dyn_cast<Instruction>(UI->getUser());
+        if (!I)
+          continue;
+
+        Function *ParentFn = I->getParent()->getParent();
+
+        // If we're only optimizing for size, ignore non-minsize functions.
+        if (OnlyOptimizeForSize && !ParentFn->optForMinSize())
+          continue;
+
+        size_t UGSIdx = GlobalUsesByFunction[ParentFn];
+
+        // If this is the first global the basic block uses, map it to the set
+        // consisting of this global only.
+        if (!UGSIdx) {
+          // If that set doesn't exist yet, create it.
+          if (!CurGVOnlySetIdx) {
+            CurGVOnlySetIdx = UsedGlobalSets.size();
+            CreateGlobalSet().Globals.set(GI);
+          } else {
+            ++UsedGlobalSets[CurGVOnlySetIdx].UsageCount;
+          }
+
+          GlobalUsesByFunction[ParentFn] = CurGVOnlySetIdx;
+          continue;
+        }
+
+        // If we already encountered this BB, just increment the counter.
+        if (UsedGlobalSets[UGSIdx].Globals.test(GI)) {
+          ++UsedGlobalSets[UGSIdx].UsageCount;
+          continue;
+        }
+
+        // If not, the previous set wasn't actually used in this function.
+        --UsedGlobalSets[UGSIdx].UsageCount;
+
+        // If we already expanded the previous set to include this global, just
+        // reuse that expanded set.
+        if (size_t ExpandedIdx = EncounteredUGS[UGSIdx]) {
+          ++UsedGlobalSets[ExpandedIdx].UsageCount;
+          GlobalUsesByFunction[ParentFn] = ExpandedIdx;
+          continue;
+        }
+
+        // If not, create a new set consisting of the union of the previous set
+        // and this global.  Mark it as encountered, so we can reuse it later.
+        GlobalUsesByFunction[ParentFn] = EncounteredUGS[UGSIdx] =
+            UsedGlobalSets.size();
+
+        UsedGlobalSet &NewUGS = CreateGlobalSet();
+        NewUGS.Globals.set(GI);
+        NewUGS.Globals |= UsedGlobalSets[UGSIdx].Globals;
+      }
+    }
+  }
+
+  // Now we found a bunch of sets of globals used together.  We accumulated
+  // the number of times we encountered the sets (i.e., the number of blocks
+  // that use that exact set of globals).
+  //
+  // Multiply that by the size of the set to give us a crude profitability
+  // metric.
+  std::sort(UsedGlobalSets.begin(), UsedGlobalSets.end(),
+            [](const UsedGlobalSet &UGS1, const UsedGlobalSet &UGS2) {
+              return UGS1.Globals.count() * UGS1.UsageCount <
+                     UGS2.Globals.count() * UGS2.UsageCount;
+            });
+
+  // We can choose to merge all globals together, but ignore globals never used
+  // with another global.  This catches the obviously non-profitable cases of
+  // having a single global, but is aggressive enough for any other case.
+  if (GlobalMergeIgnoreSingleUse) {
+    BitVector AllGlobals(Globals.size());
+    for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
+      const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
+      if (UGS.UsageCount == 0)
+        continue;
+      if (UGS.Globals.count() > 1)
+        AllGlobals |= UGS.Globals;
+    }
+    return doMerge(Globals, AllGlobals, M, isConst, AddrSpace);
+  }
+
+  // Starting from the sets with the best (=biggest) profitability, find a
+  // good combination.
+  // The ideal (and expensive) solution can only be found by trying all
+  // combinations, looking for the one with the best profitability.
+  // Don't be smart about it, and just pick the first compatible combination,
+  // starting with the sets with the best profitability.
+  BitVector PickedGlobals(Globals.size());
+  bool Changed = false;
+
+  for (size_t i = 0, e = UsedGlobalSets.size(); i != e; ++i) {
+    const UsedGlobalSet &UGS = UsedGlobalSets[e - i - 1];
+    if (UGS.UsageCount == 0)
+      continue;
+    if (PickedGlobals.anyCommon(UGS.Globals))
+      continue;
+    PickedGlobals |= UGS.Globals;
+    // If the set only contains one global, there's no point in merging.
+    // Ignore the global for inclusion in other sets though, so keep it in
+    // PickedGlobals.
+    if (UGS.Globals.count() < 2)
+      continue;
+    Changed |= doMerge(Globals, UGS.Globals, M, isConst, AddrSpace);
+  }
+
+  return Changed;
+}
+
+bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
+                          const BitVector &GlobalSet, Module &M, bool isConst,
+                          unsigned AddrSpace) const {
+  assert(Globals.size() > 1);
+
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  auto &DL = M.getDataLayout();
+
+  DEBUG(dbgs() << " Trying to merge set, starts with #"
+               << GlobalSet.find_first() << "\n");
+
+  ssize_t i = GlobalSet.find_first();
+  while (i != -1) {
+    ssize_t j = 0;
+    uint64_t MergedSize = 0;
+    std::vector<Type*> Tys;
+    std::vector<Constant*> Inits;
+
+    for (j = i; j != -1; j = GlobalSet.find_next(j)) {
+      Type *Ty = Globals[j]->getValueType();
+      MergedSize += DL.getTypeAllocSize(Ty);
+      if (MergedSize > MaxOffset) {
+        break;
+      }
+      Tys.push_back(Ty);
+      Inits.push_back(Globals[j]->getInitializer());
+    }
+
+    StructType *MergedTy = StructType::get(M.getContext(), Tys);
+    Constant *MergedInit = ConstantStruct::get(MergedTy, Inits);
+
+    GlobalVariable *MergedGV = new GlobalVariable(
+        M, MergedTy, isConst, GlobalValue::PrivateLinkage, MergedInit,
+        "_MergedGlobals", nullptr, GlobalVariable::NotThreadLocal, AddrSpace);
+
+    for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
+      GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
+      std::string Name = Globals[k]->getName();
+
+      Constant *Idx[2] = {
+        ConstantInt::get(Int32Ty, 0),
+        ConstantInt::get(Int32Ty, idx),
+      };
+      Constant *GEP =
+          ConstantExpr::getInBoundsGetElementPtr(MergedTy, MergedGV, Idx);
+      Globals[k]->replaceAllUsesWith(GEP);
+      Globals[k]->eraseFromParent();
+
+      // When the linkage is not internal we must emit an alias for the original
+      // variable name as it may be accessed from another object. On non-Mach-O
+      // we can also emit an alias for internal linkage as it's safe to do so.
+      // It's not safe on Mach-O as the alias (and thus the portion of the
+      // MergedGlobals variable) may be dead stripped at link time.
+      if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
+        GlobalAlias::create(Tys[idx], AddrSpace, Linkage, Name, GEP, &M);
+      }
+
+      NumMerged++;
+    }
+    i = j;
+  }
+
+  return true;
+}
+
+void GlobalMerge::collectUsedGlobalVariables(Module &M) {
+  // Extract global variables from llvm.used array
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const GlobalVariable *G =
+        dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
+      MustKeepGlobalVariables.insert(G);
+}
+
+void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
+  collectUsedGlobalVariables(M);
+
+  for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
+       ++IFn) {
+    for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end();
+         IBB != IEndBB; ++IBB) {
+      // Follow the invoke link to find the landing pad instruction
+      const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator());
+      if (!II) continue;
+
+      const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst();
+      // Look for globals in the clauses of the landing pad instruction
+      for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses();
+           Idx != NumClauses; ++Idx)
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(LPInst->getClause(Idx)
+                                     ->stripPointerCasts()))
+          MustKeepGlobalVariables.insert(GV);
+    }
+  }
+}
+
+bool GlobalMerge::doInitialization(Module &M) {
+  if (!EnableGlobalMerge)
+    return false;
+
+  IsMachO = Triple(M.getTargetTriple()).isOSBinFormatMachO();
+
+  auto &DL = M.getDataLayout();
+  DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
+                                                        BSSGlobals;
+  bool Changed = false;
+  setMustKeepGlobalVariables(M);
+
+  // Grab all non-const globals.
+  for (auto &GV : M.globals()) {
+    // Merge is safe for "normal" internal or external globals only
+    if (GV.isDeclaration() || GV.isThreadLocal() || GV.hasSection())
+      continue;
+
+    if (!(MergeExternalGlobals && GV.hasExternalLinkage()) &&
+        !GV.hasInternalLinkage())
+      continue;
+
+    PointerType *PT = dyn_cast<PointerType>(GV.getType());
+    assert(PT && "Global variable is not a pointer!");
+
+    unsigned AddressSpace = PT->getAddressSpace();
+
+    // Ignore fancy-aligned globals for now.
+    unsigned Alignment = DL.getPreferredAlignment(&GV);
+    Type *Ty = GV.getValueType();
+    if (Alignment > DL.getABITypeAlignment(Ty))
+      continue;
+
+    // Ignore all 'special' globals.
+    if (GV.getName().startswith("llvm.") ||
+        GV.getName().startswith(".llvm."))
+      continue;
+
+    // Ignore all "required" globals:
+    if (isMustKeepGlobalVariable(&GV))
+      continue;
+
+    if (DL.getTypeAllocSize(Ty) < MaxOffset) {
+      if (TM &&
+          TargetLoweringObjectFile::getKindForGlobal(&GV, *TM).isBSSLocal())
+        BSSGlobals[AddressSpace].push_back(&GV);
+      else if (GV.isConstant())
+        ConstGlobals[AddressSpace].push_back(&GV);
+      else
+        Globals[AddressSpace].push_back(&GV);
+    }
+  }
+
+  for (auto &P : Globals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first);
+
+  for (auto &P : BSSGlobals)
+    if (P.second.size() > 1)
+      Changed |= doMerge(P.second, M, false, P.first);
+
+  if (EnableGlobalMergeOnConst)
+    for (auto &P : ConstGlobals)
+      if (P.second.size() > 1)
+        Changed |= doMerge(P.second, M, true, P.first);
+
+  return Changed;
+}
+
+bool GlobalMerge::runOnFunction(Function &F) {
+  return false;
+}
+
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
+Pass *llvm::createGlobalMergePass(const TargetMachine *TM, unsigned Offset,
+                                  bool OnlyOptimizeForSize,
+                                  bool MergeExternalByDefault) {
+  bool MergeExternal = (EnableGlobalMergeOnExternal == cl::BOU_UNSET) ?
+    MergeExternalByDefault : (EnableGlobalMergeOnExternal == cl::BOU_TRUE);
+  return new GlobalMerge(TM, Offset, OnlyOptimizeForSize, MergeExternal);
+}
diff --git a/contrib/llvm/lib/CodeGen/IfConversion.cpp b/contrib/llvm/lib/CodeGen/IfConversion.cpp
new file mode 100644
index 000000000000..d225162860c2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/IfConversion.cpp
@@ -0,0 +1,1844 @@
+//===-- IfConversion.cpp - Machine code if conversion pass. ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the machine instruction level if-conversion pass, which
+// tries to convert conditional branches into predicated instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "BranchFolding.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ifcvt"
+
+// Hidden options for help debugging.
+static cl::opt<int> IfCvtFnStart("ifcvt-fn-start", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtFnStop("ifcvt-fn-stop", cl::init(-1), cl::Hidden);
+static cl::opt<int> IfCvtLimit("ifcvt-limit", cl::init(-1), cl::Hidden);
+static cl::opt<bool> DisableSimple("disable-ifcvt-simple",
+                                   cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableSimpleF("disable-ifcvt-simple-false",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangle("disable-ifcvt-triangle",
+                                     cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleR("disable-ifcvt-triangle-rev",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleF("disable-ifcvt-triangle-false",
+                                      cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableTriangleFR("disable-ifcvt-triangle-false-rev",
+                                       cl::init(false), cl::Hidden);
+static cl::opt<bool> DisableDiamond("disable-ifcvt-diamond",
+                                    cl::init(false), cl::Hidden);
+static cl::opt<bool> IfCvtBranchFold("ifcvt-branch-fold",
+                                     cl::init(true), cl::Hidden);
+
+STATISTIC(NumSimple,       "Number of simple if-conversions performed");
+STATISTIC(NumSimpleFalse,  "Number of simple (F) if-conversions performed");
+STATISTIC(NumTriangle,     "Number of triangle if-conversions performed");
+STATISTIC(NumTriangleRev,  "Number of triangle (R) if-conversions performed");
+STATISTIC(NumTriangleFalse,"Number of triangle (F) if-conversions performed");
+STATISTIC(NumTriangleFRev, "Number of triangle (F/R) if-conversions performed");
+STATISTIC(NumDiamonds,     "Number of diamond if-conversions performed");
+STATISTIC(NumIfConvBBs,    "Number of if-converted blocks");
+STATISTIC(NumDupBBs,       "Number of duplicated blocks");
+STATISTIC(NumUnpred,       "Number of true blocks of diamonds unpredicated");
+
+namespace {
+  class IfConverter : public MachineFunctionPass {
+    enum IfcvtKind {
+      ICNotClassfied,  // BB data valid, but not classified.
+      ICSimpleFalse,   // Same as ICSimple, but on the false path.
+      ICSimple,        // BB is entry of an one split, no rejoin sub-CFG.
+      ICTriangleFRev,  // Same as ICTriangleFalse, but false path rev condition.
+      ICTriangleRev,   // Same as ICTriangle, but true path rev condition.
+      ICTriangleFalse, // Same as ICTriangle, but on the false path.
+      ICTriangle,      // BB is entry of a triangle sub-CFG.
+      ICDiamond        // BB is entry of a diamond sub-CFG.
+    };
+
+    /// BBInfo - One per MachineBasicBlock, this is used to cache the result
+    /// if-conversion feasibility analysis. This includes results from
+    /// TargetInstrInfo::analyzeBranch() (i.e. TBB, FBB, and Cond), and its
+    /// classification, and common tail block of its successors (if it's a
+    /// diamond shape), its size, whether it's predicable, and whether any
+    /// instruction can clobber the 'would-be' predicate.
+    ///
+    /// IsDone          - True if BB is not to be considered for ifcvt.
+    /// IsBeingAnalyzed - True if BB is currently being analyzed.
+    /// IsAnalyzed      - True if BB has been analyzed (info is still valid).
+    /// IsEnqueued      - True if BB has been enqueued to be ifcvt'ed.
+    /// IsBrAnalyzable  - True if analyzeBranch() returns false.
+    /// HasFallThrough  - True if BB may fallthrough to the following BB.
+    /// IsUnpredicable  - True if BB is known to be unpredicable.
+    /// ClobbersPred    - True if BB could modify predicates (e.g. has
+    ///                   cmp, call, etc.)
+    /// NonPredSize     - Number of non-predicated instructions.
+    /// ExtraCost       - Extra cost for multi-cycle instructions.
+    /// ExtraCost2      - Some instructions are slower when predicated
+    /// BB              - Corresponding MachineBasicBlock.
+    /// TrueBB / FalseBB- See analyzeBranch().
+    /// BrCond          - Conditions for end of block conditional branches.
+    /// Predicate       - Predicate used in the BB.
+    struct BBInfo {
+      bool IsDone          : 1;
+      bool IsBeingAnalyzed : 1;
+      bool IsAnalyzed      : 1;
+      bool IsEnqueued      : 1;
+      bool IsBrAnalyzable  : 1;
+      bool HasFallThrough  : 1;
+      bool IsUnpredicable  : 1;
+      bool CannotBeCopied  : 1;
+      bool ClobbersPred    : 1;
+      unsigned NonPredSize;
+      unsigned ExtraCost;
+      unsigned ExtraCost2;
+      MachineBasicBlock *BB;
+      MachineBasicBlock *TrueBB;
+      MachineBasicBlock *FalseBB;
+      SmallVector<MachineOperand, 4> BrCond;
+      SmallVector<MachineOperand, 4> Predicate;
+      BBInfo() : IsDone(false), IsBeingAnalyzed(false),
+                 IsAnalyzed(false), IsEnqueued(false), IsBrAnalyzable(false),
+                 HasFallThrough(false), IsUnpredicable(false),
+                 CannotBeCopied(false), ClobbersPred(false), NonPredSize(0),
+                 ExtraCost(0), ExtraCost2(0), BB(nullptr), TrueBB(nullptr),
+                 FalseBB(nullptr) {}
+    };
+
+    /// IfcvtToken - Record information about pending if-conversions to attempt:
+    /// BBI             - Corresponding BBInfo.
+    /// Kind            - Type of block. See IfcvtKind.
+    /// NeedSubsumption - True if the to-be-predicated BB has already been
+    ///                   predicated.
+    /// NumDups      - Number of instructions that would be duplicated due
+    ///                   to this if-conversion. (For diamonds, the number of
+    ///                   identical instructions at the beginnings of both
+    ///                   paths).
+    /// NumDups2     - For diamonds, the number of identical instructions
+    ///                   at the ends of both paths.
+    struct IfcvtToken {
+      BBInfo &BBI;
+      IfcvtKind Kind;
+      bool NeedSubsumption;
+      unsigned NumDups;
+      unsigned NumDups2;
+      IfcvtToken(BBInfo &b, IfcvtKind k, bool s, unsigned d, unsigned d2 = 0)
+        : BBI(b), Kind(k), NeedSubsumption(s), NumDups(d), NumDups2(d2) {}
+    };
+
+    /// BBAnalysis - Results of if-conversion feasibility analysis indexed by
+    /// basic block number.
+    std::vector<BBInfo> BBAnalysis;
+    TargetSchedModel SchedModel;
+
+    const TargetLoweringBase *TLI;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineBranchProbabilityInfo *MBPI;
+    MachineRegisterInfo *MRI;
+
+    LivePhysRegs Redefs;
+    LivePhysRegs DontKill;
+
+    bool PreRegAlloc;
+    bool MadeChange;
+    int FnNum;
+    std::function<bool(const Function &)> PredicateFtor;
+
+  public:
+    static char ID;
+    IfConverter(std::function<bool(const Function &)> Ftor = nullptr)
+        : MachineFunctionPass(ID), FnNum(-1), PredicateFtor(std::move(Ftor)) {
+      initializeIfConverterPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addRequired<MachineBranchProbabilityInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
+  private:
+    bool ReverseBranchCondition(BBInfo &BBI);
+    bool ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                     BranchProbability Prediction) const;
+    bool ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                       bool FalseBranch, unsigned &Dups,
+                       BranchProbability Prediction) const;
+    bool ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                      unsigned &Dups1, unsigned &Dups2) const;
+    void ScanInstructions(BBInfo &BBI);
+    void AnalyzeBlock(MachineBasicBlock *MBB,
+                      std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
+    bool FeasibilityAnalysis(BBInfo &BBI, SmallVectorImpl<MachineOperand> &Cond,
+                             bool isTriangle = false, bool RevBranch = false);
+    void AnalyzeBlocks(MachineFunction &MF,
+                       std::vector<std::unique_ptr<IfcvtToken>> &Tokens);
+    void InvalidatePreds(MachineBasicBlock *BB);
+    void RemoveExtraEdges(BBInfo &BBI);
+    bool IfConvertSimple(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind);
+    bool IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                          unsigned NumDups1, unsigned NumDups2);
+    void PredicateBlock(BBInfo &BBI,
+                        MachineBasicBlock::iterator E,
+                        SmallVectorImpl<MachineOperand> &Cond,
+                        SmallSet<unsigned, 4> *LaterRedefs = nullptr);
+    void CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                               SmallVectorImpl<MachineOperand> &Cond,
+                               bool IgnoreBr = false);
+    void MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges = true);
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &BB,
+                            unsigned Cycle, unsigned Extra,
+                            BranchProbability Prediction) const {
+      return Cycle > 0 && TII->isProfitableToIfCvt(BB, Cycle, Extra,
+                                                   Prediction);
+    }
+
+    bool MeetIfcvtSizeLimit(MachineBasicBlock &TBB,
+                            unsigned TCycle, unsigned TExtra,
+                            MachineBasicBlock &FBB,
+                            unsigned FCycle, unsigned FExtra,
+                            BranchProbability Prediction) const {
+      return TCycle > 0 && FCycle > 0 &&
+        TII->isProfitableToIfCvt(TBB, TCycle, TExtra, FBB, FCycle, FExtra,
+                                 Prediction);
+    }
+
+    // blockAlwaysFallThrough - Block ends without a terminator.
+    bool blockAlwaysFallThrough(BBInfo &BBI) const {
+      return BBI.IsBrAnalyzable && BBI.TrueBB == nullptr;
+    }
+
+    // IfcvtTokenCmp - Used to sort if-conversion candidates.
+    static bool IfcvtTokenCmp(const std::unique_ptr<IfcvtToken> &C1,
+                              const std::unique_ptr<IfcvtToken> &C2) {
+      int Incr1 = (C1->Kind == ICDiamond)
+        ? -(int)(C1->NumDups + C1->NumDups2) : (int)C1->NumDups;
+      int Incr2 = (C2->Kind == ICDiamond)
+        ? -(int)(C2->NumDups + C2->NumDups2) : (int)C2->NumDups;
+      if (Incr1 > Incr2)
+        return true;
+      else if (Incr1 == Incr2) {
+        // Favors subsumption.
+        if (!C1->NeedSubsumption && C2->NeedSubsumption)
+          return true;
+        else if (C1->NeedSubsumption == C2->NeedSubsumption) {
+          // Favors diamond over triangle, etc.
+          if ((unsigned)C1->Kind < (unsigned)C2->Kind)
+            return true;
+          else if (C1->Kind == C2->Kind)
+            return C1->BBI.BB->getNumber() < C2->BBI.BB->getNumber();
+        }
+      }
+      return false;
+    }
+  };
+
+  char IfConverter::ID = 0;
+}
+
+char &llvm::IfConverterID = IfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(IfConverter, "if-converter", "If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_END(IfConverter, "if-converter", "If Converter", false, false)
+
+bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()) ||
+      (PredicateFtor && !PredicateFtor(*MF.getFunction())))
+    return false;
+
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TLI = ST.getTargetLowering();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+
+  if (!TII) return false;
+
+  PreRegAlloc = MRI->isSSA();
+
+  bool BFChange = false;
+  if (!PreRegAlloc) {
+    // Tail merge tend to expose more if-conversion opportunities.
+    BranchFolder BF(true, false, MBFI, *MBPI);
+    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo(),
+                                   getAnalysisIfAvailable<MachineModuleInfo>());
+  }
+
+  DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum <<  ") \'"
+               << MF.getName() << "\'");
+
+  if (FnNum < IfCvtFnStart || (IfCvtFnStop != -1 && FnNum > IfCvtFnStop)) {
+    DEBUG(dbgs() << " skipped\n");
+    return false;
+  }
+  DEBUG(dbgs() << "\n");
+
+  MF.RenumberBlocks();
+  BBAnalysis.resize(MF.getNumBlockIDs());
+
+  std::vector<std::unique_ptr<IfcvtToken>> Tokens;
+  MadeChange = false;
+  unsigned NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle +
+    NumTriangleRev + NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+  while (IfCvtLimit == -1 || (int)NumIfCvts < IfCvtLimit) {
+    // Do an initial analysis for each basic block and find all the potential
+    // candidates to perform if-conversion.
+    bool Change = false;
+    AnalyzeBlocks(MF, Tokens);
+    while (!Tokens.empty()) {
+      std::unique_ptr<IfcvtToken> Token = std::move(Tokens.back());
+      Tokens.pop_back();
+      BBInfo &BBI = Token->BBI;
+      IfcvtKind Kind = Token->Kind;
+      unsigned NumDups = Token->NumDups;
+      unsigned NumDups2 = Token->NumDups2;
+
+      // If the block has been evicted out of the queue or it has already been
+      // marked dead (due to it being predicated), then skip it.
+      if (BBI.IsDone)
+        BBI.IsEnqueued = false;
+      if (!BBI.IsEnqueued)
+        continue;
+
+      BBI.IsEnqueued = false;
+
+      bool RetVal = false;
+      switch (Kind) {
+      default: llvm_unreachable("Unexpected!");
+      case ICSimple:
+      case ICSimpleFalse: {
+        bool isFalse = Kind == ICSimpleFalse;
+        if ((isFalse && DisableSimpleF) || (!isFalse && DisableSimple)) break;
+        DEBUG(dbgs() << "Ifcvt (Simple" << (Kind == ICSimpleFalse ?
+                                            " false" : "")
+                     << "): BB#" << BBI.BB->getNumber() << " ("
+                     << ((Kind == ICSimpleFalse)
+                         ? BBI.FalseBB->getNumber()
+                         : BBI.TrueBB->getNumber()) << ") ");
+        RetVal = IfConvertSimple(BBI, Kind);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) ++NumSimpleFalse;
+          else         ++NumSimple;
+        }
+       break;
+      }
+      case ICTriangle:
+      case ICTriangleRev:
+      case ICTriangleFalse:
+      case ICTriangleFRev: {
+        bool isFalse = Kind == ICTriangleFalse;
+        bool isRev   = (Kind == ICTriangleRev || Kind == ICTriangleFRev);
+        if (DisableTriangle && !isFalse && !isRev) break;
+        if (DisableTriangleR && !isFalse && isRev) break;
+        if (DisableTriangleF && isFalse && !isRev) break;
+        if (DisableTriangleFR && isFalse && isRev) break;
+        DEBUG(dbgs() << "Ifcvt (Triangle");
+        if (isFalse)
+          DEBUG(dbgs() << " false");
+        if (isRev)
+          DEBUG(dbgs() << " rev");
+        DEBUG(dbgs() << "): BB#" << BBI.BB->getNumber() << " (T:"
+                     << BBI.TrueBB->getNumber() << ",F:"
+                     << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertTriangle(BBI, Kind);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) {
+          if (isFalse) {
+            if (isRev) ++NumTriangleFRev;
+            else       ++NumTriangleFalse;
+          } else {
+            if (isRev) ++NumTriangleRev;
+            else       ++NumTriangle;
+          }
+        }
+        break;
+      }
+      case ICDiamond: {
+        if (DisableDiamond) break;
+        DEBUG(dbgs() << "Ifcvt (Diamond): BB#" << BBI.BB->getNumber() << " (T:"
+                     << BBI.TrueBB->getNumber() << ",F:"
+                     << BBI.FalseBB->getNumber() << ") ");
+        RetVal = IfConvertDiamond(BBI, Kind, NumDups, NumDups2);
+        DEBUG(dbgs() << (RetVal ? "succeeded!" : "failed!") << "\n");
+        if (RetVal) ++NumDiamonds;
+        break;
+      }
+      }
+
+      Change |= RetVal;
+
+      NumIfCvts = NumSimple + NumSimpleFalse + NumTriangle + NumTriangleRev +
+        NumTriangleFalse + NumTriangleFRev + NumDiamonds;
+      if (IfCvtLimit != -1 && (int)NumIfCvts >= IfCvtLimit)
+        break;
+    }
+
+    if (!Change)
+      break;
+    MadeChange |= Change;
+  }
+
+  Tokens.clear();
+  BBAnalysis.clear();
+
+  if (MadeChange && IfCvtBranchFold) {
+    BranchFolder BF(false, false, MBFI, *MBPI);
+    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
+                        getAnalysisIfAvailable<MachineModuleInfo>());
+  }
+
+  MadeChange |= BFChange;
+  return MadeChange;
+}
+
+/// findFalseBlock - BB has a fallthrough. Find its 'false' successor given
+/// its 'true' successor.
+static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
+                                         MachineBasicBlock *TrueBB) {
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+         E = BB->succ_end(); SI != E; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    if (SuccBB != TrueBB)
+      return SuccBB;
+  }
+  return nullptr;
+}
+
+/// ReverseBranchCondition - Reverse the condition of the end of the block
+/// branch. Swap block's 'true' and 'false' successors.
+bool IfConverter::ReverseBranchCondition(BBInfo &BBI) {
+  DebugLoc dl;  // FIXME: this is nowhere
+  if (!TII->ReverseBranchCondition(BBI.BrCond)) {
+    TII->RemoveBranch(*BBI.BB);
+    TII->InsertBranch(*BBI.BB, BBI.FalseBB, BBI.TrueBB, BBI.BrCond, dl);
+    std::swap(BBI.TrueBB, BBI.FalseBB);
+    return true;
+  }
+  return false;
+}
+
+/// getNextBlock - Returns the next block in the function blocks ordering. If
+/// it is the end, returns NULL.
+static inline MachineBasicBlock *getNextBlock(MachineBasicBlock *BB) {
+  MachineFunction::iterator I = BB->getIterator();
+  MachineFunction::iterator E = BB->getParent()->end();
+  if (++I == E)
+    return nullptr;
+  return &*I;
+}
+
+/// ValidSimple - Returns true if the 'true' block (along with its
+/// predecessor) forms a valid simple shape for ifcvt. It also returns the
+/// number of instructions that the ifcvt would need to duplicate if performed
+/// in Dups.
+bool IfConverter::ValidSimple(BBInfo &TrueBBI, unsigned &Dups,
+                              BranchProbability Prediction) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.IsBrAnalyzable)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied ||
+        !TII->isProfitableToDupForIfCvt(*TrueBBI.BB, TrueBBI.NonPredSize,
+                                        Prediction))
+      return false;
+    Dups = TrueBBI.NonPredSize;
+  }
+
+  return true;
+}
+
+/// ValidTriangle - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid triangle shape for ifcvt.
+/// If 'FalseBranch' is true, it checks if 'true' block's false branch
+/// branches to the 'false' block rather than the other way around. It also
+/// returns the number of instructions that the ifcvt would need to duplicate
+/// if performed in 'Dups'.
+bool IfConverter::ValidTriangle(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                                bool FalseBranch, unsigned &Dups,
+                                BranchProbability Prediction) const {
+  Dups = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone)
+    return false;
+
+  if (TrueBBI.BB->pred_size() > 1) {
+    if (TrueBBI.CannotBeCopied)
+      return false;
+
+    unsigned Size = TrueBBI.NonPredSize;
+    if (TrueBBI.IsBrAnalyzable) {
+      if (TrueBBI.TrueBB && TrueBBI.BrCond.empty())
+        // Ends with an unconditional branch. It will be removed.
+        --Size;
+      else {
+        MachineBasicBlock *FExit = FalseBranch
+          ? TrueBBI.TrueBB : TrueBBI.FalseBB;
+        if (FExit)
+          // Require a conditional branch
+          ++Size;
+      }
+    }
+    if (!TII->isProfitableToDupForIfCvt(*TrueBBI.BB, Size, Prediction))
+      return false;
+    Dups = Size;
+  }
+
+  MachineBasicBlock *TExit = FalseBranch ? TrueBBI.FalseBB : TrueBBI.TrueBB;
+  if (!TExit && blockAlwaysFallThrough(TrueBBI)) {
+    MachineFunction::iterator I = TrueBBI.BB->getIterator();
+    if (++I == TrueBBI.BB->getParent()->end())
+      return false;
+    TExit = &*I;
+  }
+  return TExit && TExit == FalseBBI.BB;
+}
+
+/// ValidDiamond - Returns true if the 'true' and 'false' blocks (along
+/// with their common predecessor) forms a valid diamond shape for ifcvt.
+bool IfConverter::ValidDiamond(BBInfo &TrueBBI, BBInfo &FalseBBI,
+                               unsigned &Dups1, unsigned &Dups2) const {
+  Dups1 = Dups2 = 0;
+  if (TrueBBI.IsBeingAnalyzed || TrueBBI.IsDone ||
+      FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
+    return false;
+
+  MachineBasicBlock *TT = TrueBBI.TrueBB;
+  MachineBasicBlock *FT = FalseBBI.TrueBB;
+
+  if (!TT && blockAlwaysFallThrough(TrueBBI))
+    TT = getNextBlock(TrueBBI.BB);
+  if (!FT && blockAlwaysFallThrough(FalseBBI))
+    FT = getNextBlock(FalseBBI.BB);
+  if (TT != FT)
+    return false;
+  if (!TT && (TrueBBI.IsBrAnalyzable || FalseBBI.IsBrAnalyzable))
+    return false;
+  if  (TrueBBI.BB->pred_size() > 1 || FalseBBI.BB->pred_size() > 1)
+    return false;
+
+  // FIXME: Allow true block to have an early exit?
+  if (TrueBBI.FalseBB || FalseBBI.FalseBB ||
+      (TrueBBI.ClobbersPred && FalseBBI.ClobbersPred))
+    return false;
+
+  // Count duplicate instructions at the beginning of the true and false blocks.
+  MachineBasicBlock::iterator TIB = TrueBBI.BB->begin();
+  MachineBasicBlock::iterator FIB = FalseBBI.BB->begin();
+  MachineBasicBlock::iterator TIE = TrueBBI.BB->end();
+  MachineBasicBlock::iterator FIE = FalseBBI.BB->end();
+  while (TIB != TIE && FIB != FIE) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIB->isDebugValue()) {
+      while (TIB != TIE && TIB->isDebugValue())
+        ++TIB;
+      if (TIB == TIE)
+        break;
+    }
+    if (FIB->isDebugValue()) {
+      while (FIB != FIE && FIB->isDebugValue())
+        ++FIB;
+      if (FIB == FIE)
+        break;
+    }
+    if (!TIB->isIdenticalTo(*FIB))
+      break;
+    ++Dups1;
+    ++TIB;
+    ++FIB;
+  }
+
+  // Now, in preparation for counting duplicate instructions at the ends of the
+  // blocks, move the end iterators up past any branch instructions.
+  // If both blocks are returning don't skip the branches, since they will
+  // likely be both identical return instructions. In such cases the return
+  // can be left unpredicated.
+  // Check for already containing all of the block.
+  if (TIB == TIE || FIB == FIE)
+    return true;
+  --TIE;
+  --FIE;
+  if (!TrueBBI.BB->succ_empty() || !FalseBBI.BB->succ_empty()) {
+    while (TIE != TIB && TIE->isBranch())
+      --TIE;
+    while (FIE != FIB && FIE->isBranch())
+      --FIE;
+  }
+
+  // If Dups1 includes all of a block, then don't count duplicate
+  // instructions at the end of the blocks.
+  if (TIB == TIE || FIB == FIE)
+    return true;
+
+  // Count duplicate instructions at the ends of the blocks.
+  while (TIE != TIB && FIE != FIB) {
+    // Skip dbg_value instructions. These do not count.
+    if (TIE->isDebugValue()) {
+      while (TIE != TIB && TIE->isDebugValue())
+        --TIE;
+      if (TIE == TIB)
+        break;
+    }
+    if (FIE->isDebugValue()) {
+      while (FIE != FIB && FIE->isDebugValue())
+        --FIE;
+      if (FIE == FIB)
+        break;
+    }
+    if (!TIE->isIdenticalTo(*FIE))
+      break;
+    ++Dups2;
+    --TIE;
+    --FIE;
+  }
+
+  return true;
+}
+
+/// ScanInstructions - Scan all the instructions in the block to determine if
+/// the block is predicable. In most cases, that means all the instructions
+/// in the block are isPredicable(). Also checks if the block contains any
+/// instruction which can clobber a predicate (e.g. condition code register).
+/// If so, the block is not predicable unless it's the last instruction.
+void IfConverter::ScanInstructions(BBInfo &BBI) {
+  if (BBI.IsDone)
+    return;
+
+  bool AlreadyPredicated = !BBI.Predicate.empty();
+  // First analyze the end of BB branches.
+  BBI.TrueBB = BBI.FalseBB = nullptr;
+  BBI.BrCond.clear();
+  BBI.IsBrAnalyzable =
+      !TII->analyzeBranch(*BBI.BB, BBI.TrueBB, BBI.FalseBB, BBI.BrCond);
+  BBI.HasFallThrough = BBI.IsBrAnalyzable && BBI.FalseBB == nullptr;
+
+  if (BBI.BrCond.size()) {
+    // No false branch. This BB must end with a conditional branch and a
+    // fallthrough.
+    if (!BBI.FalseBB)
+      BBI.FalseBB = findFalseBlock(BBI.BB, BBI.TrueBB);
+    if (!BBI.FalseBB) {
+      // Malformed bcc? True and false blocks are the same?
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+
+  // Then scan all the instructions.
+  BBI.NonPredSize = 0;
+  BBI.ExtraCost = 0;
+  BBI.ExtraCost2 = 0;
+  BBI.ClobbersPred = false;
+  for (auto &MI : *BBI.BB) {
+    if (MI.isDebugValue())
+      continue;
+
+    // It's unsafe to duplicate convergent instructions in this context, so set
+    // BBI.CannotBeCopied to true if MI is convergent.  To see why, consider the
+    // following CFG, which is subject to our "simple" transformation.
+    //
+    //    BB0     // if (c1) goto BB1; else goto BB2;
+    //   /   \
+    //  BB1   |
+    //   |   BB2  // if (c2) goto TBB; else goto FBB;
+    //   |   / |
+    //   |  /  |
+    //   TBB   |
+    //    |    |
+    //    |   FBB
+    //    |
+    //    exit
+    //
+    // Suppose we want to move TBB's contents up into BB1 and BB2 (in BB1 they'd
+    // be unconditional, and in BB2, they'd be predicated upon c2), and suppose
+    // TBB contains a convergent instruction.  This is safe iff doing so does
+    // not add a control-flow dependency to the convergent instruction -- i.e.,
+    // it's safe iff the set of control flows that leads us to the convergent
+    // instruction does not get smaller after the transformation.
+    //
+    // Originally we executed TBB if c1 || c2.  After the transformation, there
+    // are two copies of TBB's instructions.  We get to the first if c1, and we
+    // get to the second if !c1 && c2.
+    //
+    // There are clearly fewer ways to satisfy the condition "c1" than
+    // "c1 || c2".  Since we've shrunk the set of control flows which lead to
+    // our convergent instruction, the transformation is unsafe.
+    if (MI.isNotDuplicable() || MI.isConvergent())
+      BBI.CannotBeCopied = true;
+
+    bool isPredicated = TII->isPredicated(MI);
+    bool isCondBr = BBI.IsBrAnalyzable && MI.isConditionalBranch();
+
+    // A conditional branch is not predicable, but it may be eliminated.
+    if (isCondBr)
+      continue;
+
+    if (!isPredicated) {
+      BBI.NonPredSize++;
+      unsigned ExtraPredCost = TII->getPredicationCost(MI);
+      unsigned NumCycles = SchedModel.computeInstrLatency(&MI, false);
+      if (NumCycles > 1)
+        BBI.ExtraCost += NumCycles-1;
+      BBI.ExtraCost2 += ExtraPredCost;
+    } else if (!AlreadyPredicated) {
+      // FIXME: This instruction is already predicated before the
+      // if-conversion pass. It's probably something like a conditional move.
+      // Mark this block unpredicable for now.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    if (BBI.ClobbersPred && !isPredicated) {
+      // Predicate modification instruction should end the block (except for
+      // already predicated instructions and end of block branches).
+      // Predicate may have been modified, the subsequent (currently)
+      // unpredicated instructions cannot be correctly predicated.
+      BBI.IsUnpredicable = true;
+      return;
+    }
+
+    // FIXME: Make use of PredDefs? e.g. ADDC, SUBC sets predicates but are
+    // still potentially predicable.
+    std::vector<MachineOperand> PredDefs;
+    if (TII->DefinesPredicate(MI, PredDefs))
+      BBI.ClobbersPred = true;
+
+    if (!TII->isPredicable(MI)) {
+      BBI.IsUnpredicable = true;
+      return;
+    }
+  }
+}
+
+/// FeasibilityAnalysis - Determine if the block is a suitable candidate to be
+/// predicated by the specified predicate.
+bool IfConverter::FeasibilityAnalysis(BBInfo &BBI,
+                                      SmallVectorImpl<MachineOperand> &Pred,
+                                      bool isTriangle, bool RevBranch) {
+  // If the block is dead or unpredicable, then it cannot be predicated.
+  if (BBI.IsDone || BBI.IsUnpredicable)
+    return false;
+
+  // If it is already predicated but we couldn't analyze its terminator, the
+  // latter might fallthrough, but we can't determine where to.
+  // Conservatively avoid if-converting again.
+  if (BBI.Predicate.size() && !BBI.IsBrAnalyzable)
+    return false;
+
+  // If it is already predicated, check if the new predicate subsumes
+  // its predicate.
+  if (BBI.Predicate.size() && !TII->SubsumesPredicate(Pred, BBI.Predicate))
+    return false;
+
+  if (BBI.BrCond.size()) {
+    if (!isTriangle)
+      return false;
+
+    // Test predicate subsumption.
+    SmallVector<MachineOperand, 4> RevPred(Pred.begin(), Pred.end());
+    SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+    if (RevBranch) {
+      if (TII->ReverseBranchCondition(Cond))
+        return false;
+    }
+    if (TII->ReverseBranchCondition(RevPred) ||
+        !TII->SubsumesPredicate(Cond, RevPred))
+      return false;
+  }
+
+  return true;
+}
+
+/// AnalyzeBlock - Analyze the structure of the sub-CFG starting from
+/// the specified block. Record its successors and whether it looks like an
+/// if-conversion candidate.
+void IfConverter::AnalyzeBlock(
+    MachineBasicBlock *MBB, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
+  struct BBState {
+    BBState(MachineBasicBlock *BB) : MBB(BB), SuccsAnalyzed(false) {}
+    MachineBasicBlock *MBB;
+
+    /// This flag is true if MBB's successors have been analyzed.
+    bool SuccsAnalyzed;
+  };
+
+  // Push MBB to the stack.
+  SmallVector<BBState, 16> BBStack(1, MBB);
+
+  while (!BBStack.empty()) {
+    BBState &State = BBStack.back();
+    MachineBasicBlock *BB = State.MBB;
+    BBInfo &BBI = BBAnalysis[BB->getNumber()];
+
+    if (!State.SuccsAnalyzed) {
+      if (BBI.IsAnalyzed || BBI.IsBeingAnalyzed) {
+        BBStack.pop_back();
+        continue;
+      }
+
+      BBI.BB = BB;
+      BBI.IsBeingAnalyzed = true;
+
+      ScanInstructions(BBI);
+
+      // Unanalyzable or ends with fallthrough or unconditional branch, or if is
+      // not considered for ifcvt anymore.
+      if (!BBI.IsBrAnalyzable || BBI.BrCond.empty() || BBI.IsDone) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Do not ifcvt if either path is a back edge to the entry block.
+      if (BBI.TrueBB == BB || BBI.FalseBB == BB) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Do not ifcvt if true and false fallthrough blocks are the same.
+      if (!BBI.FalseBB) {
+        BBI.IsBeingAnalyzed = false;
+        BBI.IsAnalyzed = true;
+        BBStack.pop_back();
+        continue;
+      }
+
+      // Push the False and True blocks to the stack.
+      State.SuccsAnalyzed = true;
+      BBStack.push_back(BBI.FalseBB);
+      BBStack.push_back(BBI.TrueBB);
+      continue;
+    }
+
+    BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+    BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+
+    if (TrueBBI.IsDone && FalseBBI.IsDone) {
+      BBI.IsBeingAnalyzed = false;
+      BBI.IsAnalyzed = true;
+      BBStack.pop_back();
+      continue;
+    }
+
+    SmallVector<MachineOperand, 4>
+        RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+    bool CanRevCond = !TII->ReverseBranchCondition(RevCond);
+
+    unsigned Dups = 0;
+    unsigned Dups2 = 0;
+    bool TNeedSub = !TrueBBI.Predicate.empty();
+    bool FNeedSub = !FalseBBI.Predicate.empty();
+    bool Enqueued = false;
+
+    BranchProbability Prediction = MBPI->getEdgeProbability(BB, TrueBBI.BB);
+
+    if (CanRevCond && ValidDiamond(TrueBBI, FalseBBI, Dups, Dups2) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, (TrueBBI.NonPredSize - (Dups + Dups2) +
+                                         TrueBBI.ExtraCost), TrueBBI.ExtraCost2,
+                           *FalseBBI.BB, (FalseBBI.NonPredSize - (Dups + Dups2) +
+                                        FalseBBI.ExtraCost),FalseBBI.ExtraCost2,
+                         Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond) &&
+        FeasibilityAnalysis(FalseBBI, RevCond)) {
+      // Diamond:
+      //   EBB
+      //   / \_
+      //  |   |
+      // TBB FBB
+      //   \ /
+      //  TailBB
+      // Note TailBB can be empty.
+      Tokens.push_back(llvm::make_unique<IfcvtToken>(
+          BBI, ICDiamond, TNeedSub | FNeedSub, Dups, Dups2));
+      Enqueued = true;
+    }
+
+    if (ValidTriangle(TrueBBI, FalseBBI, false, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond, true)) {
+      // Triangle:
+      //   EBB
+      //   | \_
+      //   |  |
+      //   | TBB
+      //   |  /
+      //   FBB
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICTriangle, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidTriangle(TrueBBI, FalseBBI, true, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond, true, true)) {
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICTriangleRev, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (ValidSimple(TrueBBI, Dups, Prediction) &&
+        MeetIfcvtSizeLimit(*TrueBBI.BB, TrueBBI.NonPredSize + TrueBBI.ExtraCost,
+                           TrueBBI.ExtraCost2, Prediction) &&
+        FeasibilityAnalysis(TrueBBI, BBI.BrCond)) {
+      // Simple (split, no rejoin):
+      //   EBB
+      //   | \_
+      //   |  |
+      //   | TBB---> exit
+      //   |
+      //   FBB
+      Tokens.push_back(
+          llvm::make_unique<IfcvtToken>(BBI, ICSimple, TNeedSub, Dups));
+      Enqueued = true;
+    }
+
+    if (CanRevCond) {
+      // Try the other path...
+      if (ValidTriangle(FalseBBI, TrueBBI, false, Dups,
+                        Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                             FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+          FeasibilityAnalysis(FalseBBI, RevCond, true)) {
+        Tokens.push_back(llvm::make_unique<IfcvtToken>(BBI, ICTriangleFalse,
+                                                       FNeedSub, Dups));
+        Enqueued = true;
+      }
+
+      if (ValidTriangle(FalseBBI, TrueBBI, true, Dups,
+                        Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                           FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+        FeasibilityAnalysis(FalseBBI, RevCond, true, true)) {
+        Tokens.push_back(
+            llvm::make_unique<IfcvtToken>(BBI, ICTriangleFRev, FNeedSub, Dups));
+        Enqueued = true;
+      }
+
+      if (ValidSimple(FalseBBI, Dups, Prediction.getCompl()) &&
+          MeetIfcvtSizeLimit(*FalseBBI.BB,
+                             FalseBBI.NonPredSize + FalseBBI.ExtraCost,
+                             FalseBBI.ExtraCost2, Prediction.getCompl()) &&
+          FeasibilityAnalysis(FalseBBI, RevCond)) {
+        Tokens.push_back(
+            llvm::make_unique<IfcvtToken>(BBI, ICSimpleFalse, FNeedSub, Dups));
+        Enqueued = true;
+      }
+    }
+
+    BBI.IsEnqueued = Enqueued;
+    BBI.IsBeingAnalyzed = false;
+    BBI.IsAnalyzed = true;
+    BBStack.pop_back();
+  }
+}
+
+/// AnalyzeBlocks - Analyze all blocks and find entries for all if-conversion
+/// candidates.
+void IfConverter::AnalyzeBlocks(
+    MachineFunction &MF, std::vector<std::unique_ptr<IfcvtToken>> &Tokens) {
+  for (auto &BB : MF)
+    AnalyzeBlock(&BB, Tokens);
+
+  // Sort to favor more complex ifcvt scheme.
+  std::stable_sort(Tokens.begin(), Tokens.end(), IfcvtTokenCmp);
+}
+
+/// canFallThroughTo - Returns true either if ToBB is the next block after BB or
+/// that all the intervening blocks are empty (given BB can fall through to its
+/// next block).
+static bool canFallThroughTo(MachineBasicBlock *BB, MachineBasicBlock *ToBB) {
+  MachineFunction::iterator PI = BB->getIterator();
+  MachineFunction::iterator I = std::next(PI);
+  MachineFunction::iterator TI = ToBB->getIterator();
+  MachineFunction::iterator E = BB->getParent()->end();
+  while (I != TI) {
+    // Check isSuccessor to avoid case where the next block is empty, but
+    // it's not a successor.
+    if (I == E || !I->empty() || !PI->isSuccessor(&*I))
+      return false;
+    PI = I++;
+  }
+  return true;
+}
+
+/// InvalidatePreds - Invalidate predecessor BB info so it would be re-analyzed
+/// to determine if it can be if-converted. If predecessor is already enqueued,
+/// dequeue it!
+void IfConverter::InvalidatePreds(MachineBasicBlock *BB) {
+  for (const auto &Predecessor : BB->predecessors()) {
+    BBInfo &PBBI = BBAnalysis[Predecessor->getNumber()];
+    if (PBBI.IsDone || PBBI.BB == BB)
+      continue;
+    PBBI.IsAnalyzed = false;
+    PBBI.IsEnqueued = false;
+  }
+}
+
+/// InsertUncondBranch - Inserts an unconditional branch from BB to ToBB.
+///
+static void InsertUncondBranch(MachineBasicBlock *BB, MachineBasicBlock *ToBB,
+                               const TargetInstrInfo *TII) {
+  DebugLoc dl;  // FIXME: this is nowhere
+  SmallVector<MachineOperand, 0> NoCond;
+  TII->InsertBranch(*BB, ToBB, nullptr, NoCond, dl);
+}
+
+/// RemoveExtraEdges - Remove true / false edges if either / both are no longer
+/// successors.
+void IfConverter::RemoveExtraEdges(BBInfo &BBI) {
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->analyzeBranch(*BBI.BB, TBB, FBB, Cond))
+    BBI.BB->CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
+}
+
+/// Behaves like LiveRegUnits::StepForward() but also adds implicit uses to all
+/// values defined in MI which are not live/used by MI.
+static void UpdatePredRedefs(MachineInstr &MI, LivePhysRegs &Redefs) {
+  SmallVector<std::pair<unsigned, const MachineOperand*>, 4> Clobbers;
+  Redefs.stepForward(MI, Clobbers);
+
+  // Now add the implicit uses for each of the clobbered values.
+  for (auto Reg : Clobbers) {
+    // FIXME: Const cast here is nasty, but better than making StepForward
+    // take a mutable instruction instead of const.
+    MachineOperand &Op = const_cast<MachineOperand&>(*Reg.second);
+    MachineInstr *OpMI = Op.getParent();
+    MachineInstrBuilder MIB(*OpMI->getParent()->getParent(), OpMI);
+    if (Op.isRegMask()) {
+      // First handle regmasks.  They clobber any entries in the mask which
+      // means that we need a def for those registers.
+      MIB.addReg(Reg.first, RegState::Implicit | RegState::Undef);
+
+      // We also need to add an implicit def of this register for the later
+      // use to read from.
+      // For the register allocator to have allocated a register clobbered
+      // by the call which is used later, it must be the case that
+      // the call doesn't return.
+      MIB.addReg(Reg.first, RegState::Implicit | RegState::Define);
+      continue;
+    }
+    assert(Op.isReg() && "Register operand required");
+    if (Op.isDead()) {
+      // If we found a dead def, but it needs to be live, then remove the dead
+      // flag.
+      if (Redefs.contains(Op.getReg()))
+        Op.setIsDead(false);
+    }
+    MIB.addReg(Reg.first, RegState::Implicit | RegState::Undef);
+  }
+}
+
+/**
+ * Remove kill flags from operands with a registers in the @p DontKill set.
+ */
+static void RemoveKills(MachineInstr &MI, const LivePhysRegs &DontKill) {
+  for (MIBundleOperands O(MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->isKill())
+      continue;
+    if (DontKill.contains(O->getReg()))
+      O->setIsKill(false);
+  }
+}
+
+/**
+ * Walks a range of machine instructions and removes kill flags for registers
+ * in the @p DontKill set.
+ */
+static void RemoveKills(MachineBasicBlock::iterator I,
+                        MachineBasicBlock::iterator E,
+                        const LivePhysRegs &DontKill,
+                        const MCRegisterInfo &MCRI) {
+  for ( ; I != E; ++I)
+    RemoveKills(*I, DontKill);
+}
+
+/// IfConvertSimple - If convert a simple (split, no rejoin) sub-CFG.
+///
+bool IfConverter::IfConvertSimple(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICSimpleFalse)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (CvtBBI->BB->hasAddressTaken())
+    // Conservatively abort if-conversion if BB's address is taken.
+    return false;
+
+  if (Kind == ICSimpleFalse)
+    if (TII->ReverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  // Initialize liveins to the first BB. These are potentiall redefined by
+  // predicated instructions.
+  Redefs.init(TRI);
+  Redefs.addLiveIns(*CvtBBI->BB);
+  Redefs.addLiveIns(*NextBBI->BB);
+
+  // Compute a set of registers which must not be killed by instructions in
+  // BB1: This is everything live-in to BB2.
+  DontKill.init(TRI);
+  DontKill.addLiveIns(*NextBBI->BB);
+
+  if (CvtBBI->BB->pred_size() > 1) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond);
+
+    // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
+    // explicitly remove CvtBBI as a successor.
+    BBI.BB->removeSuccessor(CvtBBI->BB, true);
+  } else {
+    RemoveKills(CvtBBI->BB->begin(), CvtBBI->BB->end(), DontKill, *TRI);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+
+    // Merge converted block into entry block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI);
+  }
+
+  bool IterIfcvt = true;
+  if (!canFallThroughTo(BBI.BB, NextBBI->BB)) {
+    InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+    BBI.HasFallThrough = false;
+    // Now ifcvt'd block will look like this:
+    // BB:
+    // ...
+    // t, f = cmp
+    // if t op
+    // b BBf
+    //
+    // We cannot further ifcvt this block because the unconditional branch
+    // will have to be predicated on the new condition, that will not be
+    // available if cmp executes.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertTriangle - If convert a triangle sub-CFG.
+///
+bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) {
+  BBInfo &TrueBBI = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  BBInfo *CvtBBI = &TrueBBI;
+  BBInfo *NextBBI = &FalseBBI;
+  DebugLoc dl;  // FIXME: this is nowhere
+
+  SmallVector<MachineOperand, 4> Cond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    std::swap(CvtBBI, NextBBI);
+
+  if (CvtBBI->IsDone ||
+      (CvtBBI->CannotBeCopied && CvtBBI->BB->pred_size() > 1)) {
+    // Something has changed. It's no longer safe to predicate this block.
+    BBI.IsAnalyzed = false;
+    CvtBBI->IsAnalyzed = false;
+    return false;
+  }
+
+  if (CvtBBI->BB->hasAddressTaken())
+    // Conservatively abort if-conversion if BB's address is taken.
+    return false;
+
+  if (Kind == ICTriangleFalse || Kind == ICTriangleFRev)
+    if (TII->ReverseBranchCondition(Cond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+  if (Kind == ICTriangleRev || Kind == ICTriangleFRev) {
+    if (ReverseBranchCondition(*CvtBBI)) {
+      // BB has been changed, modify its predecessors (except for this
+      // one) so they don't get ifcvt'ed based on bad intel.
+      for (MachineBasicBlock::pred_iterator PI = CvtBBI->BB->pred_begin(),
+             E = CvtBBI->BB->pred_end(); PI != E; ++PI) {
+        MachineBasicBlock *PBB = *PI;
+        if (PBB == BBI.BB)
+          continue;
+        BBInfo &PBBI = BBAnalysis[PBB->getNumber()];
+        if (PBBI.IsEnqueued) {
+          PBBI.IsAnalyzed = false;
+          PBBI.IsEnqueued = false;
+        }
+      }
+    }
+  }
+
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  Redefs.init(TRI);
+  Redefs.addLiveIns(*CvtBBI->BB);
+  Redefs.addLiveIns(*NextBBI->BB);
+
+  DontKill.clear();
+
+  bool HasEarlyExit = CvtBBI->FalseBB != nullptr;
+  BranchProbability CvtNext, CvtFalse, BBNext, BBCvt;
+
+  if (HasEarlyExit) {
+    // Get probabilities before modifying CvtBBI->BB and BBI.BB.
+    CvtNext = MBPI->getEdgeProbability(CvtBBI->BB, NextBBI->BB);
+    CvtFalse = MBPI->getEdgeProbability(CvtBBI->BB, CvtBBI->FalseBB);
+    BBNext = MBPI->getEdgeProbability(BBI.BB, NextBBI->BB);
+    BBCvt = MBPI->getEdgeProbability(BBI.BB, CvtBBI->BB);
+  }
+
+  if (CvtBBI->BB->pred_size() > 1) {
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    // Copy instructions in the true block, predicate them, and add them to
+    // the entry block.
+    CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true);
+
+    // RemoveExtraEdges won't work if the block has an unanalyzable branch, so
+    // explicitly remove CvtBBI as a successor.
+    BBI.BB->removeSuccessor(CvtBBI->BB, true);
+  } else {
+    // Predicate the 'true' block after removing its branch.
+    CvtBBI->NonPredSize -= TII->RemoveBranch(*CvtBBI->BB);
+    PredicateBlock(*CvtBBI, CvtBBI->BB->end(), Cond);
+
+    // Now merge the entry of the triangle with the true block.
+    BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+    MergeBlocks(BBI, *CvtBBI, false);
+  }
+
+  // If 'true' block has a 'false' successor, add an exit branch to it.
+  if (HasEarlyExit) {
+    SmallVector<MachineOperand, 4> RevCond(CvtBBI->BrCond.begin(),
+                                           CvtBBI->BrCond.end());
+    if (TII->ReverseBranchCondition(RevCond))
+      llvm_unreachable("Unable to reverse branch condition!");
+
+    // Update the edge probability for both CvtBBI->FalseBB and NextBBI.
+    // NewNext = New_Prob(BBI.BB, NextBBI->BB) =
+    //   Prob(BBI.BB, NextBBI->BB) +
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, NextBBI->BB)
+    // NewFalse = New_Prob(BBI.BB, CvtBBI->FalseBB) =
+    //   Prob(BBI.BB, CvtBBI->BB) * Prob(CvtBBI->BB, CvtBBI->FalseBB)
+    auto NewTrueBB = getNextBlock(BBI.BB);
+    auto NewNext = BBNext + BBCvt * CvtNext;
+    auto NewTrueBBIter =
+        std::find(BBI.BB->succ_begin(), BBI.BB->succ_end(), NewTrueBB);
+    if (NewTrueBBIter != BBI.BB->succ_end())
+      BBI.BB->setSuccProbability(NewTrueBBIter, NewNext);
+
+    auto NewFalse = BBCvt * CvtFalse;
+    TII->InsertBranch(*BBI.BB, CvtBBI->FalseBB, nullptr, RevCond, dl);
+    BBI.BB->addSuccessor(CvtBBI->FalseBB, NewFalse);
+  }
+
+  // Merge in the 'false' block if the 'false' block has no other
+  // predecessors. Otherwise, add an unconditional branch to 'false'.
+  bool FalseBBDead = false;
+  bool IterIfcvt = true;
+  bool isFallThrough = canFallThroughTo(BBI.BB, NextBBI->BB);
+  if (!isFallThrough) {
+    // Only merge them if the true block does not fallthrough to the false
+    // block. By not merging them, we make it possible to iteratively
+    // ifcvt the blocks.
+    if (!HasEarlyExit &&
+        NextBBI->BB->pred_size() == 1 && !NextBBI->HasFallThrough &&
+        !NextBBI->BB->hasAddressTaken()) {
+      MergeBlocks(BBI, *NextBBI);
+      FalseBBDead = true;
+    } else {
+      InsertUncondBranch(BBI.BB, NextBBI->BB, TII);
+      BBI.HasFallThrough = false;
+    }
+    // Mixed predicated and unpredicated code. This cannot be iteratively
+    // predicated.
+    IterIfcvt = false;
+  }
+
+  RemoveExtraEdges(BBI);
+
+  // Update block info. BB can be iteratively if-converted.
+  if (!IterIfcvt)
+    BBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+  CvtBBI->IsDone = true;
+  if (FalseBBDead)
+    NextBBI->IsDone = true;
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+/// IfConvertDiamond - If convert a diamond sub-CFG.
+///
+bool IfConverter::IfConvertDiamond(BBInfo &BBI, IfcvtKind Kind,
+                                   unsigned NumDups1, unsigned NumDups2) {
+  BBInfo &TrueBBI  = BBAnalysis[BBI.TrueBB->getNumber()];
+  BBInfo &FalseBBI = BBAnalysis[BBI.FalseBB->getNumber()];
+  MachineBasicBlock *TailBB = TrueBBI.TrueBB;
+  // True block must fall through or end with an unanalyzable terminator.
+  if (!TailBB) {
+    if (blockAlwaysFallThrough(TrueBBI))
+      TailBB = FalseBBI.TrueBB;
+    assert((TailBB || !TrueBBI.IsBrAnalyzable) && "Unexpected!");
+  }
+
+  if (TrueBBI.IsDone || FalseBBI.IsDone ||
+      TrueBBI.BB->pred_size() > 1 ||
+      FalseBBI.BB->pred_size() > 1) {
+    // Something has changed. It's no longer safe to predicate these blocks.
+    BBI.IsAnalyzed = false;
+    TrueBBI.IsAnalyzed = false;
+    FalseBBI.IsAnalyzed = false;
+    return false;
+  }
+
+  if (TrueBBI.BB->hasAddressTaken() || FalseBBI.BB->hasAddressTaken())
+    // Conservatively abort if-conversion if either BB has its address taken.
+    return false;
+
+  // Put the predicated instructions from the 'true' block before the
+  // instructions from the 'false' block, unless the true block would clobber
+  // the predicate, in which case, do the opposite.
+  BBInfo *BBI1 = &TrueBBI;
+  BBInfo *BBI2 = &FalseBBI;
+  SmallVector<MachineOperand, 4> RevCond(BBI.BrCond.begin(), BBI.BrCond.end());
+  if (TII->ReverseBranchCondition(RevCond))
+    llvm_unreachable("Unable to reverse branch condition!");
+  SmallVector<MachineOperand, 4> *Cond1 = &BBI.BrCond;
+  SmallVector<MachineOperand, 4> *Cond2 = &RevCond;
+
+  // Figure out the more profitable ordering.
+  bool DoSwap = false;
+  if (TrueBBI.ClobbersPred && !FalseBBI.ClobbersPred)
+    DoSwap = true;
+  else if (TrueBBI.ClobbersPred == FalseBBI.ClobbersPred) {
+    if (TrueBBI.NonPredSize > FalseBBI.NonPredSize)
+      DoSwap = true;
+  }
+  if (DoSwap) {
+    std::swap(BBI1, BBI2);
+    std::swap(Cond1, Cond2);
+  }
+
+  // Remove the conditional branch from entry to the blocks.
+  BBI.NonPredSize -= TII->RemoveBranch(*BBI.BB);
+
+  // Initialize liveins to the first BB. These are potentially redefined by
+  // predicated instructions.
+  Redefs.init(TRI);
+  Redefs.addLiveIns(*BBI1->BB);
+
+  // Remove the duplicated instructions at the beginnings of both paths.
+  // Skip dbg_value instructions
+  MachineBasicBlock::iterator DI1 = BBI1->BB->getFirstNonDebugInstr();
+  MachineBasicBlock::iterator DI2 = BBI2->BB->getFirstNonDebugInstr();
+  BBI1->NonPredSize -= NumDups1;
+  BBI2->NonPredSize -= NumDups1;
+
+  // Skip past the dups on each side separately since there may be
+  // differing dbg_value entries.
+  for (unsigned i = 0; i < NumDups1; ++DI1) {
+    if (!DI1->isDebugValue())
+      ++i;
+  }
+  while (NumDups1 != 0) {
+    ++DI2;
+    if (!DI2->isDebugValue())
+      --NumDups1;
+  }
+
+  // Compute a set of registers which must not be killed by instructions in BB1:
+  // This is everything used+live in BB2 after the duplicated instructions. We
+  // can compute this set by simulating liveness backwards from the end of BB2.
+  DontKill.init(TRI);
+  for (MachineBasicBlock::reverse_iterator I = BBI2->BB->rbegin(),
+       E = MachineBasicBlock::reverse_iterator(DI2); I != E; ++I) {
+    DontKill.stepBackward(*I);
+  }
+
+  for (MachineBasicBlock::const_iterator I = BBI1->BB->begin(), E = DI1; I != E;
+       ++I) {
+    SmallVector<std::pair<unsigned, const MachineOperand*>, 4> IgnoredClobbers;
+    Redefs.stepForward(*I, IgnoredClobbers);
+  }
+  BBI.BB->splice(BBI.BB->end(), BBI1->BB, BBI1->BB->begin(), DI1);
+  BBI2->BB->erase(BBI2->BB->begin(), DI2);
+
+  // Remove branch from the 'true' block, unless it was not analyzable.
+  // Non-analyzable branches need to be preserved, since in such cases,
+  // the CFG structure is not an actual diamond (the join block may not
+  // be present).
+  if (BBI1->IsBrAnalyzable)
+    BBI1->NonPredSize -= TII->RemoveBranch(*BBI1->BB);
+  // Remove duplicated instructions.
+  DI1 = BBI1->BB->end();
+  for (unsigned i = 0; i != NumDups2; ) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI1 != BBI1->BB->begin());
+    --DI1;
+    // skip dbg_value instructions
+    if (!DI1->isDebugValue())
+      ++i;
+  }
+  BBI1->BB->erase(DI1, BBI1->BB->end());
+
+  // Kill flags in the true block for registers living into the false block
+  // must be removed.
+  RemoveKills(BBI1->BB->begin(), BBI1->BB->end(), DontKill, *TRI);
+
+  // Remove 'false' block branch (unless it was not analyzable), and find
+  // the last instruction to predicate.
+  if (BBI2->IsBrAnalyzable)
+    BBI2->NonPredSize -= TII->RemoveBranch(*BBI2->BB);
+  DI2 = BBI2->BB->end();
+  while (NumDups2 != 0) {
+    // NumDups2 only counted non-dbg_value instructions, so this won't
+    // run off the head of the list.
+    assert (DI2 != BBI2->BB->begin());
+    --DI2;
+    // skip dbg_value instructions
+    if (!DI2->isDebugValue())
+      --NumDups2;
+  }
+
+  // Remember which registers would later be defined by the false block.
+  // This allows us not to predicate instructions in the true block that would
+  // later be re-defined. That is, rather than
+  //   subeq  r0, r1, #1
+  //   addne  r0, r1, #1
+  // generate:
+  //   sub    r0, r1, #1
+  //   addne  r0, r1, #1
+  SmallSet<unsigned, 4> RedefsByFalse;
+  SmallSet<unsigned, 4> ExtUses;
+  if (TII->isProfitableToUnpredicate(*BBI1->BB, *BBI2->BB)) {
+    for (MachineBasicBlock::iterator FI = BBI2->BB->begin(); FI != DI2; ++FI) {
+      if (FI->isDebugValue())
+        continue;
+      SmallVector<unsigned, 4> Defs;
+      for (unsigned i = 0, e = FI->getNumOperands(); i != e; ++i) {
+        const MachineOperand &MO = FI->getOperand(i);
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        if (MO.isDef()) {
+          Defs.push_back(Reg);
+        } else if (!RedefsByFalse.count(Reg)) {
+          // These are defined before ctrl flow reach the 'false' instructions.
+          // They cannot be modified by the 'true' instructions.
+          for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+               SubRegs.isValid(); ++SubRegs)
+            ExtUses.insert(*SubRegs);
+        }
+      }
+
+      for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+        unsigned Reg = Defs[i];
+        if (!ExtUses.count(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+               SubRegs.isValid(); ++SubRegs)
+            RedefsByFalse.insert(*SubRegs);
+        }
+      }
+    }
+  }
+
+  // Predicate the 'true' block.
+  PredicateBlock(*BBI1, BBI1->BB->end(), *Cond1, &RedefsByFalse);
+
+  // After predicating BBI1, if there is a predicated terminator in BBI1 and
+  // a non-predicated in BBI2, then we don't want to predicate the one from
+  // BBI2. The reason is that if we merged these blocks, we would end up with
+  // two predicated terminators in the same block.
+  if (!BBI2->BB->empty() && (DI2 == BBI2->BB->end())) {
+    MachineBasicBlock::iterator BBI1T = BBI1->BB->getFirstTerminator();
+    MachineBasicBlock::iterator BBI2T = BBI2->BB->getFirstTerminator();
+    if (BBI1T != BBI1->BB->end() && TII->isPredicated(*BBI1T) &&
+        BBI2T != BBI2->BB->end() && !TII->isPredicated(*BBI2T))
+      --DI2;
+  }
+
+  // Predicate the 'false' block.
+  PredicateBlock(*BBI2, DI2, *Cond2);
+
+  // Merge the true block into the entry of the diamond.
+  MergeBlocks(BBI, *BBI1, TailBB == nullptr);
+  MergeBlocks(BBI, *BBI2, TailBB == nullptr);
+
+  // If the if-converted block falls through or unconditionally branches into
+  // the tail block, and the tail block does not have other predecessors, then
+  // fold the tail block in as well. Otherwise, unless it falls through to the
+  // tail, add a unconditional branch to it.
+  if (TailBB) {
+    BBInfo &TailBBI = BBAnalysis[TailBB->getNumber()];
+    bool CanMergeTail = !TailBBI.HasFallThrough &&
+      !TailBBI.BB->hasAddressTaken();
+    // The if-converted block can still have a predicated terminator
+    // (e.g. a predicated return). If that is the case, we cannot merge
+    // it with the tail block.
+    MachineBasicBlock::const_iterator TI = BBI.BB->getFirstTerminator();
+    if (TI != BBI.BB->end() && TII->isPredicated(*TI))
+      CanMergeTail = false;
+    // There may still be a fall-through edge from BBI1 or BBI2 to TailBB;
+    // check if there are any other predecessors besides those.
+    unsigned NumPreds = TailBB->pred_size();
+    if (NumPreds > 1)
+      CanMergeTail = false;
+    else if (NumPreds == 1 && CanMergeTail) {
+      MachineBasicBlock::pred_iterator PI = TailBB->pred_begin();
+      if (*PI != BBI1->BB && *PI != BBI2->BB)
+        CanMergeTail = false;
+    }
+    if (CanMergeTail) {
+      MergeBlocks(BBI, TailBBI);
+      TailBBI.IsDone = true;
+    } else {
+      BBI.BB->addSuccessor(TailBB, BranchProbability::getOne());
+      InsertUncondBranch(BBI.BB, TailBB, TII);
+      BBI.HasFallThrough = false;
+    }
+  }
+
+  // RemoveExtraEdges won't work if the block has an unanalyzable branch,
+  // which can happen here if TailBB is unanalyzable and is merged, so
+  // explicitly remove BBI1 and BBI2 as successors.
+  BBI.BB->removeSuccessor(BBI1->BB);
+  BBI.BB->removeSuccessor(BBI2->BB, true);
+  RemoveExtraEdges(BBI);
+
+  // Update block info.
+  BBI.IsDone = TrueBBI.IsDone = FalseBBI.IsDone = true;
+  InvalidatePreds(BBI.BB);
+
+  // FIXME: Must maintain LiveIns.
+  return true;
+}
+
+static bool MaySpeculate(const MachineInstr &MI,
+                         SmallSet<unsigned, 4> &LaterRedefs) {
+  bool SawStore = true;
+  if (!MI.isSafeToMove(nullptr, SawStore))
+    return false;
+
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (MO.isDef() && !LaterRedefs.count(Reg))
+      return false;
+  }
+
+  return true;
+}
+
+/// PredicateBlock - Predicate instructions from the start of the block to the
+/// specified end with the specified condition.
+void IfConverter::PredicateBlock(BBInfo &BBI,
+                                 MachineBasicBlock::iterator E,
+                                 SmallVectorImpl<MachineOperand> &Cond,
+                                 SmallSet<unsigned, 4> *LaterRedefs) {
+  bool AnyUnpred = false;
+  bool MaySpec = LaterRedefs != nullptr;
+  for (MachineInstr &I : llvm::make_range(BBI.BB->begin(), E)) {
+    if (I.isDebugValue() || TII->isPredicated(I))
+      continue;
+    // It may be possible not to predicate an instruction if it's the 'true'
+    // side of a diamond and the 'false' side may re-define the instruction's
+    // defs.
+    if (MaySpec && MaySpeculate(I, *LaterRedefs)) {
+      AnyUnpred = true;
+      continue;
+    }
+    // If any instruction is predicated, then every instruction after it must
+    // be predicated.
+    MaySpec = false;
+    if (!TII->PredicateInstruction(I, Cond)) {
+#ifndef NDEBUG
+      dbgs() << "Unable to predicate " << I << "!\n";
+#endif
+      llvm_unreachable(nullptr);
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(I, Redefs);
+  }
+
+  BBI.Predicate.append(Cond.begin(), Cond.end());
+
+  BBI.IsAnalyzed = false;
+  BBI.NonPredSize = 0;
+
+  ++NumIfConvBBs;
+  if (AnyUnpred)
+    ++NumUnpred;
+}
+
+/// CopyAndPredicateBlock - Copy and predicate instructions from source BB to
+/// the destination block. Skip end of block branches if IgnoreBr is true.
+void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
+                                        SmallVectorImpl<MachineOperand> &Cond,
+                                        bool IgnoreBr) {
+  MachineFunction &MF = *ToBBI.BB->getParent();
+
+  for (auto &I : *FromBBI.BB) {
+    // Do not copy the end of the block branches.
+    if (IgnoreBr && I.isBranch())
+      break;
+
+    MachineInstr *MI = MF.CloneMachineInstr(&I);
+    ToBBI.BB->insert(ToBBI.BB->end(), MI);
+    ToBBI.NonPredSize++;
+    unsigned ExtraPredCost = TII->getPredicationCost(I);
+    unsigned NumCycles = SchedModel.computeInstrLatency(&I, false);
+    if (NumCycles > 1)
+      ToBBI.ExtraCost += NumCycles-1;
+    ToBBI.ExtraCost2 += ExtraPredCost;
+
+    if (!TII->isPredicated(I) && !MI->isDebugValue()) {
+      if (!TII->PredicateInstruction(*MI, Cond)) {
+#ifndef NDEBUG
+        dbgs() << "Unable to predicate " << I << "!\n";
+#endif
+        llvm_unreachable(nullptr);
+      }
+    }
+
+    // If the predicated instruction now redefines a register as the result of
+    // if-conversion, add an implicit kill.
+    UpdatePredRedefs(*MI, Redefs);
+
+    // Some kill flags may not be correct anymore.
+    if (!DontKill.empty())
+      RemoveKills(*MI, DontKill);
+  }
+
+  if (!IgnoreBr) {
+    std::vector<MachineBasicBlock *> Succs(FromBBI.BB->succ_begin(),
+                                           FromBBI.BB->succ_end());
+    MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+    MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
+
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      MachineBasicBlock *Succ = Succs[i];
+      // Fallthrough edge can't be transferred.
+      if (Succ == FallThrough)
+        continue;
+      ToBBI.BB->addSuccessor(Succ);
+    }
+  }
+
+  ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
+  ToBBI.Predicate.append(Cond.begin(), Cond.end());
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.IsAnalyzed = false;
+
+  ++NumDupBBs;
+}
+
+/// MergeBlocks - Move all instructions from FromBB to the end of ToBB.
+/// This will leave FromBB as an empty block, so remove all of its
+/// successor edges except for the fall-through edge.  If AddEdges is true,
+/// i.e., when FromBBI's branch is being moved, add those successor edges to
+/// ToBBI.
+void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
+  assert(!FromBBI.BB->hasAddressTaken() &&
+         "Removing a BB whose address is taken!");
+
+  // In case FromBBI.BB contains terminators (e.g. return instruction),
+  // first move the non-terminator instructions, then the terminators.
+  MachineBasicBlock::iterator FromTI = FromBBI.BB->getFirstTerminator();
+  MachineBasicBlock::iterator ToTI = ToBBI.BB->getFirstTerminator();
+  ToBBI.BB->splice(ToTI, FromBBI.BB, FromBBI.BB->begin(), FromTI);
+
+  // If FromBB has non-predicated terminator we should copy it at the end.
+  if (FromTI != FromBBI.BB->end() && !TII->isPredicated(*FromTI))
+    ToTI = ToBBI.BB->end();
+  ToBBI.BB->splice(ToTI, FromBBI.BB, FromTI, FromBBI.BB->end());
+
+  // Force normalizing the successors' probabilities of ToBBI.BB to convert all
+  // unknown probabilities into known ones.
+  // FIXME: This usage is too tricky and in the future we would like to
+  // eliminate all unknown probabilities in MBB.
+  ToBBI.BB->normalizeSuccProbs();
+
+  SmallVector<MachineBasicBlock *, 4> FromSuccs(FromBBI.BB->succ_begin(),
+                                                FromBBI.BB->succ_end());
+  MachineBasicBlock *NBB = getNextBlock(FromBBI.BB);
+  MachineBasicBlock *FallThrough = FromBBI.HasFallThrough ? NBB : nullptr;
+  // The edge probability from ToBBI.BB to FromBBI.BB, which is only needed when
+  // AddEdges is true and FromBBI.BB is a successor of ToBBI.BB.
+  auto To2FromProb = BranchProbability::getZero();
+  if (AddEdges && ToBBI.BB->isSuccessor(FromBBI.BB)) {
+    To2FromProb = MBPI->getEdgeProbability(ToBBI.BB, FromBBI.BB);
+    // Set the edge probability from ToBBI.BB to FromBBI.BB to zero to avoid the
+    // edge probability being merged to other edges when this edge is removed
+    // later.
+    ToBBI.BB->setSuccProbability(
+        std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), FromBBI.BB),
+        BranchProbability::getZero());
+  }
+
+  for (unsigned i = 0, e = FromSuccs.size(); i != e; ++i) {
+    MachineBasicBlock *Succ = FromSuccs[i];
+    // Fallthrough edge can't be transferred.
+    if (Succ == FallThrough)
+      continue;
+
+    auto NewProb = BranchProbability::getZero();
+    if (AddEdges) {
+      // Calculate the edge probability for the edge from ToBBI.BB to Succ,
+      // which is a portion of the edge probability from FromBBI.BB to Succ. The
+      // portion ratio is the edge probability from ToBBI.BB to FromBBI.BB (if
+      // FromBBI is a successor of ToBBI.BB. See comment below for excepion).
+      NewProb = MBPI->getEdgeProbability(FromBBI.BB, Succ);
+
+      // To2FromProb is 0 when FromBBI.BB is not a successor of ToBBI.BB. This
+      // only happens when if-converting a diamond CFG and FromBBI.BB is the
+      // tail BB.  In this case FromBBI.BB post-dominates ToBBI.BB and hence we
+      // could just use the probabilities on FromBBI.BB's out-edges when adding
+      // new successors.
+      if (!To2FromProb.isZero())
+        NewProb *= To2FromProb;
+    }
+
+    FromBBI.BB->removeSuccessor(Succ);
+
+    if (AddEdges) {
+      // If the edge from ToBBI.BB to Succ already exists, update the
+      // probability of this edge by adding NewProb to it. An example is shown
+      // below, in which A is ToBBI.BB and B is FromBBI.BB. In this case we
+      // don't have to set C as A's successor as it already is. We only need to
+      // update the edge probability on A->C. Note that B will not be
+      // immediately removed from A's successors. It is possible that B->D is
+      // not removed either if D is a fallthrough of B. Later the edge A->D
+      // (generated here) and B->D will be combined into one edge. To maintain
+      // correct edge probability of this combined edge, we need to set the edge
+      // probability of A->B to zero, which is already done above. The edge
+      // probability on A->D is calculated by scaling the original probability
+      // on A->B by the probability of B->D.
+      //
+      // Before ifcvt:      After ifcvt (assume B->D is kept):
+      //
+      //       A                A
+      //      /|               /|\
+      //     / B              / B|
+      //    | /|             |  ||
+      //    |/ |             |  |/
+      //    C  D             C  D
+      //
+      if (ToBBI.BB->isSuccessor(Succ))
+        ToBBI.BB->setSuccProbability(
+            std::find(ToBBI.BB->succ_begin(), ToBBI.BB->succ_end(), Succ),
+            MBPI->getEdgeProbability(ToBBI.BB, Succ) + NewProb);
+      else
+        ToBBI.BB->addSuccessor(Succ, NewProb);
+    }
+  }
+
+  // Now FromBBI always falls through to the next block!
+  if (NBB && !FromBBI.BB->isSuccessor(NBB))
+    FromBBI.BB->addSuccessor(NBB);
+
+  // Normalize the probabilities of ToBBI.BB's successors with all adjustment
+  // we've done above.
+  ToBBI.BB->normalizeSuccProbs();
+
+  ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
+  FromBBI.Predicate.clear();
+
+  ToBBI.NonPredSize += FromBBI.NonPredSize;
+  ToBBI.ExtraCost += FromBBI.ExtraCost;
+  ToBBI.ExtraCost2 += FromBBI.ExtraCost2;
+  FromBBI.NonPredSize = 0;
+  FromBBI.ExtraCost = 0;
+  FromBBI.ExtraCost2 = 0;
+
+  ToBBI.ClobbersPred |= FromBBI.ClobbersPred;
+  ToBBI.HasFallThrough = FromBBI.HasFallThrough;
+  ToBBI.IsAnalyzed = false;
+  FromBBI.IsAnalyzed = false;
+}
+
+FunctionPass *
+llvm::createIfConverter(std::function<bool(const Function &)> Ftor) {
+  return new IfConverter(std::move(Ftor));
+}
diff --git a/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
new file mode 100644
index 000000000000..31d6bd0b6dc6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -0,0 +1,576 @@
+//===-- ImplicitNullChecks.cpp - Fold null checks into memory accesses ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass turns explicit null checks of the form
+//
+//   test %r10, %r10
+//   je throw_npe
+//   movl (%r10), %esi
+//   ...
+//
+// to
+//
+//   faulting_load_op("movl (%r10), %esi", throw_npe)
+//   ...
+//
+// With the help of a runtime that understands the .fault_maps section,
+// faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs
+// a page fault.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+static cl::opt<int> PageSize("imp-null-check-page-size",
+                             cl::desc("The page size of the target in bytes"),
+                             cl::init(4096));
+
+#define DEBUG_TYPE "implicit-null-checks"
+
+STATISTIC(NumImplicitNullChecks,
+          "Number of explicit null checks made implicit");
+
+namespace {
+
+class ImplicitNullChecks : public MachineFunctionPass {
+  /// Represents one null check that can be made implicit.
+  class NullCheck {
+    // The memory operation the null check can be folded into.
+    MachineInstr *MemOperation;
+
+    // The instruction actually doing the null check (Ptr != 0).
+    MachineInstr *CheckOperation;
+
+    // The block the check resides in.
+    MachineBasicBlock *CheckBlock;
+
+    // The block branched to if the pointer is non-null.
+    MachineBasicBlock *NotNullSucc;
+
+    // The block branched to if the pointer is null.
+    MachineBasicBlock *NullSucc;
+
+    // If this is non-null, then MemOperation has a dependency on on this
+    // instruction; and it needs to be hoisted to execute before MemOperation.
+    MachineInstr *OnlyDependency;
+
+  public:
+    explicit NullCheck(MachineInstr *memOperation, MachineInstr *checkOperation,
+                       MachineBasicBlock *checkBlock,
+                       MachineBasicBlock *notNullSucc,
+                       MachineBasicBlock *nullSucc,
+                       MachineInstr *onlyDependency)
+        : MemOperation(memOperation), CheckOperation(checkOperation),
+          CheckBlock(checkBlock), NotNullSucc(notNullSucc), NullSucc(nullSucc),
+          OnlyDependency(onlyDependency) {}
+
+    MachineInstr *getMemOperation() const { return MemOperation; }
+
+    MachineInstr *getCheckOperation() const { return CheckOperation; }
+
+    MachineBasicBlock *getCheckBlock() const { return CheckBlock; }
+
+    MachineBasicBlock *getNotNullSucc() const { return NotNullSucc; }
+
+    MachineBasicBlock *getNullSucc() const { return NullSucc; }
+
+    MachineInstr *getOnlyDependency() const { return OnlyDependency; }
+  };
+
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  AliasAnalysis *AA = nullptr;
+  MachineModuleInfo *MMI = nullptr;
+
+  bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<NullCheck> &NullCheckList);
+  MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB,
+                                   MachineBasicBlock *HandlerMBB);
+  void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
+
+public:
+  static char ID;
+
+  ImplicitNullChecks() : MachineFunctionPass(ID) {
+    initializeImplicitNullChecksPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+};
+
+/// \brief Detect re-ordering hazards and dependencies.
+///
+/// This class keeps track of defs and uses, and can be queried if a given
+/// machine instruction can be re-ordered from after the machine instructions
+/// seen so far to before them.
+class HazardDetector {
+  static MachineInstr *getUnknownMI() {
+    return DenseMapInfo<MachineInstr *>::getTombstoneKey();
+  }
+
+  // Maps physical registers to the instruction defining them.  If there has
+  // been more than one def of an specific register, that register is mapped to
+  // getUnknownMI().
+  DenseMap<unsigned, MachineInstr *> RegDefs;
+  DenseSet<unsigned> RegUses;
+  const TargetRegisterInfo &TRI;
+  bool hasSeenClobber;
+  AliasAnalysis &AA;
+
+public:
+  explicit HazardDetector(const TargetRegisterInfo &TRI, AliasAnalysis &AA)
+      : TRI(TRI), hasSeenClobber(false), AA(AA) {}
+
+  /// \brief Make a note of \p MI for later queries to isSafeToHoist.
+  ///
+  /// May clobber this HazardDetector instance.  \see isClobbered.
+  void rememberInstruction(MachineInstr *MI);
+
+  /// \brief Return true if it is safe to hoist \p MI from after all the
+  /// instructions seen so far (via rememberInstruction) to before it.  If \p MI
+  /// has one and only one transitive dependency, set \p Dependency to that
+  /// instruction.  If there are more dependencies, return false.
+  bool isSafeToHoist(MachineInstr *MI, MachineInstr *&Dependency);
+
+  /// \brief Return true if this instance of HazardDetector has been clobbered
+  /// (i.e. has no more useful information).
+  ///
+  /// A HazardDetecter is clobbered when it sees a construct it cannot
+  /// understand, and it would have to return a conservative answer for all
+  /// future queries.  Having a separate clobbered state lets the client code
+  /// bail early, without making queries about all of the future instructions
+  /// (which would have returned the most conservative answer anyway).
+  ///
+  /// Calling rememberInstruction or isSafeToHoist on a clobbered HazardDetector
+  /// is an error.
+  bool isClobbered() { return hasSeenClobber; }
+};
+}
+
+
+void HazardDetector::rememberInstruction(MachineInstr *MI) {
+  assert(!isClobbered() &&
+         "Don't add instructions to a clobbered hazard detector");
+
+  if (MI->mayStore() || MI->hasUnmodeledSideEffects()) {
+    hasSeenClobber = true;
+    return;
+  }
+
+  for (auto *MMO : MI->memoperands()) {
+    // Right now we don't want to worry about LLVM's memory model.
+    if (!MMO->isUnordered()) {
+      hasSeenClobber = true;
+      return;
+    }
+  }
+
+  for (auto &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+
+    if (MO.isDef()) {
+      auto It = RegDefs.find(MO.getReg());
+      if (It == RegDefs.end())
+        RegDefs.insert({MO.getReg(), MI});
+      else {
+        assert(It->second && "Found null MI?");
+        It->second = getUnknownMI();
+      }
+    } else
+      RegUses.insert(MO.getReg());
+  }
+}
+
+bool HazardDetector::isSafeToHoist(MachineInstr *MI,
+                                   MachineInstr *&Dependency) {
+  assert(!isClobbered() && "isSafeToHoist cannot do anything useful!");
+  Dependency = nullptr;
+
+  // Right now we don't want to worry about LLVM's memory model.  This can be
+  // made more precise later.
+  for (auto *MMO : MI->memoperands())
+    if (!MMO->isUnordered())
+      return false;
+
+  for (auto &MO : MI->operands()) {
+    if (MO.isReg() && MO.getReg()) {
+      for (auto &RegDef : RegDefs) {
+        unsigned Reg = RegDef.first;
+        MachineInstr *MI = RegDef.second;
+        if (!TRI.regsOverlap(Reg, MO.getReg()))
+          continue;
+
+        // We found a write-after-write or read-after-write, see if the
+        // instruction causing this dependency can be hoisted too.
+
+        if (MI == getUnknownMI())
+          // We don't have precise dependency information.
+          return false;
+
+        if (Dependency) {
+          if (Dependency == MI)
+            continue;
+          // We already have one dependency, and we can track only one.
+          return false;
+        }
+
+        // Now check if MI is actually a dependency that can be hoisted.
+
+        // We don't want to track transitive dependencies.  We already know that
+        // MI is the only instruction that defines Reg, but we need to be sure
+        // that it does not use any registers that have been defined (trivially
+        // checked below by ensuring that there are no register uses), and that
+        // it is the only def for every register it defines (otherwise we could
+        // violate a write after write hazard).
+        auto IsMIOperandSafe = [&](MachineOperand &MO) {
+          if (!MO.isReg() || !MO.getReg())
+            return true;
+          if (MO.isUse())
+            return false;
+          assert((!MO.isDef() || RegDefs.count(MO.getReg())) &&
+                 "All defs must be tracked in RegDefs by now!");
+          return !MO.isDef() || RegDefs.find(MO.getReg())->second == MI;
+        };
+
+        if (!all_of(MI->operands(), IsMIOperandSafe))
+          return false;
+
+        // Now check for speculation safety:
+        bool SawStore = true;
+        if (!MI->isSafeToMove(&AA, SawStore) || MI->mayLoad())
+          return false;
+
+        Dependency = MI;
+      }
+
+      if (MO.isDef())
+        for (unsigned Reg : RegUses)
+          if (TRI.regsOverlap(Reg, MO.getReg()))
+            return false;  // We found a write-after-read
+    }
+  }
+
+  return true;
+}
+
+bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getRegInfo().getTargetRegisterInfo();
+  MMI = &MF.getMMI();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  SmallVector<NullCheck, 16> NullCheckList;
+
+  for (auto &MBB : MF)
+    analyzeBlockForNullChecks(MBB, NullCheckList);
+
+  if (!NullCheckList.empty())
+    rewriteNullChecks(NullCheckList);
+
+  return !NullCheckList.empty();
+}
+
+// Return true if any register aliasing \p Reg is live-in into \p MBB.
+static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
+                           MachineBasicBlock *MBB, unsigned Reg) {
+  for (MCRegAliasIterator AR(Reg, TRI, /*IncludeSelf*/ true); AR.isValid();
+       ++AR)
+    if (MBB->isLiveIn(*AR))
+      return true;
+  return false;
+}
+
+/// Analyze MBB to check if its terminating branch can be turned into an
+/// implicit null check.  If yes, append a description of the said null check to
+/// NullCheckList and return true, else return false.
+bool ImplicitNullChecks::analyzeBlockForNullChecks(
+    MachineBasicBlock &MBB, SmallVectorImpl<NullCheck> &NullCheckList) {
+  typedef TargetInstrInfo::MachineBranchPredicate MachineBranchPredicate;
+
+  MDNode *BranchMD = nullptr;
+  if (auto *BB = MBB.getBasicBlock())
+    BranchMD = BB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit);
+
+  if (!BranchMD)
+    return false;
+
+  MachineBranchPredicate MBP;
+
+  if (TII->analyzeBranchPredicate(MBB, MBP, true))
+    return false;
+
+  // Is the predicate comparing an integer to zero?
+  if (!(MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
+        (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
+         MBP.Predicate == MachineBranchPredicate::PRED_EQ)))
+    return false;
+
+  // If we cannot erase the test instruction itself, then making the null check
+  // implicit does not buy us much.
+  if (!MBP.SingleUseCondition)
+    return false;
+
+  MachineBasicBlock *NotNullSucc, *NullSucc;
+
+  if (MBP.Predicate == MachineBranchPredicate::PRED_NE) {
+    NotNullSucc = MBP.TrueDest;
+    NullSucc = MBP.FalseDest;
+  } else {
+    NotNullSucc = MBP.FalseDest;
+    NullSucc = MBP.TrueDest;
+  }
+
+  // We handle the simplest case for now.  We can potentially do better by using
+  // the machine dominator tree.
+  if (NotNullSucc->pred_size() != 1)
+    return false;
+
+  // Starting with a code fragment like:
+  //
+  //   test %RAX, %RAX
+  //   jne LblNotNull
+  //
+  //  LblNull:
+  //   callq throw_NullPointerException
+  //
+  //  LblNotNull:
+  //   Inst0
+  //   Inst1
+  //   ...
+  //   Def = Load (%RAX + <offset>)
+  //   ...
+  //
+  //
+  // we want to end up with
+  //
+  //   Def = FaultingLoad (%RAX + <offset>), LblNull
+  //   jmp LblNotNull ;; explicit or fallthrough
+  //
+  //  LblNotNull:
+  //   Inst0
+  //   Inst1
+  //   ...
+  //
+  //  LblNull:
+  //   callq throw_NullPointerException
+  //
+  //
+  // To see why this is legal, consider the two possibilities:
+  //
+  //  1. %RAX is null: since we constrain <offset> to be less than PageSize, the
+  //     load instruction dereferences the null page, causing a segmentation
+  //     fault.
+  //
+  //  2. %RAX is not null: in this case we know that the load cannot fault, as
+  //     otherwise the load would've faulted in the original program too and the
+  //     original program would've been undefined.
+  //
+  // This reasoning cannot be extended to justify hoisting through arbitrary
+  // control flow.  For instance, in the example below (in pseudo-C)
+  //
+  //    if (ptr == null) { throw_npe(); unreachable; }
+  //    if (some_cond) { return 42; }
+  //    v = ptr->field;  // LD
+  //    ...
+  //
+  // we cannot (without code duplication) use the load marked "LD" to null check
+  // ptr -- clause (2) above does not apply in this case.  In the above program
+  // the safety of ptr->field can be dependent on some_cond; and, for instance,
+  // ptr could be some non-null invalid reference that never gets loaded from
+  // because some_cond is always true.
+
+  unsigned PointerReg = MBP.LHS.getReg();
+
+  HazardDetector HD(*TRI, *AA);
+
+  for (auto MII = NotNullSucc->begin(), MIE = NotNullSucc->end(); MII != MIE;
+       ++MII) {
+    MachineInstr &MI = *MII;
+    unsigned BaseReg;
+    int64_t Offset;
+    MachineInstr *Dependency = nullptr;
+    if (TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+      if (MI.mayLoad() && !MI.isPredicable() && BaseReg == PointerReg &&
+          Offset < PageSize && MI.getDesc().getNumDefs() <= 1 &&
+          HD.isSafeToHoist(&MI, Dependency)) {
+
+        auto DependencyOperandIsOk = [&](MachineOperand &MO) {
+          assert(!(MO.isReg() && MO.isUse()) &&
+                 "No transitive dependendencies please!");
+          if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+            return true;
+
+          // Make sure that we won't clobber any live ins to the sibling block
+          // by hoisting Dependency.  For instance, we can't hoist INST to
+          // before the null check (even if it safe, and does not violate any
+          // dependencies in the non_null_block) if %rdx is live in to
+          // _null_block.
+          //
+          //    test %rcx, %rcx
+          //    je _null_block
+          //  _non_null_block:
+          //    %rdx<def> = INST
+          //    ...
+          if (AnyAliasLiveIn(TRI, NullSucc, MO.getReg()))
+            return false;
+
+          // Make sure Dependency isn't re-defining the base register.  Then we
+          // won't get the memory operation on the address we want.
+          if (TRI->regsOverlap(MO.getReg(), BaseReg))
+            return false;
+
+          return true;
+        };
+
+        bool DependencyOperandsAreOk =
+            !Dependency ||
+            all_of(Dependency->operands(), DependencyOperandIsOk);
+
+        if (DependencyOperandsAreOk) {
+          NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
+                                     NullSucc, Dependency);
+          return true;
+        }
+      }
+
+    HD.rememberInstruction(&MI);
+    if (HD.isClobbered())
+      return false;
+  }
+
+  return false;
+}
+
+/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine
+/// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI
+/// (defining the same register), and branches to HandlerMBB if the load
+/// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB.
+MachineInstr *
+ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
+                                       MachineBasicBlock *MBB,
+                                       MachineBasicBlock *HandlerMBB) {
+  const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
+                                 // all targets.
+
+  DebugLoc DL;
+  unsigned NumDefs = LoadMI->getDesc().getNumDefs();
+  assert(NumDefs <= 1 && "other cases unhandled!");
+
+  unsigned DefReg = NoRegister;
+  if (NumDefs != 0) {
+    DefReg = LoadMI->defs().begin()->getReg();
+    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
+           "expected exactly one def!");
+  }
+
+  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
+                 .addMBB(HandlerMBB)
+                 .addImm(LoadMI->getOpcode());
+
+  for (auto &MO : LoadMI->uses())
+    MIB.addOperand(MO);
+
+  MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end());
+
+  return MIB;
+}
+
+/// Rewrite the null checks in NullCheckList into implicit null checks.
+void ImplicitNullChecks::rewriteNullChecks(
+    ArrayRef<ImplicitNullChecks::NullCheck> NullCheckList) {
+  DebugLoc DL;
+
+  for (auto &NC : NullCheckList) {
+    // Remove the conditional branch dependent on the null check.
+    unsigned BranchesRemoved = TII->RemoveBranch(*NC.getCheckBlock());
+    (void)BranchesRemoved;
+    assert(BranchesRemoved > 0 && "expected at least one branch!");
+
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      DepMI->removeFromParent();
+      NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
+    }
+
+    // Insert a faulting load where the conditional branch was originally.  We
+    // check earlier ensures that this bit of code motion is legal.  We do not
+    // touch the successors list for any basic block since we haven't changed
+    // control flow, we've just made it implicit.
+    MachineInstr *FaultingLoad = insertFaultingLoad(
+        NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
+    // Now the values defined by MemOperation, if any, are live-in of
+    // the block of MemOperation.
+    // The original load operation may define implicit-defs alongside
+    // the loaded value.
+    MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
+    for (const MachineOperand &MO : FaultingLoad->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg || MBB->isLiveIn(Reg))
+        continue;
+      MBB->addLiveIn(Reg);
+    }
+
+    if (auto *DepMI = NC.getOnlyDependency()) {
+      for (auto &MO : DepMI->operands()) {
+        if (!MO.isReg() || !MO.getReg() || !MO.isDef())
+          continue;
+        if (!NC.getNotNullSucc()->isLiveIn(MO.getReg()))
+          NC.getNotNullSucc()->addLiveIn(MO.getReg());
+      }
+    }
+
+    NC.getMemOperation()->eraseFromParent();
+    NC.getCheckOperation()->eraseFromParent();
+
+    // Insert an *unconditional* branch to not-null successor.
+    TII->InsertBranch(*NC.getCheckBlock(), NC.getNotNullSucc(), nullptr,
+                      /*Cond=*/None, DL);
+
+    NumImplicitNullChecks++;
+  }
+}
+
+char ImplicitNullChecks::ID = 0;
+char &llvm::ImplicitNullChecksID = ImplicitNullChecks::ID;
+INITIALIZE_PASS_BEGIN(ImplicitNullChecks, "implicit-null-checks",
+                      "Implicit null checks", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(ImplicitNullChecks, "implicit-null-checks",
+                    "Implicit null checks", false, false)
diff --git a/contrib/llvm/lib/CodeGen/InlineSpiller.cpp b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
new file mode 100644
index 000000000000..197db777dd2c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -0,0 +1,1456 @@
+//===-------- InlineSpiller.cpp - Insert spills and restores inline -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The inline spiller modifies the machine function directly instead of
+// inserting spills and restores in VirtRegMap.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Spiller.h"
+#include "SplitKit.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumSpilledRanges,   "Number of spilled live ranges");
+STATISTIC(NumSnippets,        "Number of spilled snippets");
+STATISTIC(NumSpills,          "Number of spills inserted");
+STATISTIC(NumSpillsRemoved,   "Number of spills removed");
+STATISTIC(NumReloads,         "Number of reloads inserted");
+STATISTIC(NumReloadsRemoved,  "Number of reloads removed");
+STATISTIC(NumFolded,          "Number of folded stack accesses");
+STATISTIC(NumFoldedLoads,     "Number of folded loads");
+STATISTIC(NumRemats,          "Number of rematerialized defs for spilling");
+
+static cl::opt<bool> DisableHoisting("disable-spill-hoist", cl::Hidden,
+                                     cl::desc("Disable inline spill hoisting"));
+
+namespace {
+class HoistSpillHelper : private LiveRangeEdit::Delegate {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  InsertPointAnalysis IPA;
+
+  // Map from StackSlot to its original register.
+  DenseMap<int, unsigned> StackSlotToReg;
+  // Map from pair of (StackSlot and Original VNI) to a set of spills which
+  // have the same stackslot and have equal values defined by Original VNI.
+  // These spills are mergeable and are hoist candiates.
+  typedef MapVector<std::pair<int, VNInfo *>, SmallPtrSet<MachineInstr *, 16>>
+      MergeableSpillsMap;
+  MergeableSpillsMap MergeableSpills;
+
+  /// This is the map from original register to a set containing all its
+  /// siblings. To hoist a spill to another BB, we need to find out a live
+  /// sibling there and use it as the source of the new spill.
+  DenseMap<unsigned, SmallSetVector<unsigned, 16>> Virt2SiblingsMap;
+
+  bool isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI, MachineBasicBlock &BB,
+                     unsigned &LiveReg);
+
+  void rmRedundantSpills(
+      SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void getVisitOrders(
+      MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+      SmallVectorImpl<MachineDomTreeNode *> &Orders,
+      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+      DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+      DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill);
+
+  void runHoistSpills(unsigned OrigReg, VNInfo &OrigVNI,
+                      SmallPtrSet<MachineInstr *, 16> &Spills,
+                      SmallVectorImpl<MachineInstr *> &SpillsToRm,
+                      DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns);
+
+public:
+  HoistSpillHelper(MachineFunctionPass &pass, MachineFunction &mf,
+                   VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        IPA(LIS, mf.getNumBlockIDs()) {}
+
+  void addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                            unsigned Original);
+  bool rmFromMergeableSpills(MachineInstr &Spill, int StackSlot);
+  void hoistAllSpills();
+  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+};
+
+class InlineSpiller : public Spiller {
+  MachineFunction &MF;
+  LiveIntervals &LIS;
+  LiveStacks &LSS;
+  AliasAnalysis *AA;
+  MachineDominatorTree &MDT;
+  MachineLoopInfo &Loops;
+  VirtRegMap &VRM;
+  MachineFrameInfo &MFI;
+  MachineRegisterInfo &MRI;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+  // Variables that are valid during spill(), but used by multiple methods.
+  LiveRangeEdit *Edit;
+  LiveInterval *StackInt;
+  int StackSlot;
+  unsigned Original;
+
+  // All registers to spill to StackSlot, including the main register.
+  SmallVector<unsigned, 8> RegsToSpill;
+
+  // All COPY instructions to/from snippets.
+  // They are ignored since both operands refer to the same stack slot.
+  SmallPtrSet<MachineInstr*, 8> SnippetCopies;
+
+  // Values that failed to remat at some point.
+  SmallPtrSet<VNInfo*, 8> UsedValues;
+
+  // Dead defs generated during spilling.
+  SmallVector<MachineInstr*, 8> DeadDefs;
+
+  // Object records spills information and does the hoisting.
+  HoistSpillHelper HSpiller;
+
+  ~InlineSpiller() override {}
+
+public:
+  InlineSpiller(MachineFunctionPass &pass, MachineFunction &mf, VirtRegMap &vrm)
+      : MF(mf), LIS(pass.getAnalysis<LiveIntervals>()),
+        LSS(pass.getAnalysis<LiveStacks>()),
+        AA(&pass.getAnalysis<AAResultsWrapperPass>().getAAResults()),
+        MDT(pass.getAnalysis<MachineDominatorTree>()),
+        Loops(pass.getAnalysis<MachineLoopInfo>()), VRM(vrm),
+        MFI(*mf.getFrameInfo()), MRI(mf.getRegInfo()),
+        TII(*mf.getSubtarget().getInstrInfo()),
+        TRI(*mf.getSubtarget().getRegisterInfo()),
+        MBFI(pass.getAnalysis<MachineBlockFrequencyInfo>()),
+        HSpiller(pass, mf, vrm) {}
+
+  void spill(LiveRangeEdit &) override;
+  void postOptimization() override;
+
+private:
+  bool isSnippet(const LiveInterval &SnipLI);
+  void collectRegsToSpill();
+
+  bool isRegToSpill(unsigned Reg) {
+    return std::find(RegsToSpill.begin(),
+                     RegsToSpill.end(), Reg) != RegsToSpill.end();
+  }
+
+  bool isSibling(unsigned Reg);
+  bool hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstr &CopyMI);
+  void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
+
+  void markValueUsed(LiveInterval*, VNInfo*);
+  bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
+  void reMaterializeAll();
+
+  bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
+  bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> >,
+                         MachineInstr *LoadMI = nullptr);
+  void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
+
+  void spillAroundUses(unsigned Reg);
+  void spillAll();
+};
+}
+
+namespace llvm {
+
+Spiller::~Spiller() { }
+void Spiller::anchor() { }
+
+Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                             MachineFunction &mf,
+                             VirtRegMap &vrm) {
+  return new InlineSpiller(pass, mf, vrm);
+}
+
+}
+
+//===----------------------------------------------------------------------===//
+//                                Snippets
+//===----------------------------------------------------------------------===//
+
+// When spilling a virtual register, we also spill any snippets it is connected
+// to. The snippets are small live ranges that only have a single real use,
+// leftovers from live range splitting. Spilling them enables memory operand
+// folding or tightens the live range around the single use.
+//
+// This minimizes register pressure and maximizes the store-to-load distance for
+// spill slots which can be important in tight loops.
+
+/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
+/// otherwise return 0.
+static unsigned isFullCopyOf(const MachineInstr &MI, unsigned Reg) {
+  if (!MI.isFullCopy())
+    return 0;
+  if (MI.getOperand(0).getReg() == Reg)
+    return MI.getOperand(1).getReg();
+  if (MI.getOperand(1).getReg() == Reg)
+    return MI.getOperand(0).getReg();
+  return 0;
+}
+
+/// isSnippet - Identify if a live interval is a snippet that should be spilled.
+/// It is assumed that SnipLI is a virtual register with the same original as
+/// Edit->getReg().
+bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
+  unsigned Reg = Edit->getReg();
+
+  // A snippet is a tiny live range with only a single instruction using it
+  // besides copies to/from Reg or spills/fills. We accept:
+  //
+  //   %snip = COPY %Reg / FILL fi#
+  //   %snip = USE %snip
+  //   %Reg = COPY %snip / SPILL %snip, fi#
+  //
+  if (SnipLI.getNumValNums() > 2 || !LIS.intervalIsInOneMBB(SnipLI))
+    return false;
+
+  MachineInstr *UseMI = nullptr;
+
+  // Check that all uses satisfy our criteria.
+  for (MachineRegisterInfo::reg_instr_nodbg_iterator
+       RI = MRI.reg_instr_nodbg_begin(SnipLI.reg),
+       E = MRI.reg_instr_nodbg_end(); RI != E; ) {
+    MachineInstr &MI = *RI++;
+
+    // Allow copies to/from Reg.
+    if (isFullCopyOf(MI, Reg))
+      continue;
+
+    // Allow stack slot loads.
+    int FI;
+    if (SnipLI.reg == TII.isLoadFromStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    // Allow stack slot stores.
+    if (SnipLI.reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot)
+      continue;
+
+    // Allow a single additional instruction.
+    if (UseMI && &MI != UseMI)
+      return false;
+    UseMI = &MI;
+  }
+  return true;
+}
+
+/// collectRegsToSpill - Collect live range snippets that only have a single
+/// real use.
+void InlineSpiller::collectRegsToSpill() {
+  unsigned Reg = Edit->getReg();
+
+  // Main register always spills.
+  RegsToSpill.assign(1, Reg);
+  SnippetCopies.clear();
+
+  // Snippets all have the same original, so there can't be any for an original
+  // register.
+  if (Original == Reg)
+    return;
+
+  for (MachineRegisterInfo::reg_instr_iterator
+       RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end(); RI != E; ) {
+    MachineInstr &MI = *RI++;
+    unsigned SnipReg = isFullCopyOf(MI, Reg);
+    if (!isSibling(SnipReg))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(SnipReg);
+    if (!isSnippet(SnipLI))
+      continue;
+    SnippetCopies.insert(&MI);
+    if (isRegToSpill(SnipReg))
+      continue;
+    RegsToSpill.push_back(SnipReg);
+    DEBUG(dbgs() << "\talso spill snippet " << SnipLI << '\n');
+    ++NumSnippets;
+  }
+}
+
+bool InlineSpiller::isSibling(unsigned Reg) {
+  return TargetRegisterInfo::isVirtualRegister(Reg) &&
+           VRM.getOriginal(Reg) == Original;
+}
+
+/// It is beneficial to spill to earlier place in the same BB in case
+/// as follows:
+/// There is an alternative def earlier in the same MBB.
+/// Hoist the spill as far as possible in SpillMBB. This can ease
+/// register pressure:
+///
+///   x = def
+///   y = use x
+///   s = copy x
+///
+/// Hoisting the spill of s to immediately after the def removes the
+/// interference between x and y:
+///
+///   x = def
+///   spill x
+///   y = use x<kill>
+///
+/// This hoist only helps when the copy kills its source.
+///
+bool InlineSpiller::hoistSpillInsideBB(LiveInterval &SpillLI,
+                                       MachineInstr &CopyMI) {
+  SlotIndex Idx = LIS.getInstructionIndex(CopyMI);
+#ifndef NDEBUG
+  VNInfo *VNI = SpillLI.getVNInfoAt(Idx.getRegSlot());
+  assert(VNI && VNI->def == Idx.getRegSlot() && "Not defined by copy");
+#endif
+
+  unsigned SrcReg = CopyMI.getOperand(1).getReg();
+  LiveInterval &SrcLI = LIS.getInterval(SrcReg);
+  VNInfo *SrcVNI = SrcLI.getVNInfoAt(Idx);
+  LiveQueryResult SrcQ = SrcLI.Query(Idx);
+  MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(SrcVNI->def);
+  if (DefMBB != CopyMI.getParent() || !SrcQ.isKill())
+    return false;
+
+  // Conservatively extend the stack slot range to the range of the original
+  // value. We may be able to do better with stack slot coloring by being more
+  // careful here.
+  assert(StackInt && "No stack slot assigned yet.");
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+  StackInt->MergeValueInAsValue(OrigLI, OrigVNI, StackInt->getValNumInfo(0));
+  DEBUG(dbgs() << "\tmerged orig valno " << OrigVNI->id << ": "
+               << *StackInt << '\n');
+
+  // We are going to spill SrcVNI immediately after its def, so clear out
+  // any later spills of the same value.
+  eliminateRedundantSpills(SrcLI, SrcVNI);
+
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(SrcVNI->def);
+  MachineBasicBlock::iterator MII;
+  if (SrcVNI->isPHIDef())
+    MII = MBB->SkipPHIsAndLabels(MBB->begin());
+  else {
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(SrcVNI->def);
+    assert(DefMI && "Defining instruction disappeared");
+    MII = DefMI;
+    ++MII;
+  }
+  // Insert spill without kill flag immediately after def.
+  TII.storeRegToStackSlot(*MBB, MII, SrcReg, false, StackSlot,
+                          MRI.getRegClass(SrcReg), &TRI);
+  --MII; // Point to store instruction.
+  LIS.InsertMachineInstrInMaps(*MII);
+  DEBUG(dbgs() << "\thoisted: " << SrcVNI->def << '\t' << *MII);
+
+  HSpiller.addToMergeableSpills(*MII, StackSlot, Original);
+  ++NumSpills;
+  return true;
+}
+
+/// eliminateRedundantSpills - SLI:VNI is known to be on the stack. Remove any
+/// redundant spills of this value in SLI.reg and sibling copies.
+void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
+  assert(VNI && "Missing value");
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(&SLI, VNI));
+  assert(StackInt && "No stack slot assigned yet.");
+
+  do {
+    LiveInterval *LI;
+    std::tie(LI, VNI) = WorkList.pop_back_val();
+    unsigned Reg = LI->reg;
+    DEBUG(dbgs() << "Checking redundant spills for "
+                 << VNI->id << '@' << VNI->def << " in " << *LI << '\n');
+
+    // Regs to spill are taken care of.
+    if (isRegToSpill(Reg))
+      continue;
+
+    // Add all of VNI's live range to StackInt.
+    StackInt->MergeValueInAsValue(*LI, VNI, StackInt->getValNumInfo(0));
+    DEBUG(dbgs() << "Merged to stack int: " << *StackInt << '\n');
+
+    // Find all spills and copies of VNI.
+    for (MachineRegisterInfo::use_instr_nodbg_iterator
+         UI = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+         UI != E; ) {
+      MachineInstr &MI = *UI++;
+      if (!MI.isCopy() && !MI.mayStore())
+        continue;
+      SlotIndex Idx = LIS.getInstructionIndex(MI);
+      if (LI->getVNInfoAt(Idx) != VNI)
+        continue;
+
+      // Follow sibling copies down the dominator tree.
+      if (unsigned DstReg = isFullCopyOf(MI, Reg)) {
+        if (isSibling(DstReg)) {
+           LiveInterval &DstLI = LIS.getInterval(DstReg);
+           VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
+           assert(DstVNI && "Missing defined value");
+           assert(DstVNI->def == Idx.getRegSlot() && "Wrong copy def slot");
+           WorkList.push_back(std::make_pair(&DstLI, DstVNI));
+        }
+        continue;
+      }
+
+      // Erase spills.
+      int FI;
+      if (Reg == TII.isStoreToStackSlot(MI, FI) && FI == StackSlot) {
+        DEBUG(dbgs() << "Redundant spill " << Idx << '\t' << MI);
+        // eliminateDeadDefs won't normally remove stores, so switch opcode.
+        MI.setDesc(TII.get(TargetOpcode::KILL));
+        DeadDefs.push_back(&MI);
+        ++NumSpillsRemoved;
+        if (HSpiller.rmFromMergeableSpills(MI, StackSlot))
+          --NumSpills;
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Rematerialization
+//===----------------------------------------------------------------------===//
+
+/// markValueUsed - Remember that VNI failed to rematerialize, so its defining
+/// instruction cannot be eliminated. See through snippet copies
+void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
+  SmallVector<std::pair<LiveInterval*, VNInfo*>, 8> WorkList;
+  WorkList.push_back(std::make_pair(LI, VNI));
+  do {
+    std::tie(LI, VNI) = WorkList.pop_back_val();
+    if (!UsedValues.insert(VNI).second)
+      continue;
+
+    if (VNI->isPHIDef()) {
+      MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      for (MachineBasicBlock *P : MBB->predecessors()) {
+        VNInfo *PVNI = LI->getVNInfoBefore(LIS.getMBBEndIdx(P));
+        if (PVNI)
+          WorkList.push_back(std::make_pair(LI, PVNI));
+      }
+      continue;
+    }
+
+    // Follow snippet copies.
+    MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+    if (!SnippetCopies.count(MI))
+      continue;
+    LiveInterval &SnipLI = LIS.getInterval(MI->getOperand(1).getReg());
+    assert(isRegToSpill(SnipLI.reg) && "Unexpected register in copy");
+    VNInfo *SnipVNI = SnipLI.getVNInfoAt(VNI->def.getRegSlot(true));
+    assert(SnipVNI && "Snippet undefined before copy");
+    WorkList.push_back(std::make_pair(&SnipLI, SnipVNI));
+  } while (!WorkList.empty());
+}
+
+/// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
+bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
+
+  // Analyze instruction
+  SmallVector<std::pair<MachineInstr *, unsigned>, 8> Ops;
+  MIBundleOperands::VirtRegInfo RI =
+      MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
+
+  if (!RI.Reads)
+    return false;
+
+  SlotIndex UseIdx = LIS.getInstructionIndex(MI).getRegSlot(true);
+  VNInfo *ParentVNI = VirtReg.getVNInfoAt(UseIdx.getBaseIndex());
+
+  if (!ParentVNI) {
+    DEBUG(dbgs() << "\tadding <undef> flags: ");
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg)
+        MO.setIsUndef();
+    }
+    DEBUG(dbgs() << UseIdx << '\t' << MI);
+    return true;
+  }
+
+  if (SnippetCopies.count(&MI))
+    return false;
+
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+  LiveRangeEdit::Remat RM(ParentVNI);
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (!Edit->canRematerializeAt(RM, OrigVNI, UseIdx, false)) {
+    markValueUsed(&VirtReg, ParentVNI);
+    DEBUG(dbgs() << "\tcannot remat for " << UseIdx << '\t' << MI);
+    return false;
+  }
+
+  // If the instruction also writes VirtReg.reg, it had better not require the
+  // same register for uses and defs.
+  if (RI.Tied) {
+    markValueUsed(&VirtReg, ParentVNI);
+    DEBUG(dbgs() << "\tcannot remat tied reg: " << UseIdx << '\t' << MI);
+    return false;
+  }
+
+  // Before rematerializing into a register for a single instruction, try to
+  // fold a load into the instruction. That avoids allocating a new register.
+  if (RM.OrigMI->canFoldAsLoad() &&
+      foldMemoryOperand(Ops, RM.OrigMI)) {
+    Edit->markRematerialized(RM.ParentVNI);
+    ++NumFoldedLoads;
+    return true;
+  }
+
+  // Alocate a new register for the remat.
+  unsigned NewVReg = Edit->createFrom(Original);
+
+  // Finally we can rematerialize OrigMI before MI.
+  SlotIndex DefIdx =
+      Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
+  (void)DefIdx;
+  DEBUG(dbgs() << "\tremat:  " << DefIdx << '\t'
+               << *LIS.getInstructionFromIndex(DefIdx));
+
+  // Replace operands
+  for (const auto &OpPair : Ops) {
+    MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
+    if (MO.isReg() && MO.isUse() && MO.getReg() == VirtReg.reg) {
+      MO.setReg(NewVReg);
+      MO.setIsKill();
+    }
+  }
+  DEBUG(dbgs() << "\t        " << UseIdx << '\t' << MI << '\n');
+
+  ++NumRemats;
+  return true;
+}
+
+/// reMaterializeAll - Try to rematerialize as many uses as possible,
+/// and trim the live ranges after.
+void InlineSpiller::reMaterializeAll() {
+  if (!Edit->anyRematerializable(AA))
+    return;
+
+  UsedValues.clear();
+
+  // Try to remat before all uses of snippets.
+  bool anyRemat = false;
+  for (unsigned Reg : RegsToSpill) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (MachineRegisterInfo::reg_bundle_iterator
+           RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
+         RegI != E; ) {
+      MachineInstr &MI = *RegI++;
+
+      // Debug values are not allowed to affect codegen.
+      if (MI.isDebugValue())
+        continue;
+
+      anyRemat |= reMaterializeFor(LI, MI);
+    }
+  }
+  if (!anyRemat)
+    return;
+
+  // Remove any values that were completely rematted.
+  for (unsigned Reg : RegsToSpill) {
+    LiveInterval &LI = LIS.getInterval(Reg);
+    for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+         I != E; ++I) {
+      VNInfo *VNI = *I;
+      if (VNI->isUnused() || VNI->isPHIDef() || UsedValues.count(VNI))
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(VNI->def);
+      MI->addRegisterDead(Reg, &TRI);
+      if (!MI->allDefsAreDead())
+        continue;
+      DEBUG(dbgs() << "All defs dead: " << *MI);
+      DeadDefs.push_back(MI);
+    }
+  }
+
+  // Eliminate dead code after remat. Note that some snippet copies may be
+  // deleted here.
+  if (DeadDefs.empty())
+    return;
+  DEBUG(dbgs() << "Remat created " << DeadDefs.size() << " dead defs.\n");
+  Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+
+  // LiveRangeEdit::eliminateDeadDef is used to remove dead define instructions
+  // after rematerialization.  To remove a VNI for a vreg from its LiveInterval,
+  // LiveIntervals::removeVRegDefAt is used. However, after non-PHI VNIs are all
+  // removed, PHI VNI are still left in the LiveInterval.
+  // So to get rid of unused reg, we need to check whether it has non-dbg
+  // reference instead of whether it has non-empty interval.
+  unsigned ResultPos = 0;
+  for (unsigned Reg : RegsToSpill) {
+    if (MRI.reg_nodbg_empty(Reg)) {
+      Edit->eraseVirtReg(Reg);
+      continue;
+    }
+    assert((LIS.hasInterval(Reg) && !LIS.getInterval(Reg).empty()) &&
+           "Reg with empty interval has reference");
+    RegsToSpill[ResultPos++] = Reg;
+  }
+  RegsToSpill.erase(RegsToSpill.begin() + ResultPos, RegsToSpill.end());
+  DEBUG(dbgs() << RegsToSpill.size() << " registers to spill after remat.\n");
+}
+
+
+//===----------------------------------------------------------------------===//
+//                                 Spilling
+//===----------------------------------------------------------------------===//
+
+/// If MI is a load or store of StackSlot, it can be removed.
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
+  int FI = 0;
+  unsigned InstrReg = TII.isLoadFromStackSlot(*MI, FI);
+  bool IsLoad = InstrReg;
+  if (!IsLoad)
+    InstrReg = TII.isStoreToStackSlot(*MI, FI);
+
+  // We have a stack access. Is it the right register and slot?
+  if (InstrReg != Reg || FI != StackSlot)
+    return false;
+
+  if (!IsLoad)
+    HSpiller.rmFromMergeableSpills(*MI, StackSlot);
+
+  DEBUG(dbgs() << "Coalescing stack access: " << *MI);
+  LIS.RemoveMachineInstrFromMaps(*MI);
+  MI->eraseFromParent();
+
+  if (IsLoad) {
+    ++NumReloadsRemoved;
+    --NumReloads;
+  } else {
+    ++NumSpillsRemoved;
+    --NumSpills;
+  }
+
+  return true;
+}
+
+#if !defined(NDEBUG)
+// Dump the range of instructions from B to E with their slot indexes.
+static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
+                                               MachineBasicBlock::iterator E,
+                                               LiveIntervals const &LIS,
+                                               const char *const header,
+                                               unsigned VReg =0) {
+  char NextLine = '\n';
+  char SlotIndent = '\t';
+
+  if (std::next(B) == E) {
+    NextLine = ' ';
+    SlotIndent = ' ';
+  }
+
+  dbgs() << '\t' << header << ": " << NextLine;
+
+  for (MachineBasicBlock::iterator I = B; I != E; ++I) {
+    SlotIndex Idx = LIS.getInstructionIndex(*I).getRegSlot();
+
+    // If a register was passed in and this instruction has it as a
+    // destination that is marked as an early clobber, print the
+    // early-clobber slot index.
+    if (VReg) {
+      MachineOperand *MO = I->findRegisterDefOperand(VReg);
+      if (MO && MO->isEarlyClobber())
+        Idx = Idx.getRegSlot(true);
+    }
+
+    dbgs() << SlotIndent << Idx << '\t' << *I;
+  }
+}
+#endif
+
+/// foldMemoryOperand - Try folding stack slot references in Ops into their
+/// instructions.
+///
+/// @param Ops    Operand indices from analyzeVirtReg().
+/// @param LoadMI Load instruction to use instead of stack slot when non-null.
+/// @return       True on success.
+bool InlineSpiller::
+foldMemoryOperand(ArrayRef<std::pair<MachineInstr*, unsigned> > Ops,
+                  MachineInstr *LoadMI) {
+  if (Ops.empty())
+    return false;
+  // Don't attempt folding in bundles.
+  MachineInstr *MI = Ops.front().first;
+  if (Ops.back().first != MI || MI->isBundled())
+    return false;
+
+  bool WasCopy = MI->isCopy();
+  unsigned ImpReg = 0;
+
+  bool SpillSubRegs = (MI->getOpcode() == TargetOpcode::STATEPOINT ||
+                       MI->getOpcode() == TargetOpcode::PATCHPOINT ||
+                       MI->getOpcode() == TargetOpcode::STACKMAP);
+
+  // TargetInstrInfo::foldMemoryOperand only expects explicit, non-tied
+  // operands.
+  SmallVector<unsigned, 8> FoldOps;
+  for (const auto &OpPair : Ops) {
+    unsigned Idx = OpPair.second;
+    assert(MI == OpPair.first && "Instruction conflict during operand folding");
+    MachineOperand &MO = MI->getOperand(Idx);
+    if (MO.isImplicit()) {
+      ImpReg = MO.getReg();
+      continue;
+    }
+    // FIXME: Teach targets to deal with subregs.
+    if (!SpillSubRegs && MO.getSubReg())
+      return false;
+    // We cannot fold a load instruction into a def.
+    if (LoadMI && MO.isDef())
+      return false;
+    // Tied use operands should not be passed to foldMemoryOperand.
+    if (!MI->isRegTiedToDefOperand(Idx))
+      FoldOps.push_back(Idx);
+  }
+
+  MachineInstrSpan MIS(MI);
+
+  MachineInstr *FoldMI =
+      LoadMI ? TII.foldMemoryOperand(*MI, FoldOps, *LoadMI, &LIS)
+             : TII.foldMemoryOperand(*MI, FoldOps, StackSlot, &LIS);
+  if (!FoldMI)
+    return false;
+
+  // Remove LIS for any dead defs in the original MI not in FoldMI.
+  for (MIBundleOperands MO(*MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!Reg || TargetRegisterInfo::isVirtualRegister(Reg) ||
+        MRI.isReserved(Reg)) {
+      continue;
+    }
+    // Skip non-Defs, including undef uses and internal reads.
+    if (MO->isUse())
+      continue;
+    MIBundleOperands::PhysRegInfo RI =
+        MIBundleOperands(*FoldMI).analyzePhysReg(Reg, &TRI);
+    if (RI.FullyDefined)
+      continue;
+    // FoldMI does not define this physreg. Remove the LI segment.
+    assert(MO->isDead() && "Cannot fold physreg def");
+    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+    LIS.removePhysRegDefAt(Reg, Idx);
+  }
+
+  int FI;
+  if (TII.isStoreToStackSlot(*MI, FI) &&
+      HSpiller.rmFromMergeableSpills(*MI, FI))
+    --NumSpills;
+  LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
+  MI->eraseFromParent();
+
+  // Insert any new instructions other than FoldMI into the LIS maps.
+  assert(!MIS.empty() && "Unexpected empty span of instructions!");
+  for (MachineInstr &MI : MIS)
+    if (&MI != FoldMI)
+      LIS.InsertMachineInstrInMaps(MI);
+
+  // TII.foldMemoryOperand may have left some implicit operands on the
+  // instruction.  Strip them.
+  if (ImpReg)
+    for (unsigned i = FoldMI->getNumOperands(); i; --i) {
+      MachineOperand &MO = FoldMI->getOperand(i - 1);
+      if (!MO.isReg() || !MO.isImplicit())
+        break;
+      if (MO.getReg() == ImpReg)
+        FoldMI->RemoveOperand(i - 1);
+    }
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MIS.end(), LIS,
+                                           "folded"));
+
+  if (!WasCopy)
+    ++NumFolded;
+  else if (Ops.front().second == 0) {
+    ++NumSpills;
+    HSpiller.addToMergeableSpills(*FoldMI, StackSlot, Original);
+  } else
+    ++NumReloads;
+  return true;
+}
+
+void InlineSpiller::insertReload(unsigned NewVReg,
+                                 SlotIndex Idx,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  MachineInstrSpan MIS(MI);
+  TII.loadRegFromStackSlot(MBB, MI, NewVReg, StackSlot,
+                           MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(MIS.begin(), MI);
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(MIS.begin(), MI, LIS, "reload",
+                                           NewVReg));
+  ++NumReloads;
+}
+
+/// insertSpill - Insert a spill of NewVReg after MI.
+void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
+                                 MachineBasicBlock::iterator MI) {
+  MachineBasicBlock &MBB = *MI->getParent();
+
+  MachineInstrSpan MIS(MI);
+  TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot,
+                          MRI.getRegClass(NewVReg), &TRI);
+
+  LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end());
+
+  DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
+                                           "spill"));
+  ++NumSpills;
+  HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
+}
+
+/// spillAroundUses - insert spill code around each use of Reg.
+void InlineSpiller::spillAroundUses(unsigned Reg) {
+  DEBUG(dbgs() << "spillAroundUses " << PrintReg(Reg) << '\n');
+  LiveInterval &OldLI = LIS.getInterval(Reg);
+
+  // Iterate over instructions using Reg.
+  for (MachineRegisterInfo::reg_bundle_iterator
+       RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
+       RegI != E; ) {
+    MachineInstr *MI = &*(RegI++);
+
+    // Debug values are not allowed to affect codegen.
+    if (MI->isDebugValue()) {
+      // Modify DBG_VALUE now that the value is in a spill slot.
+      bool IsIndirect = MI->isIndirectDebugValue();
+      uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
+      const MDNode *Var = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
+      DebugLoc DL = MI->getDebugLoc();
+      DEBUG(dbgs() << "Modifying debug info due to spill:" << "\t" << *MI);
+      MachineBasicBlock *MBB = MI->getParent();
+      assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+             "Expected inlined-at fields to agree");
+      BuildMI(*MBB, MBB->erase(MI), DL, TII.get(TargetOpcode::DBG_VALUE))
+          .addFrameIndex(StackSlot)
+          .addImm(Offset)
+          .addMetadata(Var)
+          .addMetadata(Expr);
+      continue;
+    }
+
+    // Ignore copies to/from snippets. We'll delete them.
+    if (SnippetCopies.count(MI))
+      continue;
+
+    // Stack slot accesses may coalesce away.
+    if (coalesceStackAccess(MI, Reg))
+      continue;
+
+    // Analyze instruction.
+    SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
+    MIBundleOperands::VirtRegInfo RI =
+        MIBundleOperands(*MI).analyzeVirtReg(Reg, &Ops);
+
+    // Find the slot index where this instruction reads and writes OldLI.
+    // This is usually the def slot, except for tied early clobbers.
+    SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+    if (VNInfo *VNI = OldLI.getVNInfoAt(Idx.getRegSlot(true)))
+      if (SlotIndex::isSameInstr(Idx, VNI->def))
+        Idx = VNI->def;
+
+    // Check for a sibling copy.
+    unsigned SibReg = isFullCopyOf(*MI, Reg);
+    if (SibReg && isSibling(SibReg)) {
+      // This may actually be a copy between snippets.
+      if (isRegToSpill(SibReg)) {
+        DEBUG(dbgs() << "Found new snippet copy: " << *MI);
+        SnippetCopies.insert(MI);
+        continue;
+      }
+      if (RI.Writes) {
+        if (hoistSpillInsideBB(OldLI, *MI)) {
+          // This COPY is now dead, the value is already in the stack slot.
+          MI->getOperand(0).setIsDead();
+          DeadDefs.push_back(MI);
+          continue;
+        }
+      } else {
+        // This is a reload for a sib-reg copy. Drop spills downstream.
+        LiveInterval &SibLI = LIS.getInterval(SibReg);
+        eliminateRedundantSpills(SibLI, SibLI.getVNInfoAt(Idx));
+        // The COPY will fold to a reload below.
+      }
+    }
+
+    // Attempt to fold memory ops.
+    if (foldMemoryOperand(Ops))
+      continue;
+
+    // Create a new virtual register for spill/fill.
+    // FIXME: Infer regclass from instruction alone.
+    unsigned NewVReg = Edit->createFrom(Reg);
+
+    if (RI.Reads)
+      insertReload(NewVReg, Idx, MI);
+
+    // Rewrite instruction operands.
+    bool hasLiveDef = false;
+    for (const auto &OpPair : Ops) {
+      MachineOperand &MO = OpPair.first->getOperand(OpPair.second);
+      MO.setReg(NewVReg);
+      if (MO.isUse()) {
+        if (!OpPair.first->isRegTiedToDefOperand(OpPair.second))
+          MO.setIsKill();
+      } else {
+        if (!MO.isDead())
+          hasLiveDef = true;
+      }
+    }
+    DEBUG(dbgs() << "\trewrite: " << Idx << '\t' << *MI << '\n');
+
+    // FIXME: Use a second vreg if instruction has no tied ops.
+    if (RI.Writes)
+      if (hasLiveDef)
+        insertSpill(NewVReg, true, MI);
+  }
+}
+
+/// spillAll - Spill all registers remaining after rematerialization.
+void InlineSpiller::spillAll() {
+  // Update LiveStacks now that we are committed to spilling.
+  if (StackSlot == VirtRegMap::NO_STACK_SLOT) {
+    StackSlot = VRM.assignVirt2StackSlot(Original);
+    StackInt = &LSS.getOrCreateInterval(StackSlot, MRI.getRegClass(Original));
+    StackInt->getNextValue(SlotIndex(), LSS.getVNInfoAllocator());
+  } else
+    StackInt = &LSS.getInterval(StackSlot);
+
+  if (Original != Edit->getReg())
+    VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot);
+
+  assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
+  for (unsigned Reg : RegsToSpill)
+    StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg),
+                                     StackInt->getValNumInfo(0));
+  DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
+
+  // Spill around uses of all RegsToSpill.
+  for (unsigned Reg : RegsToSpill)
+    spillAroundUses(Reg);
+
+  // Hoisted spills may cause dead code.
+  if (!DeadDefs.empty()) {
+    DEBUG(dbgs() << "Eliminating " << DeadDefs.size() << " dead defs\n");
+    Edit->eliminateDeadDefs(DeadDefs, RegsToSpill, AA);
+  }
+
+  // Finally delete the SnippetCopies.
+  for (unsigned Reg : RegsToSpill) {
+    for (MachineRegisterInfo::reg_instr_iterator
+         RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end();
+         RI != E; ) {
+      MachineInstr &MI = *(RI++);
+      assert(SnippetCopies.count(&MI) && "Remaining use wasn't a snippet copy");
+      // FIXME: Do this with a LiveRangeEdit callback.
+      LIS.RemoveMachineInstrFromMaps(MI);
+      MI.eraseFromParent();
+    }
+  }
+
+  // Delete all spilled registers.
+  for (unsigned Reg : RegsToSpill)
+    Edit->eraseVirtReg(Reg);
+}
+
+void InlineSpiller::spill(LiveRangeEdit &edit) {
+  ++NumSpilledRanges;
+  Edit = &edit;
+  assert(!TargetRegisterInfo::isStackSlot(edit.getReg())
+         && "Trying to spill a stack slot.");
+  // Share a stack slot among all descendants of Original.
+  Original = VRM.getOriginal(edit.getReg());
+  StackSlot = VRM.getStackSlot(Original);
+  StackInt = nullptr;
+
+  DEBUG(dbgs() << "Inline spilling "
+               << TRI.getRegClassName(MRI.getRegClass(edit.getReg()))
+               << ':' << edit.getParent()
+               << "\nFrom original " << PrintReg(Original) << '\n');
+  assert(edit.getParent().isSpillable() &&
+         "Attempting to spill already spilled value.");
+  assert(DeadDefs.empty() && "Previous spill didn't remove dead defs");
+
+  collectRegsToSpill();
+  reMaterializeAll();
+
+  // Remat may handle everything.
+  if (!RegsToSpill.empty())
+    spillAll();
+
+  Edit->calculateRegClassAndHint(MF, Loops, MBFI);
+}
+
+/// Optimizations after all the reg selections and spills are done.
+///
+void InlineSpiller::postOptimization() { HSpiller.hoistAllSpills(); }
+
+/// When a spill is inserted, add the spill to MergeableSpills map.
+///
+void HoistSpillHelper::addToMergeableSpills(MachineInstr &Spill, int StackSlot,
+                                            unsigned Original) {
+  StackSlotToReg[StackSlot] = Original;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  MergeableSpills[MIdx].insert(&Spill);
+}
+
+/// When a spill is removed, remove the spill from MergeableSpills map.
+/// Return true if the spill is removed successfully.
+///
+bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill,
+                                             int StackSlot) {
+  int Original = StackSlotToReg[StackSlot];
+  if (!Original)
+    return false;
+  SlotIndex Idx = LIS.getInstructionIndex(Spill);
+  VNInfo *OrigVNI = LIS.getInterval(Original).getVNInfoAt(Idx.getRegSlot());
+  std::pair<int, VNInfo *> MIdx = std::make_pair(StackSlot, OrigVNI);
+  return MergeableSpills[MIdx].erase(&Spill);
+}
+
+/// Check BB to see if it is a possible target BB to place a hoisted spill,
+/// i.e., there should be a living sibling of OrigReg at the insert point.
+///
+bool HoistSpillHelper::isSpillCandBB(unsigned OrigReg, VNInfo &OrigVNI,
+                                     MachineBasicBlock &BB, unsigned &LiveReg) {
+  SlotIndex Idx;
+  LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+  MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, BB);
+  if (MI != BB.end())
+    Idx = LIS.getInstructionIndex(*MI);
+  else
+    Idx = LIS.getMBBEndIdx(&BB).getPrevSlot();
+  SmallSetVector<unsigned, 16> &Siblings = Virt2SiblingsMap[OrigReg];
+  assert((LIS.getInterval(OrigReg)).getVNInfoAt(Idx) == &OrigVNI &&
+         "Unexpected VNI");
+
+  for (auto const SibReg : Siblings) {
+    LiveInterval &LI = LIS.getInterval(SibReg);
+    VNInfo *VNI = LI.getVNInfoAt(Idx);
+    if (VNI) {
+      LiveReg = SibReg;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Remove redundant spills in the same BB. Save those redundant spills in
+/// SpillsToRm, and save the spill to keep and its BB in SpillBBToSpill map.
+///
+void HoistSpillHelper::rmRedundantSpills(
+    SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // For each spill saw, check SpillBBToSpill[] and see if its BB already has
+  // another spill inside. If a BB contains more than one spill, only keep the
+  // earlier spill with smaller SlotIndex.
+  for (const auto CurrentSpill : Spills) {
+    MachineBasicBlock *Block = CurrentSpill->getParent();
+    MachineDomTreeNode *Node = MDT.DT->getNode(Block);
+    MachineInstr *PrevSpill = SpillBBToSpill[Node];
+    if (PrevSpill) {
+      SlotIndex PIdx = LIS.getInstructionIndex(*PrevSpill);
+      SlotIndex CIdx = LIS.getInstructionIndex(*CurrentSpill);
+      MachineInstr *SpillToRm = (CIdx > PIdx) ? CurrentSpill : PrevSpill;
+      MachineInstr *SpillToKeep = (CIdx > PIdx) ? PrevSpill : CurrentSpill;
+      SpillsToRm.push_back(SpillToRm);
+      SpillBBToSpill[MDT.DT->getNode(Block)] = SpillToKeep;
+    } else {
+      SpillBBToSpill[MDT.DT->getNode(Block)] = CurrentSpill;
+    }
+  }
+  for (const auto SpillToRm : SpillsToRm)
+    Spills.erase(SpillToRm);
+}
+
+/// Starting from \p Root find a top-down traversal order of the dominator
+/// tree to visit all basic blocks containing the elements of \p Spills.
+/// Redundant spills will be found and put into \p SpillsToRm at the same
+/// time. \p SpillBBToSpill will be populated as part of the process and
+/// maps a basic block to the first store occurring in the basic block.
+/// \post SpillsToRm.union(Spills\@post) == Spills\@pre
+///
+void HoistSpillHelper::getVisitOrders(
+    MachineBasicBlock *Root, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineDomTreeNode *> &Orders,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineDomTreeNode *, unsigned> &SpillsToKeep,
+    DenseMap<MachineDomTreeNode *, MachineInstr *> &SpillBBToSpill) {
+  // The set contains all the possible BB nodes to which we may hoist
+  // original spills.
+  SmallPtrSet<MachineDomTreeNode *, 8> WorkSet;
+  // Save the BB nodes on the path from the first BB node containing
+  // non-redundant spill to the Root node.
+  SmallPtrSet<MachineDomTreeNode *, 8> NodesOnPath;
+  // All the spills to be hoisted must originate from a single def instruction
+  // to the OrigReg. It means the def instruction should dominate all the spills
+  // to be hoisted. We choose the BB where the def instruction is located as
+  // the Root.
+  MachineDomTreeNode *RootIDomNode = MDT[Root]->getIDom();
+  // For every node on the dominator tree with spill, walk up on the dominator
+  // tree towards the Root node until it is reached. If there is other node
+  // containing spill in the middle of the path, the previous spill saw will
+  // be redundant and the node containing it will be removed. All the nodes on
+  // the path starting from the first node with non-redundant spill to the Root
+  // node will be added to the WorkSet, which will contain all the possible
+  // locations where spills may be hoisted to after the loop below is done.
+  for (const auto Spill : Spills) {
+    MachineBasicBlock *Block = Spill->getParent();
+    MachineDomTreeNode *Node = MDT[Block];
+    MachineInstr *SpillToRm = nullptr;
+    while (Node != RootIDomNode) {
+      // If Node dominates Block, and it already contains a spill, the spill in
+      // Block will be redundant.
+      if (Node != MDT[Block] && SpillBBToSpill[Node]) {
+        SpillToRm = SpillBBToSpill[MDT[Block]];
+        break;
+        /// If we see the Node already in WorkSet, the path from the Node to
+        /// the Root node must already be traversed by another spill.
+        /// Then no need to repeat.
+      } else if (WorkSet.count(Node)) {
+        break;
+      } else {
+        NodesOnPath.insert(Node);
+      }
+      Node = Node->getIDom();
+    }
+    if (SpillToRm) {
+      SpillsToRm.push_back(SpillToRm);
+    } else {
+      // Add a BB containing the original spills to SpillsToKeep -- i.e.,
+      // set the initial status before hoisting start. The value of BBs
+      // containing original spills is set to 0, in order to descriminate
+      // with BBs containing hoisted spills which will be inserted to
+      // SpillsToKeep later during hoisting.
+      SpillsToKeep[MDT[Block]] = 0;
+      WorkSet.insert(NodesOnPath.begin(), NodesOnPath.end());
+    }
+    NodesOnPath.clear();
+  }
+
+  // Sort the nodes in WorkSet in top-down order and save the nodes
+  // in Orders. Orders will be used for hoisting in runHoistSpills.
+  unsigned idx = 0;
+  Orders.push_back(MDT.DT->getNode(Root));
+  do {
+    MachineDomTreeNode *Node = Orders[idx++];
+    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (WorkSet.count(Child))
+        Orders.push_back(Child);
+    }
+  } while (idx != Orders.size());
+  assert(Orders.size() == WorkSet.size() &&
+         "Orders have different size with WorkSet");
+
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Orders size is " << Orders.size() << "\n");
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++)
+    DEBUG(dbgs() << "BB" << (*RIt)->getBlock()->getNumber() << ",");
+  DEBUG(dbgs() << "\n");
+#endif
+}
+
+/// Try to hoist spills according to BB hotness. The spills to removed will
+/// be saved in \p SpillsToRm. The spills to be inserted will be saved in
+/// \p SpillsToIns.
+///
+void HoistSpillHelper::runHoistSpills(
+    unsigned OrigReg, VNInfo &OrigVNI, SmallPtrSet<MachineInstr *, 16> &Spills,
+    SmallVectorImpl<MachineInstr *> &SpillsToRm,
+    DenseMap<MachineBasicBlock *, unsigned> &SpillsToIns) {
+  // Visit order of dominator tree nodes.
+  SmallVector<MachineDomTreeNode *, 32> Orders;
+  // SpillsToKeep contains all the nodes where spills are to be inserted
+  // during hoisting. If the spill to be inserted is an original spill
+  // (not a hoisted one), the value of the map entry is 0. If the spill
+  // is a hoisted spill, the value of the map entry is the VReg to be used
+  // as the source of the spill.
+  DenseMap<MachineDomTreeNode *, unsigned> SpillsToKeep;
+  // Map from BB to the first spill inside of it.
+  DenseMap<MachineDomTreeNode *, MachineInstr *> SpillBBToSpill;
+
+  rmRedundantSpills(Spills, SpillsToRm, SpillBBToSpill);
+
+  MachineBasicBlock *Root = LIS.getMBBFromIndex(OrigVNI.def);
+  getVisitOrders(Root, Spills, Orders, SpillsToRm, SpillsToKeep,
+                 SpillBBToSpill);
+
+  // SpillsInSubTreeMap keeps the map from a dom tree node to a pair of
+  // nodes set and the cost of all the spills inside those nodes.
+  // The nodes set are the locations where spills are to be inserted
+  // in the subtree of current node.
+  typedef std::pair<SmallPtrSet<MachineDomTreeNode *, 16>, BlockFrequency>
+      NodesCostPair;
+  DenseMap<MachineDomTreeNode *, NodesCostPair> SpillsInSubTreeMap;
+  // Iterate Orders set in reverse order, which will be a bottom-up order
+  // in the dominator tree. Once we visit a dom tree node, we know its
+  // children have already been visited and the spill locations in the
+  // subtrees of all the children have been determined.
+  SmallVector<MachineDomTreeNode *, 32>::reverse_iterator RIt = Orders.rbegin();
+  for (; RIt != Orders.rend(); RIt++) {
+    MachineBasicBlock *Block = (*RIt)->getBlock();
+
+    // If Block contains an original spill, simply continue.
+    if (SpillsToKeep.find(*RIt) != SpillsToKeep.end() && !SpillsToKeep[*RIt]) {
+      SpillsInSubTreeMap[*RIt].first.insert(*RIt);
+      // SpillsInSubTreeMap[*RIt].second contains the cost of spill.
+      SpillsInSubTreeMap[*RIt].second = MBFI.getBlockFreq(Block);
+      continue;
+    }
+
+    // Collect spills in subtree of current node (*RIt) to
+    // SpillsInSubTreeMap[*RIt].first.
+    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
+    unsigned NumChildren = Children.size();
+    for (unsigned i = 0; i != NumChildren; ++i) {
+      MachineDomTreeNode *Child = Children[i];
+      if (SpillsInSubTreeMap.find(Child) == SpillsInSubTreeMap.end())
+        continue;
+      // The stmt "SpillsInSubTree = SpillsInSubTreeMap[*RIt].first" below
+      // should be placed before getting the begin and end iterators of
+      // SpillsInSubTreeMap[Child].first, or else the iterators may be
+      // invalidated when SpillsInSubTreeMap[*RIt] is seen the first time
+      // and the map grows and then the original buckets in the map are moved.
+      SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+      BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+      SubTreeCost += SpillsInSubTreeMap[Child].second;
+      auto BI = SpillsInSubTreeMap[Child].first.begin();
+      auto EI = SpillsInSubTreeMap[Child].first.end();
+      SpillsInSubTree.insert(BI, EI);
+      SpillsInSubTreeMap.erase(Child);
+    }
+
+    SmallPtrSet<MachineDomTreeNode *, 16> &SpillsInSubTree =
+          SpillsInSubTreeMap[*RIt].first;
+    BlockFrequency &SubTreeCost = SpillsInSubTreeMap[*RIt].second;
+    // No spills in subtree, simply continue.
+    if (SpillsInSubTree.empty())
+      continue;
+
+    // Check whether Block is a possible candidate to insert spill.
+    unsigned LiveReg = 0;
+    if (!isSpillCandBB(OrigReg, OrigVNI, *Block, LiveReg))
+      continue;
+
+    // If there are multiple spills that could be merged, bias a little
+    // to hoist the spill.
+    BranchProbability MarginProb = (SpillsInSubTree.size() > 1)
+                                       ? BranchProbability(9, 10)
+                                       : BranchProbability(1, 1);
+    if (SubTreeCost > MBFI.getBlockFreq(Block) * MarginProb) {
+      // Hoist: Move spills to current Block.
+      for (const auto SpillBB : SpillsInSubTree) {
+        // When SpillBB is a BB contains original spill, insert the spill
+        // to SpillsToRm.
+        if (SpillsToKeep.find(SpillBB) != SpillsToKeep.end() &&
+            !SpillsToKeep[SpillBB]) {
+          MachineInstr *SpillToRm = SpillBBToSpill[SpillBB];
+          SpillsToRm.push_back(SpillToRm);
+        }
+        // SpillBB will not contain spill anymore, remove it from SpillsToKeep.
+        SpillsToKeep.erase(SpillBB);
+      }
+      // Current Block is the BB containing the new hoisted spill. Add it to
+      // SpillsToKeep. LiveReg is the source of the new spill.
+      SpillsToKeep[*RIt] = LiveReg;
+      DEBUG({
+        dbgs() << "spills in BB: ";
+        for (const auto Rspill : SpillsInSubTree)
+          dbgs() << Rspill->getBlock()->getNumber() << " ";
+        dbgs() << "were promoted to BB" << (*RIt)->getBlock()->getNumber()
+               << "\n";
+      });
+      SpillsInSubTree.clear();
+      SpillsInSubTree.insert(*RIt);
+      SubTreeCost = MBFI.getBlockFreq(Block);
+    }
+  }
+  // For spills in SpillsToKeep with LiveReg set (i.e., not original spill),
+  // save them to SpillsToIns.
+  for (const auto Ent : SpillsToKeep) {
+    if (Ent.second)
+      SpillsToIns[Ent.first->getBlock()] = Ent.second;
+  }
+}
+
+/// For spills with equal values, remove redundant spills and hoist those left
+/// to less hot spots.
+///
+/// Spills with equal values will be collected into the same set in
+/// MergeableSpills when spill is inserted. These equal spills are originated
+/// from the same defining instruction and are dominated by the instruction.
+/// Before hoisting all the equal spills, redundant spills inside in the same
+/// BB are first marked to be deleted. Then starting from the spills left, walk
+/// up on the dominator tree towards the Root node where the define instruction
+/// is located, mark the dominated spills to be deleted along the way and
+/// collect the BB nodes on the path from non-dominated spills to the define
+/// instruction into a WorkSet. The nodes in WorkSet are the candidate places
+/// where we are considering to hoist the spills. We iterate the WorkSet in
+/// bottom-up order, and for each node, we will decide whether to hoist spills
+/// inside its subtree to that node. In this way, we can get benefit locally
+/// even if hoisting all the equal spills to one cold place is impossible.
+///
+void HoistSpillHelper::hoistAllSpills() {
+  SmallVector<unsigned, 4> NewVRegs;
+  LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
+
+  // Save the mapping between stackslot and its original reg.
+  DenseMap<int, unsigned> SlotToOrigReg;
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    int Slot = VRM.getStackSlot(Reg);
+    if (Slot != VirtRegMap::NO_STACK_SLOT)
+      SlotToOrigReg[Slot] = VRM.getOriginal(Reg);
+    unsigned Original = VRM.getPreSplitReg(Reg);
+    if (!MRI.def_empty(Reg))
+      Virt2SiblingsMap[Original].insert(Reg);
+  }
+
+  // Each entry in MergeableSpills contains a spill set with equal values.
+  for (auto &Ent : MergeableSpills) {
+    int Slot = Ent.first.first;
+    unsigned OrigReg = SlotToOrigReg[Slot];
+    LiveInterval &OrigLI = LIS.getInterval(OrigReg);
+    VNInfo *OrigVNI = Ent.first.second;
+    SmallPtrSet<MachineInstr *, 16> &EqValSpills = Ent.second;
+    if (Ent.second.empty())
+      continue;
+
+    DEBUG({
+      dbgs() << "\nFor Slot" << Slot << " and VN" << OrigVNI->id << ":\n"
+             << "Equal spills in BB: ";
+      for (const auto spill : EqValSpills)
+        dbgs() << spill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // SpillsToRm is the spill set to be removed from EqValSpills.
+    SmallVector<MachineInstr *, 16> SpillsToRm;
+    // SpillsToIns is the spill set to be newly inserted after hoisting.
+    DenseMap<MachineBasicBlock *, unsigned> SpillsToIns;
+
+    runHoistSpills(OrigReg, *OrigVNI, EqValSpills, SpillsToRm, SpillsToIns);
+
+    DEBUG({
+      dbgs() << "Finally inserted spills in BB: ";
+      for (const auto Ispill : SpillsToIns)
+        dbgs() << Ispill.first->getNumber() << " ";
+      dbgs() << "\nFinally removed spills in BB: ";
+      for (const auto Rspill : SpillsToRm)
+        dbgs() << Rspill->getParent()->getNumber() << " ";
+      dbgs() << "\n";
+    });
+
+    // Stack live range update.
+    LiveInterval &StackIntvl = LSS.getInterval(Slot);
+    if (!SpillsToIns.empty() || !SpillsToRm.empty())
+      StackIntvl.MergeValueInAsValue(OrigLI, OrigVNI,
+                                     StackIntvl.getValNumInfo(0));
+
+    // Insert hoisted spills.
+    for (auto const Insert : SpillsToIns) {
+      MachineBasicBlock *BB = Insert.first;
+      unsigned LiveReg = Insert.second;
+      MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, *BB);
+      TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
+                              MRI.getRegClass(LiveReg), &TRI);
+      LIS.InsertMachineInstrRangeInMaps(std::prev(MI), MI);
+      ++NumSpills;
+    }
+
+    // Remove redundant spills or change them to dead instructions.
+    NumSpills -= SpillsToRm.size();
+    for (auto const RMEnt : SpillsToRm) {
+      RMEnt->setDesc(TII.get(TargetOpcode::KILL));
+      for (unsigned i = RMEnt->getNumOperands(); i; --i) {
+        MachineOperand &MO = RMEnt->getOperand(i - 1);
+        if (MO.isReg() && MO.isImplicit() && MO.isDef() && !MO.isDead())
+          RMEnt->RemoveOperand(i - 1);
+      }
+    }
+    Edit.eliminateDeadDefs(SpillsToRm, None, AA);
+  }
+}
+
+/// For VirtReg clone, the \p New register should have the same physreg or
+/// stackslot as the \p old register.
+void HoistSpillHelper::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+  if (VRM.hasPhys(Old))
+    VRM.assignVirt2Phys(New, VRM.getPhys(Old));
+  else if (VRM.getStackSlot(Old) != VirtRegMap::NO_STACK_SLOT)
+    VRM.assignVirt2StackSlot(New, VRM.getStackSlot(Old));
+  else
+    llvm_unreachable("VReg should be assigned either physreg or stackslot");
+}
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.cpp b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
new file mode 100644
index 000000000000..f8cc24724580
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.cpp
@@ -0,0 +1,250 @@
+//===-- InterferenceCache.cpp - Caching per-block interference ---------*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference in LiveIntervalUnions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InterferenceCache.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Static member used for null interference cursors.
+const InterferenceCache::BlockInterference
+    InterferenceCache::Cursor::NoInterference;
+
+// Initializes PhysRegEntries (instead of a SmallVector, PhysRegEntries is a
+// buffer of size NumPhysRegs to speed up alloc/clear for targets with large
+// reg files). Calloced memory is used for good form, and quites tools like
+// Valgrind too, but zero initialized memory is not required by the algorithm:
+// this is because PhysRegEntries works like a SparseSet and its entries are
+// only valid when there is a corresponding CacheEntries assignment. There is
+// also support for when pass managers are reused for targets with different
+// numbers of PhysRegs: in this case PhysRegEntries is freed and reinitialized.
+void InterferenceCache::reinitPhysRegEntries() {
+  if (PhysRegEntriesCount == TRI->getNumRegs()) return;
+  free(PhysRegEntries);
+  PhysRegEntriesCount = TRI->getNumRegs();
+  PhysRegEntries = (unsigned char*)
+    calloc(PhysRegEntriesCount, sizeof(unsigned char));
+}
+
+void InterferenceCache::init(MachineFunction *mf,
+                             LiveIntervalUnion *liuarray,
+                             SlotIndexes *indexes,
+                             LiveIntervals *lis,
+                             const TargetRegisterInfo *tri) {
+  MF = mf;
+  LIUArray = liuarray;
+  TRI = tri;
+  reinitPhysRegEntries();
+  for (unsigned i = 0; i != CacheEntries; ++i)
+    Entries[i].clear(mf, indexes, lis);
+}
+
+InterferenceCache::Entry *InterferenceCache::get(unsigned PhysReg) {
+  unsigned E = PhysRegEntries[PhysReg];
+  if (E < CacheEntries && Entries[E].getPhysReg() == PhysReg) {
+    if (!Entries[E].valid(LIUArray, TRI))
+      Entries[E].revalidate(LIUArray, TRI);
+    return &Entries[E];
+  }
+  // No valid entry exists, pick the next round-robin entry.
+  E = RoundRobin;
+  if (++RoundRobin == CacheEntries)
+    RoundRobin = 0;
+  for (unsigned i = 0; i != CacheEntries; ++i) {
+    // Skip entries that are in use.
+    if (Entries[E].hasRefs()) {
+      if (++E == CacheEntries)
+        E = 0;
+      continue;
+    }
+    Entries[E].reset(PhysReg, LIUArray, TRI, MF);
+    PhysRegEntries[PhysReg] = E;
+    return &Entries[E];
+  }
+  llvm_unreachable("Ran out of interference cache entries.");
+}
+
+/// revalidate - LIU contents have changed, update tags.
+void InterferenceCache::Entry::revalidate(LiveIntervalUnion *LIUArray,
+                                          const TargetRegisterInfo *TRI) {
+  // Invalidate all block entries.
+  ++Tag;
+  // Invalidate all iterators.
+  PrevPos = SlotIndex();
+  unsigned i = 0;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i)
+    RegUnits[i].VirtTag = LIUArray[*Units].getTag();
+}
+
+void InterferenceCache::Entry::reset(unsigned physReg,
+                                     LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI,
+                                     const MachineFunction *MF) {
+  assert(!hasRefs() && "Cannot reset cache entry with references");
+  // LIU's changed, invalidate cache.
+  ++Tag;
+  PhysReg = physReg;
+  Blocks.resize(MF->getNumBlockIDs());
+
+  // Reset iterators.
+  PrevPos = SlotIndex();
+  RegUnits.clear();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    RegUnits.push_back(LIUArray[*Units]);
+    RegUnits.back().Fixed = &LIS->getRegUnit(*Units);
+  }
+}
+
+bool InterferenceCache::Entry::valid(LiveIntervalUnion *LIUArray,
+                                     const TargetRegisterInfo *TRI) {
+  unsigned i = 0, e = RegUnits.size();
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units, ++i) {
+    if (i == e)
+      return false;
+    if (LIUArray[*Units].changedSince(RegUnits[i].VirtTag))
+      return false;
+  }
+  return i == e;
+}
+
+void InterferenceCache::Entry::update(unsigned MBBNum) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+
+  // Use advanceTo only when possible.
+  if (PrevPos != Start) {
+    if (!PrevPos.isValid() || Start < PrevPos) {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.find(Start);
+        RUI.FixedI = RUI.Fixed->find(Start);
+      }
+    } else {
+      for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+        RegUnitInfo &RUI = RegUnits[i];
+        RUI.VirtI.advanceTo(Start);
+        if (RUI.FixedI != RUI.Fixed->end())
+          RUI.FixedI = RUI.Fixed->advanceTo(RUI.FixedI, Start);
+      }
+    }
+    PrevPos = Start;
+  }
+
+  MachineFunction::const_iterator MFI =
+      MF->getBlockNumbered(MBBNum)->getIterator();
+  BlockInterference *BI = &Blocks[MBBNum];
+  ArrayRef<SlotIndex> RegMaskSlots;
+  ArrayRef<const uint32_t*> RegMaskBits;
+  for (;;) {
+    BI->Tag = Tag;
+    BI->First = BI->Last = SlotIndex();
+
+    // Check for first interference from virtregs.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+      if (!I.valid())
+        continue;
+      SlotIndex StartI = I.start();
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Same thing for fixed interference.
+    for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+      LiveInterval::const_iterator I = RegUnits[i].FixedI;
+      LiveInterval::const_iterator E = RegUnits[i].Fixed->end();
+      if (I == E)
+        continue;
+      SlotIndex StartI = I->start;
+      if (StartI >= Stop)
+        continue;
+      if (!BI->First.isValid() || StartI < BI->First)
+        BI->First = StartI;
+    }
+
+    // Also check for register mask interference.
+    RegMaskSlots = LIS->getRegMaskSlotsInBlock(MBBNum);
+    RegMaskBits = LIS->getRegMaskBitsInBlock(MBBNum);
+    SlotIndex Limit = BI->First.isValid() ? BI->First : Stop;
+    for (unsigned i = 0, e = RegMaskSlots.size();
+         i != e && RegMaskSlots[i] < Limit; ++i)
+      if (MachineOperand::clobbersPhysReg(RegMaskBits[i], PhysReg)) {
+        // Register mask i clobbers PhysReg before the LIU interference.
+        BI->First = RegMaskSlots[i];
+        break;
+      }
+
+    PrevPos = Stop;
+    if (BI->First.isValid())
+      break;
+
+    // No interference in this block? Go ahead and precompute the next block.
+    if (++MFI == MF->end())
+      return;
+    MBBNum = MFI->getNumber();
+    BI = &Blocks[MBBNum];
+    if (BI->Tag == Tag)
+      return;
+    std::tie(Start, Stop) = Indexes->getMBBRange(MBBNum);
+  }
+
+  // Check for last interference in block.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveIntervalUnion::SegmentIter &I = RegUnits[i].VirtI;
+    if (!I.valid() || I.start() >= Stop)
+      continue;
+    I.advanceTo(Stop);
+    bool Backup = !I.valid() || I.start() >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I.stop();
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Fixed interference.
+  for (unsigned i = 0, e = RegUnits.size(); i != e; ++i) {
+    LiveInterval::iterator &I = RegUnits[i].FixedI;
+    LiveRange *LR = RegUnits[i].Fixed;
+    if (I == LR->end() || I->start >= Stop)
+      continue;
+    I = LR->advanceTo(I, Stop);
+    bool Backup = I == LR->end() || I->start >= Stop;
+    if (Backup)
+      --I;
+    SlotIndex StopI = I->end;
+    if (!BI->Last.isValid() || StopI > BI->Last)
+      BI->Last = StopI;
+    if (Backup)
+      ++I;
+  }
+
+  // Also check for register mask interference.
+  SlotIndex Limit = BI->Last.isValid() ? BI->Last : Start;
+  for (unsigned i = RegMaskSlots.size();
+       i && RegMaskSlots[i-1].getDeadSlot() > Limit; --i)
+    if (MachineOperand::clobbersPhysReg(RegMaskBits[i-1], PhysReg)) {
+      // Register mask i-1 clobbers PhysReg after the LIU interference.
+      // Model the regmask clobber as a dead def.
+      BI->Last = RegMaskSlots[i-1].getDeadSlot();
+      break;
+    }
+}
diff --git a/contrib/llvm/lib/CodeGen/InterferenceCache.h b/contrib/llvm/lib/CodeGen/InterferenceCache.h
new file mode 100644
index 000000000000..18aa5c7c5ad6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InterferenceCache.h
@@ -0,0 +1,238 @@
+//===-- InterferenceCache.h - Caching per-block interference ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InterferenceCache remembers per-block interference from LiveIntervalUnions,
+// fixed RegUnit interference, and register masks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+#define LLVM_LIB_CODEGEN_INTERFERENCECACHE_H
+
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+
+namespace llvm {
+
+class LiveIntervals;
+
+class LLVM_LIBRARY_VISIBILITY InterferenceCache {
+  const TargetRegisterInfo *TRI;
+  LiveIntervalUnion *LIUArray;
+  MachineFunction *MF;
+
+  /// BlockInterference - information about the interference in a single basic
+  /// block.
+  struct BlockInterference {
+    BlockInterference() : Tag(0) {}
+    unsigned Tag;
+    SlotIndex First;
+    SlotIndex Last;
+  };
+
+  /// Entry - A cache entry containing interference information for all aliases
+  /// of PhysReg in all basic blocks.
+  class Entry {
+    /// PhysReg - The register currently represented.
+    unsigned PhysReg;
+
+    /// Tag - Cache tag is changed when any of the underlying LiveIntervalUnions
+    /// change.
+    unsigned Tag;
+
+    /// RefCount - The total number of Cursor instances referring to this Entry.
+    unsigned RefCount;
+
+    /// MF - The current function.
+    MachineFunction *MF;
+
+    /// Indexes - Mapping block numbers to SlotIndex ranges.
+    SlotIndexes *Indexes;
+
+    /// LIS - Used for accessing register mask interference maps.
+    LiveIntervals *LIS;
+
+    /// PrevPos - The previous position the iterators were moved to.
+    SlotIndex PrevPos;
+
+    /// RegUnitInfo - Information tracked about each RegUnit in PhysReg.
+    /// When PrevPos is set, the iterators are valid as if advanceTo(PrevPos)
+    /// had just been called.
+    struct RegUnitInfo {
+      /// Iterator pointing into the LiveIntervalUnion containing virtual
+      /// register interference.
+      LiveIntervalUnion::SegmentIter VirtI;
+
+      /// Tag of the LIU last time we looked.
+      unsigned VirtTag;
+
+      /// Fixed interference in RegUnit.
+      LiveRange *Fixed;
+
+      /// Iterator pointing into the fixed RegUnit interference.
+      LiveInterval::iterator FixedI;
+
+      RegUnitInfo(LiveIntervalUnion &LIU)
+          : VirtTag(LIU.getTag()), Fixed(nullptr) {
+        VirtI.setMap(LIU.getMap());
+      }
+    };
+
+    /// Info for each RegUnit in PhysReg. It is very rare ofr a PHysReg to have
+    /// more than 4 RegUnits.
+    SmallVector<RegUnitInfo, 4> RegUnits;
+
+    /// Blocks - Interference for each block in the function.
+    SmallVector<BlockInterference, 8> Blocks;
+
+    /// update - Recompute Blocks[MBBNum]
+    void update(unsigned MBBNum);
+
+  public:
+    Entry() : PhysReg(0), Tag(0), RefCount(0), Indexes(nullptr), LIS(nullptr) {}
+
+    void clear(MachineFunction *mf, SlotIndexes *indexes, LiveIntervals *lis) {
+      assert(!hasRefs() && "Cannot clear cache entry with references");
+      PhysReg = 0;
+      MF = mf;
+      Indexes = indexes;
+      LIS = lis;
+    }
+
+    unsigned getPhysReg() const { return PhysReg; }
+
+    void addRef(int Delta) { RefCount += Delta; }
+
+    bool hasRefs() const { return RefCount > 0; }
+
+    void revalidate(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// valid - Return true if this is a valid entry for physReg.
+    bool valid(LiveIntervalUnion *LIUArray, const TargetRegisterInfo *TRI);
+
+    /// reset - Initialize entry to represent physReg's aliases.
+    void reset(unsigned physReg,
+               LiveIntervalUnion *LIUArray,
+               const TargetRegisterInfo *TRI,
+               const MachineFunction *MF);
+
+    /// get - Return an up to date BlockInterference.
+    BlockInterference *get(unsigned MBBNum) {
+      if (Blocks[MBBNum].Tag != Tag)
+        update(MBBNum);
+      return &Blocks[MBBNum];
+    }
+  };
+
+  // We don't keep a cache entry for every physical register, that would use too
+  // much memory. Instead, a fixed number of cache entries are used in a round-
+  // robin manner.
+  enum { CacheEntries = 32 };
+
+  // Point to an entry for each physreg. The entry pointed to may not be up to
+  // date, and it may have been reused for a different physreg.
+  unsigned char* PhysRegEntries;
+  size_t PhysRegEntriesCount;
+
+  // Next round-robin entry to be picked.
+  unsigned RoundRobin;
+
+  // The actual cache entries.
+  Entry Entries[CacheEntries];
+
+  // get - Get a valid entry for PhysReg.
+  Entry *get(unsigned PhysReg);
+
+public:
+  InterferenceCache()
+    : TRI(nullptr), LIUArray(nullptr), MF(nullptr), PhysRegEntries(nullptr),
+      PhysRegEntriesCount(0), RoundRobin(0) {}
+
+  ~InterferenceCache() {
+    free(PhysRegEntries);
+  }
+
+  void reinitPhysRegEntries();
+
+  /// init - Prepare cache for a new function.
+  void init(MachineFunction*, LiveIntervalUnion*, SlotIndexes*, LiveIntervals*,
+            const TargetRegisterInfo *);
+
+  /// getMaxCursors - Return the maximum number of concurrent cursors that can
+  /// be supported.
+  unsigned getMaxCursors() const { return CacheEntries; }
+
+  /// Cursor - The primary query interface for the block interference cache.
+  class Cursor {
+    Entry *CacheEntry;
+    const BlockInterference *Current;
+    static const BlockInterference NoInterference;
+
+    void setEntry(Entry *E) {
+      Current = nullptr;
+      // Update reference counts. Nothing happens when RefCount reaches 0, so
+      // we don't have to check for E == CacheEntry etc.
+      if (CacheEntry)
+        CacheEntry->addRef(-1);
+      CacheEntry = E;
+      if (CacheEntry)
+        CacheEntry->addRef(+1);
+    }
+
+  public:
+    /// Cursor - Create a dangling cursor.
+    Cursor() : CacheEntry(nullptr), Current(nullptr) {}
+    ~Cursor() { setEntry(nullptr); }
+
+    Cursor(const Cursor &O) : CacheEntry(nullptr), Current(nullptr) {
+      setEntry(O.CacheEntry);
+    }
+
+    Cursor &operator=(const Cursor &O) {
+      setEntry(O.CacheEntry);
+      return *this;
+    }
+
+    /// setPhysReg - Point this cursor to PhysReg's interference.
+    void setPhysReg(InterferenceCache &Cache, unsigned PhysReg) {
+      // Release reference before getting a new one. That guarantees we can
+      // actually have CacheEntries live cursors.
+      setEntry(nullptr);
+      if (PhysReg)
+        setEntry(Cache.get(PhysReg));
+    }
+
+    /// moveTo - Move cursor to basic block MBBNum.
+    void moveToBlock(unsigned MBBNum) {
+      Current = CacheEntry ? CacheEntry->get(MBBNum) : &NoInterference;
+    }
+
+    /// hasInterference - Return true if the current block has any interference.
+    bool hasInterference() {
+      return Current->First.isValid();
+    }
+
+    /// first - Return the starting index of the first interfering range in the
+    /// current block.
+    SlotIndex first() {
+      return Current->First;
+    }
+
+    /// last - Return the ending index of the last interfering range in the
+    /// current block.
+    SlotIndex last() {
+      return Current->Last;
+    }
+  };
+
+  friend class Cursor;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
new file mode 100644
index 000000000000..3f1111976852
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -0,0 +1,386 @@
+//===--------------------- InterleavedAccessPass.cpp ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Interleaved Access pass, which identifies
+// interleaved memory accesses and transforms them into target specific
+// intrinsics.
+//
+// An interleaved load reads data from memory into several vectors, with
+// DE-interleaving the data on a factor. An interleaved store writes several
+// vectors to memory with RE-interleaving the data on a factor.
+//
+// As interleaved accesses are difficult to identified in CodeGen (mainly
+// because the VECTOR_SHUFFLE DAG node is quite different from the shufflevector
+// IR), we identify and transform them to intrinsics in this pass so the
+// intrinsics can be easily matched into target specific instructions later in
+// CodeGen.
+//
+// E.g. An interleaved load (Factor = 2):
+//        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
+//        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
+//        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
+//
+// It could be transformed into a ld2 intrinsic in AArch64 backend or a vld2
+// intrinsic in ARM backend.
+//
+// E.g. An interleaved store (Factor = 3):
+//        %i.vec = shuffle <8 x i32> %v0, <8 x i32> %v1,
+//                                    <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>
+//        store <12 x i32> %i.vec, <12 x i32>* %ptr
+//
+// It could be transformed into a st3 intrinsic in AArch64 backend or a vst3
+// intrinsic in ARM backend.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "interleaved-access"
+
+static cl::opt<bool> LowerInterleavedAccesses(
+    "lower-interleaved-accesses",
+    cl::desc("Enable lowering interleaved accesses to intrinsics"),
+    cl::init(true), cl::Hidden);
+
+static unsigned MaxFactor; // The maximum supported interleave factor.
+
+namespace {
+
+class InterleavedAccess : public FunctionPass {
+
+public:
+  static char ID;
+  InterleavedAccess(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), DT(nullptr), TM(TM), TLI(nullptr) {
+    initializeInterleavedAccessPass(*PassRegistry::getPassRegistry());
+  }
+
+  const char *getPassName() const override { return "Interleaved Access Pass"; }
+
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+private:
+  DominatorTree *DT;
+  const TargetMachine *TM;
+  const TargetLowering *TLI;
+
+  /// \brief Transform an interleaved load into target specific intrinsics.
+  bool lowerInterleavedLoad(LoadInst *LI,
+                            SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// \brief Transform an interleaved store into target specific intrinsics.
+  bool lowerInterleavedStore(StoreInst *SI,
+                             SmallVector<Instruction *, 32> &DeadInsts);
+
+  /// \brief Returns true if the uses of an interleaved load by the
+  /// extractelement instructions in \p Extracts can be replaced by uses of the
+  /// shufflevector instructions in \p Shuffles instead. If so, the necessary
+  /// replacements are also performed.
+  bool tryReplaceExtracts(ArrayRef<ExtractElementInst *> Extracts,
+                          ArrayRef<ShuffleVectorInst *> Shuffles);
+};
+} // end anonymous namespace.
+
+char InterleavedAccess::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(
+    InterleavedAccess, "interleaved-access",
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_TM_PASS_END(
+    InterleavedAccess, "interleaved-access",
+    "Lower interleaved memory accesses to target specific intrinsics", false,
+    false)
+
+FunctionPass *llvm::createInterleavedAccessPass(const TargetMachine *TM) {
+  return new InterleavedAccess(TM);
+}
+
+/// \brief Check if the mask is a DE-interleave mask of the given factor
+/// \p Factor like:
+///     <Index, Index+Factor, ..., Index+(NumElts-1)*Factor>
+static bool isDeInterleaveMaskOfFactor(ArrayRef<int> Mask, unsigned Factor,
+                                       unsigned &Index) {
+  // Check all potential start indices from 0 to (Factor - 1).
+  for (Index = 0; Index < Factor; Index++) {
+    unsigned i = 0;
+
+    // Check that elements are in ascending order by Factor. Ignore undef
+    // elements.
+    for (; i < Mask.size(); i++)
+      if (Mask[i] >= 0 && static_cast<unsigned>(Mask[i]) != Index + i * Factor)
+        break;
+
+    if (i == Mask.size())
+      return true;
+  }
+
+  return false;
+}
+
+/// \brief Check if the mask is a DE-interleave mask for an interleaved load.
+///
+/// E.g. DE-interleave masks (Factor = 2) could be:
+///     <0, 2, 4, 6>    (mask of index 0 to extract even elements)
+///     <1, 3, 5, 7>    (mask of index 1 to extract odd elements)
+static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
+                               unsigned &Index) {
+  if (Mask.size() < 2)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = 2; Factor <= MaxFactor; Factor++)
+    if (isDeInterleaveMaskOfFactor(Mask, Factor, Index))
+      return true;
+
+  return false;
+}
+
+/// \brief Check if the mask is RE-interleave mask for an interleaved store.
+///
+/// I.e. <0, NumSubElts, ... , NumSubElts*(Factor - 1), 1, NumSubElts + 1, ...>
+///
+/// E.g. The RE-interleave mask (Factor = 2) could be:
+///     <0, 4, 1, 5, 2, 6, 3, 7>
+static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor) {
+  unsigned NumElts = Mask.size();
+  if (NumElts < 4)
+    return false;
+
+  // Check potential Factors.
+  for (Factor = 2; Factor <= MaxFactor; Factor++) {
+    if (NumElts % Factor)
+      continue;
+
+    unsigned NumSubElts = NumElts / Factor;
+    if (!isPowerOf2_32(NumSubElts))
+      continue;
+
+    // Check whether each element matchs the RE-interleaved rule. Ignore undef
+    // elements.
+    unsigned i = 0;
+    for (; i < NumElts; i++)
+      if (Mask[i] >= 0 &&
+          static_cast<unsigned>(Mask[i]) !=
+              (i % Factor) * NumSubElts + i / Factor)
+        break;
+
+    // Find a RE-interleaved mask of current factor.
+    if (i == NumElts)
+      return true;
+  }
+
+  return false;
+}
+
+bool InterleavedAccess::lowerInterleavedLoad(
+    LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!LI->isSimple())
+    return false;
+
+  SmallVector<ShuffleVectorInst *, 4> Shuffles;
+  SmallVector<ExtractElementInst *, 4> Extracts;
+
+  // Check if all users of this load are shufflevectors. If we encounter any
+  // users that are extractelement instructions, we save them to later check if
+  // they can be modifed to extract from one of the shufflevectors instead of
+  // the load.
+  for (auto UI = LI->user_begin(), E = LI->user_end(); UI != E; UI++) {
+    auto *Extract = dyn_cast<ExtractElementInst>(*UI);
+    if (Extract && isa<ConstantInt>(Extract->getIndexOperand())) {
+      Extracts.push_back(Extract);
+      continue;
+    }
+    ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(*UI);
+    if (!SVI || !isa<UndefValue>(SVI->getOperand(1)))
+      return false;
+
+    Shuffles.push_back(SVI);
+  }
+
+  if (Shuffles.empty())
+    return false;
+
+  unsigned Factor, Index;
+
+  // Check if the first shufflevector is DE-interleave shuffle.
+  if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index))
+    return false;
+
+  // Holds the corresponding index for each DE-interleave shuffle.
+  SmallVector<unsigned, 4> Indices;
+  Indices.push_back(Index);
+
+  Type *VecTy = Shuffles[0]->getType();
+
+  // Check if other shufflevectors are also DE-interleaved of the same type
+  // and factor as the first shufflevector.
+  for (unsigned i = 1; i < Shuffles.size(); i++) {
+    if (Shuffles[i]->getType() != VecTy)
+      return false;
+
+    if (!isDeInterleaveMaskOfFactor(Shuffles[i]->getShuffleMask(), Factor,
+                                    Index))
+      return false;
+
+    Indices.push_back(Index);
+  }
+
+  // Try and modify users of the load that are extractelement instructions to
+  // use the shufflevector instructions instead of the load.
+  if (!tryReplaceExtracts(Extracts, Shuffles))
+    return false;
+
+  DEBUG(dbgs() << "IA: Found an interleaved load: " << *LI << "\n");
+
+  // Try to create target specific intrinsics to replace the load and shuffles.
+  if (!TLI->lowerInterleavedLoad(LI, Shuffles, Indices, Factor))
+    return false;
+
+  for (auto SVI : Shuffles)
+    DeadInsts.push_back(SVI);
+
+  DeadInsts.push_back(LI);
+  return true;
+}
+
+bool InterleavedAccess::tryReplaceExtracts(
+    ArrayRef<ExtractElementInst *> Extracts,
+    ArrayRef<ShuffleVectorInst *> Shuffles) {
+
+  // If there aren't any extractelement instructions to modify, there's nothing
+  // to do.
+  if (Extracts.empty())
+    return true;
+
+  // Maps extractelement instructions to vector-index pairs. The extractlement
+  // instructions will be modified to use the new vector and index operands.
+  DenseMap<ExtractElementInst *, std::pair<Value *, int>> ReplacementMap;
+
+  for (auto *Extract : Extracts) {
+
+    // The vector index that is extracted.
+    auto *IndexOperand = cast<ConstantInt>(Extract->getIndexOperand());
+    auto Index = IndexOperand->getSExtValue();
+
+    // Look for a suitable shufflevector instruction. The goal is to modify the
+    // extractelement instruction (which uses an interleaved load) to use one
+    // of the shufflevector instructions instead of the load.
+    for (auto *Shuffle : Shuffles) {
+
+      // If the shufflevector instruction doesn't dominate the extract, we
+      // can't create a use of it.
+      if (!DT->dominates(Shuffle, Extract))
+        continue;
+
+      // Inspect the indices of the shufflevector instruction. If the shuffle
+      // selects the same index that is extracted, we can modify the
+      // extractelement instruction.
+      SmallVector<int, 4> Indices;
+      Shuffle->getShuffleMask(Indices);
+      for (unsigned I = 0; I < Indices.size(); ++I)
+        if (Indices[I] == Index) {
+          assert(Extract->getOperand(0) == Shuffle->getOperand(0) &&
+                 "Vector operations do not match");
+          ReplacementMap[Extract] = std::make_pair(Shuffle, I);
+          break;
+        }
+
+      // If we found a suitable shufflevector instruction, stop looking.
+      if (ReplacementMap.count(Extract))
+        break;
+    }
+
+    // If we did not find a suitable shufflevector instruction, the
+    // extractelement instruction cannot be modified, so we must give up.
+    if (!ReplacementMap.count(Extract))
+      return false;
+  }
+
+  // Finally, perform the replacements.
+  IRBuilder<> Builder(Extracts[0]->getContext());
+  for (auto &Replacement : ReplacementMap) {
+    auto *Extract = Replacement.first;
+    auto *Vector = Replacement.second.first;
+    auto Index = Replacement.second.second;
+    Builder.SetInsertPoint(Extract);
+    Extract->replaceAllUsesWith(Builder.CreateExtractElement(Vector, Index));
+    Extract->eraseFromParent();
+  }
+
+  return true;
+}
+
+bool InterleavedAccess::lowerInterleavedStore(
+    StoreInst *SI, SmallVector<Instruction *, 32> &DeadInsts) {
+  if (!SI->isSimple())
+    return false;
+
+  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
+  if (!SVI || !SVI->hasOneUse())
+    return false;
+
+  // Check if the shufflevector is RE-interleave shuffle.
+  unsigned Factor;
+  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor))
+    return false;
+
+  DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
+
+  // Try to create target specific intrinsics to replace the store and shuffle.
+  if (!TLI->lowerInterleavedStore(SI, SVI, Factor))
+    return false;
+
+  // Already have a new target specific interleaved store. Erase the old store.
+  DeadInsts.push_back(SI);
+  DeadInsts.push_back(SVI);
+  return true;
+}
+
+bool InterleavedAccess::runOnFunction(Function &F) {
+  if (!TM || !LowerInterleavedAccesses)
+    return false;
+
+  DEBUG(dbgs() << "*** " << getPassName() << ": " << F.getName() << "\n");
+
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  MaxFactor = TLI->getMaxSupportedInterleaveFactor();
+
+  // Holds dead instructions that will be erased later.
+  SmallVector<Instruction *, 32> DeadInsts;
+  bool Changed = false;
+
+  for (auto &I : instructions(F)) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&I))
+      Changed |= lowerInterleavedLoad(LI, DeadInsts);
+
+    if (StoreInst *SI = dyn_cast<StoreInst>(&I))
+      Changed |= lowerInterleavedStore(SI, DeadInsts);
+  }
+
+  for (auto I : DeadInsts)
+    I->eraseFromParent();
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
new file mode 100644
index 000000000000..2962f8701625
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -0,0 +1,608 @@
+//===-- IntrinsicLowering.cpp - Intrinsic Lowering default implementation -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the IntrinsicLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/IntrinsicLowering.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+template <class ArgIt>
+static void EnsureFunctionExists(Module &M, const char *Name,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 Type *RetTy) {
+  // Insert a correctly-typed definition now.
+  std::vector<Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back(I->getType());
+  M.getOrInsertFunction(Name, FunctionType::get(RetTy, ParamTys, false));
+}
+
+static void EnsureFPIntrinsicsExist(Module &M, Function &Fn,
+                                    const char *FName,
+                                    const char *DName, const char *LDName) {
+  // Insert definitions for all the floating point types.
+  switch((int)Fn.arg_begin()->getType()->getTypeID()) {
+  case Type::FloatTyID:
+    EnsureFunctionExists(M, FName, Fn.arg_begin(), Fn.arg_end(),
+                         Type::getFloatTy(M.getContext()));
+    break;
+  case Type::DoubleTyID:
+    EnsureFunctionExists(M, DName, Fn.arg_begin(), Fn.arg_end(),
+                         Type::getDoubleTy(M.getContext()));
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    EnsureFunctionExists(M, LDName, Fn.arg_begin(), Fn.arg_end(),
+                         Fn.arg_begin()->getType());
+    break;
+  }
+}
+
+/// ReplaceCallWith - This function is used when we want to lower an intrinsic
+/// call to a call of an external function.  This handles hard cases such as
+/// when there was already a prototype for the external function, and if that
+/// prototype doesn't match the arguments we expect to pass in.
+template <class ArgIt>
+static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
+                                 ArgIt ArgBegin, ArgIt ArgEnd,
+                                 Type *RetTy) {
+  // If we haven't already looked up this function, check to see if the
+  // program already contains a function with this name.
+  Module *M = CI->getModule();
+  // Get or insert the definition now.
+  std::vector<Type *> ParamTys;
+  for (ArgIt I = ArgBegin; I != ArgEnd; ++I)
+    ParamTys.push_back((*I)->getType());
+  Constant* FCache = M->getOrInsertFunction(NewFn,
+                                  FunctionType::get(RetTy, ParamTys, false));
+
+  IRBuilder<> Builder(CI->getParent(), CI->getIterator());
+  SmallVector<Value *, 8> Args(ArgBegin, ArgEnd);
+  CallInst *NewCI = Builder.CreateCall(FCache, Args);
+  NewCI->setName(CI->getName());
+  if (!CI->use_empty())
+    CI->replaceAllUsesWith(NewCI);
+  return NewCI;
+}
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                         !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+void IntrinsicLowering::AddPrototypes(Module &M) {
+  LLVMContext &Context = M.getContext();
+  for (auto &F : M)
+    if (F.isDeclaration() && !F.use_empty())
+      switch (F.getIntrinsicID()) {
+      default: break;
+      case Intrinsic::setjmp:
+        EnsureFunctionExists(M, "setjmp", F.arg_begin(), F.arg_end(),
+                             Type::getInt32Ty(M.getContext()));
+        break;
+      case Intrinsic::longjmp:
+        EnsureFunctionExists(M, "longjmp", F.arg_begin(), F.arg_end(),
+                             Type::getVoidTy(M.getContext()));
+        break;
+      case Intrinsic::siglongjmp:
+        EnsureFunctionExists(M, "abort", F.arg_end(), F.arg_end(),
+                             Type::getVoidTy(M.getContext()));
+        break;
+      case Intrinsic::memcpy:
+        M.getOrInsertFunction("memcpy",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt8PtrTy(Context), 
+                              DL.getIntPtrType(Context), nullptr);
+        break;
+      case Intrinsic::memmove:
+        M.getOrInsertFunction("memmove",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt8PtrTy(Context), 
+                              DL.getIntPtrType(Context), nullptr);
+        break;
+      case Intrinsic::memset:
+        M.getOrInsertFunction("memset",
+          Type::getInt8PtrTy(Context),
+                              Type::getInt8PtrTy(Context), 
+                              Type::getInt32Ty(M.getContext()), 
+                              DL.getIntPtrType(Context), nullptr);
+        break;
+      case Intrinsic::sqrt:
+        EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl");
+        break;
+      case Intrinsic::sin:
+        EnsureFPIntrinsicsExist(M, F, "sinf", "sin", "sinl");
+        break;
+      case Intrinsic::cos:
+        EnsureFPIntrinsicsExist(M, F, "cosf", "cos", "cosl");
+        break;
+      case Intrinsic::pow:
+        EnsureFPIntrinsicsExist(M, F, "powf", "pow", "powl");
+        break;
+      case Intrinsic::log:
+        EnsureFPIntrinsicsExist(M, F, "logf", "log", "logl");
+        break;
+      case Intrinsic::log2:
+        EnsureFPIntrinsicsExist(M, F, "log2f", "log2", "log2l");
+        break;
+      case Intrinsic::log10:
+        EnsureFPIntrinsicsExist(M, F, "log10f", "log10", "log10l");
+        break;
+      case Intrinsic::exp:
+        EnsureFPIntrinsicsExist(M, F, "expf", "exp", "expl");
+        break;
+      case Intrinsic::exp2:
+        EnsureFPIntrinsicsExist(M, F, "exp2f", "exp2", "exp2l");
+        break;
+      }
+}
+
+/// LowerBSWAP - Emit the code to lower bswap of V before the specified
+/// instruction IP.
+static Value *LowerBSWAP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntegerTy() && "Can't bswap a non-integer type!");
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+
+  IRBuilder<> Builder(IP);
+
+  switch(BitSize) {
+  default: llvm_unreachable("Unhandled type size of value to byteswap!");
+  case 16: {
+    Value *Tmp1 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.2");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.1");
+    V = Builder.CreateOr(Tmp1, Tmp2, "bswap.i16");
+    break;
+  }
+  case 32: {
+    Value *Tmp4 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.4");
+    Value *Tmp3 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.3");
+    Value *Tmp2 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.2");
+    Value *Tmp1 = Builder.CreateLShr(V,ConstantInt::get(V->getType(), 24),
+                                     "bswap.1");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                         ConstantInt::get(Type::getInt32Ty(Context), 0xFF0000),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                           ConstantInt::get(Type::getInt32Ty(Context), 0xFF00),
+                             "bswap.and2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or1");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or2");
+    V = Builder.CreateOr(Tmp4, Tmp2, "bswap.i32");
+    break;
+  }
+  case 64: {
+    Value *Tmp8 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 56),
+                                    "bswap.8");
+    Value *Tmp7 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 40),
+                                    "bswap.7");
+    Value *Tmp6 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 24),
+                                    "bswap.6");
+    Value *Tmp5 = Builder.CreateShl(V, ConstantInt::get(V->getType(), 8),
+                                    "bswap.5");
+    Value* Tmp4 = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 8),
+                                     "bswap.4");
+    Value* Tmp3 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 24),
+                                     "bswap.3");
+    Value* Tmp2 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 40),
+                                     "bswap.2");
+    Value* Tmp1 = Builder.CreateLShr(V, 
+                                     ConstantInt::get(V->getType(), 56),
+                                     "bswap.1");
+    Tmp7 = Builder.CreateAnd(Tmp7,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                                              0xFF000000000000ULL),
+                             "bswap.and7");
+    Tmp6 = Builder.CreateAnd(Tmp6,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                                              0xFF0000000000ULL),
+                             "bswap.and6");
+    Tmp5 = Builder.CreateAnd(Tmp5,
+                        ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF00000000ULL),
+                             "bswap.and5");
+    Tmp4 = Builder.CreateAnd(Tmp4,
+                        ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF000000ULL),
+                             "bswap.and4");
+    Tmp3 = Builder.CreateAnd(Tmp3,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF0000ULL),
+                             "bswap.and3");
+    Tmp2 = Builder.CreateAnd(Tmp2,
+                             ConstantInt::get(Type::getInt64Ty(Context),
+                             0xFF00ULL),
+                             "bswap.and2");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp7, "bswap.or1");
+    Tmp6 = Builder.CreateOr(Tmp6, Tmp5, "bswap.or2");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp3, "bswap.or3");
+    Tmp2 = Builder.CreateOr(Tmp2, Tmp1, "bswap.or4");
+    Tmp8 = Builder.CreateOr(Tmp8, Tmp6, "bswap.or5");
+    Tmp4 = Builder.CreateOr(Tmp4, Tmp2, "bswap.or6");
+    V = Builder.CreateOr(Tmp8, Tmp4, "bswap.i64");
+    break;
+  }
+  }
+  return V;
+}
+
+/// LowerCTPOP - Emit the code to lower ctpop of V before the specified
+/// instruction IP.
+static Value *LowerCTPOP(LLVMContext &Context, Value *V, Instruction *IP) {
+  assert(V->getType()->isIntegerTy() && "Can't ctpop a non-integer type!");
+
+  static const uint64_t MaskValues[6] = {
+    0x5555555555555555ULL, 0x3333333333333333ULL,
+    0x0F0F0F0F0F0F0F0FULL, 0x00FF00FF00FF00FFULL,
+    0x0000FFFF0000FFFFULL, 0x00000000FFFFFFFFULL
+  };
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  unsigned WordSize = (BitSize + 63) / 64;
+  Value *Count = ConstantInt::get(V->getType(), 0);
+
+  for (unsigned n = 0; n < WordSize; ++n) {
+    Value *PartValue = V;
+    for (unsigned i = 1, ct = 0; i < (BitSize>64 ? 64 : BitSize); 
+         i <<= 1, ++ct) {
+      Value *MaskCst = ConstantInt::get(V->getType(), MaskValues[ct]);
+      Value *LHS = Builder.CreateAnd(PartValue, MaskCst, "cppop.and1");
+      Value *VShift = Builder.CreateLShr(PartValue,
+                                        ConstantInt::get(V->getType(), i),
+                                         "ctpop.sh");
+      Value *RHS = Builder.CreateAnd(VShift, MaskCst, "cppop.and2");
+      PartValue = Builder.CreateAdd(LHS, RHS, "ctpop.step");
+    }
+    Count = Builder.CreateAdd(PartValue, Count, "ctpop.part");
+    if (BitSize > 64) {
+      V = Builder.CreateLShr(V, ConstantInt::get(V->getType(), 64),
+                             "ctpop.part.sh");
+      BitSize -= 64;
+    }
+  }
+
+  return Count;
+}
+
+/// LowerCTLZ - Emit the code to lower ctlz of V before the specified
+/// instruction IP.
+static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
+
+  IRBuilder<> Builder(IP);
+
+  unsigned BitSize = V->getType()->getPrimitiveSizeInBits();
+  for (unsigned i = 1; i < BitSize; i <<= 1) {
+    Value *ShVal = ConstantInt::get(V->getType(), i);
+    ShVal = Builder.CreateLShr(V, ShVal, "ctlz.sh");
+    V = Builder.CreateOr(V, ShVal, "ctlz.step");
+  }
+
+  V = Builder.CreateNot(V);
+  return LowerCTPOP(Context, V, IP);
+}
+
+static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
+                                       const char *Dname,
+                                       const char *LDname) {
+  CallSite CS(CI);
+  switch (CI->getArgOperand(0)->getType()->getTypeID()) {
+  default: llvm_unreachable("Invalid type in intrinsic");
+  case Type::FloatTyID:
+    ReplaceCallWith(Fname, CI, CS.arg_begin(), CS.arg_end(),
+                  Type::getFloatTy(CI->getContext()));
+    break;
+  case Type::DoubleTyID:
+    ReplaceCallWith(Dname, CI, CS.arg_begin(), CS.arg_end(),
+                  Type::getDoubleTy(CI->getContext()));
+    break;
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+    ReplaceCallWith(LDname, CI, CS.arg_begin(), CS.arg_end(),
+                  CI->getArgOperand(0)->getType());
+    break;
+  }
+}
+
+void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
+  IRBuilder<> Builder(CI);
+  LLVMContext &Context = CI->getContext();
+
+  const Function *Callee = CI->getCalledFunction();
+  assert(Callee && "Cannot lower an indirect call!");
+
+  CallSite CS(CI);
+  switch (Callee->getIntrinsicID()) {
+  case Intrinsic::not_intrinsic:
+    report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
+                      Callee->getName() + "'!");
+  default:
+    report_fatal_error("Code generator does not support intrinsic function '"+
+                      Callee->getName()+"'!");
+
+  case Intrinsic::expect: {
+    // Just replace __builtin_expect(exp, c) with EXP.
+    Value *V = CI->getArgOperand(0);
+    CI->replaceAllUsesWith(V);
+    break;
+  }
+
+    // The setjmp/longjmp intrinsics should only exist in the code if it was
+    // never optimized (ie, right out of the CFE), or if it has been hacked on
+    // by the lowerinvoke pass.  In both cases, the right thing to do is to
+    // convert the call to an explicit setjmp or longjmp call.
+  case Intrinsic::setjmp: {
+    Value *V = ReplaceCallWith("setjmp", CI, CS.arg_begin(), CS.arg_end(),
+                               Type::getInt32Ty(Context));
+    if (!CI->getType()->isVoidTy())
+      CI->replaceAllUsesWith(V);
+    break;
+  }
+  case Intrinsic::sigsetjmp:
+     if (!CI->getType()->isVoidTy())
+       CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+     break;
+
+  case Intrinsic::longjmp: {
+    ReplaceCallWith("longjmp", CI, CS.arg_begin(), CS.arg_end(),
+                    Type::getVoidTy(Context));
+    break;
+  }
+
+  case Intrinsic::siglongjmp: {
+    // Insert the call to abort
+    ReplaceCallWith("abort", CI, CS.arg_end(), CS.arg_end(), 
+                    Type::getVoidTy(Context));
+    break;
+  }
+  case Intrinsic::ctpop:
+    CI->replaceAllUsesWith(LowerCTPOP(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::bswap:
+    CI->replaceAllUsesWith(LowerBSWAP(Context, CI->getArgOperand(0), CI));
+    break;
+    
+  case Intrinsic::ctlz:
+    CI->replaceAllUsesWith(LowerCTLZ(Context, CI->getArgOperand(0), CI));
+    break;
+
+  case Intrinsic::cttz: {
+    // cttz(x) -> ctpop(~X & (X-1))
+    Value *Src = CI->getArgOperand(0);
+    Value *NotSrc = Builder.CreateNot(Src);
+    NotSrc->setName(Src->getName() + ".not");
+    Value *SrcM1 = ConstantInt::get(Src->getType(), 1);
+    SrcM1 = Builder.CreateSub(Src, SrcM1);
+    Src = LowerCTPOP(Context, Builder.CreateAnd(NotSrc, SrcM1), CI);
+    CI->replaceAllUsesWith(Src);
+    break;
+  }
+
+  case Intrinsic::stacksave:
+  case Intrinsic::stackrestore: {
+    if (!Warned)
+      errs() << "WARNING: this target does not support the llvm.stack"
+             << (Callee->getIntrinsicID() == Intrinsic::stacksave ?
+               "save" : "restore") << " intrinsic.\n";
+    Warned = true;
+    if (Callee->getIntrinsicID() == Intrinsic::stacksave)
+      CI->replaceAllUsesWith(Constant::getNullValue(CI->getType()));
+    break;
+  }
+    
+  case Intrinsic::get_dynamic_area_offset:
+    errs() << "WARNING: this target does not support the custom llvm.get."
+              "dynamic.area.offset.  It is being lowered to a constant 0\n";
+    // Just lower it to a constant 0 because for most targets
+    // @llvm.get.dynamic.area.offset is lowered to zero.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 0));
+    break;
+  case Intrinsic::returnaddress:
+  case Intrinsic::frameaddress:
+    errs() << "WARNING: this target does not support the llvm."
+           << (Callee->getIntrinsicID() == Intrinsic::returnaddress ?
+             "return" : "frame") << "address intrinsic.\n";
+    CI->replaceAllUsesWith(ConstantPointerNull::get(
+                                            cast<PointerType>(CI->getType())));
+    break;
+
+  case Intrinsic::prefetch:
+    break;    // Simply strip out prefetches on unsupported architectures
+
+  case Intrinsic::pcmarker:
+    break;    // Simply strip out pcmarker on unsupported architectures
+  case Intrinsic::readcyclecounter: {
+    errs() << "WARNING: this target does not support the llvm.readcyclecoun"
+           << "ter intrinsic.  It is being lowered to a constant 0\n";
+    CI->replaceAllUsesWith(ConstantInt::get(Type::getInt64Ty(Context), 0));
+    break;
+  }
+
+  case Intrinsic::dbg_declare:
+    break;    // Simply strip out debugging intrinsics
+
+  case Intrinsic::eh_typeid_for:
+    // Return something different to eh_selector.
+    CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+    break;
+
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+    // Just drop the annotation, but forward the value
+    CI->replaceAllUsesWith(CI->getOperand(0));
+    break;
+
+  case Intrinsic::assume:
+  case Intrinsic::var_annotation:
+    break;   // Strip out these intrinsics
+ 
+  case Intrinsic::memcpy: {
+    Type *IntPtr = DL.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memcpy", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memmove: {
+    Type *IntPtr = DL.getIntPtrType(Context);
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = CI->getArgOperand(0);
+    Ops[1] = CI->getArgOperand(1);
+    Ops[2] = Size;
+    ReplaceCallWith("memmove", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::memset: {
+    Value *Op0 = CI->getArgOperand(0);
+    Type *IntPtr = DL.getIntPtrType(Op0->getType());
+    Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
+                                        /* isSigned */ false);
+    Value *Ops[3];
+    Ops[0] = Op0;
+    // Extend the amount to i32.
+    Ops[1] = Builder.CreateIntCast(CI->getArgOperand(1),
+                                   Type::getInt32Ty(Context),
+                                   /* isSigned */ false);
+    Ops[2] = Size;
+    ReplaceCallWith("memset", CI, Ops, Ops+3, CI->getArgOperand(0)->getType());
+    break;
+  }
+  case Intrinsic::sqrt: {
+    ReplaceFPIntrinsicWithCall(CI, "sqrtf", "sqrt", "sqrtl");
+    break;
+  }
+  case Intrinsic::log: {
+    ReplaceFPIntrinsicWithCall(CI, "logf", "log", "logl");
+    break;
+  }
+  case Intrinsic::log2: {
+    ReplaceFPIntrinsicWithCall(CI, "log2f", "log2", "log2l");
+    break;
+  }
+  case Intrinsic::log10: {
+    ReplaceFPIntrinsicWithCall(CI, "log10f", "log10", "log10l");
+    break;
+  }
+  case Intrinsic::exp: {
+    ReplaceFPIntrinsicWithCall(CI, "expf", "exp", "expl");
+    break;
+  }
+  case Intrinsic::exp2: {
+    ReplaceFPIntrinsicWithCall(CI, "exp2f", "exp2", "exp2l");
+    break;
+  }
+  case Intrinsic::pow: {
+    ReplaceFPIntrinsicWithCall(CI, "powf", "pow", "powl");
+    break;
+  }
+  case Intrinsic::sin: {
+    ReplaceFPIntrinsicWithCall(CI, "sinf", "sin", "sinl");
+    break;
+  }
+  case Intrinsic::cos: {
+    ReplaceFPIntrinsicWithCall(CI, "cosf", "cos", "cosl");
+    break;
+  }
+  case Intrinsic::floor: {
+    ReplaceFPIntrinsicWithCall(CI, "floorf", "floor", "floorl");
+    break;
+  }
+  case Intrinsic::ceil: {
+    ReplaceFPIntrinsicWithCall(CI, "ceilf", "ceil", "ceill");
+    break;
+  }
+  case Intrinsic::trunc: {
+    ReplaceFPIntrinsicWithCall(CI, "truncf", "trunc", "truncl");
+    break;
+  }
+  case Intrinsic::round: {
+    ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
+    break;
+  }
+  case Intrinsic::copysign: {
+    ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
+    break;
+  }
+  case Intrinsic::flt_rounds:
+     // Lower to "round to the nearest"
+     if (!CI->getType()->isVoidTy())
+       CI->replaceAllUsesWith(ConstantInt::get(CI->getType(), 1));
+     break;
+  case Intrinsic::invariant_start:
+  case Intrinsic::lifetime_start:
+    // Discard region information.
+    CI->replaceAllUsesWith(UndefValue::get(CI->getType()));
+    break;
+  case Intrinsic::invariant_end:
+  case Intrinsic::lifetime_end:
+    // Discard region information.
+    break;
+  }
+
+  assert(CI->use_empty() &&
+         "Lowering should have eliminated any uses of the intrinsic call!");
+  CI->eraseFromParent();
+}
+
+bool IntrinsicLowering::LowerToByteSwap(CallInst *CI) {
+  // Verify this is a simple bswap.
+  if (CI->getNumArgOperands() != 1 ||
+      CI->getType() != CI->getArgOperand(0)->getType() ||
+      !CI->getType()->isIntegerTy())
+    return false;
+
+  IntegerType *Ty = dyn_cast<IntegerType>(CI->getType());
+  if (!Ty)
+    return false;
+
+  // Okay, we can do this xform, do so now.
+  Module *M = CI->getModule();
+  Constant *Int = Intrinsic::getDeclaration(M, Intrinsic::bswap, Ty);
+
+  Value *Op = CI->getArgOperand(0);
+  Op = CallInst::Create(Int, Op, CI->getName(), CI);
+
+  CI->replaceAllUsesWith(Op);
+  CI->eraseFromParent();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
new file mode 100644
index 000000000000..9eb43d2bec10
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -0,0 +1,311 @@
+//===-- LLVMTargetMachine.cpp - Implement the LLVMTargetMachine class -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LLVMTargetMachine class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar.h"
+using namespace llvm;
+
+// Enable or disable FastISel. Both options are needed, because
+// FastISel is enabled by default with -fast, and we wish to be
+// able to enable or disable fast-isel independently from -O0.
+static cl::opt<cl::boolOrDefault>
+EnableFastISelOption("fast-isel", cl::Hidden,
+  cl::desc("Enable the \"fast\" instruction selector"));
+
+static cl::opt<bool>
+    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false),
+                     cl::desc("Enable the \"global\" instruction selector"));
+
+void LLVMTargetMachine::initAsmInfo() {
+  MRI = TheTarget.createMCRegInfo(getTargetTriple().str());
+  MII = TheTarget.createMCInstrInfo();
+  // FIXME: Having an MCSubtargetInfo on the target machine is a hack due
+  // to some backends having subtarget feature dependent module level
+  // code generation. This is similar to the hack in the AsmPrinter for
+  // module level assembly etc.
+  STI = TheTarget.createMCSubtargetInfo(getTargetTriple().str(), getTargetCPU(),
+                                        getTargetFeatureString());
+
+  MCAsmInfo *TmpAsmInfo =
+      TheTarget.createMCAsmInfo(*MRI, getTargetTriple().str());
+  // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
+  // and if the old one gets included then MCAsmInfo will be NULL and
+  // we'll crash later.
+  // Provide the user with a useful error message about what's wrong.
+  assert(TmpAsmInfo && "MCAsmInfo not initialized. "
+         "Make sure you include the correct TargetSelect.h"
+         "and that InitializeAllTargetMCs() is being invoked!");
+
+  if (Options.DisableIntegratedAS)
+    TmpAsmInfo->setUseIntegratedAssembler(false);
+
+  TmpAsmInfo->setPreserveAsmComments(Options.MCOptions.PreserveAsmComments);
+
+  if (Options.CompressDebugSections)
+    TmpAsmInfo->setCompressDebugSections(DebugCompressionType::DCT_ZlibGnu);
+
+  TmpAsmInfo->setRelaxELFRelocations(Options.RelaxELFRelocations);
+
+  if (Options.ExceptionModel != ExceptionHandling::None)
+    TmpAsmInfo->setExceptionsType(Options.ExceptionModel);
+
+  AsmInfo = TmpAsmInfo;
+}
+
+LLVMTargetMachine::LLVMTargetMachine(const Target &T,
+                                     StringRef DataLayoutString,
+                                     const Triple &TT, StringRef CPU,
+                                     StringRef FS, TargetOptions Options,
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL)
+    : TargetMachine(T, DataLayoutString, TT, CPU, FS, Options) {
+  T.adjustCodeGenOpts(TT, RM, CM);
+  this->RM = RM;
+  this->CMModel = CM;
+  this->OptLevel = OL;
+}
+
+TargetIRAnalysis LLVMTargetMachine::getTargetIRAnalysis() {
+  return TargetIRAnalysis([this](const Function &F) {
+    return TargetTransformInfo(BasicTTIImpl(this, F));
+  });
+}
+
+MachineModuleInfo &
+LLVMTargetMachine::addMachineModuleInfo(PassManagerBase &PM) const {
+  MachineModuleInfo *MMI = new MachineModuleInfo(*getMCAsmInfo(),
+                                                 *getMCRegisterInfo(),
+                                                 getObjFileLowering());
+  PM.add(MMI);
+  return *MMI;
+}
+
+void LLVMTargetMachine::addMachineFunctionAnalysis(PassManagerBase &PM,
+    MachineFunctionInitializer *MFInitializer) const {
+  PM.add(new MachineFunctionAnalysis(*this, MFInitializer));
+}
+
+/// addPassesToX helper drives creation and initialization of TargetPassConfig.
+static MCContext *
+addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
+                        bool DisableVerify, AnalysisID StartBefore,
+                        AnalysisID StartAfter, AnalysisID StopAfter,
+                        MachineFunctionInitializer *MFInitializer = nullptr) {
+
+  // When in emulated TLS mode, add the LowerEmuTLS pass.
+  if (TM->Options.EmulatedTLS)
+    PM.add(createLowerEmuTLSPass(TM));
+
+  PM.add(createPreISelIntrinsicLoweringPass());
+
+  // Add internal analysis passes from the target machine.
+  PM.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
+
+  // Targets may override createPassConfig to provide a target-specific
+  // subclass.
+  TargetPassConfig *PassConfig = TM->createPassConfig(PM);
+  PassConfig->setStartStopPasses(StartBefore, StartAfter, StopAfter);
+
+  // Set PassConfig options provided by TargetMachine.
+  PassConfig->setDisableVerify(DisableVerify);
+
+  PM.add(PassConfig);
+
+  PassConfig->addIRPasses();
+
+  PassConfig->addCodeGenPrepare();
+
+  PassConfig->addPassesToHandleExceptions();
+
+  PassConfig->addISelPrepare();
+
+  MachineModuleInfo &MMI = TM->addMachineModuleInfo(PM);
+  TM->addMachineFunctionAnalysis(PM, MFInitializer);
+
+  // Enable FastISel with -fast, but allow that to be overridden.
+  TM->setO0WantsFastISel(EnableFastISelOption != cl::BOU_FALSE);
+  if (EnableFastISelOption == cl::BOU_TRUE ||
+      (TM->getOptLevel() == CodeGenOpt::None &&
+       TM->getO0WantsFastISel()))
+    TM->setFastISel(true);
+
+  // Ask the target for an isel.
+  if (LLVM_UNLIKELY(EnableGlobalISel)) {
+    if (PassConfig->addIRTranslator())
+      return nullptr;
+
+    // Before running the register bank selector, ask the target if it
+    // wants to run some passes.
+    PassConfig->addPreRegBankSelect();
+
+    if (PassConfig->addRegBankSelect())
+      return nullptr;
+
+  } else if (PassConfig->addInstSelector())
+    return nullptr;
+
+  PassConfig->addMachinePasses();
+
+  PassConfig->setInitialized();
+
+  return &MMI.getContext();
+}
+
+bool LLVMTargetMachine::addPassesToEmitFile(
+    PassManagerBase &PM, raw_pwrite_stream &Out, CodeGenFileType FileType,
+    bool DisableVerify, AnalysisID StartBefore, AnalysisID StartAfter,
+    AnalysisID StopAfter, MachineFunctionInitializer *MFInitializer) {
+  // Add common CodeGen passes.
+  MCContext *Context =
+      addPassesToGenerateCode(this, PM, DisableVerify, StartBefore, StartAfter,
+                              StopAfter, MFInitializer);
+  if (!Context)
+    return true;
+
+  if (StopAfter) {
+    PM.add(createPrintMIRPass(Out));
+    return false;
+  }
+
+  if (Options.MCOptions.MCSaveTempLabels)
+    Context->setAllowTemporaryLabels(false);
+
+  const MCSubtargetInfo &STI = *getMCSubtargetInfo();
+  const MCAsmInfo &MAI = *getMCAsmInfo();
+  const MCRegisterInfo &MRI = *getMCRegisterInfo();
+  const MCInstrInfo &MII = *getMCInstrInfo();
+
+  std::unique_ptr<MCStreamer> AsmStreamer;
+
+  switch (FileType) {
+  case CGFT_AssemblyFile: {
+    MCInstPrinter *InstPrinter = getTarget().createMCInstPrinter(
+        getTargetTriple(), MAI.getAssemblerDialect(), MAI, MII, MRI);
+
+    // Create a code emitter if asked to show the encoding.
+    MCCodeEmitter *MCE = nullptr;
+    if (Options.MCOptions.ShowMCEncoding)
+      MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context);
+
+    MCAsmBackend *MAB =
+        getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU);
+    auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
+    MCStreamer *S = getTarget().createAsmStreamer(
+        *Context, std::move(FOut), Options.MCOptions.AsmVerbose,
+        Options.MCOptions.MCUseDwarfDirectory, InstPrinter, MCE, MAB,
+        Options.MCOptions.ShowMCInst);
+    AsmStreamer.reset(S);
+    break;
+  }
+  case CGFT_ObjectFile: {
+    // Create the code emitter for the target if it exists.  If not, .o file
+    // emission fails.
+    MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(MII, MRI, *Context);
+    MCAsmBackend *MAB =
+        getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU);
+    if (!MCE || !MAB)
+      return true;
+
+    // Don't waste memory on names of temp labels.
+    Context->setUseNamesOnTempLabels(false);
+
+    Triple T(getTargetTriple().str());
+    AsmStreamer.reset(getTarget().createMCObjectStreamer(
+        T, *Context, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
+        Options.MCOptions.MCIncrementalLinkerCompatible,
+        /*DWARFMustBeAtTheEnd*/ true));
+    break;
+  }
+  case CGFT_Null:
+    // The Null output is intended for use for performance analysis and testing,
+    // not real users.
+    AsmStreamer.reset(getTarget().createNullStreamer(*Context));
+    break;
+  }
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
+  if (!Printer)
+    return true;
+
+  PM.add(Printer);
+
+  return false;
+}
+
+/// addPassesToEmitMC - Add passes to the specified pass manager to get
+/// machine code emitted with the MCJIT. This method returns true if machine
+/// code is not supported. It fills the MCContext Ctx pointer which can be
+/// used to build custom MCStreamer.
+///
+bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM, MCContext *&Ctx,
+                                          raw_pwrite_stream &Out,
+                                          bool DisableVerify) {
+  // Add common CodeGen passes.
+  Ctx = addPassesToGenerateCode(this, PM, DisableVerify, nullptr, nullptr,
+                                nullptr);
+  if (!Ctx)
+    return true;
+
+  if (Options.MCOptions.MCSaveTempLabels)
+    Ctx->setAllowTemporaryLabels(false);
+
+  // Create the code emitter for the target if it exists.  If not, .o file
+  // emission fails.
+  const MCRegisterInfo &MRI = *getMCRegisterInfo();
+  MCCodeEmitter *MCE =
+      getTarget().createMCCodeEmitter(*getMCInstrInfo(), MRI, *Ctx);
+  MCAsmBackend *MAB =
+      getTarget().createMCAsmBackend(MRI, getTargetTriple().str(), TargetCPU);
+  if (!MCE || !MAB)
+    return true;
+
+  const Triple &T = getTargetTriple();
+  const MCSubtargetInfo &STI = *getMCSubtargetInfo();
+  std::unique_ptr<MCStreamer> AsmStreamer(getTarget().createMCObjectStreamer(
+      T, *Ctx, *MAB, Out, MCE, STI, Options.MCOptions.MCRelaxAll,
+      Options.MCOptions.MCIncrementalLinkerCompatible,
+      /*DWARFMustBeAtTheEnd*/ true));
+
+  // Create the AsmPrinter, which takes ownership of AsmStreamer if successful.
+  FunctionPass *Printer =
+      getTarget().createAsmPrinter(*this, std::move(AsmStreamer));
+  if (!Printer)
+    return true;
+
+  PM.add(Printer);
+
+  return false; // success!
+}
diff --git a/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
new file mode 100644
index 000000000000..43218492ed1c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LatencyPriorityQueue.cpp
@@ -0,0 +1,140 @@
+//===---- LatencyPriorityQueue.cpp - A latency-oriented priority queue ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LatencyPriorityQueue class, which is a
+// SchedulingPriorityQueue that schedules using latency information to
+// reduce the length of the critical path through the basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "scheduler"
+
+bool latency_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return RHSNum < LHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *LatencyPriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return nullptr;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+
+  return OnlyAvailablePred;
+}
+
+void LatencyPriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+  }
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+
+  Queue.push_back(SU);
+}
+
+
+// scheduledNode - As nodes are scheduled, we look to see if there are any
+// successor nodes that have a single unscheduled predecessor.  If so, that
+// single predecessor has a higher priority, since scheduling it will make
+// the node available.
+void LatencyPriorityQueue::scheduledNode(SUnit *SU) {
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    AdjustPriorityOfUnscheduledPreds(I->getSUnit());
+  }
+}
+
+/// AdjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void LatencyPriorityQueue::AdjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable) return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+SUnit *LatencyPriorityQueue::pop() {
+  if (empty()) return nullptr;
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != std::prev(Queue.end()))
+    std::swap(*Best, Queue.back());
+  Queue.pop_back();
+  return V;
+}
+
+void LatencyPriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(), SU);
+  if (I != std::prev(Queue.end()))
+    std::swap(*I, Queue.back());
+  Queue.pop_back();
+}
diff --git a/contrib/llvm/lib/CodeGen/LexicalScopes.cpp b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
new file mode 100644
index 000000000000..b810176e6a18
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -0,0 +1,332 @@
+//===- LexicalScopes.cpp - Collecting lexical scope info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements LexicalScopes analysis.
+//
+// This pass collects lexical scope information and maps machine instructions
+// to respective lexical scopes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "lexicalscopes"
+
+/// reset - Reset the instance so that it's prepared for another function.
+void LexicalScopes::reset() {
+  MF = nullptr;
+  CurrentFnLexicalScope = nullptr;
+  LexicalScopeMap.clear();
+  AbstractScopeMap.clear();
+  InlinedLexicalScopeMap.clear();
+  AbstractScopesList.clear();
+}
+
+/// initialize - Scan machine function and constuct lexical scope nest.
+void LexicalScopes::initialize(const MachineFunction &Fn) {
+  reset();
+  MF = &Fn;
+  SmallVector<InsnRange, 4> MIRanges;
+  DenseMap<const MachineInstr *, LexicalScope *> MI2ScopeMap;
+  extractLexicalScopes(MIRanges, MI2ScopeMap);
+  if (CurrentFnLexicalScope) {
+    constructScopeNest(CurrentFnLexicalScope);
+    assignInstructionRanges(MIRanges, MI2ScopeMap);
+  }
+}
+
+/// extractLexicalScopes - Extract instruction ranges for each lexical scopes
+/// for the given machine function.
+void LexicalScopes::extractLexicalScopes(
+    SmallVectorImpl<InsnRange> &MIRanges,
+    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+
+  // Scan each instruction and create scopes. First build working set of scopes.
+  for (const auto &MBB : *MF) {
+    const MachineInstr *RangeBeginMI = nullptr;
+    const MachineInstr *PrevMI = nullptr;
+    const DILocation *PrevDL = nullptr;
+    for (const auto &MInsn : MBB) {
+      // Check if instruction has valid location information.
+      const DILocation *MIDL = MInsn.getDebugLoc();
+      if (!MIDL) {
+        PrevMI = &MInsn;
+        continue;
+      }
+
+      // If scope has not changed then skip this instruction.
+      if (MIDL == PrevDL) {
+        PrevMI = &MInsn;
+        continue;
+      }
+
+      // Ignore DBG_VALUE. It does not contribute to any instruction in output.
+      if (MInsn.isDebugValue())
+        continue;
+
+      if (RangeBeginMI) {
+        // If we have already seen a beginning of an instruction range and
+        // current instruction scope does not match scope of first instruction
+        // in this range then create a new instruction range.
+        InsnRange R(RangeBeginMI, PrevMI);
+        MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+        MIRanges.push_back(R);
+      }
+
+      // This is a beginning of a new instruction range.
+      RangeBeginMI = &MInsn;
+
+      // Reset previous markers.
+      PrevMI = &MInsn;
+      PrevDL = MIDL;
+    }
+
+    // Create last instruction range.
+    if (RangeBeginMI && PrevMI && PrevDL) {
+      InsnRange R(RangeBeginMI, PrevMI);
+      MIRanges.push_back(R);
+      MI2ScopeMap[RangeBeginMI] = getOrCreateLexicalScope(PrevDL);
+    }
+  }
+}
+
+/// findLexicalScope - Find lexical scope, either regular or inlined, for the
+/// given DebugLoc. Return NULL if not found.
+LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
+  DILocalScope *Scope = DL->getScope();
+  if (!Scope)
+    return nullptr;
+
+  // The scope that we were created with could have an extra file - which
+  // isn't what we care about in this case.
+  Scope = Scope->getNonLexicalBlockFileScope();
+
+  if (auto *IA = DL->getInlinedAt()) {
+    auto I = InlinedLexicalScopeMap.find(std::make_pair(Scope, IA));
+    return I != InlinedLexicalScopeMap.end() ? &I->second : nullptr;
+  }
+  return findLexicalScope(Scope);
+}
+
+/// getOrCreateLexicalScope - Find lexical scope for the given DebugLoc. If
+/// not available then create new lexical scope.
+LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
+                                                     const DILocation *IA) {
+  if (IA) {
+    // Create an abstract scope for inlined function.
+    getOrCreateAbstractScope(Scope);
+    // Create an inlined scope for inlined function.
+    return getOrCreateInlinedScope(Scope, IA);
+  }
+
+  return getOrCreateRegularScope(Scope);
+}
+
+/// getOrCreateRegularScope - Find or create a regular lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateRegularScope(const DILocalScope *Scope) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+
+  auto I = LexicalScopeMap.find(Scope);
+  if (I != LexicalScopeMap.end())
+    return &I->second;
+
+  // FIXME: Should the following dyn_cast be DILexicalBlock?
+  LexicalScope *Parent = nullptr;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateLexicalScope(Block->getScope());
+  I = LexicalScopeMap.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(Scope),
+                              std::forward_as_tuple(Parent, Scope, nullptr,
+                                                    false)).first;
+
+  if (!Parent) {
+    assert(cast<DISubprogram>(Scope)->describes(MF->getFunction()));
+    assert(!CurrentFnLexicalScope);
+    CurrentFnLexicalScope = &I->second;
+  }
+
+  return &I->second;
+}
+
+/// getOrCreateInlinedScope - Find or create an inlined lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
+                                       const DILocation *InlinedAt) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+  std::pair<const DILocalScope *, const DILocation *> P(Scope, InlinedAt);
+  auto I = InlinedLexicalScopeMap.find(P);
+  if (I != InlinedLexicalScopeMap.end())
+    return &I->second;
+
+  LexicalScope *Parent;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateInlinedScope(Block->getScope(), InlinedAt);
+  else
+    Parent = getOrCreateLexicalScope(InlinedAt);
+
+  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
+                                     std::forward_as_tuple(P),
+                                     std::forward_as_tuple(Parent, Scope,
+                                                           InlinedAt, false))
+          .first;
+  return &I->second;
+}
+
+/// getOrCreateAbstractScope - Find or create an abstract lexical scope.
+LexicalScope *
+LexicalScopes::getOrCreateAbstractScope(const DILocalScope *Scope) {
+  assert(Scope && "Invalid Scope encoding!");
+  Scope = Scope->getNonLexicalBlockFileScope();
+  auto I = AbstractScopeMap.find(Scope);
+  if (I != AbstractScopeMap.end())
+    return &I->second;
+
+  // FIXME: Should the following isa be DILexicalBlock?
+  LexicalScope *Parent = nullptr;
+  if (auto *Block = dyn_cast<DILexicalBlockBase>(Scope))
+    Parent = getOrCreateAbstractScope(Block->getScope());
+
+  I = AbstractScopeMap.emplace(std::piecewise_construct,
+                               std::forward_as_tuple(Scope),
+                               std::forward_as_tuple(Parent, Scope,
+                                                     nullptr, true)).first;
+  if (isa<DISubprogram>(Scope))
+    AbstractScopesList.push_back(&I->second);
+  return &I->second;
+}
+
+/// constructScopeNest
+void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
+  assert(Scope && "Unable to calculate scope dominance graph!");
+  SmallVector<LexicalScope *, 4> WorkStack;
+  WorkStack.push_back(Scope);
+  unsigned Counter = 0;
+  while (!WorkStack.empty()) {
+    LexicalScope *WS = WorkStack.back();
+    const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
+    bool visitedChildren = false;
+    for (SmallVectorImpl<LexicalScope *>::const_iterator SI = Children.begin(),
+                                                         SE = Children.end();
+         SI != SE; ++SI) {
+      LexicalScope *ChildScope = *SI;
+      if (!ChildScope->getDFSOut()) {
+        WorkStack.push_back(ChildScope);
+        visitedChildren = true;
+        ChildScope->setDFSIn(++Counter);
+        break;
+      }
+    }
+    if (!visitedChildren) {
+      WorkStack.pop_back();
+      WS->setDFSOut(++Counter);
+    }
+  }
+}
+
+/// assignInstructionRanges - Find ranges of instructions covered by each
+/// lexical scope.
+void LexicalScopes::assignInstructionRanges(
+    SmallVectorImpl<InsnRange> &MIRanges,
+    DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
+
+  LexicalScope *PrevLexicalScope = nullptr;
+  for (SmallVectorImpl<InsnRange>::const_iterator RI = MIRanges.begin(),
+                                                  RE = MIRanges.end();
+       RI != RE; ++RI) {
+    const InsnRange &R = *RI;
+    LexicalScope *S = MI2ScopeMap.lookup(R.first);
+    assert(S && "Lost LexicalScope for a machine instruction!");
+    if (PrevLexicalScope && !PrevLexicalScope->dominates(S))
+      PrevLexicalScope->closeInsnRange(S);
+    S->openInsnRange(R.first);
+    S->extendInsnRange(R.second);
+    PrevLexicalScope = S;
+  }
+
+  if (PrevLexicalScope)
+    PrevLexicalScope->closeInsnRange();
+}
+
+/// getMachineBasicBlocks - Populate given set using machine basic blocks which
+/// have machine instructions that belong to lexical scope identified by
+/// DebugLoc.
+void LexicalScopes::getMachineBasicBlocks(
+    const DILocation *DL, SmallPtrSetImpl<const MachineBasicBlock *> &MBBs) {
+  MBBs.clear();
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return;
+
+  if (Scope == CurrentFnLexicalScope) {
+    for (const auto &MBB : *MF)
+      MBBs.insert(&MBB);
+    return;
+  }
+
+  SmallVectorImpl<InsnRange> &InsnRanges = Scope->getRanges();
+  for (SmallVectorImpl<InsnRange>::iterator I = InsnRanges.begin(),
+                                            E = InsnRanges.end();
+       I != E; ++I) {
+    InsnRange &R = *I;
+    MBBs.insert(R.first->getParent());
+  }
+}
+
+/// dominates - Return true if DebugLoc's lexical scope dominates at least one
+/// machine instruction's lexical scope in a given machine basic block.
+bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
+  LexicalScope *Scope = getOrCreateLexicalScope(DL);
+  if (!Scope)
+    return false;
+
+  // Current function scope covers all basic blocks in the function.
+  if (Scope == CurrentFnLexicalScope && MBB->getParent() == MF)
+    return true;
+
+  bool Result = false;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+       ++I) {
+    if (const DILocation *IDL = I->getDebugLoc())
+      if (LexicalScope *IScope = getOrCreateLexicalScope(IDL))
+        if (Scope->dominates(IScope))
+          return true;
+  }
+  return Result;
+}
+
+/// dump - Print data structures.
+void LexicalScope::dump(unsigned Indent) const {
+#ifndef NDEBUG
+  raw_ostream &err = dbgs();
+  err.indent(Indent);
+  err << "DFSIn: " << DFSIn << " DFSOut: " << DFSOut << "\n";
+  const MDNode *N = Desc;
+  err.indent(Indent);
+  N->dump();
+  if (AbstractScope)
+    err << std::string(Indent, ' ') << "Abstract Scope\n";
+
+  if (!Children.empty())
+    err << std::string(Indent + 2, ' ') << "Children ...\n";
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    if (Children[i] != this)
+      Children[i]->dump(Indent + 2);
+#endif
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
new file mode 100644
index 000000000000..4ff88d528108
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -0,0 +1,516 @@
+//===------ LiveDebugValues.cpp - Tracking Debug Value MIs ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass implements a data flow analysis that propagates debug location
+/// information by inserting additional DBG_VALUE instructions into the machine
+/// instruction stream. The pass internally builds debug location liveness
+/// ranges to determine the points where additional DBG_VALUEs need to be
+/// inserted.
+///
+/// This is a separate pass from DbgValueHistoryCalculator to facilitate
+/// testing and improve modularity.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <list>
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "live-debug-values"
+
+STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
+
+namespace {
+
+// \brief If @MI is a DBG_VALUE with debug value described by a defined
+// register, returns the number of this register. In the other case, returns 0.
+static unsigned isDbgValueDescribedByReg(const MachineInstr &MI) {
+  assert(MI.isDebugValue() && "expected a DBG_VALUE");
+  assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+  // If location of variable is described using a register (directly
+  // or indirectly), this register is always a first operand.
+  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : 0;
+}
+
+class LiveDebugValues : public MachineFunctionPass {
+
+private:
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+
+  /// Based on std::pair so it can be used as an index into a DenseMap.
+  typedef std::pair<const DILocalVariable *, const DILocation *>
+      DebugVariableBase;
+  /// A potentially inlined instance of a variable.
+  struct DebugVariable : public DebugVariableBase {
+    DebugVariable(const DILocalVariable *Var, const DILocation *InlinedAt)
+        : DebugVariableBase(Var, InlinedAt) {}
+
+    const DILocalVariable *getVar() const { return this->first; };
+    const DILocation *getInlinedAt() const { return this->second; };
+
+    bool operator<(const DebugVariable &DV) const {
+      if (getVar() == DV.getVar())
+        return getInlinedAt() < DV.getInlinedAt();
+      return getVar() < DV.getVar();
+    }
+  };
+
+  /// A pair of debug variable and value location.
+  struct VarLoc {
+    const DebugVariable Var;
+    const MachineInstr &MI; ///< Only used for cloning a new DBG_VALUE.
+
+    enum { InvalidKind = 0, RegisterKind } Kind;
+
+    /// The value location. Stored separately to avoid repeatedly
+    /// extracting it from MI.
+    union {
+      struct {
+        uint32_t RegNo;
+        uint32_t Offset;
+      } RegisterLoc;
+      uint64_t Hash;
+    } Loc;
+
+    VarLoc(const MachineInstr &MI)
+        : Var(MI.getDebugVariable(), MI.getDebugLoc()->getInlinedAt()), MI(MI),
+          Kind(InvalidKind) {
+      static_assert((sizeof(Loc) == sizeof(uint64_t)),
+                    "hash does not cover all members of Loc");
+      assert(MI.isDebugValue() && "not a DBG_VALUE");
+      assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
+      if (int RegNo = isDbgValueDescribedByReg(MI)) {
+        Kind = RegisterKind;
+        Loc.RegisterLoc.RegNo = RegNo;
+        uint64_t Offset =
+            MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;
+        // We don't support offsets larger than 4GiB here. They are
+        // slated to be replaced with DIExpressions anyway.
+        if (Offset >= (1ULL << 32))
+          Kind = InvalidKind;
+        else
+          Loc.RegisterLoc.Offset = Offset;
+      }
+    }
+
+    /// If this variable is described by a register, return it,
+    /// otherwise return 0.
+    unsigned isDescribedByReg() const {
+      if (Kind == RegisterKind)
+        return Loc.RegisterLoc.RegNo;
+      return 0;
+    }
+
+    void dump() const { MI.dump(); }
+
+    bool operator==(const VarLoc &Other) const {
+      return Var == Other.Var && Loc.Hash == Other.Loc.Hash;
+    }
+
+    /// This operator guarantees that VarLocs are sorted by Variable first.
+    bool operator<(const VarLoc &Other) const {
+      if (Var == Other.Var)
+        return Loc.Hash < Other.Loc.Hash;
+      return Var < Other.Var;
+    }
+  };
+
+  typedef UniqueVector<VarLoc> VarLocMap;
+  typedef SparseBitVector<> VarLocSet;
+  typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB;
+
+  /// This holds the working set of currently open ranges. For fast
+  /// access, this is done both as a set of VarLocIDs, and a map of
+  /// DebugVariable to recent VarLocID. Note that a DBG_VALUE ends all
+  /// previous open ranges for the same variable.
+  class OpenRangesSet {
+    VarLocSet VarLocs;
+    SmallDenseMap<DebugVariableBase, unsigned, 8> Vars;
+
+  public:
+    const VarLocSet &getVarLocs() const { return VarLocs; }
+
+    /// Terminate all open ranges for Var by removing it from the set.
+    void erase(DebugVariable Var) {
+      auto It = Vars.find(Var);
+      if (It != Vars.end()) {
+        unsigned ID = It->second;
+        VarLocs.reset(ID);
+        Vars.erase(It);
+      }
+    }
+
+    /// Terminate all open ranges listed in \c KillSet by removing
+    /// them from the set.
+    void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs) {
+      VarLocs.intersectWithComplement(KillSet);
+      for (unsigned ID : KillSet)
+        Vars.erase(VarLocIDs[ID].Var);
+    }
+
+    /// Insert a new range into the set.
+    void insert(unsigned VarLocID, DebugVariableBase Var) {
+      VarLocs.set(VarLocID);
+      Vars.insert({Var, VarLocID});
+    }
+
+    /// Empty the set.
+    void clear() {
+      VarLocs.clear();
+      Vars.clear();
+    }
+
+    /// Return whether the set is empty or not.
+    bool empty() const {
+      assert(Vars.empty() == VarLocs.empty() && "open ranges are inconsistent");
+      return VarLocs.empty();
+    }
+  };
+
+  void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
+                          VarLocMap &VarLocIDs);
+  void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                           const VarLocMap &VarLocIDs);
+  bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                              VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
+  bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs);
+
+  bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
+            const VarLocMap &VarLocIDs);
+
+  bool ExtendRanges(MachineFunction &MF);
+
+public:
+  static char ID;
+
+  /// Default construct and initialize the pass.
+  LiveDebugValues();
+
+  /// Tell the pass manager which passes we depend on and what
+  /// information we preserve.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+  /// Print to ostream with a message.
+  void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
+                        const VarLocMap &VarLocIDs, const char *msg,
+                        raw_ostream &Out) const;
+
+  /// Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+//            Implementation
+//===----------------------------------------------------------------------===//
+
+char LiveDebugValues::ID = 0;
+char &llvm::LiveDebugValuesID = LiveDebugValues::ID;
+INITIALIZE_PASS(LiveDebugValues, "livedebugvalues", "Live DEBUG_VALUE analysis",
+                false, false)
+
+/// Default construct and initialize the pass.
+LiveDebugValues::LiveDebugValues() : MachineFunctionPass(ID) {
+  initializeLiveDebugValuesPass(*PassRegistry::getPassRegistry());
+}
+
+/// Tell the pass manager which passes we depend on and what information we
+/// preserve.
+void LiveDebugValues::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+//===----------------------------------------------------------------------===//
+//            Debug Range Extension Implementation
+//===----------------------------------------------------------------------===//
+
+void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
+                                       const VarLocInMBB &V,
+                                       const VarLocMap &VarLocIDs,
+                                       const char *msg,
+                                       raw_ostream &Out) const {
+  for (const MachineBasicBlock &BB : MF) {
+    const auto &L = V.lookup(&BB);
+    Out << "MBB: " << BB.getName() << ":\n";
+    for (unsigned VLL : L) {
+      const VarLoc &VL = VarLocIDs[VLL];
+      Out << " Var: " << VL.Var.getVar()->getName();
+      Out << " MI: ";
+      VL.dump();
+      Out << "\n";
+    }
+  }
+  Out << "\n";
+}
+
+/// End all previous ranges related to @MI and start a new range from @MI
+/// if it is a DBG_VALUE instr.
+void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
+                                         OpenRangesSet &OpenRanges,
+                                         VarLocMap &VarLocIDs) {
+  if (!MI.isDebugValue())
+    return;
+  const DILocalVariable *Var = MI.getDebugVariable();
+  const DILocation *DebugLoc = MI.getDebugLoc();
+  const DILocation *InlinedAt = DebugLoc->getInlinedAt();
+  assert(Var->isValidLocationForIntrinsic(DebugLoc) &&
+         "Expected inlined-at fields to agree");
+
+  // End all previous ranges of Var.
+  DebugVariable V(Var, InlinedAt);
+  OpenRanges.erase(V);
+
+  // Add the VarLoc to OpenRanges from this DBG_VALUE.
+  // TODO: Currently handles DBG_VALUE which has only reg as location.
+  if (isDbgValueDescribedByReg(MI)) {
+    VarLoc VL(MI);
+    unsigned ID = VarLocIDs.insert(VL);
+    OpenRanges.insert(ID, VL.Var);
+  }
+}
+
+/// A definition of a register may mark the end of a range.
+void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
+                                          OpenRangesSet &OpenRanges,
+                                          const VarLocMap &VarLocIDs) {
+  MachineFunction *MF = MI.getParent()->getParent();
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+  SparseBitVector<> KillSet;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isDef() && MO.getReg() &&
+        TRI->isPhysicalRegister(MO.getReg())) {
+      // Remove ranges of all aliased registers.
+      for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
+        for (unsigned ID : OpenRanges.getVarLocs())
+          if (VarLocIDs[ID].isDescribedByReg() == *RAI)
+            KillSet.set(ID);
+    } else if (MO.isRegMask()) {
+      // Remove ranges of all clobbered registers. Register masks don't usually
+      // list SP as preserved.  While the debug info may be off for an
+      // instruction or two around callee-cleanup calls, transferring the
+      // DEBUG_VALUE across the call is still a better user experience.
+      for (unsigned ID : OpenRanges.getVarLocs()) {
+        unsigned Reg = VarLocIDs[ID].isDescribedByReg();
+        if (Reg && Reg != SP && MO.clobbersPhysReg(Reg))
+          KillSet.set(ID);
+      }
+    }
+  }
+  OpenRanges.erase(KillSet, VarLocIDs);
+}
+
+/// Terminate all open ranges at the end of the current basic block.
+bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
+                                             OpenRangesSet &OpenRanges,
+                                             VarLocInMBB &OutLocs,
+                                             const VarLocMap &VarLocIDs) {
+  bool Changed = false;
+  const MachineBasicBlock *CurMBB = MI.getParent();
+  if (!(MI.isTerminator() || (&MI == &CurMBB->instr_back())))
+    return false;
+
+  if (OpenRanges.empty())
+    return false;
+
+  DEBUG(for (unsigned ID : OpenRanges.getVarLocs()) {
+          // Copy OpenRanges to OutLocs, if not already present.
+          dbgs() << "Add to OutLocs: "; VarLocIDs[ID].dump();
+        });
+  VarLocSet &VLS = OutLocs[CurMBB];
+  Changed = VLS |= OpenRanges.getVarLocs();
+  OpenRanges.clear();
+  return Changed;
+}
+
+/// This routine creates OpenRanges and OutLocs.
+bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) {
+  bool Changed = false;
+  transferDebugValue(MI, OpenRanges, VarLocIDs);
+  transferRegisterDef(MI, OpenRanges, VarLocIDs);
+  Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
+  return Changed;
+}
+
+/// This routine joins the analysis results of all incoming edges in @MBB by
+/// inserting a new DBG_VALUE instruction at the start of the @MBB - if the same
+/// source variable in all the predecessors of @MBB reside in the same location.
+bool LiveDebugValues::join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs,
+                           VarLocInMBB &InLocs, const VarLocMap &VarLocIDs) {
+  DEBUG(dbgs() << "join MBB: " << MBB.getName() << "\n");
+  bool Changed = false;
+
+  VarLocSet InLocsT; // Temporary incoming locations.
+
+  // For all predecessors of this MBB, find the set of VarLocs that
+  // can be joined.
+  for (auto p : MBB.predecessors()) {
+    auto OL = OutLocs.find(p);
+    // Join is null in case of empty OutLocs from any of the pred.
+    if (OL == OutLocs.end())
+      return false;
+
+    // Just copy over the Out locs to incoming locs for the first predecessor.
+    if (p == *MBB.pred_begin()) {
+      InLocsT = OL->second;
+      continue;
+    }
+    // Join with this predecessor.
+    InLocsT &= OL->second;
+  }
+
+  if (InLocsT.empty())
+    return false;
+
+  VarLocSet &ILS = InLocs[&MBB];
+
+  // Insert DBG_VALUE instructions, if not already inserted.
+  VarLocSet Diff = InLocsT;
+  Diff.intersectWithComplement(ILS);
+  for (auto ID : Diff) {
+    // This VarLoc is not found in InLocs i.e. it is not yet inserted. So, a
+    // new range is started for the var from the mbb's beginning by inserting
+    // a new DBG_VALUE. transfer() will end this range however appropriate.
+    const VarLoc &DiffIt = VarLocIDs[ID];
+    const MachineInstr *DMI = &DiffIt.MI;
+    MachineInstr *MI =
+        BuildMI(MBB, MBB.instr_begin(), DMI->getDebugLoc(), DMI->getDesc(),
+                DMI->isIndirectDebugValue(), DMI->getOperand(0).getReg(), 0,
+                DMI->getDebugVariable(), DMI->getDebugExpression());
+    if (DMI->isIndirectDebugValue())
+      MI->getOperand(1).setImm(DMI->getOperand(1).getImm());
+    DEBUG(dbgs() << "Inserted: "; MI->dump(););
+    ILS.set(ID);
+    ++NumInserted;
+    Changed = true;
+  }
+  return Changed;
+}
+
+/// Calculate the liveness information for the given machine function and
+/// extend ranges across basic blocks.
+bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "\nDebug Range Extension\n");
+
+  bool Changed = false;
+  bool OLChanged = false;
+  bool MBBJoined = false;
+
+  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors.
+  OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
+  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+
+  DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
+  DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Worklist;
+  std::priority_queue<unsigned int, std::vector<unsigned int>,
+                      std::greater<unsigned int>>
+      Pending;
+
+  // Initialize every mbb with OutLocs.
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
+      transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+
+  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
+                         dbgs()));
+
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  unsigned int RPONumber = 0;
+  for (auto RI = RPOT.begin(), RE = RPOT.end(); RI != RE; ++RI) {
+    OrderToBB[RPONumber] = *RI;
+    BBToOrder[*RI] = RPONumber;
+    Worklist.push(RPONumber);
+    ++RPONumber;
+  }
+  // This is a standard "union of predecessor outs" dataflow problem.
+  // To solve it, we perform join() and transfer() using the two worklist method
+  // until the ranges converge.
+  // Ranges have converged when both worklists are empty.
+  while (!Worklist.empty() || !Pending.empty()) {
+    // We track what is on the pending worklist to avoid inserting the same
+    // thing twice.  We could avoid this with a custom priority queue, but this
+    // is probably not worth it.
+    SmallPtrSet<MachineBasicBlock *, 16> OnPending;
+    while (!Worklist.empty()) {
+      MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
+      Worklist.pop();
+      MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs);
+
+      if (MBBJoined) {
+        MBBJoined = false;
+        Changed = true;
+        for (auto &MI : *MBB)
+          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+
+        DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
+                               "OutLocs after propagating", dbgs()));
+        DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs,
+                               "InLocs after propagating", dbgs()));
+
+        if (OLChanged) {
+          OLChanged = false;
+          for (auto s : MBB->successors())
+            if (OnPending.insert(s).second) {
+              Pending.push(BBToOrder[s]);
+            }
+        }
+      }
+    }
+    Worklist.swap(Pending);
+    // At this point, pending must be empty, since it was just the empty
+    // worklist
+    assert(Pending.empty() && "Pending should be empty");
+  }
+
+  DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
+  DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
+  return Changed;
+}
+
+bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  bool Changed = false;
+
+  Changed |= ExtendRanges(MF);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
new file mode 100644
index 000000000000..966b4f1f4e4d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -0,0 +1,1047 @@
+//===- LiveDebugVariables.cpp - Tracking debug info variables -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveDebugVariables analysis.
+//
+// Remove all DBG_VALUE instructions referencing virtual registers and replace
+// them with a data structure tracking where live user variables are kept - in a
+// virtual register or in a stack slot.
+//
+// Allow the data structure to be updated during register allocation when values
+// are moved between registers and stack slots. Finally emit new DBG_VALUE
+// instructions after register allocation is complete.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "livedebug"
+
+static cl::opt<bool>
+EnableLDV("live-debug-variables", cl::init(true),
+          cl::desc("Enable the live debug variables pass"), cl::Hidden);
+
+STATISTIC(NumInsertedDebugValues, "Number of DBG_VALUEs inserted");
+char LiveDebugVariables::ID = 0;
+
+INITIALIZE_PASS_BEGIN(LiveDebugVariables, "livedebugvars",
+                "Debug Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(LiveDebugVariables, "livedebugvars",
+                "Debug Variable Analysis", false, false)
+
+void LiveDebugVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID), pImpl(nullptr) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+}
+
+/// LocMap - Map of where a user value is live, and its location.
+typedef IntervalMap<SlotIndex, unsigned, 4> LocMap;
+
+namespace {
+/// UserValueScopes - Keeps track of lexical scopes associated with a
+/// user value's source location.
+class UserValueScopes {
+  DebugLoc DL;
+  LexicalScopes &LS;
+  SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
+
+public:
+  UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(std::move(D)), LS(L) {}
+
+  /// dominates - Return true if current scope dominates at least one machine
+  /// instruction in a given machine basic block.
+  bool dominates(MachineBasicBlock *MBB) {
+    if (LBlocks.empty())
+      LS.getMachineBasicBlocks(DL, LBlocks);
+    return LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB);
+  }
+};
+} // end anonymous namespace
+
+/// UserValue - A user value is a part of a debug info user variable.
+///
+/// A DBG_VALUE instruction notes that (a sub-register of) a virtual register
+/// holds part of a user variable. The part is identified by a byte offset.
+///
+/// UserValues are grouped into equivalence classes for easier searching. Two
+/// user values are related if they refer to the same variable, or if they are
+/// held by the same virtual register. The equivalence class is the transitive
+/// closure of that relation.
+namespace {
+class LDVImpl;
+class UserValue {
+  const MDNode *Variable;   ///< The debug info variable we are part of.
+  const MDNode *Expression; ///< Any complex address expression.
+  unsigned offset;        ///< Byte offset into variable.
+  bool IsIndirect;        ///< true if this is a register-indirect+offset value.
+  DebugLoc dl;            ///< The debug location for the variable. This is
+                          ///< used by dwarf writer to find lexical scope.
+  UserValue *leader;      ///< Equivalence class leader.
+  UserValue *next;        ///< Next value in equivalence class, or null.
+
+  /// Numbered locations referenced by locmap.
+  SmallVector<MachineOperand, 4> locations;
+
+  /// Map of slot indices where this value is live.
+  LocMap locInts;
+
+  /// coalesceLocation - After LocNo was changed, check if it has become
+  /// identical to another location, and coalesce them. This may cause LocNo or
+  /// a later location to be erased, but no earlier location will be erased.
+  void coalesceLocation(unsigned LocNo);
+
+  /// insertDebugValue - Insert a DBG_VALUE into MBB at Idx for LocNo.
+  void insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx, unsigned LocNo,
+                        LiveIntervals &LIS, const TargetInstrInfo &TII);
+
+  /// splitLocation - Replace OldLocNo ranges with NewRegs ranges where NewRegs
+  /// is live. Returns true if any changes were made.
+  bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
+
+public:
+  /// UserValue - Create a new UserValue.
+  UserValue(const MDNode *var, const MDNode *expr, unsigned o, bool i,
+            DebugLoc L, LocMap::Allocator &alloc)
+      : Variable(var), Expression(expr), offset(o), IsIndirect(i),
+        dl(std::move(L)), leader(this), next(nullptr), locInts(alloc) {}
+
+  /// getLeader - Get the leader of this value's equivalence class.
+  UserValue *getLeader() {
+    UserValue *l = leader;
+    while (l != l->leader)
+      l = l->leader;
+    return leader = l;
+  }
+
+  /// getNext - Return the next UserValue in the equivalence class.
+  UserValue *getNext() const { return next; }
+
+  /// match - Does this UserValue match the parameters?
+  bool match(const MDNode *Var, const MDNode *Expr, const DILocation *IA,
+             unsigned Offset, bool indirect) const {
+    return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA &&
+           Offset == offset && indirect == IsIndirect;
+  }
+
+  /// merge - Merge equivalence classes.
+  static UserValue *merge(UserValue *L1, UserValue *L2) {
+    L2 = L2->getLeader();
+    if (!L1)
+      return L2;
+    L1 = L1->getLeader();
+    if (L1 == L2)
+      return L1;
+    // Splice L2 before L1's members.
+    UserValue *End = L2;
+    while (End->next) {
+      End->leader = L1;
+      End = End->next;
+    }
+    End->leader = L1;
+    End->next = L1->next;
+    L1->next = L2;
+    return L1;
+  }
+
+  /// getLocationNo - Return the location number that matches Loc.
+  unsigned getLocationNo(const MachineOperand &LocMO) {
+    if (LocMO.isReg()) {
+      if (LocMO.getReg() == 0)
+        return ~0u;
+      // For register locations we dont care about use/def and other flags.
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (locations[i].isReg() &&
+            locations[i].getReg() == LocMO.getReg() &&
+            locations[i].getSubReg() == LocMO.getSubReg())
+          return i;
+    } else
+      for (unsigned i = 0, e = locations.size(); i != e; ++i)
+        if (LocMO.isIdenticalTo(locations[i]))
+          return i;
+    locations.push_back(LocMO);
+    // We are storing a MachineOperand outside a MachineInstr.
+    locations.back().clearParent();
+    // Don't store def operands.
+    if (locations.back().isReg())
+      locations.back().setIsUse();
+    return locations.size() - 1;
+  }
+
+  /// mapVirtRegs - Ensure that all virtual register locations are mapped.
+  void mapVirtRegs(LDVImpl *LDV);
+
+  /// addDef - Add a definition point to this value.
+  void addDef(SlotIndex Idx, const MachineOperand &LocMO) {
+    // Add a singular (Idx,Idx) -> Loc mapping.
+    LocMap::iterator I = locInts.find(Idx);
+    if (!I.valid() || I.start() != Idx)
+      I.insert(Idx, Idx.getNextSlot(), getLocationNo(LocMO));
+    else
+      // A later DBG_VALUE at the same SlotIndex overrides the old location.
+      I.setValue(getLocationNo(LocMO));
+  }
+
+  /// extendDef - Extend the current definition as far as possible down the
+  /// dominator tree. Stop when meeting an existing def or when leaving the live
+  /// range of VNI.
+  /// End points where VNI is no longer live are added to Kills.
+  /// @param Idx   Starting point for the definition.
+  /// @param LocNo Location number to propagate.
+  /// @param LR    Restrict liveness to where LR has the value VNI. May be null.
+  /// @param VNI   When LR is not null, this is the value to restrict to.
+  /// @param Kills Append end points of VNI's live range to Kills.
+  /// @param LIS   Live intervals analysis.
+  /// @param MDT   Dominator tree.
+  void extendDef(SlotIndex Idx, unsigned LocNo,
+                 LiveRange *LR, const VNInfo *VNI,
+                 SmallVectorImpl<SlotIndex> *Kills,
+                 LiveIntervals &LIS, MachineDominatorTree &MDT,
+                 UserValueScopes &UVS);
+
+  /// addDefsFromCopies - The value in LI/LocNo may be copies to other
+  /// registers. Determine if any of the copies are available at the kill
+  /// points, and add defs if possible.
+  /// @param LI      Scan for copies of the value in LI->reg.
+  /// @param LocNo   Location number of LI->reg.
+  /// @param Kills   Points where the range of LocNo could be extended.
+  /// @param NewDefs Append (Idx, LocNo) of inserted defs here.
+  void addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
+                      const SmallVectorImpl<SlotIndex> &Kills,
+                      SmallVectorImpl<std::pair<SlotIndex, unsigned> > &NewDefs,
+                      MachineRegisterInfo &MRI,
+                      LiveIntervals &LIS);
+
+  /// computeIntervals - Compute the live intervals of all locations after
+  /// collecting all their def points.
+  void computeIntervals(MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                        LiveIntervals &LIS, MachineDominatorTree &MDT,
+                        UserValueScopes &UVS);
+
+  /// splitRegister - Replace OldReg ranges with NewRegs ranges where NewRegs is
+  /// live. Returns true if any changes were made.
+  bool splitRegister(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
+
+  /// rewriteLocations - Rewrite virtual register locations according to the
+  /// provided virtual register map.
+  void rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI);
+
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM,
+                       LiveIntervals &LIS, const TargetInstrInfo &TRI);
+
+  /// getDebugLoc - Return DebugLoc of this UserValue.
+  DebugLoc getDebugLoc() { return dl;}
+  void print(raw_ostream &, const TargetRegisterInfo *);
+};
+} // namespace
+
+/// LDVImpl - Implementation of the LiveDebugVariables pass.
+namespace {
+class LDVImpl {
+  LiveDebugVariables &pass;
+  LocMap::Allocator allocator;
+  MachineFunction *MF;
+  LiveIntervals *LIS;
+  LexicalScopes LS;
+  MachineDominatorTree *MDT;
+  const TargetRegisterInfo *TRI;
+
+  /// Whether emitDebugValues is called.
+  bool EmitDone;
+  /// Whether the machine function is modified during the pass.
+  bool ModifiedMF;
+
+  /// userValues - All allocated UserValue instances.
+  SmallVector<std::unique_ptr<UserValue>, 8> userValues;
+
+  /// Map virtual register to eq class leader.
+  typedef DenseMap<unsigned, UserValue*> VRMap;
+  VRMap virtRegToEqClass;
+
+  /// Map user variable to eq class leader.
+  typedef DenseMap<const MDNode *, UserValue*> UVMap;
+  UVMap userVarMap;
+
+  /// getUserValue - Find or create a UserValue.
+  UserValue *getUserValue(const MDNode *Var, const MDNode *Expr,
+                          unsigned Offset, bool IsIndirect, const DebugLoc &DL);
+
+  /// lookupVirtReg - Find the EC leader for VirtReg or null.
+  UserValue *lookupVirtReg(unsigned VirtReg);
+
+  /// handleDebugValue - Add DBG_VALUE instruction to our maps.
+  /// @param MI  DBG_VALUE instruction
+  /// @param Idx Last valid SLotIndex before instruction.
+  /// @return    True if the DBG_VALUE instruction should be deleted.
+  bool handleDebugValue(MachineInstr &MI, SlotIndex Idx);
+
+  /// collectDebugValues - Collect and erase all DBG_VALUE instructions, adding
+  /// a UserValue def for each instruction.
+  /// @param mf MachineFunction to be scanned.
+  /// @return True if any debug values were found.
+  bool collectDebugValues(MachineFunction &mf);
+
+  /// computeIntervals - Compute the live intervals of all user values after
+  /// collecting all their def points.
+  void computeIntervals();
+
+public:
+  LDVImpl(LiveDebugVariables *ps)
+      : pass(*ps), MF(nullptr), EmitDone(false), ModifiedMF(false) {}
+  bool runOnMachineFunction(MachineFunction &mf);
+
+  /// clear - Release all memory.
+  void clear() {
+    MF = nullptr;
+    userValues.clear();
+    virtRegToEqClass.clear();
+    userVarMap.clear();
+    // Make sure we call emitDebugValues if the machine function was modified.
+    assert((!ModifiedMF || EmitDone) &&
+           "Dbg values are not emitted in LDV");
+    EmitDone = false;
+    ModifiedMF = false;
+    LS.reset();
+  }
+
+  /// mapVirtReg - Map virtual register to an equivalence class.
+  void mapVirtReg(unsigned VirtReg, UserValue *EC);
+
+  /// splitRegister -  Replace all references to OldReg with NewRegs.
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
+
+  /// emitDebugValues - Recreate DBG_VALUE instruction from data structures.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  void print(raw_ostream&);
+};
+} // namespace
+
+static void printDebugLoc(const DebugLoc &DL, raw_ostream &CommentOS,
+                          const LLVMContext &Ctx) {
+  if (!DL)
+    return;
+
+  auto *Scope = cast<DIScope>(DL.getScope());
+  // Omit the directory, because it's likely to be long and uninteresting.
+  CommentOS << Scope->getFilename();
+  CommentOS << ':' << DL.getLine();
+  if (DL.getCol() != 0)
+    CommentOS << ':' << DL.getCol();
+
+  DebugLoc InlinedAtDL = DL.getInlinedAt();
+  if (!InlinedAtDL)
+    return;
+
+  CommentOS << " @[ ";
+  printDebugLoc(InlinedAtDL, CommentOS, Ctx);
+  CommentOS << " ]";
+}
+
+static void printExtendedName(raw_ostream &OS, const DILocalVariable *V,
+                              const DILocation *DL) {
+  const LLVMContext &Ctx = V->getContext();
+  StringRef Res = V->getName();
+  if (!Res.empty())
+    OS << Res << "," << V->getLine();
+  if (auto *InlinedAt = DL->getInlinedAt()) {
+    if (DebugLoc InlinedAtDL = InlinedAt) {
+      OS << " @[";
+      printDebugLoc(InlinedAtDL, OS, Ctx);
+      OS << "]";
+    }
+  }
+}
+
+void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
+  auto *DV = cast<DILocalVariable>(Variable);
+  OS << "!\"";
+  printExtendedName(OS, DV, dl);
+
+  OS << "\"\t";
+  if (offset)
+    OS << '+' << offset;
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
+    OS << " [" << I.start() << ';' << I.stop() << "):";
+    if (I.value() == ~0u)
+      OS << "undef";
+    else
+      OS << I.value();
+  }
+  for (unsigned i = 0, e = locations.size(); i != e; ++i) {
+    OS << " Loc" << i << '=';
+    locations[i].print(OS, TRI);
+  }
+  OS << '\n';
+}
+
+void LDVImpl::print(raw_ostream &OS) {
+  OS << "********** DEBUG VARIABLES **********\n";
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i)
+    userValues[i]->print(OS, TRI);
+}
+
+void UserValue::coalesceLocation(unsigned LocNo) {
+  unsigned KeepLoc = 0;
+  for (unsigned e = locations.size(); KeepLoc != e; ++KeepLoc) {
+    if (KeepLoc == LocNo)
+      continue;
+    if (locations[KeepLoc].isIdenticalTo(locations[LocNo]))
+      break;
+  }
+  // No matches.
+  if (KeepLoc == locations.size())
+    return;
+
+  // Keep the smaller location, erase the larger one.
+  unsigned EraseLoc = LocNo;
+  if (KeepLoc > EraseLoc)
+    std::swap(KeepLoc, EraseLoc);
+  locations.erase(locations.begin() + EraseLoc);
+
+  // Rewrite values.
+  for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
+    unsigned v = I.value();
+    if (v == EraseLoc)
+      I.setValue(KeepLoc);      // Coalesce when possible.
+    else if (v > EraseLoc)
+      I.setValueUnchecked(v-1); // Avoid coalescing with untransformed values.
+  }
+}
+
+void UserValue::mapVirtRegs(LDVImpl *LDV) {
+  for (unsigned i = 0, e = locations.size(); i != e; ++i)
+    if (locations[i].isReg() &&
+        TargetRegisterInfo::isVirtualRegister(locations[i].getReg()))
+      LDV->mapVirtReg(locations[i].getReg(), this);
+}
+
+UserValue *LDVImpl::getUserValue(const MDNode *Var, const MDNode *Expr,
+                                 unsigned Offset, bool IsIndirect,
+                                 const DebugLoc &DL) {
+  UserValue *&Leader = userVarMap[Var];
+  if (Leader) {
+    UserValue *UV = Leader->getLeader();
+    Leader = UV;
+    for (; UV; UV = UV->getNext())
+      if (UV->match(Var, Expr, DL->getInlinedAt(), Offset, IsIndirect))
+        return UV;
+  }
+
+  userValues.push_back(
+      make_unique<UserValue>(Var, Expr, Offset, IsIndirect, DL, allocator));
+  UserValue *UV = userValues.back().get();
+  Leader = UserValue::merge(Leader, UV);
+  return UV;
+}
+
+void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) && "Only map VirtRegs");
+  UserValue *&Leader = virtRegToEqClass[VirtReg];
+  Leader = UserValue::merge(Leader, EC);
+}
+
+UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
+  if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
+    return UV->getLeader();
+  return nullptr;
+}
+
+bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
+  // DBG_VALUE loc, offset, variable
+  if (MI.getNumOperands() != 4 ||
+      !(MI.getOperand(1).isReg() || MI.getOperand(1).isImm()) ||
+      !MI.getOperand(2).isMetadata()) {
+    DEBUG(dbgs() << "Can't handle " << MI);
+    return false;
+  }
+
+  // Get or create the UserValue for (variable,offset).
+  bool IsIndirect = MI.isIndirectDebugValue();
+  unsigned Offset = IsIndirect ? MI.getOperand(1).getImm() : 0;
+  const MDNode *Var = MI.getDebugVariable();
+  const MDNode *Expr = MI.getDebugExpression();
+  //here.
+  UserValue *UV = getUserValue(Var, Expr, Offset, IsIndirect, MI.getDebugLoc());
+  UV->addDef(Idx, MI.getOperand(0));
+  return true;
+}
+
+bool LDVImpl::collectDebugValues(MachineFunction &mf) {
+  bool Changed = false;
+  for (MachineFunction::iterator MFI = mf.begin(), MFE = mf.end(); MFI != MFE;
+       ++MFI) {
+    MachineBasicBlock *MBB = &*MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB->begin(), MBBE = MBB->end();
+         MBBI != MBBE;) {
+      if (!MBBI->isDebugValue()) {
+        ++MBBI;
+        continue;
+      }
+      // DBG_VALUE has no slot index, use the previous instruction instead.
+      SlotIndex Idx =
+          MBBI == MBB->begin()
+              ? LIS->getMBBStartIdx(MBB)
+              : LIS->getInstructionIndex(*std::prev(MBBI)).getRegSlot();
+      // Handle consecutive DBG_VALUE instructions with the same slot index.
+      do {
+        if (handleDebugValue(*MBBI, Idx)) {
+          MBBI = MBB->erase(MBBI);
+          Changed = true;
+        } else
+          ++MBBI;
+      } while (MBBI != MBBE && MBBI->isDebugValue());
+    }
+  }
+  return Changed;
+}
+
+/// We only propagate DBG_VALUES locally here. LiveDebugValues performs a
+/// data-flow analysis to propagate them beyond basic block boundaries.
+void UserValue::extendDef(SlotIndex Idx, unsigned LocNo, LiveRange *LR,
+                          const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills,
+                          LiveIntervals &LIS, MachineDominatorTree &MDT,
+                          UserValueScopes &UVS) {
+  SlotIndex Start = Idx;
+  MachineBasicBlock *MBB = LIS.getMBBFromIndex(Start);
+  SlotIndex Stop = LIS.getMBBEndIdx(MBB);
+  LocMap::iterator I = locInts.find(Start);
+
+  // Limit to VNI's live range.
+  bool ToEnd = true;
+  if (LR && VNI) {
+    LiveInterval::Segment *Segment = LR->getSegmentContaining(Start);
+    if (!Segment || Segment->valno != VNI) {
+      if (Kills)
+        Kills->push_back(Start);
+      return;
+    }
+    if (Segment->end < Stop) {
+      Stop = Segment->end;
+      ToEnd = false;
+    }
+  }
+
+  // There could already be a short def at Start.
+  if (I.valid() && I.start() <= Start) {
+    // Stop when meeting a different location or an already extended interval.
+    Start = Start.getNextSlot();
+    if (I.value() != LocNo || I.stop() != Start)
+      return;
+    // This is a one-slot placeholder. Just skip it.
+    ++I;
+  }
+
+  // Limited by the next def.
+  if (I.valid() && I.start() < Stop) {
+    Stop = I.start();
+    ToEnd = false;
+  }
+  // Limited by VNI's live range.
+  else if (!ToEnd && Kills)
+    Kills->push_back(Stop);
+
+  if (Start < Stop)
+    I.insert(Start, Stop, LocNo);
+}
+
+void
+UserValue::addDefsFromCopies(LiveInterval *LI, unsigned LocNo,
+                      const SmallVectorImpl<SlotIndex> &Kills,
+                      SmallVectorImpl<std::pair<SlotIndex, unsigned> > &NewDefs,
+                      MachineRegisterInfo &MRI, LiveIntervals &LIS) {
+  if (Kills.empty())
+    return;
+  // Don't track copies from physregs, there are too many uses.
+  if (!TargetRegisterInfo::isVirtualRegister(LI->reg))
+    return;
+
+  // Collect all the (vreg, valno) pairs that are copies of LI.
+  SmallVector<std::pair<LiveInterval*, const VNInfo*>, 8> CopyValues;
+  for (MachineOperand &MO : MRI.use_nodbg_operands(LI->reg)) {
+    MachineInstr *MI = MO.getParent();
+    // Copies of the full value.
+    if (MO.getSubReg() || !MI->isCopy())
+      continue;
+    unsigned DstReg = MI->getOperand(0).getReg();
+
+    // Don't follow copies to physregs. These are usually setting up call
+    // arguments, and the argument registers are always call clobbered. We are
+    // better off in the source register which could be a callee-saved register,
+    // or it could be spilled.
+    if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+      continue;
+
+    // Is LocNo extended to reach this copy? If not, another def may be blocking
+    // it, or we are looking at a wrong value of LI.
+    SlotIndex Idx = LIS.getInstructionIndex(*MI);
+    LocMap::iterator I = locInts.find(Idx.getRegSlot(true));
+    if (!I.valid() || I.value() != LocNo)
+      continue;
+
+    if (!LIS.hasInterval(DstReg))
+      continue;
+    LiveInterval *DstLI = &LIS.getInterval(DstReg);
+    const VNInfo *DstVNI = DstLI->getVNInfoAt(Idx.getRegSlot());
+    assert(DstVNI && DstVNI->def == Idx.getRegSlot() && "Bad copy value");
+    CopyValues.push_back(std::make_pair(DstLI, DstVNI));
+  }
+
+  if (CopyValues.empty())
+    return;
+
+  DEBUG(dbgs() << "Got " << CopyValues.size() << " copies of " << *LI << '\n');
+
+  // Try to add defs of the copied values for each kill point.
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i) {
+    SlotIndex Idx = Kills[i];
+    for (unsigned j = 0, e = CopyValues.size(); j != e; ++j) {
+      LiveInterval *DstLI = CopyValues[j].first;
+      const VNInfo *DstVNI = CopyValues[j].second;
+      if (DstLI->getVNInfoAt(Idx) != DstVNI)
+        continue;
+      // Check that there isn't already a def at Idx
+      LocMap::iterator I = locInts.find(Idx);
+      if (I.valid() && I.start() <= Idx)
+        continue;
+      DEBUG(dbgs() << "Kill at " << Idx << " covered by valno #"
+                   << DstVNI->id << " in " << *DstLI << '\n');
+      MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
+      assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
+      unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
+      I.insert(Idx, Idx.getNextSlot(), LocNo);
+      NewDefs.push_back(std::make_pair(Idx, LocNo));
+      break;
+    }
+  }
+}
+
+void
+UserValue::computeIntervals(MachineRegisterInfo &MRI,
+                            const TargetRegisterInfo &TRI,
+                            LiveIntervals &LIS,
+                            MachineDominatorTree &MDT,
+                            UserValueScopes &UVS) {
+  SmallVector<std::pair<SlotIndex, unsigned>, 16> Defs;
+
+  // Collect all defs to be extended (Skipping undefs).
+  for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I)
+    if (I.value() != ~0u)
+      Defs.push_back(std::make_pair(I.start(), I.value()));
+
+  // Extend all defs, and possibly add new ones along the way.
+  for (unsigned i = 0; i != Defs.size(); ++i) {
+    SlotIndex Idx = Defs[i].first;
+    unsigned LocNo = Defs[i].second;
+    const MachineOperand &Loc = locations[LocNo];
+
+    if (!Loc.isReg()) {
+      extendDef(Idx, LocNo, nullptr, nullptr, nullptr, LIS, MDT, UVS);
+      continue;
+    }
+
+    // Register locations are constrained to where the register value is live.
+    if (TargetRegisterInfo::isVirtualRegister(Loc.getReg())) {
+      LiveInterval *LI = nullptr;
+      const VNInfo *VNI = nullptr;
+      if (LIS.hasInterval(Loc.getReg())) {
+        LI = &LIS.getInterval(Loc.getReg());
+        VNI = LI->getVNInfoAt(Idx);
+      }
+      SmallVector<SlotIndex, 16> Kills;
+      extendDef(Idx, LocNo, LI, VNI, &Kills, LIS, MDT, UVS);
+      if (LI)
+        addDefsFromCopies(LI, LocNo, Kills, Defs, MRI, LIS);
+      continue;
+    }
+
+    // For physregs, use the live range of the first regunit as a guide.
+    unsigned Unit = *MCRegUnitIterator(Loc.getReg(), &TRI);
+    LiveRange *LR = &LIS.getRegUnit(Unit);
+    const VNInfo *VNI = LR->getVNInfoAt(Idx);
+    // Don't track copies from physregs, it is too expensive.
+    extendDef(Idx, LocNo, LR, VNI, nullptr, LIS, MDT, UVS);
+  }
+
+  // Finally, erase all the undefs.
+  for (LocMap::iterator I = locInts.begin(); I.valid();)
+    if (I.value() == ~0u)
+      I.erase();
+    else
+      ++I;
+}
+
+void LDVImpl::computeIntervals() {
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    UserValueScopes UVS(userValues[i]->getDebugLoc(), LS);
+    userValues[i]->computeIntervals(MF->getRegInfo(), *TRI, *LIS, *MDT, UVS);
+    userValues[i]->mapVirtRegs(this);
+  }
+}
+
+bool LDVImpl::runOnMachineFunction(MachineFunction &mf) {
+  clear();
+  MF = &mf;
+  LIS = &pass.getAnalysis<LiveIntervals>();
+  MDT = &pass.getAnalysis<MachineDominatorTree>();
+  TRI = mf.getSubtarget().getRegisterInfo();
+  LS.initialize(mf);
+  DEBUG(dbgs() << "********** COMPUTING LIVE DEBUG VARIABLES: "
+               << mf.getName() << " **********\n");
+
+  bool Changed = collectDebugValues(mf);
+  computeIntervals();
+  DEBUG(print(dbgs()));
+  ModifiedMF = Changed;
+  return Changed;
+}
+
+static void removeDebugValues(MachineFunction &mf) {
+  for (MachineBasicBlock &MBB : mf) {
+    for (auto MBBI = MBB.begin(), MBBE = MBB.end(); MBBI != MBBE; ) {
+      if (!MBBI->isDebugValue()) {
+        ++MBBI;
+        continue;
+      }
+      MBBI = MBB.erase(MBBI);
+    }
+  }
+}
+
+bool LiveDebugVariables::runOnMachineFunction(MachineFunction &mf) {
+  if (!EnableLDV)
+    return false;
+  if (!mf.getFunction()->getSubprogram()) {
+    removeDebugValues(mf);
+    return false;
+  }
+  if (!pImpl)
+    pImpl = new LDVImpl(this);
+  return static_cast<LDVImpl*>(pImpl)->runOnMachineFunction(mf);
+}
+
+void LiveDebugVariables::releaseMemory() {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->clear();
+}
+
+LiveDebugVariables::~LiveDebugVariables() {
+  if (pImpl)
+    delete static_cast<LDVImpl*>(pImpl);
+}
+
+//===----------------------------------------------------------------------===//
+//                           Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+bool
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals& LIS) {
+  DEBUG({
+    dbgs() << "Splitting Loc" << OldLocNo << '\t';
+    print(dbgs(), nullptr);
+  });
+  bool DidChange = false;
+  LocMap::iterator LocMapI;
+  LocMapI.setMap(locInts);
+  for (unsigned i = 0; i != NewRegs.size(); ++i) {
+    LiveInterval *LI = &LIS.getInterval(NewRegs[i]);
+    if (LI->empty())
+      continue;
+
+    // Don't allocate the new LocNo until it is needed.
+    unsigned NewLocNo = ~0u;
+
+    // Iterate over the overlaps between locInts and LI.
+    LocMapI.find(LI->beginIndex());
+    if (!LocMapI.valid())
+      continue;
+    LiveInterval::iterator LII = LI->advanceTo(LI->begin(), LocMapI.start());
+    LiveInterval::iterator LIE = LI->end();
+    while (LocMapI.valid() && LII != LIE) {
+      // At this point, we know that LocMapI.stop() > LII->start.
+      LII = LI->advanceTo(LII, LocMapI.start());
+      if (LII == LIE)
+        break;
+
+      // Now LII->end > LocMapI.start(). Do we have an overlap?
+      if (LocMapI.value() == OldLocNo && LII->start < LocMapI.stop()) {
+        // Overlapping correct location. Allocate NewLocNo now.
+        if (NewLocNo == ~0u) {
+          MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
+          MO.setSubReg(locations[OldLocNo].getSubReg());
+          NewLocNo = getLocationNo(MO);
+          DidChange = true;
+        }
+
+        SlotIndex LStart = LocMapI.start();
+        SlotIndex LStop  = LocMapI.stop();
+
+        // Trim LocMapI down to the LII overlap.
+        if (LStart < LII->start)
+          LocMapI.setStartUnchecked(LII->start);
+        if (LStop > LII->end)
+          LocMapI.setStopUnchecked(LII->end);
+
+        // Change the value in the overlap. This may trigger coalescing.
+        LocMapI.setValue(NewLocNo);
+
+        // Re-insert any removed OldLocNo ranges.
+        if (LStart < LocMapI.start()) {
+          LocMapI.insert(LStart, LocMapI.start(), OldLocNo);
+          ++LocMapI;
+          assert(LocMapI.valid() && "Unexpected coalescing");
+        }
+        if (LStop > LocMapI.stop()) {
+          ++LocMapI;
+          LocMapI.insert(LII->end, LStop, OldLocNo);
+          --LocMapI;
+        }
+      }
+
+      // Advance to the next overlap.
+      if (LII->end < LocMapI.stop()) {
+        if (++LII == LIE)
+          break;
+        LocMapI.advanceTo(LII->start);
+      } else {
+        ++LocMapI;
+        if (!LocMapI.valid())
+          break;
+        LII = LI->advanceTo(LII, LocMapI.start());
+      }
+    }
+  }
+
+  // Finally, remove any remaining OldLocNo intervals and OldLocNo itself.
+  locations.erase(locations.begin() + OldLocNo);
+  LocMapI.goToBegin();
+  while (LocMapI.valid()) {
+    unsigned v = LocMapI.value();
+    if (v == OldLocNo) {
+      DEBUG(dbgs() << "Erasing [" << LocMapI.start() << ';'
+                   << LocMapI.stop() << ")\n");
+      LocMapI.erase();
+    } else {
+      if (v > OldLocNo)
+        LocMapI.setValueUnchecked(v-1);
+      ++LocMapI;
+    }
+  }
+
+  DEBUG({dbgs() << "Split result: \t"; print(dbgs(), nullptr);});
+  return DidChange;
+}
+
+bool
+UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                         LiveIntervals &LIS) {
+  bool DidChange = false;
+  // Split locations referring to OldReg. Iterate backwards so splitLocation can
+  // safely erase unused locations.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    const MachineOperand *Loc = &locations[LocNo];
+    if (!Loc->isReg() || Loc->getReg() != OldReg)
+      continue;
+    DidChange |= splitLocation(LocNo, NewRegs, LIS);
+  }
+  return DidChange;
+}
+
+void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
+  bool DidChange = false;
+  for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
+    DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
+
+  if (!DidChange)
+    return;
+
+  // Map all of the new virtual registers.
+  UserValue *UV = lookupVirtReg(OldReg);
+  for (unsigned i = 0; i != NewRegs.size(); ++i)
+    mapVirtReg(NewRegs[i], UV);
+}
+
+void LiveDebugVariables::
+splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs, LiveIntervals &LIS) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
+}
+
+void
+UserValue::rewriteLocations(VirtRegMap &VRM, const TargetRegisterInfo &TRI) {
+  // Iterate over locations in reverse makes it easier to handle coalescing.
+  for (unsigned i = locations.size(); i ; --i) {
+    unsigned LocNo = i-1;
+    MachineOperand &Loc = locations[LocNo];
+    // Only virtual registers are rewritten.
+    if (!Loc.isReg() || !Loc.getReg() ||
+        !TargetRegisterInfo::isVirtualRegister(Loc.getReg()))
+      continue;
+    unsigned VirtReg = Loc.getReg();
+    if (VRM.isAssignedReg(VirtReg) &&
+        TargetRegisterInfo::isPhysicalRegister(VRM.getPhys(VirtReg))) {
+      // This can create a %noreg operand in rare cases when the sub-register
+      // index is no longer available. That means the user value is in a
+      // non-existent sub-register, and %noreg is exactly what we want.
+      Loc.substPhysReg(VRM.getPhys(VirtReg), TRI);
+    } else if (VRM.getStackSlot(VirtReg) != VirtRegMap::NO_STACK_SLOT) {
+      // FIXME: Translate SubIdx to a stackslot offset.
+      Loc = MachineOperand::CreateFI(VRM.getStackSlot(VirtReg));
+    } else {
+      Loc.setReg(0);
+      Loc.setSubReg(0);
+    }
+    coalesceLocation(LocNo);
+  }
+}
+
+/// findInsertLocation - Find an iterator for inserting a DBG_VALUE
+/// instruction.
+static MachineBasicBlock::iterator
+findInsertLocation(MachineBasicBlock *MBB, SlotIndex Idx,
+                   LiveIntervals &LIS) {
+  SlotIndex Start = LIS.getMBBStartIdx(MBB);
+  Idx = Idx.getBaseIndex();
+
+  // Try to find an insert location by going backwards from Idx.
+  MachineInstr *MI;
+  while (!(MI = LIS.getInstructionFromIndex(Idx))) {
+    // We've reached the beginning of MBB.
+    if (Idx == Start) {
+      MachineBasicBlock::iterator I = MBB->SkipPHIsAndLabels(MBB->begin());
+      return I;
+    }
+    Idx = Idx.getPrevIndex();
+  }
+
+  // Don't insert anything after the first terminator, though.
+  return MI->isTerminator() ? MBB->getFirstTerminator() :
+                              std::next(MachineBasicBlock::iterator(MI));
+}
+
+void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex Idx,
+                                 unsigned LocNo,
+                                 LiveIntervals &LIS,
+                                 const TargetInstrInfo &TII) {
+  MachineBasicBlock::iterator I = findInsertLocation(MBB, Idx, LIS);
+  MachineOperand &Loc = locations[LocNo];
+  ++NumInsertedDebugValues;
+
+  assert(cast<DILocalVariable>(Variable)
+             ->isValidLocationForIntrinsic(getDebugLoc()) &&
+         "Expected inlined-at fields to agree");
+  if (Loc.isReg())
+    BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
+            IsIndirect, Loc.getReg(), offset, Variable, Expression);
+  else
+    BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE))
+        .addOperand(Loc)
+        .addImm(offset)
+        .addMetadata(Variable)
+        .addMetadata(Expression);
+}
+
+void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
+                                const TargetInstrInfo &TII) {
+  MachineFunction::iterator MFEnd = VRM->getMachineFunction().end();
+
+  for (LocMap::const_iterator I = locInts.begin(); I.valid();) {
+    SlotIndex Start = I.start();
+    SlotIndex Stop = I.stop();
+    unsigned LocNo = I.value();
+    DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << LocNo);
+    MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
+    SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB);
+
+    DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
+    insertDebugValue(&*MBB, Start, LocNo, LIS, TII);
+    // This interval may span multiple basic blocks.
+    // Insert a DBG_VALUE into each one.
+    while(Stop > MBBEnd) {
+      // Move to the next block.
+      Start = MBBEnd;
+      if (++MBB == MFEnd)
+        break;
+      MBBEnd = LIS.getMBBEndIdx(&*MBB);
+      DEBUG(dbgs() << " BB#" << MBB->getNumber() << '-' << MBBEnd);
+      insertDebugValue(&*MBB, Start, LocNo, LIS, TII);
+    }
+    DEBUG(dbgs() << '\n');
+    if (MBB == MFEnd)
+      break;
+
+    ++I;
+  }
+}
+
+void LDVImpl::emitDebugValues(VirtRegMap *VRM) {
+  DEBUG(dbgs() << "********** EMITTING LIVE DEBUG VARIABLES **********\n");
+  if (!MF)
+    return;
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  for (unsigned i = 0, e = userValues.size(); i != e; ++i) {
+    DEBUG(userValues[i]->print(dbgs(), TRI));
+    userValues[i]->rewriteLocations(*VRM, *TRI);
+    userValues[i]->emitDebugValues(VRM, *LIS, *TII);
+  }
+  EmitDone = true;
+}
+
+void LiveDebugVariables::emitDebugValues(VirtRegMap *VRM) {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->emitDebugValues(VRM);
+}
+
+bool LiveDebugVariables::doInitialization(Module &M) {
+  return Pass::doInitialization(M);
+}
+
+#ifndef NDEBUG
+LLVM_DUMP_METHOD void LiveDebugVariables::dump() {
+  if (pImpl)
+    static_cast<LDVImpl*>(pImpl)->print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveDebugVariables.h b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
new file mode 100644
index 000000000000..afe87a52544d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -0,0 +1,75 @@
+//===- LiveDebugVariables.h - Tracking debug info variables ----*- c++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the interface to the LiveDebugVariables analysis.
+//
+// The analysis removes DBG_VALUE instructions for virtual registers and tracks
+// live user variables in a data structure that can be updated during register
+// allocation.
+//
+// After register allocation new DBG_VALUE instructions are emitted to reflect
+// the new locations of user variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+#define LLVM_LIB_CODEGEN_LIVEDEBUGVARIABLES_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/IR/DebugInfo.h"
+
+namespace llvm {
+
+template <typename T> class ArrayRef;
+class LiveInterval;
+class LiveIntervals;
+class VirtRegMap;
+
+class LLVM_LIBRARY_VISIBILITY LiveDebugVariables : public MachineFunctionPass {
+  void *pImpl;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+
+  LiveDebugVariables();
+  ~LiveDebugVariables() override;
+
+  /// renameRegister - Move any user variables in OldReg to NewReg:SubIdx.
+  /// @param OldReg Old virtual register that is going away.
+  /// @param NewReg New register holding the user variables.
+  /// @param SubIdx If NewReg is a virtual register, SubIdx may indicate a sub-
+  ///               register.
+  void renameRegister(unsigned OldReg, unsigned NewReg, unsigned SubIdx);
+
+  /// splitRegister - Move any user variables in OldReg to the live ranges in
+  /// NewRegs where they are live. Mark the values as unavailable where no new
+  /// register is live.
+  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+                     LiveIntervals &LIS);
+
+  /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
+  /// that happened during register allocation.
+  /// @param VRM Rename virtual registers according to map.
+  void emitDebugValues(VirtRegMap *VRM);
+
+  /// dump - Print data structures to dbgs().
+  void dump();
+
+private:
+
+  bool runOnMachineFunction(MachineFunction &) override;
+  void releaseMemory() override;
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  bool doInitialization(Module &) override;
+
+};
+
+} // namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveInterval.cpp b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
new file mode 100644
index 000000000000..93c5ca785ac9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveInterval.cpp
@@ -0,0 +1,1250 @@
+//===-- LiveInterval.cpp - Live Interval Representation -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveRange and LiveInterval classes.  Given some
+// numbering of each the machine instructions an interval [i, j) is said to be a
+// live range for register v if there is no instruction with number j' >= j
+// such that v is live at j' and there is no instruction with number i' < i such
+// that v is live at i'. In this implementation ranges can have holes,
+// i.e. a range might look like [1,20), [50,65), [1000,1001).  Each
+// individual segment is represented as an instance of LiveRange::Segment,
+// and the whole range is represented as an instance of LiveRange.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveInterval.h"
+
+#include "LiveRangeUtils.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+namespace {
+//===----------------------------------------------------------------------===//
+// Implementation of various methods necessary for calculation of live ranges.
+// The implementation of the methods abstracts from the concrete type of the
+// segment collection.
+//
+// Implementation of the class follows the Template design pattern. The base
+// class contains generic algorithms that call collection-specific methods,
+// which are provided in concrete subclasses. In order to avoid virtual calls
+// these methods are provided by means of C++ template instantiation.
+// The base class calls the methods of the subclass through method impl(),
+// which casts 'this' pointer to the type of the subclass.
+//
+//===----------------------------------------------------------------------===//
+
+template <typename ImplT, typename IteratorT, typename CollectionT>
+class CalcLiveRangeUtilBase {
+protected:
+  LiveRange *LR;
+
+protected:
+  CalcLiveRangeUtilBase(LiveRange *LR) : LR(LR) {}
+
+public:
+  typedef LiveRange::Segment Segment;
+  typedef IteratorT iterator;
+
+  VNInfo *createDeadDef(SlotIndex Def, VNInfo::Allocator &VNInfoAllocator) {
+    assert(!Def.isDead() && "Cannot define a value at the dead slot");
+
+    iterator I = impl().find(Def);
+    if (I == segments().end()) {
+      VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+      impl().insertAtEnd(Segment(Def, Def.getDeadSlot(), VNI));
+      return VNI;
+    }
+
+    Segment *S = segmentAt(I);
+    if (SlotIndex::isSameInstr(Def, S->start)) {
+      assert(S->valno->def == S->start && "Inconsistent existing value def");
+
+      // It is possible to have both normal and early-clobber defs of the same
+      // register on an instruction. It doesn't make a lot of sense, but it is
+      // possible to specify in inline assembly.
+      //
+      // Just convert everything to early-clobber.
+      Def = std::min(Def, S->start);
+      if (Def != S->start)
+        S->start = S->valno->def = Def;
+      return S->valno;
+    }
+    assert(SlotIndex::isEarlierInstr(Def, S->start) && "Already live at def");
+    VNInfo *VNI = LR->getNextValue(Def, VNInfoAllocator);
+    segments().insert(I, Segment(Def, Def.getDeadSlot(), VNI));
+    return VNI;
+  }
+
+  VNInfo *extendInBlock(SlotIndex StartIdx, SlotIndex Use) {
+    if (segments().empty())
+      return nullptr;
+    iterator I =
+        impl().findInsertPos(Segment(Use.getPrevSlot(), Use, nullptr));
+    if (I == segments().begin())
+      return nullptr;
+    --I;
+    if (I->end <= StartIdx)
+      return nullptr;
+    if (I->end < Use)
+      extendSegmentEndTo(I, Use);
+    return I->valno;
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to end at the specified endpoint. To do this, we should
+  /// merge and eliminate all segments that this will overlap
+  /// with. The iterator is not invalidated.
+  void extendSegmentEndTo(iterator I, SlotIndex NewEnd) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = std::next(I);
+    for (; MergeTo != segments().end() && NewEnd >= MergeTo->end; ++MergeTo)
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+
+    // If NewEnd was in the middle of a segment, make sure to get its endpoint.
+    S->end = std::max(NewEnd, std::prev(MergeTo)->end);
+
+    // If the newly formed segment now touches the segment after it and if they
+    // have the same value number, merge the two segments into one segment.
+    if (MergeTo != segments().end() && MergeTo->start <= I->end &&
+        MergeTo->valno == ValNo) {
+      S->end = MergeTo->end;
+      ++MergeTo;
+    }
+
+    // Erase any dead segments.
+    segments().erase(std::next(I), MergeTo);
+  }
+
+  /// This method is used when we want to extend the segment specified
+  /// by I to start at the specified endpoint.  To do this, we should
+  /// merge and eliminate all segments that this will overlap with.
+  iterator extendSegmentStartTo(iterator I, SlotIndex NewStart) {
+    assert(I != segments().end() && "Not a valid segment!");
+    Segment *S = segmentAt(I);
+    VNInfo *ValNo = I->valno;
+
+    // Search for the first segment that we can't merge with.
+    iterator MergeTo = I;
+    do {
+      if (MergeTo == segments().begin()) {
+        S->start = NewStart;
+        segments().erase(MergeTo, I);
+        return I;
+      }
+      assert(MergeTo->valno == ValNo && "Cannot merge with differing values!");
+      --MergeTo;
+    } while (NewStart <= MergeTo->start);
+
+    // If we start in the middle of another segment, just delete a range and
+    // extend that segment.
+    if (MergeTo->end >= NewStart && MergeTo->valno == ValNo) {
+      segmentAt(MergeTo)->end = S->end;
+    } else {
+      // Otherwise, extend the segment right after.
+      ++MergeTo;
+      Segment *MergeToSeg = segmentAt(MergeTo);
+      MergeToSeg->start = NewStart;
+      MergeToSeg->end = S->end;
+    }
+
+    segments().erase(std::next(MergeTo), std::next(I));
+    return MergeTo;
+  }
+
+  iterator addSegment(Segment S) {
+    SlotIndex Start = S.start, End = S.end;
+    iterator I = impl().findInsertPos(S);
+
+    // If the inserted segment starts in the middle or right at the end of
+    // another segment, just extend that segment to contain the segment of S.
+    if (I != segments().begin()) {
+      iterator B = std::prev(I);
+      if (S.valno == B->valno) {
+        if (B->start <= Start && B->end >= Start) {
+          extendSegmentEndTo(B, End);
+          return B;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(B->end <= Start &&
+               "Cannot overlap two segments with differing ValID's"
+               " (did you def the same reg twice in a MachineInstr?)");
+      }
+    }
+
+    // Otherwise, if this segment ends in the middle of, or right next
+    // to, another segment, merge it into that segment.
+    if (I != segments().end()) {
+      if (S.valno == I->valno) {
+        if (I->start <= End) {
+          I = extendSegmentStartTo(I, Start);
+
+          // If S is a complete superset of a segment, we may need to grow its
+          // endpoint as well.
+          if (End > I->end)
+            extendSegmentEndTo(I, End);
+          return I;
+        }
+      } else {
+        // Check to make sure that we are not overlapping two live segments with
+        // different valno's.
+        assert(I->start >= End &&
+               "Cannot overlap two segments with differing ValID's");
+      }
+    }
+
+    // Otherwise, this is just a new segment that doesn't interact with
+    // anything.
+    // Insert it.
+    return segments().insert(I, S);
+  }
+
+private:
+  ImplT &impl() { return *static_cast<ImplT *>(this); }
+
+  CollectionT &segments() { return impl().segmentsColl(); }
+
+  Segment *segmentAt(iterator I) { return const_cast<Segment *>(&(*I)); }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment vector.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilVector;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilVector, LiveRange::iterator,
+                              LiveRange::Segments> CalcLiveRangeUtilVectorBase;
+
+class CalcLiveRangeUtilVector : public CalcLiveRangeUtilVectorBase {
+public:
+  CalcLiveRangeUtilVector(LiveRange *LR) : CalcLiveRangeUtilVectorBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilVectorBase;
+
+  LiveRange::Segments &segmentsColl() { return LR->segments; }
+
+  void insertAtEnd(const Segment &S) { LR->segments.push_back(S); }
+
+  iterator find(SlotIndex Pos) { return LR->find(Pos); }
+
+  iterator findInsertPos(Segment S) {
+    return std::upper_bound(LR->begin(), LR->end(), S.start);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+//   Instantiation of the methods for calculation of live ranges
+//   based on a segment set.
+//===----------------------------------------------------------------------===//
+
+class CalcLiveRangeUtilSet;
+typedef CalcLiveRangeUtilBase<CalcLiveRangeUtilSet,
+                              LiveRange::SegmentSet::iterator,
+                              LiveRange::SegmentSet> CalcLiveRangeUtilSetBase;
+
+class CalcLiveRangeUtilSet : public CalcLiveRangeUtilSetBase {
+public:
+  CalcLiveRangeUtilSet(LiveRange *LR) : CalcLiveRangeUtilSetBase(LR) {}
+
+private:
+  friend CalcLiveRangeUtilSetBase;
+
+  LiveRange::SegmentSet &segmentsColl() { return *LR->segmentSet; }
+
+  void insertAtEnd(const Segment &S) {
+    LR->segmentSet->insert(LR->segmentSet->end(), S);
+  }
+
+  iterator find(SlotIndex Pos) {
+    iterator I =
+        LR->segmentSet->upper_bound(Segment(Pos, Pos.getNextSlot(), nullptr));
+    if (I == LR->segmentSet->begin())
+      return I;
+    iterator PrevI = std::prev(I);
+    if (Pos < (*PrevI).end)
+      return PrevI;
+    return I;
+  }
+
+  iterator findInsertPos(Segment S) {
+    iterator I = LR->segmentSet->upper_bound(S);
+    if (I != LR->segmentSet->end() && !(S.start < *I))
+      ++I;
+    return I;
+  }
+};
+} // namespace
+
+//===----------------------------------------------------------------------===//
+//   LiveRange methods
+//===----------------------------------------------------------------------===//
+
+LiveRange::iterator LiveRange::find(SlotIndex Pos) {
+  // This algorithm is basically std::upper_bound.
+  // Unfortunately, std::upper_bound cannot be used with mixed types until we
+  // adopt C++0x. Many libraries can do it, but not all.
+  if (empty() || Pos >= endIndex())
+    return end();
+  iterator I = begin();
+  size_t Len = size();
+  do {
+    size_t Mid = Len >> 1;
+    if (Pos < I[Mid].end) {
+      Len = Mid;
+    } else {
+      I += Mid + 1;
+      Len -= Mid + 1;
+    }
+  } while (Len);
+  return I;
+}
+
+VNInfo *LiveRange::createDeadDef(SlotIndex Def,
+                                  VNInfo::Allocator &VNInfoAllocator) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).createDeadDef(Def, VNInfoAllocator);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).createDeadDef(Def, VNInfoAllocator);
+}
+
+// overlaps - Return true if the intersection of the two live ranges is
+// not empty.
+//
+// An example for overlaps():
+//
+// 0: A = ...
+// 4: B = ...
+// 8: C = A + B ;; last use of A
+//
+// The live ranges should look like:
+//
+// A = [3, 11)
+// B = [7, x)
+// C = [11, y)
+//
+// A->overlaps(C) should return false since we want to be able to join
+// A and C.
+//
+bool LiveRange::overlapsFrom(const LiveRange& other,
+                             const_iterator StartPos) const {
+  assert(!empty() && "empty range");
+  const_iterator i = begin();
+  const_iterator ie = end();
+  const_iterator j = StartPos;
+  const_iterator je = other.end();
+
+  assert((StartPos->start <= i->start || StartPos == other.begin()) &&
+         StartPos != other.end() && "Bogus start position hint!");
+
+  if (i->start < j->start) {
+    i = std::upper_bound(i, ie, j->start);
+    if (i != begin()) --i;
+  } else if (j->start < i->start) {
+    ++StartPos;
+    if (StartPos != other.end() && StartPos->start <= i->start) {
+      assert(StartPos < other.end() && i < end());
+      j = std::upper_bound(j, je, i->start);
+      if (j != other.begin()) --j;
+    }
+  } else {
+    return true;
+  }
+
+  if (j == je) return false;
+
+  while (i != ie) {
+    if (i->start > j->start) {
+      std::swap(i, j);
+      std::swap(ie, je);
+    }
+
+    if (i->end > j->start)
+      return true;
+    ++i;
+  }
+
+  return false;
+}
+
+bool LiveRange::overlaps(const LiveRange &Other, const CoalescerPair &CP,
+                         const SlotIndexes &Indexes) const {
+  assert(!empty() && "empty range");
+  if (Other.empty())
+    return false;
+
+  // Use binary searches to find initial positions.
+  const_iterator I = find(Other.beginIndex());
+  const_iterator IE = end();
+  if (I == IE)
+    return false;
+  const_iterator J = Other.find(I->start);
+  const_iterator JE = Other.end();
+  if (J == JE)
+    return false;
+
+  for (;;) {
+    // J has just been advanced to satisfy:
+    assert(J->end >= I->start);
+    // Check for an overlap.
+    if (J->start < I->end) {
+      // I and J are overlapping. Find the later start.
+      SlotIndex Def = std::max(I->start, J->start);
+      // Allow the overlap if Def is a coalescable copy.
+      if (Def.isBlock() ||
+          !CP.isCoalescable(Indexes.getInstructionFromIndex(Def)))
+        return true;
+    }
+    // Advance the iterator that ends first to check for more overlaps.
+    if (J->end > I->end) {
+      std::swap(I, J);
+      std::swap(IE, JE);
+    }
+    // Advance J until J->end >= I->start.
+    do
+      if (++J == JE)
+        return false;
+    while (J->end < I->start);
+  }
+}
+
+/// overlaps - Return true if the live range overlaps an interval specified
+/// by [Start, End).
+bool LiveRange::overlaps(SlotIndex Start, SlotIndex End) const {
+  assert(Start < End && "Invalid range");
+  const_iterator I = std::lower_bound(begin(), end(), End);
+  return I != begin() && (--I)->end > Start;
+}
+
+bool LiveRange::covers(const LiveRange &Other) const {
+  if (empty())
+    return Other.empty();
+
+  const_iterator I = begin();
+  for (const Segment &O : Other.segments) {
+    I = advanceTo(I, O.start);
+    if (I == end() || I->start > O.start)
+      return false;
+
+    // Check adjacent live segments and see if we can get behind O.end.
+    while (I->end < O.end) {
+      const_iterator Last = I;
+      // Get next segment and abort if it was not adjacent.
+      ++I;
+      if (I == end() || Last->end != I->start)
+        return false;
+    }
+  }
+  return true;
+}
+
+/// ValNo is dead, remove it.  If it is the largest value number, just nuke it
+/// (and any other deleted values neighboring it), otherwise mark it as ~1U so
+/// it can be nuked later.
+void LiveRange::markValNoForDeletion(VNInfo *ValNo) {
+  if (ValNo->id == getNumValNums()-1) {
+    do {
+      valnos.pop_back();
+    } while (!valnos.empty() && valnos.back()->isUnused());
+  } else {
+    ValNo->markUnused();
+  }
+}
+
+/// RenumberValues - Renumber all values in order of appearance and delete the
+/// remaining unused values.
+void LiveRange::RenumberValues() {
+  SmallPtrSet<VNInfo*, 8> Seen;
+  valnos.clear();
+  for (const Segment &S : segments) {
+    VNInfo *VNI = S.valno;
+    if (!Seen.insert(VNI).second)
+      continue;
+    assert(!VNI->isUnused() && "Unused valno used by live segment");
+    VNI->id = (unsigned)valnos.size();
+    valnos.push_back(VNI);
+  }
+}
+
+void LiveRange::addSegmentToSet(Segment S) {
+  CalcLiveRangeUtilSet(this).addSegment(S);
+}
+
+LiveRange::iterator LiveRange::addSegment(Segment S) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr) {
+    addSegmentToSet(S);
+    return end();
+  }
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).addSegment(S);
+}
+
+void LiveRange::append(const Segment S) {
+  // Check that the segment belongs to the back of the list.
+  assert(segments.empty() || segments.back().end <= S.start);
+  segments.push_back(S);
+}
+
+/// extendInBlock - If this range is live before Kill in the basic
+/// block that starts at StartIdx, extend it to be live up to Kill and return
+/// the value. If there is no live range before Kill, return NULL.
+VNInfo *LiveRange::extendInBlock(SlotIndex StartIdx, SlotIndex Kill) {
+  // Use the segment set, if it is available.
+  if (segmentSet != nullptr)
+    return CalcLiveRangeUtilSet(this).extendInBlock(StartIdx, Kill);
+  // Otherwise use the segment vector.
+  return CalcLiveRangeUtilVector(this).extendInBlock(StartIdx, Kill);
+}
+
+/// Remove the specified segment from this range.  Note that the segment must
+/// be in a single Segment in its entirety.
+void LiveRange::removeSegment(SlotIndex Start, SlotIndex End,
+                              bool RemoveDeadValNo) {
+  // Find the Segment containing this span.
+  iterator I = find(Start);
+  assert(I != end() && "Segment is not in range!");
+  assert(I->containsInterval(Start, End)
+         && "Segment is not entirely in range!");
+
+  // If the span we are removing is at the start of the Segment, adjust it.
+  VNInfo *ValNo = I->valno;
+  if (I->start == Start) {
+    if (I->end == End) {
+      if (RemoveDeadValNo) {
+        // Check if val# is dead.
+        bool isDead = true;
+        for (const_iterator II = begin(), EE = end(); II != EE; ++II)
+          if (II != I && II->valno == ValNo) {
+            isDead = false;
+            break;
+          }
+        if (isDead) {
+          // Now that ValNo is dead, remove it.
+          markValNoForDeletion(ValNo);
+        }
+      }
+
+      segments.erase(I);  // Removed the whole Segment.
+    } else
+      I->start = End;
+    return;
+  }
+
+  // Otherwise if the span we are removing is at the end of the Segment,
+  // adjust the other way.
+  if (I->end == End) {
+    I->end = Start;
+    return;
+  }
+
+  // Otherwise, we are splitting the Segment into two pieces.
+  SlotIndex OldEnd = I->end;
+  I->end = Start;   // Trim the old segment.
+
+  // Insert the new one.
+  segments.insert(std::next(I), Segment(End, OldEnd, ValNo));
+}
+
+/// removeValNo - Remove all the segments defined by the specified value#.
+/// Also remove the value# from value# list.
+void LiveRange::removeValNo(VNInfo *ValNo) {
+  if (empty()) return;
+  segments.erase(std::remove_if(begin(), end(), [ValNo](const Segment &S) {
+    return S.valno == ValNo;
+  }), end());
+  // Now that ValNo is dead, remove it.
+  markValNoForDeletion(ValNo);
+}
+
+void LiveRange::join(LiveRange &Other,
+                     const int *LHSValNoAssignments,
+                     const int *RHSValNoAssignments,
+                     SmallVectorImpl<VNInfo *> &NewVNInfo) {
+  verify();
+
+  // Determine if any of our values are mapped.  This is uncommon, so we want
+  // to avoid the range scan if not.
+  bool MustMapCurValNos = false;
+  unsigned NumVals = getNumValNums();
+  unsigned NumNewVals = NewVNInfo.size();
+  for (unsigned i = 0; i != NumVals; ++i) {
+    unsigned LHSValID = LHSValNoAssignments[i];
+    if (i != LHSValID ||
+        (NewVNInfo[LHSValID] && NewVNInfo[LHSValID] != getValNumInfo(i))) {
+      MustMapCurValNos = true;
+      break;
+    }
+  }
+
+  // If we have to apply a mapping to our base range assignment, rewrite it now.
+  if (MustMapCurValNos && !empty()) {
+    // Map the first live range.
+
+    iterator OutIt = begin();
+    OutIt->valno = NewVNInfo[LHSValNoAssignments[OutIt->valno->id]];
+    for (iterator I = std::next(OutIt), E = end(); I != E; ++I) {
+      VNInfo* nextValNo = NewVNInfo[LHSValNoAssignments[I->valno->id]];
+      assert(nextValNo && "Huh?");
+
+      // If this live range has the same value # as its immediate predecessor,
+      // and if they are neighbors, remove one Segment.  This happens when we
+      // have [0,4:0)[4,7:1) and map 0/1 onto the same value #.
+      if (OutIt->valno == nextValNo && OutIt->end == I->start) {
+        OutIt->end = I->end;
+      } else {
+        // Didn't merge. Move OutIt to the next segment,
+        ++OutIt;
+        OutIt->valno = nextValNo;
+        if (OutIt != I) {
+          OutIt->start = I->start;
+          OutIt->end = I->end;
+        }
+      }
+    }
+    // If we merge some segments, chop off the end.
+    ++OutIt;
+    segments.erase(OutIt, end());
+  }
+
+  // Rewrite Other values before changing the VNInfo ids.
+  // This can leave Other in an invalid state because we're not coalescing
+  // touching segments that now have identical values. That's OK since Other is
+  // not supposed to be valid after calling join();
+  for (Segment &S : Other.segments)
+    S.valno = NewVNInfo[RHSValNoAssignments[S.valno->id]];
+
+  // Update val# info. Renumber them and make sure they all belong to this
+  // LiveRange now. Also remove dead val#'s.
+  unsigned NumValNos = 0;
+  for (unsigned i = 0; i < NumNewVals; ++i) {
+    VNInfo *VNI = NewVNInfo[i];
+    if (VNI) {
+      if (NumValNos >= NumVals)
+        valnos.push_back(VNI);
+      else
+        valnos[NumValNos] = VNI;
+      VNI->id = NumValNos++;  // Renumber val#.
+    }
+  }
+  if (NumNewVals < NumVals)
+    valnos.resize(NumNewVals);  // shrinkify
+
+  // Okay, now insert the RHS live segments into the LHS.
+  LiveRangeUpdater Updater(this);
+  for (Segment &S : Other.segments)
+    Updater.add(S);
+}
+
+/// Merge all of the segments in RHS into this live range as the specified
+/// value number.  The segments in RHS are allowed to overlap with segments in
+/// the current range, but only if the overlapping segments have the
+/// specified value number.
+void LiveRange::MergeSegmentsInAsValue(const LiveRange &RHS,
+                                       VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const Segment &S : RHS.segments)
+    Updater.add(S.start, S.end, LHSValNo);
+}
+
+/// MergeValueInAsValue - Merge all of the live segments of a specific val#
+/// in RHS into this live range as the specified value number.
+/// The segments in RHS are allowed to overlap with segments in the
+/// current range, it will replace the value numbers of the overlaped
+/// segments with the specified value number.
+void LiveRange::MergeValueInAsValue(const LiveRange &RHS,
+                                    const VNInfo *RHSValNo,
+                                    VNInfo *LHSValNo) {
+  LiveRangeUpdater Updater(this);
+  for (const Segment &S : RHS.segments)
+    if (S.valno == RHSValNo)
+      Updater.add(S.start, S.end, LHSValNo);
+}
+
+/// MergeValueNumberInto - This method is called when two value nubmers
+/// are found to be equivalent.  This eliminates V1, replacing all
+/// segments with the V1 value number with the V2 value number.  This can
+/// cause merging of V1/V2 values numbers and compaction of the value space.
+VNInfo *LiveRange::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
+  assert(V1 != V2 && "Identical value#'s are always equivalent!");
+
+  // This code actually merges the (numerically) larger value number into the
+  // smaller value number, which is likely to allow us to compactify the value
+  // space.  The only thing we have to be careful of is to preserve the
+  // instruction that defines the result value.
+
+  // Make sure V2 is smaller than V1.
+  if (V1->id < V2->id) {
+    V1->copyFrom(*V2);
+    std::swap(V1, V2);
+  }
+
+  // Merge V1 segments into V2.
+  for (iterator I = begin(); I != end(); ) {
+    iterator S = I++;
+    if (S->valno != V1) continue;  // Not a V1 Segment.
+
+    // Okay, we found a V1 live range.  If it had a previous, touching, V2 live
+    // range, extend it.
+    if (S != begin()) {
+      iterator Prev = S-1;
+      if (Prev->valno == V2 && Prev->end == S->start) {
+        Prev->end = S->end;
+
+        // Erase this live-range.
+        segments.erase(S);
+        I = Prev+1;
+        S = Prev;
+      }
+    }
+
+    // Okay, now we have a V1 or V2 live range that is maximally merged forward.
+    // Ensure that it is a V2 live-range.
+    S->valno = V2;
+
+    // If we can merge it into later V2 segments, do so now.  We ignore any
+    // following V1 segments, as they will be merged in subsequent iterations
+    // of the loop.
+    if (I != end()) {
+      if (I->start == S->end && I->valno == V2) {
+        S->end = I->end;
+        segments.erase(I);
+        I = S+1;
+      }
+    }
+  }
+
+  // Now that V1 is dead, remove it.
+  markValNoForDeletion(V1);
+
+  return V2;
+}
+
+void LiveRange::flushSegmentSet() {
+  assert(segmentSet != nullptr && "segment set must have been created");
+  assert(
+      segments.empty() &&
+      "segment set can be used only initially before switching to the array");
+  segments.append(segmentSet->begin(), segmentSet->end());
+  segmentSet = nullptr;
+  verify();
+}
+
+bool LiveRange::isLiveAtIndexes(ArrayRef<SlotIndex> Slots) const {
+  ArrayRef<SlotIndex>::iterator SlotI = Slots.begin();
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // If there are no regmask slots, we have nothing to search.
+  if (SlotI == SlotE)
+    return false;
+
+  // Start our search at the first segment that ends after the first slot.
+  const_iterator SegmentI = find(*SlotI);
+  const_iterator SegmentE = end();
+
+  // If there are no segments that end after the first slot, we're done.
+  if (SegmentI == SegmentE)
+    return false;
+
+  // Look for each slot in the live range.
+  for ( ; SlotI != SlotE; ++SlotI) {
+    // Go to the next segment that ends after the current slot.
+    // The slot may be within a hole in the range.
+    SegmentI = advanceTo(SegmentI, *SlotI);
+    if (SegmentI == SegmentE)
+      return false;
+
+    // If this segment contains the slot, we're done.
+    if (SegmentI->contains(*SlotI))
+      return true;
+    // Otherwise, look for the next slot.
+  }
+
+  // We didn't find a segment containing any of the slots.
+  return false;
+}
+
+void LiveInterval::freeSubRange(SubRange *S) {
+  S->~SubRange();
+  // Memory was allocated with BumpPtr allocator and is not freed here.
+}
+
+void LiveInterval::removeEmptySubRanges() {
+  SubRange **NextPtr = &SubRanges;
+  SubRange *I = *NextPtr;
+  while (I != nullptr) {
+    if (!I->empty()) {
+      NextPtr = &I->Next;
+      I = *NextPtr;
+      continue;
+    }
+    // Skip empty subranges until we find the first nonempty one.
+    do {
+      SubRange *Next = I->Next;
+      freeSubRange(I);
+      I = Next;
+    } while (I != nullptr && I->empty());
+    *NextPtr = I;
+  }
+}
+
+void LiveInterval::clearSubRanges() {
+  for (SubRange *I = SubRanges, *Next; I != nullptr; I = Next) {
+    Next = I->Next;
+    freeSubRange(I);
+  }
+  SubRanges = nullptr;
+}
+
+unsigned LiveInterval::getSize() const {
+  unsigned Sum = 0;
+  for (const Segment &S : segments)
+    Sum += S.start.distance(S.end);
+  return Sum;
+}
+
+raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange::Segment &S) {
+  return os << '[' << S.start << ',' << S.end << ':' << S.valno->id << ')';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveRange::Segment::dump() const {
+  dbgs() << *this << '\n';
+}
+#endif
+
+void LiveRange::print(raw_ostream &OS) const {
+  if (empty())
+    OS << "EMPTY";
+  else {
+    for (const Segment &S : segments) {
+      OS << S;
+      assert(S.valno == getValNumInfo(S.valno->id) && "Bad VNInfo");
+    }
+  }
+
+  // Print value number info.
+  if (getNumValNums()) {
+    OS << "  ";
+    unsigned vnum = 0;
+    for (const_vni_iterator i = vni_begin(), e = vni_end(); i != e;
+         ++i, ++vnum) {
+      const VNInfo *vni = *i;
+      if (vnum) OS << ' ';
+      OS << vnum << '@';
+      if (vni->isUnused()) {
+        OS << 'x';
+      } else {
+        OS << vni->def;
+        if (vni->isPHIDef())
+          OS << "-phi";
+      }
+    }
+  }
+}
+
+void LiveInterval::SubRange::print(raw_ostream &OS) const {
+  OS << " L" << PrintLaneMask(LaneMask) << ' '
+     << static_cast<const LiveRange&>(*this);
+}
+
+void LiveInterval::print(raw_ostream &OS) const {
+  OS << PrintReg(reg) << ' ';
+  super::print(OS);
+  // Print subranges
+  for (const SubRange &SR : subranges())
+    OS << SR;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void LiveRange::dump() const {
+  dbgs() << *this << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveInterval::SubRange::dump() const {
+  dbgs() << *this << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveInterval::dump() const {
+  dbgs() << *this << '\n';
+}
+#endif
+
+#ifndef NDEBUG
+void LiveRange::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != nullptr);
+    assert(I->valno->id < valnos.size());
+    assert(I->valno == valnos[I->valno->id]);
+    if (std::next(I) != E) {
+      assert(I->end <= std::next(I)->start);
+      if (I->end == std::next(I)->start)
+        assert(I->valno != std::next(I)->valno);
+    }
+  }
+}
+
+void LiveInterval::verify(const MachineRegisterInfo *MRI) const {
+  super::verify();
+
+  // Make sure SubRanges are fine and LaneMasks are disjunct.
+  LaneBitmask Mask = 0;
+  LaneBitmask MaxMask = MRI != nullptr ? MRI->getMaxLaneMaskForVReg(reg) : ~0u;
+  for (const SubRange &SR : subranges()) {
+    // Subrange lanemask should be disjunct to any previous subrange masks.
+    assert((Mask & SR.LaneMask) == 0);
+    Mask |= SR.LaneMask;
+
+    // subrange mask should not contained in maximum lane mask for the vreg.
+    assert((Mask & ~MaxMask) == 0);
+    // empty subranges must be removed.
+    assert(!SR.empty());
+
+    SR.verify();
+    // Main liverange should cover subrange.
+    assert(covers(SR));
+  }
+}
+#endif
+
+
+//===----------------------------------------------------------------------===//
+//                           LiveRangeUpdater class
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeUpdater class always maintains these invariants:
+//
+// - When LastStart is invalid, Spills is empty and the iterators are invalid.
+//   This is the initial state, and the state created by flush().
+//   In this state, isDirty() returns false.
+//
+// Otherwise, segments are kept in three separate areas:
+//
+// 1. [begin; WriteI) at the front of LR.
+// 2. [ReadI; end) at the back of LR.
+// 3. Spills.
+//
+// - LR.begin() <= WriteI <= ReadI <= LR.end().
+// - Segments in all three areas are fully ordered and coalesced.
+// - Segments in area 1 precede and can't coalesce with segments in area 2.
+// - Segments in Spills precede and can't coalesce with segments in area 2.
+// - No coalescing is possible between segments in Spills and segments in area
+//   1, and there are no overlapping segments.
+//
+// The segments in Spills are not ordered with respect to the segments in area
+// 1. They need to be merged.
+//
+// When they exist, Spills.back().start <= LastStart,
+//                 and WriteI[-1].start <= LastStart.
+
+void LiveRangeUpdater::print(raw_ostream &OS) const {
+  if (!isDirty()) {
+    if (LR)
+      OS << "Clean updater: " << *LR << '\n';
+    else
+      OS << "Null updater.\n";
+    return;
+  }
+  assert(LR && "Can't have null LR in dirty updater.");
+  OS << " updater with gap = " << (ReadI - WriteI)
+     << ", last start = " << LastStart
+     << ":\n  Area 1:";
+  for (const auto &S : make_range(LR->begin(), WriteI))
+    OS << ' ' << S;
+  OS << "\n  Spills:";
+  for (unsigned I = 0, E = Spills.size(); I != E; ++I)
+    OS << ' ' << Spills[I];
+  OS << "\n  Area 2:";
+  for (const auto &S : make_range(ReadI, LR->end()))
+    OS << ' ' << S;
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD void LiveRangeUpdater::dump() const {
+  print(errs());
+}
+
+// Determine if A and B should be coalesced.
+static inline bool coalescable(const LiveRange::Segment &A,
+                               const LiveRange::Segment &B) {
+  assert(A.start <= B.start && "Unordered live segments.");
+  if (A.end == B.start)
+    return A.valno == B.valno;
+  if (A.end < B.start)
+    return false;
+  assert(A.valno == B.valno && "Cannot overlap different values");
+  return true;
+}
+
+void LiveRangeUpdater::add(LiveRange::Segment Seg) {
+  assert(LR && "Cannot add to a null destination");
+
+  // Fall back to the regular add method if the live range
+  // is using the segment set instead of the segment vector.
+  if (LR->segmentSet != nullptr) {
+    LR->addSegmentToSet(Seg);
+    return;
+  }
+
+  // Flush the state if Start moves backwards.
+  if (!LastStart.isValid() || LastStart > Seg.start) {
+    if (isDirty())
+      flush();
+    // This brings us to an uninitialized state. Reinitialize.
+    assert(Spills.empty() && "Leftover spilled segments");
+    WriteI = ReadI = LR->begin();
+  }
+
+  // Remember start for next time.
+  LastStart = Seg.start;
+
+  // Advance ReadI until it ends after Seg.start.
+  LiveRange::iterator E = LR->end();
+  if (ReadI != E && ReadI->end <= Seg.start) {
+    // First try to close the gap between WriteI and ReadI with spills.
+    if (ReadI != WriteI)
+      mergeSpills();
+    // Then advance ReadI.
+    if (ReadI == WriteI)
+      ReadI = WriteI = LR->find(Seg.start);
+    else
+      while (ReadI != E && ReadI->end <= Seg.start)
+        *WriteI++ = *ReadI++;
+  }
+
+  assert(ReadI == E || ReadI->end > Seg.start);
+
+  // Check if the ReadI segment begins early.
+  if (ReadI != E && ReadI->start <= Seg.start) {
+    assert(ReadI->valno == Seg.valno && "Cannot overlap different values");
+    // Bail if Seg is completely contained in ReadI.
+    if (ReadI->end >= Seg.end)
+      return;
+    // Coalesce into Seg.
+    Seg.start = ReadI->start;
+    ++ReadI;
+  }
+
+  // Coalesce as much as possible from ReadI into Seg.
+  while (ReadI != E && coalescable(Seg, *ReadI)) {
+    Seg.end = std::max(Seg.end, ReadI->end);
+    ++ReadI;
+  }
+
+  // Try coalescing Spills.back() into Seg.
+  if (!Spills.empty() && coalescable(Spills.back(), Seg)) {
+    Seg.start = Spills.back().start;
+    Seg.end = std::max(Spills.back().end, Seg.end);
+    Spills.pop_back();
+  }
+
+  // Try coalescing Seg into WriteI[-1].
+  if (WriteI != LR->begin() && coalescable(WriteI[-1], Seg)) {
+    WriteI[-1].end = std::max(WriteI[-1].end, Seg.end);
+    return;
+  }
+
+  // Seg doesn't coalesce with anything, and needs to be inserted somewhere.
+  if (WriteI != ReadI) {
+    *WriteI++ = Seg;
+    return;
+  }
+
+  // Finally, append to LR or Spills.
+  if (WriteI == E) {
+    LR->segments.push_back(Seg);
+    WriteI = ReadI = LR->end();
+  } else
+    Spills.push_back(Seg);
+}
+
+// Merge as many spilled segments as possible into the gap between WriteI
+// and ReadI. Advance WriteI to reflect the inserted instructions.
+void LiveRangeUpdater::mergeSpills() {
+  // Perform a backwards merge of Spills and [SpillI;WriteI).
+  size_t GapSize = ReadI - WriteI;
+  size_t NumMoved = std::min(Spills.size(), GapSize);
+  LiveRange::iterator Src = WriteI;
+  LiveRange::iterator Dst = Src + NumMoved;
+  LiveRange::iterator SpillSrc = Spills.end();
+  LiveRange::iterator B = LR->begin();
+
+  // This is the new WriteI position after merging spills.
+  WriteI = Dst;
+
+  // Now merge Src and Spills backwards.
+  while (Src != Dst) {
+    if (Src != B && Src[-1].start > SpillSrc[-1].start)
+      *--Dst = *--Src;
+    else
+      *--Dst = *--SpillSrc;
+  }
+  assert(NumMoved == size_t(Spills.end() - SpillSrc));
+  Spills.erase(SpillSrc, Spills.end());
+}
+
+void LiveRangeUpdater::flush() {
+  if (!isDirty())
+    return;
+  // Clear the dirty state.
+  LastStart = SlotIndex();
+
+  assert(LR && "Cannot add to a null destination");
+
+  // Nothing to merge?
+  if (Spills.empty()) {
+    LR->segments.erase(WriteI, ReadI);
+    LR->verify();
+    return;
+  }
+
+  // Resize the WriteI - ReadI gap to match Spills.
+  size_t GapSize = ReadI - WriteI;
+  if (GapSize < Spills.size()) {
+    // The gap is too small. Make some room.
+    size_t WritePos = WriteI - LR->begin();
+    LR->segments.insert(ReadI, Spills.size() - GapSize, LiveRange::Segment());
+    // This also invalidated ReadI, but it is recomputed below.
+    WriteI = LR->begin() + WritePos;
+  } else {
+    // Shrink the gap if necessary.
+    LR->segments.erase(WriteI + Spills.size(), ReadI);
+  }
+  ReadI = WriteI + Spills.size();
+  mergeSpills();
+  LR->verify();
+}
+
+unsigned ConnectedVNInfoEqClasses::Classify(const LiveRange &LR) {
+  // Create initial equivalence classes.
+  EqClass.clear();
+  EqClass.grow(LR.getNumValNums());
+
+  const VNInfo *used = nullptr, *unused = nullptr;
+
+  // Determine connections.
+  for (const VNInfo *VNI : LR.valnos) {
+    // Group all unused values into one class.
+    if (VNI->isUnused()) {
+      if (unused)
+        EqClass.join(unused->id, VNI->id);
+      unused = VNI;
+      continue;
+    }
+    used = VNI;
+    if (VNI->isPHIDef()) {
+      const MachineBasicBlock *MBB = LIS.getMBBFromIndex(VNI->def);
+      assert(MBB && "Phi-def has no defining MBB");
+      // Connect to values live out of predecessors.
+      for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+           PE = MBB->pred_end(); PI != PE; ++PI)
+        if (const VNInfo *PVNI = LR.getVNInfoBefore(LIS.getMBBEndIdx(*PI)))
+          EqClass.join(VNI->id, PVNI->id);
+    } else {
+      // Normal value defined by an instruction. Check for two-addr redef.
+      // FIXME: This could be coincidental. Should we really check for a tied
+      // operand constraint?
+      // Note that VNI->def may be a use slot for an early clobber def.
+      if (const VNInfo *UVNI = LR.getVNInfoBefore(VNI->def))
+        EqClass.join(VNI->id, UVNI->id);
+    }
+  }
+
+  // Lump all the unused values in with the last used value.
+  if (used && unused)
+    EqClass.join(used->id, unused->id);
+
+  EqClass.compress();
+  return EqClass.getNumClasses();
+}
+
+void ConnectedVNInfoEqClasses::Distribute(LiveInterval &LI, LiveInterval *LIV[],
+                                          MachineRegisterInfo &MRI) {
+  // Rewrite instructions.
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LI.reg),
+       RE = MRI.reg_end(); RI != RE;) {
+    MachineOperand &MO = *RI;
+    MachineInstr *MI = RI->getParent();
+    ++RI;
+    // DBG_VALUE instructions don't have slot indexes, so get the index of the
+    // instruction before them.
+    // Normally, DBG_VALUE instructions are removed before this function is
+    // called, but it is not a requirement.
+    SlotIndex Idx;
+    if (MI->isDebugValue())
+      Idx = LIS.getSlotIndexes()->getIndexBefore(*MI);
+    else
+      Idx = LIS.getInstructionIndex(*MI);
+    LiveQueryResult LRQ = LI.Query(Idx);
+    const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
+    // In the case of an <undef> use that isn't tied to any def, VNI will be
+    // NULL. If the use is tied to a def, VNI will be the defined value.
+    if (!VNI)
+      continue;
+    if (unsigned EqClass = getEqClass(VNI))
+      MO.setReg(LIV[EqClass-1]->reg);
+  }
+
+  // Distribute subregister liveranges.
+  if (LI.hasSubRanges()) {
+    unsigned NumComponents = EqClass.getNumClasses();
+    SmallVector<unsigned, 8> VNIMapping;
+    SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+    BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+    for (LiveInterval::SubRange &SR : LI.subranges()) {
+      // Create new subranges in the split intervals and construct a mapping
+      // for the VNInfos in the subrange.
+      unsigned NumValNos = SR.valnos.size();
+      VNIMapping.clear();
+      VNIMapping.reserve(NumValNos);
+      SubRanges.clear();
+      SubRanges.resize(NumComponents-1, nullptr);
+      for (unsigned I = 0; I < NumValNos; ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        unsigned ComponentNum;
+        if (VNI.isUnused()) {
+          ComponentNum = 0;
+        } else {
+          const VNInfo *MainRangeVNI = LI.getVNInfoAt(VNI.def);
+          assert(MainRangeVNI != nullptr
+                 && "SubRange def must have corresponding main range def");
+          ComponentNum = getEqClass(MainRangeVNI);
+          if (ComponentNum > 0 && SubRanges[ComponentNum-1] == nullptr) {
+            SubRanges[ComponentNum-1]
+              = LIV[ComponentNum-1]->createSubRange(Allocator, SR.LaneMask);
+          }
+        }
+        VNIMapping.push_back(ComponentNum);
+      }
+      DistributeRange(SR, SubRanges.data(), VNIMapping);
+    }
+    LI.removeEmptySubRanges();
+  }
+
+  // Distribute main liverange.
+  DistributeRange(LI, LIV, EqClass);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
new file mode 100644
index 000000000000..5f3281f6771d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -0,0 +1,1575 @@
+//===-- LiveIntervalAnalysis.cpp - Live Interval Analysis -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveInterval analysis pass which is used
+// by the Linear Scan Register allocator. This pass linearizes the
+// basic blocks of the function in DFS order and computes live intervals for
+// each virtual and physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "LiveRangeCalc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/BlockFrequency.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+char LiveIntervals::ID = 0;
+char &llvm::LiveIntervalsID = LiveIntervals::ID;
+INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveIntervals, "liveintervals",
+                "Live Interval Analysis", false, false)
+
+#ifndef NDEBUG
+static cl::opt<bool> EnablePrecomputePhysRegs(
+  "precompute-phys-liveness", cl::Hidden,
+  cl::desc("Eagerly compute live intervals for all physreg units."));
+#else
+static bool EnablePrecomputePhysRegs = false;
+#endif // NDEBUG
+
+static cl::opt<bool> EnableSubRegLiveness(
+  "enable-subreg-liveness", cl::Hidden, cl::init(true),
+  cl::desc("Enable subregister liveness tracking."));
+
+namespace llvm {
+cl::opt<bool> UseSegmentSetForPhysRegs(
+    "use-segment-set-for-physregs", cl::Hidden, cl::init(true),
+    cl::desc(
+        "Use segment set for the computation of the live ranges of physregs."));
+}
+
+void LiveIntervals::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<LiveVariables>();
+  AU.addPreservedID(MachineLoopInfoID);
+  AU.addRequiredTransitiveID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+LiveIntervals::LiveIntervals() : MachineFunctionPass(ID),
+  DomTree(nullptr), LRCalc(nullptr) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+LiveIntervals::~LiveIntervals() {
+  delete LRCalc;
+}
+
+void LiveIntervals::releaseMemory() {
+  // Free the live intervals themselves.
+  for (unsigned i = 0, e = VirtRegIntervals.size(); i != e; ++i)
+    delete VirtRegIntervals[TargetRegisterInfo::index2VirtReg(i)];
+  VirtRegIntervals.clear();
+  RegMaskSlots.clear();
+  RegMaskBits.clear();
+  RegMaskBlocks.clear();
+
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    delete RegUnitRanges[i];
+  RegUnitRanges.clear();
+
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+}
+
+/// runOnMachineFunction - calculates LiveIntervals
+///
+bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  MRI = &MF->getRegInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  Indexes = &getAnalysis<SlotIndexes>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+
+  if (EnableSubRegLiveness && MF->getSubtarget().enableSubRegLiveness())
+    MRI->enableSubRegLiveness(true);
+
+  if (!LRCalc)
+    LRCalc = new LiveRangeCalc();
+
+  // Allocate space for all virtual registers.
+  VirtRegIntervals.resize(MRI->getNumVirtRegs());
+
+  computeVirtRegs();
+  computeRegMasks();
+  computeLiveInRegUnits();
+
+  if (EnablePrecomputePhysRegs) {
+    // For stress testing, precompute live ranges of all physical register
+    // units, including reserved registers.
+    for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+      getRegUnit(i);
+  }
+  DEBUG(dump());
+  return true;
+}
+
+/// print - Implement the dump method.
+void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
+  OS << "********** INTERVALS **********\n";
+
+  // Dump the regunits.
+  for (unsigned i = 0, e = RegUnitRanges.size(); i != e; ++i)
+    if (LiveRange *LR = RegUnitRanges[i])
+      OS << PrintRegUnit(i, TRI) << ' ' << *LR << '\n';
+
+  // Dump the virtregs.
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (hasInterval(Reg))
+      OS << getInterval(Reg) << '\n';
+  }
+
+  OS << "RegMasks:";
+  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+    OS << ' ' << RegMaskSlots[i];
+  OS << '\n';
+
+  printInstrs(OS);
+}
+
+void LiveIntervals::printInstrs(raw_ostream &OS) const {
+  OS << "********** MACHINEINSTRS **********\n";
+  MF->print(OS, Indexes);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void LiveIntervals::dumpInstrs() const {
+  printInstrs(dbgs());
+}
+#endif
+
+LiveInterval* LiveIntervals::createInterval(unsigned reg) {
+  float Weight = TargetRegisterInfo::isPhysicalRegister(reg) ?
+                  llvm::huge_valf : 0.0F;
+  return new LiveInterval(reg, Weight);
+}
+
+
+/// computeVirtRegInterval - Compute the live interval of a virtual register,
+/// based on defs and uses.
+void LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  assert(LI.empty() && "Should only compute empty intervals.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
+  computeDeadValues(LI, nullptr);
+}
+
+void LiveIntervals::computeVirtRegs() {
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    createAndComputeVirtRegInterval(Reg);
+  }
+}
+
+void LiveIntervals::computeRegMasks() {
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
+
+  // Find all instructions with regmask operands.
+  for (MachineBasicBlock &MBB : *MF) {
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB.getNumber()];
+    RMB.first = RegMaskSlots.size();
+
+    // Some block starts, such as EH funclets, create masks.
+    if (const uint32_t *Mask = MBB.getBeginClobberMask(TRI)) {
+      RegMaskSlots.push_back(Indexes->getMBBStartIdx(&MBB));
+      RegMaskBits.push_back(Mask);
+    }
+
+    for (MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
+        RegMaskBits.push_back(MO.getRegMask());
+      }
+    }
+
+    // Some block ends, such as funclet returns, create masks. Put the mask on
+    // the last instruction of the block, because MBB slot index intervals are
+    // half-open.
+    if (const uint32_t *Mask = MBB.getEndClobberMask(TRI)) {
+      assert(!MBB.empty() && "empty return block?");
+      RegMaskSlots.push_back(
+          Indexes->getInstructionIndex(MBB.back()).getRegSlot());
+      RegMaskBits.push_back(Mask);
+    }
+
+    // Compute the number of register mask instructions in this block.
+    RMB.second = RegMaskSlots.size() - RMB.first;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                           Register Unit Liveness
+//===----------------------------------------------------------------------===//
+//
+// Fixed interference typically comes from ABI boundaries: Function arguments
+// and return values are passed in fixed registers, and so are exception
+// pointers entering landing pads. Certain instructions require values to be
+// present in specific registers. That is also represented through fixed
+// interference.
+//
+
+/// computeRegUnitInterval - Compute the live range of a register unit, based
+/// on the uses and defs of aliasing registers.  The range should be empty,
+/// or contain only dead phi-defs from ABI blocks.
+void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+
+  // The physregs aliasing Unit are the roots and their super-registers.
+  // Create all values as dead defs before extending to uses. Note that roots
+  // may share super-registers. That's OK because createDeadDefs() is
+  // idempotent. It is very rare for a register unit to have multiple roots, so
+  // uniquing super-registers is probably not worthwhile.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
+         Supers.isValid(); ++Supers) {
+      if (!MRI->reg_empty(*Supers))
+        LRCalc->createDeadDefs(LR, *Supers);
+    }
+  }
+
+  // Now extend LR to reach all uses.
+  // Ignore uses of reserved registers. We only track defs of those.
+  for (MCRegUnitRootIterator Roots(Unit, TRI); Roots.isValid(); ++Roots) {
+    for (MCSuperRegIterator Supers(*Roots, TRI, /*IncludeSelf=*/true);
+         Supers.isValid(); ++Supers) {
+      unsigned Reg = *Supers;
+      if (!MRI->isReserved(Reg) && !MRI->reg_empty(Reg))
+        LRCalc->extendToUses(LR, Reg);
+    }
+  }
+
+  // Flush the segment set to the segment vector.
+  if (UseSegmentSetForPhysRegs)
+    LR.flushSegmentSet();
+}
+
+
+/// computeLiveInRegUnits - Precompute the live ranges of any register units
+/// that are live-in to an ABI block somewhere. Register values can appear
+/// without a corresponding def when entering the entry block or a landing pad.
+///
+void LiveIntervals::computeLiveInRegUnits() {
+  RegUnitRanges.resize(TRI->getNumRegUnits());
+  DEBUG(dbgs() << "Computing live-in reg-units in ABI blocks.\n");
+
+  // Keep track of the live range sets allocated.
+  SmallVector<unsigned, 8> NewRanges;
+
+  // Check all basic blocks for live-ins.
+  for (MachineFunction::const_iterator MFI = MF->begin(), MFE = MF->end();
+       MFI != MFE; ++MFI) {
+    const MachineBasicBlock *MBB = &*MFI;
+
+    // We only care about ABI blocks: Entry + landing pads.
+    if ((MFI != MF->begin() && !MBB->isEHPad()) || MBB->livein_empty())
+      continue;
+
+    // Create phi-defs at Begin for all live-in registers.
+    SlotIndex Begin = Indexes->getMBBStartIdx(MBB);
+    DEBUG(dbgs() << Begin << "\tBB#" << MBB->getNumber());
+    for (const auto &LI : MBB->liveins()) {
+      for (MCRegUnitIterator Units(LI.PhysReg, TRI); Units.isValid(); ++Units) {
+        unsigned Unit = *Units;
+        LiveRange *LR = RegUnitRanges[Unit];
+        if (!LR) {
+          // Use segment set to speed-up initial computation of the live range.
+          LR = RegUnitRanges[Unit] = new LiveRange(UseSegmentSetForPhysRegs);
+          NewRanges.push_back(Unit);
+        }
+        VNInfo *VNI = LR->createDeadDef(Begin, getVNInfoAllocator());
+        (void)VNI;
+        DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << '#' << VNI->id);
+      }
+    }
+    DEBUG(dbgs() << '\n');
+  }
+  DEBUG(dbgs() << "Created " << NewRanges.size() << " new intervals.\n");
+
+  // Compute the 'normal' part of the ranges.
+  for (unsigned i = 0, e = NewRanges.size(); i != e; ++i) {
+    unsigned Unit = NewRanges[i];
+    computeRegUnitRange(*RegUnitRanges[Unit], Unit);
+  }
+}
+
+
+static void createSegmentsForValues(LiveRange &LR,
+      iterator_range<LiveInterval::vni_iterator> VNIs) {
+  for (auto VNI : VNIs) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LR.addSegment(LiveRange::Segment(Def, Def.getDeadSlot(), VNI));
+  }
+}
+
+typedef SmallVector<std::pair<SlotIndex, VNInfo*>, 16> ShrinkToUsesWorkList;
+
+static void extendSegmentsToUses(LiveRange &LR, const SlotIndexes &Indexes,
+                                 ShrinkToUsesWorkList &WorkList,
+                                 const LiveRange &OldRange) {
+  // Keep track of the PHIs that are in use.
+  SmallPtrSet<VNInfo*, 8> UsedPHIs;
+  // Blocks that have already been added to WorkList as live-out.
+  SmallPtrSet<MachineBasicBlock*, 16> LiveOut;
+
+  // Extend intervals to reach all uses in WorkList.
+  while (!WorkList.empty()) {
+    SlotIndex Idx = WorkList.back().first;
+    VNInfo *VNI = WorkList.back().second;
+    WorkList.pop_back();
+    const MachineBasicBlock *MBB = Indexes.getMBBFromIndex(Idx.getPrevSlot());
+    SlotIndex BlockStart = Indexes.getMBBStartIdx(MBB);
+
+    // Extend the live range for VNI to be live at Idx.
+    if (VNInfo *ExtVNI = LR.extendInBlock(BlockStart, Idx)) {
+      assert(ExtVNI == VNI && "Unexpected existing value number");
+      (void)ExtVNI;
+      // Is this a PHIDef we haven't seen before?
+      if (!VNI->isPHIDef() || VNI->def != BlockStart ||
+          !UsedPHIs.insert(VNI).second)
+        continue;
+      // The PHI is live, make sure the predecessors are live-out.
+      for (auto &Pred : MBB->predecessors()) {
+        if (!LiveOut.insert(Pred).second)
+          continue;
+        SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+        // A predecessor is not required to have a live-out value for a PHI.
+        if (VNInfo *PVNI = OldRange.getVNInfoBefore(Stop))
+          WorkList.push_back(std::make_pair(Stop, PVNI));
+      }
+      continue;
+    }
+
+    // VNI is live-in to MBB.
+    DEBUG(dbgs() << " live-in at " << BlockStart << '\n');
+    LR.addSegment(LiveRange::Segment(BlockStart, Idx, VNI));
+
+    // Make sure VNI is live-out from the predecessors.
+    for (auto &Pred : MBB->predecessors()) {
+      if (!LiveOut.insert(Pred).second)
+        continue;
+      SlotIndex Stop = Indexes.getMBBEndIdx(Pred);
+      assert(OldRange.getVNInfoBefore(Stop) == VNI &&
+             "Wrong value out of predecessor");
+      WorkList.push_back(std::make_pair(Stop, VNI));
+    }
+  }
+}
+
+bool LiveIntervals::shrinkToUses(LiveInterval *li,
+                                 SmallVectorImpl<MachineInstr*> *dead) {
+  DEBUG(dbgs() << "Shrink: " << *li << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(li->reg)
+         && "Can only shrink virtual registers");
+
+  // Shrink subregister live ranges.
+  bool NeedsCleanup = false;
+  for (LiveInterval::SubRange &S : li->subranges()) {
+    shrinkToUses(S, li->reg);
+    if (S.empty())
+      NeedsCleanup = true;
+  }
+  if (NeedsCleanup)
+    li->removeEmptySubRanges();
+
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading li->reg.
+  for (MachineRegisterInfo::reg_instr_iterator
+       I = MRI->reg_instr_begin(li->reg), E = MRI->reg_instr_end();
+       I != E; ) {
+    MachineInstr *UseMI = &*(I++);
+    if (UseMI->isDebugValue() || !UseMI->readsVirtualRegister(li->reg))
+      continue;
+    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
+    LiveQueryResult LRQ = li->Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    if (!VNI) {
+      // This shouldn't happen: readsVirtualRegister returns true, but there is
+      // no live value. It is likely caused by a target getting <undef> flags
+      // wrong.
+      DEBUG(dbgs() << Idx << '\t' << *UseMI
+                   << "Warning: Instr claims to read non-existent value in "
+                    << *li << '\n');
+      continue;
+    }
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, make_range(li->vni_begin(), li->vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, *li);
+
+  // Move the trimmed segments back.
+  li->segments.swap(NewLR.segments);
+
+  // Handle dead values.
+  bool CanSeparate = computeDeadValues(*li, dead);
+  DEBUG(dbgs() << "Shrunk: " << *li << '\n');
+  return CanSeparate;
+}
+
+bool LiveIntervals::computeDeadValues(LiveInterval &LI,
+                                      SmallVectorImpl<MachineInstr*> *dead) {
+  bool MayHaveSplitComponents = false;
+  for (auto VNI : LI.valnos) {
+    if (VNI->isUnused())
+      continue;
+    SlotIndex Def = VNI->def;
+    LiveRange::iterator I = LI.FindSegmentContaining(Def);
+    assert(I != LI.end() && "Missing segment for VNI");
+
+    // Is the register live before? Otherwise we may have to add a read-undef
+    // flag for subregister defs.
+    unsigned VReg = LI.reg;
+    if (MRI->shouldTrackSubRegLiveness(VReg)) {
+      if ((I == LI.begin() || std::prev(I)->end < Def) && !VNI->isPHIDef()) {
+        MachineInstr *MI = getInstructionFromIndex(Def);
+        MI->setRegisterDefReadUndef(VReg);
+      }
+    }
+
+    if (I->end != Def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      VNI->markUnused();
+      LI.removeSegment(I);
+      DEBUG(dbgs() << "Dead PHI at " << Def << " may separate interval\n");
+      MayHaveSplitComponents = true;
+    } else {
+      // This is a dead def. Make sure the instruction knows.
+      MachineInstr *MI = getInstructionFromIndex(Def);
+      assert(MI && "No instruction defining live value");
+      MI->addRegisterDead(LI.reg, TRI);
+      if (dead && MI->allDefsAreDead()) {
+        DEBUG(dbgs() << "All defs dead: " << Def << '\t' << *MI);
+        dead->push_back(MI);
+      }
+    }
+  }
+  return MayHaveSplitComponents;
+}
+
+void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg)
+{
+  DEBUG(dbgs() << "Shrink: " << SR << '\n');
+  assert(TargetRegisterInfo::isVirtualRegister(Reg)
+         && "Can only shrink virtual registers");
+  // Find all the values used, including PHI kills.
+  ShrinkToUsesWorkList WorkList;
+
+  // Visit all instructions reading Reg.
+  SlotIndex LastIdx;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    if (UseMI->isDebugValue())
+      continue;
+    // Maybe the operand is for a subregister we don't care about.
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      LaneBitmask LaneMask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((LaneMask & SR.LaneMask) == 0)
+        continue;
+    }
+    // We only need to visit each instruction once.
+    SlotIndex Idx = getInstructionIndex(*UseMI).getRegSlot();
+    if (Idx == LastIdx)
+      continue;
+    LastIdx = Idx;
+
+    LiveQueryResult LRQ = SR.Query(Idx);
+    VNInfo *VNI = LRQ.valueIn();
+    // For Subranges it is possible that only undef values are left in that
+    // part of the subregister, so there is no real liverange at the use
+    if (!VNI)
+      continue;
+
+    // Special case: An early-clobber tied operand reads and writes the
+    // register one slot early.
+    if (VNInfo *DefVNI = LRQ.valueDefined())
+      Idx = DefVNI->def;
+
+    WorkList.push_back(std::make_pair(Idx, VNI));
+  }
+
+  // Create a new live ranges with only minimal live segments per def.
+  LiveRange NewLR;
+  createSegmentsForValues(NewLR, make_range(SR.vni_begin(), SR.vni_end()));
+  extendSegmentsToUses(NewLR, *Indexes, WorkList, SR);
+
+  // Move the trimmed ranges back.
+  SR.segments.swap(NewLR.segments);
+
+  // Remove dead PHI value numbers
+  for (auto VNI : SR.valnos) {
+    if (VNI->isUnused())
+      continue;
+    const LiveRange::Segment *Segment = SR.getSegmentContaining(VNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end != VNI->def.getDeadSlot())
+      continue;
+    if (VNI->isPHIDef()) {
+      // This is a dead PHI. Remove it.
+      DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
+      VNI->markUnused();
+      SR.removeSegment(*Segment);
+    }
+  }
+
+  DEBUG(dbgs() << "Shrunk: " << SR << '\n');
+}
+
+void LiveIntervals::extendToIndices(LiveRange &LR,
+                                    ArrayRef<SlotIndex> Indices) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+    LRCalc->extend(LR, Indices[i]);
+}
+
+void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  LiveQueryResult LRQ = LR.Query(Kill);
+  VNInfo *VNI = LRQ.valueOutOrDead();
+  if (!VNI)
+    return;
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(KillMBB);
+
+  // If VNI isn't live out from KillMBB, the value is trivially pruned.
+  if (LRQ.endPoint() < MBBEnd) {
+    LR.removeSegment(Kill, LRQ.endPoint());
+    if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+    return;
+  }
+
+  // VNI is live out of KillMBB.
+  LR.removeSegment(Kill, MBBEnd);
+  if (EndPoints) EndPoints->push_back(MBBEnd);
+
+  // Find all blocks that are reachable from KillMBB without leaving VNI's live
+  // range. It is possible that KillMBB itself is reachable, so start a DFS
+  // from each successor.
+  typedef SmallPtrSet<MachineBasicBlock*, 9> VisitedTy;
+  VisitedTy Visited;
+  for (MachineBasicBlock::succ_iterator
+       SuccI = KillMBB->succ_begin(), SuccE = KillMBB->succ_end();
+       SuccI != SuccE; ++SuccI) {
+    for (df_ext_iterator<MachineBasicBlock*, VisitedTy>
+         I = df_ext_begin(*SuccI, Visited), E = df_ext_end(*SuccI, Visited);
+         I != E;) {
+      MachineBasicBlock *MBB = *I;
+
+      // Check if VNI is live in to MBB.
+      SlotIndex MBBStart, MBBEnd;
+      std::tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
+      LiveQueryResult LRQ = LR.Query(MBBStart);
+      if (LRQ.valueIn() != VNI) {
+        // This block isn't part of the VNI segment. Prune the search.
+        I.skipChildren();
+        continue;
+      }
+
+      // Prune the search if VNI is killed in MBB.
+      if (LRQ.endPoint() < MBBEnd) {
+        LR.removeSegment(MBBStart, LRQ.endPoint());
+        if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+        I.skipChildren();
+        continue;
+      }
+
+      // VNI is live through MBB.
+      LR.removeSegment(MBBStart, MBBEnd);
+      if (EndPoints) EndPoints->push_back(MBBEnd);
+      ++I;
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Register allocator hooks.
+//
+
+void LiveIntervals::addKillFlags(const VirtRegMap *VRM) {
+  // Keep track of regunit ranges.
+  SmallVector<std::pair<const LiveRange*, LiveRange::const_iterator>, 8> RU;
+  // Keep track of subregister ranges.
+  SmallVector<std::pair<const LiveInterval::SubRange*,
+                        LiveRange::const_iterator>, 4> SRs;
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    const LiveInterval &LI = getInterval(Reg);
+    if (LI.empty())
+      continue;
+
+    // Find the regunit intervals for the assigned register. They may overlap
+    // the virtual register live range, cancelling any kills.
+    RU.clear();
+    for (MCRegUnitIterator Units(VRM->getPhys(Reg), TRI); Units.isValid();
+         ++Units) {
+      const LiveRange &RURange = getRegUnit(*Units);
+      if (RURange.empty())
+        continue;
+      RU.push_back(std::make_pair(&RURange, RURange.find(LI.begin()->end)));
+    }
+
+    if (MRI->subRegLivenessEnabled()) {
+      SRs.clear();
+      for (const LiveInterval::SubRange &SR : LI.subranges()) {
+        SRs.push_back(std::make_pair(&SR, SR.find(LI.begin()->end)));
+      }
+    }
+
+    // Every instruction that kills Reg corresponds to a segment range end
+    // point.
+    for (LiveInterval::const_iterator RI = LI.begin(), RE = LI.end(); RI != RE;
+         ++RI) {
+      // A block index indicates an MBB edge.
+      if (RI->end.isBlock())
+        continue;
+      MachineInstr *MI = getInstructionFromIndex(RI->end);
+      if (!MI)
+        continue;
+
+      // Check if any of the regunits are live beyond the end of RI. That could
+      // happen when a physreg is defined as a copy of a virtreg:
+      //
+      //   %EAX = COPY %vreg5
+      //   FOO %vreg5         <--- MI, cancel kill because %EAX is live.
+      //   BAR %EAX<kill>
+      //
+      // There should be no kill flag on FOO when %vreg5 is rewritten as %EAX.
+      for (auto &RUP : RU) {
+        const LiveRange &RURange = *RUP.first;
+        LiveRange::const_iterator &I = RUP.second;
+        if (I == RURange.end())
+          continue;
+        I = RURange.advanceTo(I, RI->end);
+        if (I == RURange.end() || I->start >= RI->end)
+          continue;
+        // I is overlapping RI.
+        goto CancelKill;
+      }
+
+      if (MRI->subRegLivenessEnabled()) {
+        // When reading a partial undefined value we must not add a kill flag.
+        // The regalloc might have used the undef lane for something else.
+        // Example:
+        //     %vreg1 = ...              ; R32: %vreg1
+        //     %vreg2:high16 = ...       ; R64: %vreg2
+        //        = read %vreg2<kill>    ; R64: %vreg2
+        //        = read %vreg1          ; R32: %vreg1
+        // The <kill> flag is correct for %vreg2, but the register allocator may
+        // assign R0L to %vreg1, and R0 to %vreg2 because the low 32bits of R0
+        // are actually never written by %vreg2. After assignment the <kill>
+        // flag at the read instruction is invalid.
+        LaneBitmask DefinedLanesMask;
+        if (!SRs.empty()) {
+          // Compute a mask of lanes that are defined.
+          DefinedLanesMask = 0;
+          for (auto &SRP : SRs) {
+            const LiveInterval::SubRange &SR = *SRP.first;
+            LiveRange::const_iterator &I = SRP.second;
+            if (I == SR.end())
+              continue;
+            I = SR.advanceTo(I, RI->end);
+            if (I == SR.end() || I->start >= RI->end)
+              continue;
+            // I is overlapping RI
+            DefinedLanesMask |= SR.LaneMask;
+          }
+        } else
+          DefinedLanesMask = ~0u;
+
+        bool IsFullWrite = false;
+        for (const MachineOperand &MO : MI->operands()) {
+          if (!MO.isReg() || MO.getReg() != Reg)
+            continue;
+          if (MO.isUse()) {
+            // Reading any undefined lanes?
+            LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+            if ((UseMask & ~DefinedLanesMask) != 0)
+              goto CancelKill;
+          } else if (MO.getSubReg() == 0) {
+            // Writing to the full register?
+            assert(MO.isDef());
+            IsFullWrite = true;
+          }
+        }
+
+        // If an instruction writes to a subregister, a new segment starts in
+        // the LiveInterval. But as this is only overriding part of the register
+        // adding kill-flags is not correct here after registers have been
+        // assigned.
+        if (!IsFullWrite) {
+          // Next segment has to be adjacent in the subregister write case.
+          LiveRange::const_iterator N = std::next(RI);
+          if (N != LI.end() && N->start == RI->end)
+            goto CancelKill;
+        }
+      }
+
+      MI->addRegisterKilled(Reg, nullptr);
+      continue;
+CancelKill:
+      MI->clearRegisterKills(Reg, nullptr);
+    }
+  }
+}
+
+MachineBasicBlock*
+LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
+  // A local live range must be fully contained inside the block, meaning it is
+  // defined and killed at instructions, not at block boundaries. It is not
+  // live in or or out of any block.
+  //
+  // It is technically possible to have a PHI-defined live range identical to a
+  // single block, but we are going to return false in that case.
+
+  SlotIndex Start = LI.beginIndex();
+  if (Start.isBlock())
+    return nullptr;
+
+  SlotIndex Stop = LI.endIndex();
+  if (Stop.isBlock())
+    return nullptr;
+
+  // getMBBFromIndex doesn't need to search the MBB table when both indexes
+  // belong to proper instructions.
+  MachineBasicBlock *MBB1 = Indexes->getMBBFromIndex(Start);
+  MachineBasicBlock *MBB2 = Indexes->getMBBFromIndex(Stop);
+  return MBB1 == MBB2 ? MBB1 : nullptr;
+}
+
+bool
+LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
+  for (const VNInfo *PHI : LI.valnos) {
+    if (PHI->isUnused() || !PHI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
+    // Conservatively return true instead of scanning huge predecessor lists.
+    if (PHIMBB->pred_size() > 100)
+      return true;
+    for (MachineBasicBlock::const_pred_iterator
+         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+        return true;
+  }
+  return false;
+}
+
+float LiveIntervals::getSpillWeight(bool isDef, bool isUse,
+                                    const MachineBlockFrequencyInfo *MBFI,
+                                    const MachineInstr &MI) {
+  BlockFrequency Freq = MBFI->getBlockFreq(MI.getParent());
+  const float Scale = 1.0f / MBFI->getEntryFreq();
+  return (isDef + isUse) * (Freq.getFrequency() * Scale);
+}
+
+LiveRange::Segment
+LiveIntervals::addSegmentToEndOfBlock(unsigned reg, MachineInstr &startInst) {
+  LiveInterval& Interval = createEmptyInterval(reg);
+  VNInfo *VN = Interval.getNextValue(
+      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+      getVNInfoAllocator());
+  LiveRange::Segment S(SlotIndex(getInstructionIndex(startInst).getRegSlot()),
+                       getMBBEndIdx(startInst.getParent()), VN);
+  Interval.addSegment(S);
+
+  return S;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                          Register mask functions
+//===----------------------------------------------------------------------===//
+
+bool LiveIntervals::checkRegMaskInterference(LiveInterval &LI,
+                                             BitVector &UsableRegs) {
+  if (LI.empty())
+    return false;
+  LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
+
+  // Use a smaller arrays for local live ranges.
+  ArrayRef<SlotIndex> Slots;
+  ArrayRef<const uint32_t*> Bits;
+  if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
+    Slots = getRegMaskSlotsInBlock(MBB->getNumber());
+    Bits = getRegMaskBitsInBlock(MBB->getNumber());
+  } else {
+    Slots = getRegMaskSlots();
+    Bits = getRegMaskBits();
+  }
+
+  // We are going to enumerate all the register mask slots contained in LI.
+  // Start with a binary search of RegMaskSlots to find a starting point.
+  ArrayRef<SlotIndex>::iterator SlotI =
+    std::lower_bound(Slots.begin(), Slots.end(), LiveI->start);
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // No slots in range, LI begins after the last call.
+  if (SlotI == SlotE)
+    return false;
+
+  bool Found = false;
+  for (;;) {
+    assert(*SlotI >= LiveI->start);
+    // Loop over all slots overlapping this segment.
+    while (*SlotI < LiveI->end) {
+      // *SlotI overlaps LI. Collect mask bits.
+      if (!Found) {
+        // This is the first overlap. Initialize UsableRegs to all ones.
+        UsableRegs.clear();
+        UsableRegs.resize(TRI->getNumRegs(), true);
+        Found = true;
+      }
+      // Remove usable registers clobbered by this mask.
+      UsableRegs.clearBitsNotInMask(Bits[SlotI-Slots.begin()]);
+      if (++SlotI == SlotE)
+        return Found;
+    }
+    // *SlotI is beyond the current LI segment.
+    LiveI = LI.advanceTo(LiveI, *SlotI);
+    if (LiveI == LiveE)
+      return Found;
+    // Advance SlotI until it overlaps.
+    while (*SlotI < LiveI->start)
+      if (++SlotI == SlotE)
+        return Found;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         IntervalUpdate class.
+//===----------------------------------------------------------------------===//
+
+// HMEditor is a toolkit used by handleMove to trim or extend live intervals.
+class LiveIntervals::HMEditor {
+private:
+  LiveIntervals& LIS;
+  const MachineRegisterInfo& MRI;
+  const TargetRegisterInfo& TRI;
+  SlotIndex OldIdx;
+  SlotIndex NewIdx;
+  SmallPtrSet<LiveRange*, 8> Updated;
+  bool UpdateFlags;
+
+public:
+  HMEditor(LiveIntervals& LIS, const MachineRegisterInfo& MRI,
+           const TargetRegisterInfo& TRI,
+           SlotIndex OldIdx, SlotIndex NewIdx, bool UpdateFlags)
+    : LIS(LIS), MRI(MRI), TRI(TRI), OldIdx(OldIdx), NewIdx(NewIdx),
+      UpdateFlags(UpdateFlags) {}
+
+  // FIXME: UpdateFlags is a workaround that creates live intervals for all
+  // physregs, even those that aren't needed for regalloc, in order to update
+  // kill flags. This is wasteful. Eventually, LiveVariables will strip all kill
+  // flags, and postRA passes will use a live register utility instead.
+  LiveRange *getRegUnitLI(unsigned Unit) {
+    if (UpdateFlags)
+      return &LIS.getRegUnit(Unit);
+    return LIS.getCachedRegUnit(Unit);
+  }
+
+  /// Update all live ranges touched by MI, assuming a move from OldIdx to
+  /// NewIdx.
+  void updateAllRanges(MachineInstr *MI) {
+    DEBUG(dbgs() << "handleMove " << OldIdx << " -> " << NewIdx << ": " << *MI);
+    bool hasRegMask = false;
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isRegMask())
+        hasRegMask = true;
+      if (!MO.isReg())
+        continue;
+      if (MO.isUse()) {
+        if (!MO.readsReg())
+          continue;
+        // Aggressively clear all kill flags.
+        // They are reinserted by VirtRegRewriter.
+        MO.setIsKill(false);
+      }
+
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        LiveInterval &LI = LIS.getInterval(Reg);
+        if (LI.hasSubRanges()) {
+          unsigned SubReg = MO.getSubReg();
+          LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            if ((S.LaneMask & LaneMask) == 0)
+              continue;
+            updateRange(S, Reg, S.LaneMask);
+          }
+        }
+        updateRange(LI, Reg, 0);
+        continue;
+      }
+
+      // For physregs, only update the regunits that actually have a
+      // precomputed live range.
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        if (LiveRange *LR = getRegUnitLI(*Units))
+          updateRange(*LR, *Units, 0);
+    }
+    if (hasRegMask)
+      updateRegMaskSlots();
+  }
+
+private:
+  /// Update a single live range, assuming an instruction has been moved from
+  /// OldIdx to NewIdx.
+  void updateRange(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
+    if (!Updated.insert(&LR).second)
+      return;
+    DEBUG({
+      dbgs() << "     ";
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        dbgs() << PrintReg(Reg);
+        if (LaneMask != 0)
+          dbgs() << " L" << PrintLaneMask(LaneMask);
+      } else {
+        dbgs() << PrintRegUnit(Reg, &TRI);
+      }
+      dbgs() << ":\t" << LR << '\n';
+    });
+    if (SlotIndex::isEarlierInstr(OldIdx, NewIdx))
+      handleMoveDown(LR);
+    else
+      handleMoveUp(LR, Reg, LaneMask);
+    DEBUG(dbgs() << "        -->\t" << LR << '\n');
+    LR.verify();
+  }
+
+  /// Update LR to reflect an instruction has been moved downwards from OldIdx
+  /// to NewIdx (OldIdx < NewIdx).
+  void handleMoveDown(LiveRange &LR) {
+    LiveRange::iterator E = LR.end();
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
+      return;
+
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
+      // If the live-in value already extends to NewIdx, there is nothing to do.
+      if (SlotIndex::isEarlierEqualInstr(NewIdx, OldIdxIn->end))
+        return;
+      // Aggressively remove all kill flags from the old kill point.
+      // Kill flags shouldn't be used while live intervals exist, they will be
+      // reinserted by VirtRegRewriter.
+      if (MachineInstr *KillMI = LIS.getInstructionFromIndex(OldIdxIn->end))
+        for (MIBundleOperands MO(*KillMI); MO.isValid(); ++MO)
+          if (MO->isReg() && MO->isUse())
+            MO->setIsKill(false);
+
+      // Is there a def before NewIdx which is not OldIdx?
+      LiveRange::iterator Next = std::next(OldIdxIn);
+      if (Next != E && !SlotIndex::isSameInstr(OldIdx, Next->start) &&
+          SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+        // If we are here then OldIdx was just a use but not a def. We only have
+        // to ensure liveness extends to NewIdx.
+        LiveRange::iterator NewIdxIn =
+          LR.advanceTo(Next, NewIdx.getBaseIndex());
+        // Extend the segment before NewIdx if necessary.
+        if (NewIdxIn == E ||
+            !SlotIndex::isEarlierInstr(NewIdxIn->start, NewIdx)) {
+          LiveRange::iterator Prev = std::prev(NewIdxIn);
+          Prev->end = NewIdx.getRegSlot();
+        }
+        return;
+      }
+
+      // Adjust OldIdxIn->end to reach NewIdx. This may temporarily make LR
+      // invalid by overlapping ranges.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      OldIdxIn->end = NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber());
+      // If this was not a kill, then there was no def and we're done.
+      if (!isKill)
+        return;
+
+      // Did we have a Def at OldIdx?
+      OldIdxOut = Next;
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
+        return;
+    } else {
+      OldIdxOut = OldIdxIn;
+    }
+
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+
+    // If the defined value extends beyond NewIdx, just move the beginning
+    // of the segment to NewIdx.
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    if (SlotIndex::isEarlierInstr(NewIdxDef, OldIdxOut->end)) {
+      OldIdxVNI->def = NewIdxDef;
+      OldIdxOut->start = OldIdxVNI->def;
+      return;
+    }
+
+    // If we are here then we have a Definition at OldIdx which ends before
+    // NewIdx.
+
+    // Is there an existing Def at NewIdx?
+    LiveRange::iterator AfterNewIdx
+      = LR.advanceTo(OldIdxOut, NewIdx.getRegSlot());
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+    if (!OldIdxDefIsDead &&
+        SlotIndex::isEarlierInstr(OldIdxOut->end, NewIdxDef)) {
+      // OldIdx is not a dead def, and NewIdxDef is inside a new interval.
+      VNInfo *DefVNI;
+      if (OldIdxOut != LR.begin() &&
+          !SlotIndex::isEarlierInstr(std::prev(OldIdxOut)->end,
+                                     OldIdxOut->start)) {
+        // There is no gap between OldIdxOut and its predecessor anymore,
+        // merge them.
+        LiveRange::iterator IPrev = std::prev(OldIdxOut);
+        DefVNI = OldIdxVNI;
+        IPrev->end = OldIdxOut->end;
+      } else {
+        // The value is live in to OldIdx
+        LiveRange::iterator INext = std::next(OldIdxOut);
+        assert(INext != E && "Must have following segment");
+        // We merge OldIdxOut and its successor. As we're dealing with subreg
+        // reordering, there is always a successor to OldIdxOut in the same BB
+        // We don't need INext->valno anymore and will reuse for the new segment
+        // we create later.
+        DefVNI = OldIdxVNI;
+        INext->start = OldIdxOut->end;
+        INext->valno->def = INext->start;
+      }
+      // If NewIdx is behind the last segment, extend that and append a new one.
+      if (AfterNewIdx == E) {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn -| end
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS -| end
+        std::copy(std::next(OldIdxOut), E, OldIdxOut);
+        // The last segment is undefined now, reuse it for a dead def.
+        LiveRange::iterator NewSegment = std::prev(E);
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         DefVNI);
+        DefVNI->def = NewIdxDef;
+
+        LiveRange::iterator Prev = std::prev(NewSegment);
+        Prev->end = NewIdxDef;
+      } else {
+        // OldIdxOut is undef at this point, Slide (OldIdxOut;AfterNewIdx] up
+        // one position.
+        //    |-  ?/OldIdxOut -| |- X0 -| ... |- Xn/AfterNewIdx -| |- Next -|
+        // => |- X0/OldIdxOut -| ... |- Xn -| |- Xn/AfterNewIdx -| |- Next -|
+        std::copy(std::next(OldIdxOut), std::next(AfterNewIdx), OldIdxOut);
+        LiveRange::iterator Prev = std::prev(AfterNewIdx);
+        // We have two cases:
+        if (SlotIndex::isEarlierInstr(Prev->start, NewIdxDef)) {
+          // Case 1: NewIdx is inside a liverange. Split this liverange at
+          // NewIdxDef into the segment "Prev" followed by "NewSegment".
+          LiveRange::iterator NewSegment = AfterNewIdx;
+          *NewSegment = LiveRange::Segment(NewIdxDef, Prev->end, Prev->valno);
+          Prev->valno->def = NewIdxDef;
+
+          *Prev = LiveRange::Segment(Prev->start, NewIdxDef, DefVNI);
+          DefVNI->def = Prev->start;
+        } else {
+          // Case 2: NewIdx is in a lifetime hole. Keep AfterNewIdx as is and
+          // turn Prev into a segment from NewIdx to AfterNewIdx->start.
+          *Prev = LiveRange::Segment(NewIdxDef, AfterNewIdx->start, DefVNI);
+          DefVNI->def = NewIdxDef;
+          assert(DefVNI != AfterNewIdx->valno);
+        }
+      }
+      return;
+    }
+
+    if (AfterNewIdx != E &&
+        SlotIndex::isSameInstr(AfterNewIdx->start, NewIdxDef)) {
+      // There is an existing def at NewIdx. The def at OldIdx is coalesced into
+      // that value.
+      assert(AfterNewIdx->valno != OldIdxVNI && "Multiple defs of value?");
+      LR.removeValNo(OldIdxVNI);
+    } else {
+      // There was no existing def at NewIdx. We need to create a dead def
+      // at NewIdx. Shift segments over the old OldIdxOut segment, this frees
+      // a new segment at the place where we want to construct the dead def.
+      //    |- OldIdxOut -| |- X0 -| ... |- Xn -| |- AfterNewIdx -|
+      // => |- X0/OldIdxOut -| ... |- Xn -| |- undef/NewS. -| |- AfterNewIdx -|
+      assert(AfterNewIdx != OldIdxOut && "Inconsistent iterators");
+      std::copy(std::next(OldIdxOut), AfterNewIdx, OldIdxOut);
+      // We can reuse OldIdxVNI now.
+      LiveRange::iterator NewSegment = std::prev(AfterNewIdx);
+      VNInfo *NewSegmentVNI = OldIdxVNI;
+      NewSegmentVNI->def = NewIdxDef;
+      *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                       NewSegmentVNI);
+    }
+  }
+
+  /// Update LR to reflect an instruction has been moved upwards from OldIdx
+  /// to NewIdx (NewIdx < OldIdx).
+  void handleMoveUp(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask) {
+    LiveRange::iterator E = LR.end();
+    // Segment going into OldIdx.
+    LiveRange::iterator OldIdxIn = LR.find(OldIdx.getBaseIndex());
+
+    // No value live before or after OldIdx? Nothing to do.
+    if (OldIdxIn == E || SlotIndex::isEarlierInstr(OldIdx, OldIdxIn->start))
+      return;
+
+    LiveRange::iterator OldIdxOut;
+    // Do we have a value live-in to OldIdx?
+    if (SlotIndex::isEarlierInstr(OldIdxIn->start, OldIdx)) {
+      // If the live-in value isn't killed here, then we have no Def at
+      // OldIdx, moreover the value must be live at NewIdx so there is nothing
+      // to do.
+      bool isKill = SlotIndex::isSameInstr(OldIdx, OldIdxIn->end);
+      if (!isKill)
+        return;
+
+      // At this point we have to move OldIdxIn->end back to the nearest
+      // previous use or (dead-)def but no further than NewIdx.
+      SlotIndex DefBeforeOldIdx
+        = std::max(OldIdxIn->start.getDeadSlot(),
+                   NewIdx.getRegSlot(OldIdxIn->end.isEarlyClobber()));
+      OldIdxIn->end = findLastUseBefore(DefBeforeOldIdx, Reg, LaneMask);
+
+      // Did we have a Def at OldIdx? If not we are done now.
+      OldIdxOut = std::next(OldIdxIn);
+      if (OldIdxOut == E || !SlotIndex::isSameInstr(OldIdx, OldIdxOut->start))
+        return;
+    } else {
+      OldIdxOut = OldIdxIn;
+      OldIdxIn = OldIdxOut != LR.begin() ? std::prev(OldIdxOut) : E;
+    }
+
+    // If we are here then there is a Definition at OldIdx. OldIdxOut points
+    // to the segment starting there.
+    assert(OldIdxOut != E && SlotIndex::isSameInstr(OldIdx, OldIdxOut->start) &&
+           "No def?");
+    VNInfo *OldIdxVNI = OldIdxOut->valno;
+    assert(OldIdxVNI->def == OldIdxOut->start && "Inconsistent def");
+    bool OldIdxDefIsDead = OldIdxOut->end.isDead();
+
+    // Is there an existing def at NewIdx?
+    SlotIndex NewIdxDef = NewIdx.getRegSlot(OldIdxOut->start.isEarlyClobber());
+    LiveRange::iterator NewIdxOut = LR.find(NewIdx.getRegSlot());
+    if (SlotIndex::isSameInstr(NewIdxOut->start, NewIdx)) {
+      assert(NewIdxOut->valno != OldIdxVNI &&
+             "Same value defined more than once?");
+      // If OldIdx was a dead def remove it.
+      if (!OldIdxDefIsDead) {
+        // Remove segment starting at NewIdx and move begin of OldIdxOut to
+        // NewIdx so it can take its place.
+        OldIdxVNI->def = NewIdxDef;
+        OldIdxOut->start = NewIdxDef;
+        LR.removeValNo(NewIdxOut->valno);
+      } else {
+        // Simply remove the dead def at OldIdx.
+        LR.removeValNo(OldIdxVNI);
+      }
+    } else {
+      // Previously nothing was live after NewIdx, so all we have to do now is
+      // move the begin of OldIdxOut to NewIdx.
+      if (!OldIdxDefIsDead) {
+        // Do we have any intermediate Defs between OldIdx and NewIdx?
+        if (OldIdxIn != E &&
+            SlotIndex::isEarlierInstr(NewIdxDef, OldIdxIn->start)) {
+          // OldIdx is not a dead def and NewIdx is before predecessor start.
+          LiveRange::iterator NewIdxIn = NewIdxOut;
+          assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
+          const SlotIndex SplitPos = NewIdxDef;
+
+          // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
+                                          OldIdxIn->valno);
+          // OldIdxIn and OldIdxVNI are now undef and can be overridden.
+          // We Slide [NewIdxIn, OldIdxIn) down one position.
+          //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
+          // => |- undef/NexIdxIn -| |- X0 -| ... |- Xn-1 -| |- Xn/OldIdxOut -|
+          std::copy_backward(NewIdxIn, OldIdxIn, OldIdxOut);
+          // NewIdxIn is now considered undef so we can reuse it for the moved
+          // value.
+          LiveRange::iterator NewSegment = NewIdxIn;
+          LiveRange::iterator Next = std::next(NewSegment);
+          if (SlotIndex::isEarlierInstr(Next->start, NewIdx)) {
+            // There is no gap between NewSegment and its predecessor.
+            *NewSegment = LiveRange::Segment(Next->start, SplitPos,
+                                             Next->valno);
+            *Next = LiveRange::Segment(SplitPos, Next->end, OldIdxVNI);
+            Next->valno->def = SplitPos;
+          } else {
+            // There is a gap between NewSegment and its predecessor
+            // Value becomes live in.
+            *NewSegment = LiveRange::Segment(SplitPos, Next->start, OldIdxVNI);
+            NewSegment->valno->def = SplitPos;
+          }
+        } else {
+          // Leave the end point of a live def.
+          OldIdxOut->start = NewIdxDef;
+          OldIdxVNI->def = NewIdxDef;
+          if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
+            OldIdxIn->end = NewIdx.getRegSlot();
+        }
+      } else {
+        // OldIdxVNI is a dead def. It may have been moved across other values
+        // in LR, so move OldIdxOut up to NewIdxOut. Slide [NewIdxOut;OldIdxOut)
+        // down one position.
+        //    |- X0/NewIdxOut -| ... |- Xn-1 -| |- Xn/OldIdxOut -| |- next - |
+        // => |- undef/NewIdxOut -| |- X0 -| ... |- Xn-1 -| |- next -|
+        std::copy_backward(NewIdxOut, OldIdxOut, std::next(OldIdxOut));
+        // OldIdxVNI can be reused now to build a new dead def segment.
+        LiveRange::iterator NewSegment = NewIdxOut;
+        VNInfo *NewSegmentVNI = OldIdxVNI;
+        *NewSegment = LiveRange::Segment(NewIdxDef, NewIdxDef.getDeadSlot(),
+                                         NewSegmentVNI);
+        NewSegmentVNI->def = NewIdxDef;
+      }
+    }
+  }
+
+  void updateRegMaskSlots() {
+    SmallVectorImpl<SlotIndex>::iterator RI =
+      std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
+                       OldIdx);
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*std::prev(RI), *RI)) &&
+           "Cannot move regmask instruction above another call");
+    assert((std::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *std::next(RI))) &&
+           "Cannot move regmask instruction below another call");
+  }
+
+  // Return the last use of reg between NewIdx and OldIdx.
+  SlotIndex findLastUseBefore(SlotIndex Before, unsigned Reg,
+                              LaneBitmask LaneMask) {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      SlotIndex LastUse = Before;
+      for (MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+        if (MO.isUndef())
+          continue;
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0 && LaneMask != 0
+            && (TRI.getSubRegIndexLaneMask(SubReg) & LaneMask) == 0)
+          continue;
+
+        const MachineInstr &MI = *MO.getParent();
+        SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
+        if (InstSlot > LastUse && InstSlot < OldIdx)
+          LastUse = InstSlot.getRegSlot();
+      }
+      return LastUse;
+    }
+
+    // This is a regunit interval, so scanning the use list could be very
+    // expensive. Scan upwards from OldIdx instead.
+    assert(Before < OldIdx && "Expected upwards move");
+    SlotIndexes *Indexes = LIS.getSlotIndexes();
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(Before);
+
+    // OldIdx may not correspond to an instruction any longer, so set MII to
+    // point to the next instruction after OldIdx, or MBB->end().
+    MachineBasicBlock::iterator MII = MBB->end();
+    if (MachineInstr *MI = Indexes->getInstructionFromIndex(
+                           Indexes->getNextNonNullIndex(OldIdx)))
+      if (MI->getParent() == MBB)
+        MII = MI;
+
+    MachineBasicBlock::iterator Begin = MBB->begin();
+    while (MII != Begin) {
+      if ((--MII)->isDebugValue())
+        continue;
+      SlotIndex Idx = Indexes->getInstructionIndex(*MII);
+
+      // Stop searching when Before is reached.
+      if (!SlotIndex::isEarlierInstr(Before, Idx))
+        return Before;
+
+      // Check if MII uses Reg.
+      for (MIBundleOperands MO(*MII); MO.isValid(); ++MO)
+        if (MO->isReg() && !MO->isUndef() &&
+            TargetRegisterInfo::isPhysicalRegister(MO->getReg()) &&
+            TRI.hasRegUnit(MO->getReg(), Reg))
+          return Idx.getRegSlot();
+    }
+    // Didn't reach Before. It must be the first instruction in the block.
+    return Before;
+  }
+};
+
+void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
+  assert(!MI.isBundled() && "Can't handle bundled instructions yet.");
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  Indexes->removeMachineInstrFromMaps(MI);
+  SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(MI);
+  assert(getMBBStartIdx(MI.getParent()) <= OldIndex &&
+         OldIndex < getMBBEndIdx(MI.getParent()) &&
+         "Cannot handle moves across basic block boundaries.");
+
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(&MI);
+}
+
+void LiveIntervals::handleMoveIntoBundle(MachineInstr &MI,
+                                         MachineInstr &BundleStart,
+                                         bool UpdateFlags) {
+  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
+  SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
+  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+  HME.updateAllRanges(&MI);
+}
+
+void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
+                                        const MachineBasicBlock::iterator End,
+                                        const SlotIndex endIdx,
+                                        LiveRange &LR, const unsigned Reg,
+                                        LaneBitmask LaneMask) {
+  LiveInterval::iterator LII = LR.find(endIdx);
+  SlotIndex lastUseIdx;
+  if (LII != LR.end() && LII->start < endIdx)
+    lastUseIdx = LII->end;
+  else
+    --LII;
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (MI.isDebugValue())
+      continue;
+
+    SlotIndex instrIdx = getInstructionIndex(MI);
+    bool isStartValid = getInstructionFromIndex(LII->start);
+    bool isEndValid = getInstructionFromIndex(LII->end);
+
+    // FIXME: This doesn't currently handle early-clobber or multiple removed
+    // defs inside of the region to repair.
+    for (MachineInstr::mop_iterator OI = MI.operands_begin(),
+                                    OE = MI.operands_end();
+         OI != OE; ++OI) {
+      const MachineOperand &MO = *OI;
+      if (!MO.isReg() || MO.getReg() != Reg)
+        continue;
+
+      unsigned SubReg = MO.getSubReg();
+      LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubReg);
+      if ((Mask & LaneMask) == 0)
+        continue;
+
+      if (MO.isDef()) {
+        if (!isStartValid) {
+          if (LII->end.isDead()) {
+            SlotIndex prevStart;
+            if (LII != LR.begin())
+              prevStart = std::prev(LII)->start;
+
+            // FIXME: This could be more efficient if there was a
+            // removeSegment method that returned an iterator.
+            LR.removeSegment(*LII, true);
+            if (prevStart.isValid())
+              LII = LR.find(prevStart);
+            else
+              LII = LR.begin();
+          } else {
+            LII->start = instrIdx.getRegSlot();
+            LII->valno->def = instrIdx.getRegSlot();
+            if (MO.getSubReg() && !MO.isUndef())
+              lastUseIdx = instrIdx.getRegSlot();
+            else
+              lastUseIdx = SlotIndex();
+            continue;
+          }
+        }
+
+        if (!lastUseIdx.isValid()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(),
+                               instrIdx.getDeadSlot(), VNI);
+          LII = LR.addSegment(S);
+        } else if (LII->start != instrIdx.getRegSlot()) {
+          VNInfo *VNI = LR.getNextValue(instrIdx.getRegSlot(), VNInfoAllocator);
+          LiveRange::Segment S(instrIdx.getRegSlot(), lastUseIdx, VNI);
+          LII = LR.addSegment(S);
+        }
+
+        if (MO.getSubReg() && !MO.isUndef())
+          lastUseIdx = instrIdx.getRegSlot();
+        else
+          lastUseIdx = SlotIndex();
+      } else if (MO.isUse()) {
+        // FIXME: This should probably be handled outside of this branch,
+        // either as part of the def case (for defs inside of the region) or
+        // after the loop over the region.
+        if (!isEndValid && !LII->end.isBlock())
+          LII->end = instrIdx.getRegSlot();
+        if (!lastUseIdx.isValid())
+          lastUseIdx = instrIdx.getRegSlot();
+      }
+    }
+  }
+}
+
+void
+LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator Begin,
+                                      MachineBasicBlock::iterator End,
+                                      ArrayRef<unsigned> OrigRegs) {
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !Indexes->hasIndex(*Begin))
+    --Begin;
+  while (End != MBB->end() && !Indexes->hasIndex(*End))
+    ++End;
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB).getPrevSlot();
+  else
+    endIdx = getInstructionIndex(*End);
+
+  Indexes->repairIndexesInRange(MBB, Begin, End);
+
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (MI.isDebugValue())
+      continue;
+    for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                                          MOE = MI.operands_end();
+         MOI != MOE; ++MOI) {
+      if (MOI->isReg() &&
+          TargetRegisterInfo::isVirtualRegister(MOI->getReg()) &&
+          !hasInterval(MOI->getReg())) {
+        createAndComputeVirtRegInterval(MOI->getReg());
+      }
+    }
+  }
+
+  for (unsigned i = 0, e = OrigRegs.size(); i != e; ++i) {
+    unsigned Reg = OrigRegs[i];
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    LiveInterval &LI = getInterval(Reg);
+    // FIXME: Should we support undefs that gain defs?
+    if (!LI.hasAtLeastOneValue())
+      continue;
+
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      repairOldRegInRange(Begin, End, endIdx, S, Reg, S.LaneMask);
+    }
+    repairOldRegInRange(Begin, End, endIdx, LI, Reg);
+  }
+}
+
+void LiveIntervals::removePhysRegDefAt(unsigned Reg, SlotIndex Pos) {
+  for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+    if (LiveRange *LR = getCachedRegUnit(*Units))
+      if (VNInfo *VNI = LR->getVNInfoAt(Pos))
+        LR->removeValNo(VNI);
+  }
+}
+
+void LiveIntervals::removeVRegDefAt(LiveInterval &LI, SlotIndex Pos) {
+  VNInfo *VNI = LI.getVNInfoAt(Pos);
+  if (VNI == nullptr)
+    return;
+  LI.removeValNo(VNI);
+
+  // Also remove the value in subranges.
+  for (LiveInterval::SubRange &S : LI.subranges()) {
+    if (VNInfo *SVNI = S.getVNInfoAt(Pos))
+      S.removeValNo(SVNI);
+  }
+  LI.removeEmptySubRanges();
+}
+
+void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
+    SmallVectorImpl<LiveInterval*> &SplitLIs) {
+  ConnectedVNInfoEqClasses ConEQ(*this);
+  unsigned NumComp = ConEQ.Classify(LI);
+  if (NumComp <= 1)
+    return;
+  DEBUG(dbgs() << "  Split " << NumComp << " components: " << LI << '\n');
+  unsigned Reg = LI.reg;
+  const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+  for (unsigned I = 1; I < NumComp; ++I) {
+    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    LiveInterval &NewLI = createEmptyInterval(NewVReg);
+    SplitLIs.push_back(&NewLI);
+  }
+  ConEQ.Distribute(LI, SplitLIs.data(), *MRI);
+}
+
+void LiveIntervals::constructMainRangeFromSubranges(LiveInterval &LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->constructMainRangeFromSubranges(LI);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
new file mode 100644
index 000000000000..025d99ce7881
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveIntervalUnion.cpp
@@ -0,0 +1,205 @@
+//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// LiveIntervalUnion represents a coalesced set of live intervals. This may be
+// used during coalescing to represent a congruence class, or during register
+// allocation to model liveness of a physical register.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+
+// Merge a LiveInterval's segments. Guarantee no overlaps.
+void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
+    return;
+  ++Tag;
+
+  // Insert each of the virtual register's live segments into the map.
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  while (SegPos.valid()) {
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+    if (++RegPos == RegEnd)
+      return;
+    SegPos.advanceTo(RegPos->start);
+  }
+
+  // We have reached the end of Segments, so it is no longer necessary to search
+  // for the insertion position.
+  // It is faster to insert the end first.
+  --RegEnd;
+  SegPos.insert(RegEnd->start, RegEnd->end, &VirtReg);
+  for (; RegPos != RegEnd; ++RegPos, ++SegPos)
+    SegPos.insert(RegPos->start, RegPos->end, &VirtReg);
+}
+
+// Remove a live virtual register's segments from this union.
+void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
+  if (Range.empty())
+    return;
+  ++Tag;
+
+  // Remove each of the virtual register's live segments from the map.
+  LiveRange::const_iterator RegPos = Range.begin();
+  LiveRange::const_iterator RegEnd = Range.end();
+  SegmentIter SegPos = Segments.find(RegPos->start);
+
+  for (;;) {
+    assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
+    SegPos.erase();
+    if (!SegPos.valid())
+      return;
+
+    // Skip all segments that may have been coalesced.
+    RegPos = Range.advanceTo(RegPos, SegPos.start());
+    if (RegPos == RegEnd)
+      return;
+
+    SegPos.advanceTo(RegPos->start);
+  }
+}
+
+void
+LiveIntervalUnion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) const {
+  if (empty()) {
+    OS << " empty\n";
+    return;
+  }
+  for (LiveSegments::const_iterator SI = Segments.begin(); SI.valid(); ++SI) {
+    OS << " [" << SI.start() << ' ' << SI.stop() << "):"
+       << PrintReg(SI.value()->reg, TRI);
+  }
+  OS << '\n';
+}
+
+#ifndef NDEBUG
+// Verify the live intervals in this union and add them to the visited set.
+void LiveIntervalUnion::verify(LiveVirtRegBitSet& VisitedVRegs) {
+  for (SegmentIter SI = Segments.begin(); SI.valid(); ++SI)
+    VisitedVRegs.set(SI.value()->reg);
+}
+#endif //!NDEBUG
+
+// Scan the vector of interfering virtual registers in this union. Assume it's
+// quite small.
+bool LiveIntervalUnion::Query::isSeenInterference(LiveInterval *VirtReg) const {
+  SmallVectorImpl<LiveInterval*>::const_iterator I =
+    std::find(InterferingVRegs.begin(), InterferingVRegs.end(), VirtReg);
+  return I != InterferingVRegs.end();
+}
+
+// Collect virtual registers in this union that interfere with this
+// query's live virtual register.
+//
+// The query state is one of:
+//
+// 1. CheckedFirstInterference == false: Iterators are uninitialized.
+// 2. SeenAllInterferences == true: InterferingVRegs complete, iterators unused.
+// 3. Iterators left at the last seen intersection.
+//
+unsigned LiveIntervalUnion::Query::
+collectInterferingVRegs(unsigned MaxInterferingRegs) {
+  // Fast path return if we already have the desired information.
+  if (SeenAllInterferences || InterferingVRegs.size() >= MaxInterferingRegs)
+    return InterferingVRegs.size();
+
+  // Set up iterators on the first call.
+  if (!CheckedFirstInterference) {
+    CheckedFirstInterference = true;
+
+    // Quickly skip interference check for empty sets.
+    if (VirtReg->empty() || LiveUnion->empty()) {
+      SeenAllInterferences = true;
+      return 0;
+    }
+
+    // In most cases, the union will start before VirtReg.
+    VirtRegI = VirtReg->begin();
+    LiveUnionI.setMap(LiveUnion->getMap());
+    LiveUnionI.find(VirtRegI->start);
+  }
+
+  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveInterval *RecentReg = nullptr;
+  while (LiveUnionI.valid()) {
+    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+
+    // Check for overlapping interference.
+    while (VirtRegI->start < LiveUnionI.stop() &&
+           VirtRegI->end > LiveUnionI.start()) {
+      // This is an overlap, record the interfering register.
+      LiveInterval *VReg = LiveUnionI.value();
+      if (VReg != RecentReg && !isSeenInterference(VReg)) {
+        RecentReg = VReg;
+        InterferingVRegs.push_back(VReg);
+        if (InterferingVRegs.size() >= MaxInterferingRegs)
+          return InterferingVRegs.size();
+      }
+      // This LiveUnion segment is no longer interesting.
+      if (!(++LiveUnionI).valid()) {
+        SeenAllInterferences = true;
+        return InterferingVRegs.size();
+      }
+    }
+
+    // The iterators are now not overlapping, LiveUnionI has been advanced
+    // beyond VirtRegI.
+    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
+
+    // Advance the iterator that ends first.
+    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
+    if (VirtRegI == VirtRegEnd)
+      break;
+
+    // Detect overlap, handle above.
+    if (VirtRegI->start < LiveUnionI.stop())
+      continue;
+
+    // Still not overlapping. Catch up LiveUnionI.
+    LiveUnionI.advanceTo(VirtRegI->start);
+  }
+  SeenAllInterferences = true;
+  return InterferingVRegs.size();
+}
+
+void LiveIntervalUnion::Array::init(LiveIntervalUnion::Allocator &Alloc,
+                                    unsigned NSize) {
+  // Reuse existing allocation.
+  if (NSize == Size)
+    return;
+  clear();
+  Size = NSize;
+  LIUs = static_cast<LiveIntervalUnion*>(
+    malloc(sizeof(LiveIntervalUnion)*NSize));
+  for (unsigned i = 0; i != Size; ++i)
+    new(LIUs + i) LiveIntervalUnion(Alloc);
+}
+
+void LiveIntervalUnion::Array::clear() {
+  if (!LIUs)
+    return;
+  for (unsigned i = 0; i != Size; ++i)
+    LIUs[i].~LiveIntervalUnion();
+  free(LIUs);
+  Size =  0;
+  LIUs = nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
new file mode 100644
index 000000000000..4e2528f47568
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -0,0 +1,189 @@
+//===--- LivePhysRegs.cpp - Live Physical Register Set --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LivePhysRegs utility for tracking liveness of
+// physical registers across machine instructions in forward or backward order.
+// A more detailed description can be found in the corresponding header file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+
+/// \brief Remove all registers from the set that get clobbered by the register
+/// mask.
+/// The clobbers set will be the list of live registers clobbered
+/// by the regmask.
+void LivePhysRegs::removeRegsInMask(const MachineOperand &MO,
+        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> *Clobbers) {
+  SparseSet<unsigned>::iterator LRI = LiveRegs.begin();
+  while (LRI != LiveRegs.end()) {
+    if (MO.clobbersPhysReg(*LRI)) {
+      if (Clobbers)
+        Clobbers->push_back(std::make_pair(*LRI, &MO));
+      LRI = LiveRegs.erase(LRI);
+    } else
+      ++LRI;
+  }
+}
+
+/// Simulates liveness when stepping backwards over an instruction(bundle):
+/// Remove Defs, add uses. This is the recommended way of calculating liveness.
+void LivePhysRegs::stepBackward(const MachineInstr &MI) {
+  // Remove defined registers and regmask kills from the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      if (!O->isDef())
+        continue;
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      removeReg(Reg);
+    } else if (O->isRegMask())
+      removeRegsInMask(*O, nullptr);
+  }
+
+  // Add uses to the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (!O->isReg() || !O->readsReg())
+      continue;
+    unsigned Reg = O->getReg();
+    if (Reg == 0)
+      continue;
+    addReg(Reg);
+  }
+}
+
+/// Simulates liveness when stepping forward over an instruction(bundle): Remove
+/// killed-uses, add defs. This is the not recommended way, because it depends
+/// on accurate kill flags. If possible use stepBackward() instead of this
+/// function.
+void LivePhysRegs::stepForward(const MachineInstr &MI,
+        SmallVectorImpl<std::pair<unsigned, const MachineOperand*>> &Clobbers) {
+  // Remove killed registers from the set.
+  for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
+    if (O->isReg()) {
+      unsigned Reg = O->getReg();
+      if (Reg == 0)
+        continue;
+      if (O->isDef()) {
+        // Note, dead defs are still recorded.  The caller should decide how to
+        // handle them.
+        Clobbers.push_back(std::make_pair(Reg, &*O));
+      } else {
+        if (!O->isKill())
+          continue;
+        assert(O->isUse());
+        removeReg(Reg);
+      }
+    } else if (O->isRegMask())
+      removeRegsInMask(*O, &Clobbers);
+  }
+
+  // Add defs to the set.
+  for (auto Reg : Clobbers) {
+    // Skip dead defs.  They shouldn't be added to the set.
+    if (Reg.second->isReg() && Reg.second->isDead())
+      continue;
+    addReg(Reg.first);
+  }
+}
+
+/// Prin the currently live registers to OS.
+void LivePhysRegs::print(raw_ostream &OS) const {
+  OS << "Live Registers:";
+  if (!TRI) {
+    OS << " (uninitialized)\n";
+    return;
+  }
+
+  if (empty()) {
+    OS << " (empty)\n";
+    return;
+  }
+
+  for (const_iterator I = begin(), E = end(); I != E; ++I)
+    OS << " " << PrintReg(*I, TRI);
+  OS << "\n";
+}
+
+/// Dumps the currently live registers to the debug output.
+LLVM_DUMP_METHOD void LivePhysRegs::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  " << *this;
+#endif
+}
+
+bool LivePhysRegs::available(const MachineRegisterInfo &MRI,
+                             unsigned Reg) const {
+  if (LiveRegs.count(Reg))
+    return false;
+  if (MRI.isReserved(Reg))
+    return false;
+  for (MCRegAliasIterator R(Reg, TRI, false); R.isValid(); ++R) {
+    if (LiveRegs.count(*R))
+      return false;
+  }
+  return true;
+}
+
+/// Add live-in registers of basic block \p MBB to \p LiveRegs.
+static void addLiveIns(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) {
+  for (const auto &LI : MBB.liveins())
+    LiveRegs.addReg(LI.PhysReg);
+}
+
+/// Add pristine registers to the given \p LiveRegs. This function removes
+/// actually saved callee save registers when \p InPrologueEpilogue is false.
+static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
+                         const MachineFrameInfo &MFI,
+                         const TargetRegisterInfo &TRI) {
+  for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+    LiveRegs.addReg(*CSR);
+  for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
+    LiveRegs.removeReg(Info.getReg());
+}
+
+void LivePhysRegs::addLiveOutsNoPristines(const MachineBasicBlock &MBB) {
+  // To get the live-outs we simply merge the live-ins of all successors.
+  for (const MachineBasicBlock *Succ : MBB.successors())
+    ::addLiveIns(*this, *Succ);
+}
+
+void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid()) {
+    if (MBB.isReturnBlock()) {
+      // The return block has no successors whose live-ins we could merge
+      // below. So instead we add the callee saved registers manually.
+      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+        addReg(*I);
+    } else {
+      addPristines(*this, MF, MFI, *TRI);
+    }
+  }
+
+  addLiveOutsNoPristines(MBB);
+}
+
+void LivePhysRegs::addLiveIns(const MachineBasicBlock &MBB) {
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  if (MFI.isCalleeSavedInfoValid())
+    addPristines(*this, MF, MFI, *TRI);
+  ::addLiveIns(*this, MBB);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
new file mode 100644
index 000000000000..db91ca113dc1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -0,0 +1,484 @@
+//===---- LiveRangeCalc.cpp - Calculate live ranges -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveRangeCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveRangeCalc.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+void LiveRangeCalc::resetLiveOutMap() {
+  unsigned NumBlocks = MF->getNumBlockIDs();
+  Seen.clear();
+  Seen.resize(NumBlocks);
+  Map.resize(NumBlocks);
+}
+
+void LiveRangeCalc::reset(const MachineFunction *mf,
+                          SlotIndexes *SI,
+                          MachineDominatorTree *MDT,
+                          VNInfo::Allocator *VNIA) {
+  MF = mf;
+  MRI = &MF->getRegInfo();
+  Indexes = SI;
+  DomTree = MDT;
+  Alloc = VNIA;
+  resetLiveOutMap();
+  LiveIn.clear();
+}
+
+
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex DefIdx =
+      Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+
+  // Create the def in LR. This may find an existing def.
+  LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Step 1: Create minimal live segments for every definition of Reg.
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) {
+      LaneBitmask Mask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                     : MRI->getMaxLaneMaskForVReg(Reg);
+
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        // A Mask for subregs common to the existing subrange and current def.
+        LaneBitmask Common = S.LaneMask & Mask;
+        if (Common == 0)
+          continue;
+        // A Mask for subregs covered by the subrange but not the current def.
+        LaneBitmask LRest = S.LaneMask & ~Mask;
+        LiveInterval::SubRange *CommonRange;
+        if (LRest != 0) {
+          // Split current subrange into Common and LRest ranges.
+          S.LaneMask = LRest;
+          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
+        } else {
+          assert(Common == S.LaneMask);
+          CommonRange = &S;
+        }
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
+        Mask &= ~Common;
+      }
+      // Create a new SubRange for subregs we did not cover yet.
+      if (Mask != 0) {
+        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
+        if (MO.isDef())
+          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
+      }
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      resetLiveOutMap();
+      extendToUses(S, Reg, S.LaneMask);
+    }
+    LI.clear();
+    constructMainRangeFromSubranges(LI);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, ~0u);
+  }
+}
+
+void LiveRangeCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
+  // First create dead defs at all defs found in subranges.
+  LiveRange &MainRange = LI;
+  assert(MainRange.segments.empty() && MainRange.valnos.empty() &&
+         "Expect empty main liverange");
+
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    for (const VNInfo *VNI : SR.valnos) {
+      if (!VNI->isUnused() && !VNI->isPHIDef())
+        MainRange.createDeadDef(VNI->def, *Alloc);
+    }
+  }
+
+  resetLiveOutMap();
+  extendToUses(MainRange, LI.reg);
+}
+
+void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LR.createDeadDef() will deduplicate.
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
+}
+
+
+void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg,
+                                 LaneBitmask Mask) {
+  // Visit all operands that read Reg. This may include partial defs.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervalAnalysis::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
+    else {
+      // We only care about uses, but on the main range (mask ~0u) this includes
+      // the "virtual" reads happening for subregister defs.
+      if (Mask != ~0u)
+        continue;
+    }
+
+    if (!MO.readsReg())
+      continue;
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(SubReg);
+      // Ignore uses not covering the current subrange.
+      if ((SubRegMask & Mask) == 0)
+        continue;
+    }
+
+    // Determine the actual place of the use.
+    const MachineInstr *MI = MO.getParent();
+    unsigned OpNo = (&MO - &MI->getOperand(0));
+    SlotIndex UseIdx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
+    } else {
+      // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
+      unsigned DefIdx;
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
+      }
+      UseIdx = Indexes->getInstructionIndex(*MI).getRegSlot(isEarlyClobber);
+    }
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg);
+  }
+}
+
+
+void LiveRangeCalc::updateFromLiveIns() {
+  LiveRangeUpdater Updater;
+  for (const LiveInBlock &I : LiveIn) {
+    if (!I.DomNode)
+      continue;
+    MachineBasicBlock *MBB = I.DomNode->getBlock();
+    assert(I.Value && "No live-in value found");
+    SlotIndex Start, End;
+    std::tie(Start, End) = Indexes->getMBBRange(MBB);
+
+    if (I.Kill.isValid())
+      // Value is killed inside this block.
+      End = I.Kill;
+    else {
+      // The value is live-through, update LiveOut as well.
+      // Defer the Domtree lookup until it is needed.
+      assert(Seen.test(MBB->getNumber()));
+      Map[MBB] = LiveOutPair(I.Value, nullptr);
+    }
+    Updater.setDest(&I.LR);
+    Updater.add(Start, End, I.Value);
+  }
+  LiveIn.clear();
+}
+
+
+void LiveRangeCalc::extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg) {
+  assert(Use.isValid() && "Invalid SlotIndex");
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  MachineBasicBlock *UseMBB = Indexes->getMBBFromIndex(Use.getPrevSlot());
+  assert(UseMBB && "No MBB at Use");
+
+  // Is there a def in the same MBB we can extend?
+  if (LR.extendInBlock(Indexes->getMBBStartIdx(UseMBB), Use))
+    return;
+
+  // Find the single reaching def, or determine if Use is jointly dominated by
+  // multiple values, and we may need to create even more phi-defs to preserve
+  // VNInfo SSA form.  Perform a search for all predecessor blocks where we
+  // know the dominating VNInfo.
+  if (findReachingDefs(LR, *UseMBB, Use, PhysReg))
+    return;
+
+  // When there were multiple different values, we may need new PHIs.
+  calculateValues();
+}
+
+
+// This function is called by a client after using the low-level API to add
+// live-out and live-in blocks.  The unique value optimization is not
+// available, SplitEditor::transferValues handles that case directly anyway.
+void LiveRangeCalc::calculateValues() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+  updateSSA();
+  updateFromLiveIns();
+}
+
+
+bool LiveRangeCalc::findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
+                                     SlotIndex Use, unsigned PhysReg) {
+  unsigned UseMBBNum = UseMBB.getNumber();
+
+  // Block numbers where LR should be live-in.
+  SmallVector<unsigned, 16> WorkList(1, UseMBBNum);
+
+  // Remember if we have seen more than one value.
+  bool UniqueVNI = true;
+  VNInfo *TheVNI = nullptr;
+
+  // Using Seen as a visited set, perform a BFS for all reaching defs.
+  for (unsigned i = 0; i != WorkList.size(); ++i) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(WorkList[i]);
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      errs() << "Use of " << PrintReg(PhysReg)
+             << " does not have a corresponding definition on every path:\n";
+      const MachineInstr *MI = Indexes->getInstructionFromIndex(Use);
+      if (MI != nullptr)
+        errs() << Use << " " << *MI;
+      llvm_unreachable("Use not jointly dominated by defs.");
+    }
+
+    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+        !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      errs() << "The register " << PrintReg(PhysReg)
+             << " needs to be live in to BB#" << MBB->getNumber()
+             << ", but is missing from the live-in list.\n";
+      llvm_unreachable("Invalid global physical register");
+    }
+#endif
+
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+       MachineBasicBlock *Pred = *PI;
+
+       // Is this a known live-out block?
+       if (Seen.test(Pred->getNumber())) {
+         if (VNInfo *VNI = Map[Pred].first) {
+           if (TheVNI && TheVNI != VNI)
+             UniqueVNI = false;
+           TheVNI = VNI;
+         }
+         continue;
+       }
+
+       SlotIndex Start, End;
+       std::tie(Start, End) = Indexes->getMBBRange(Pred);
+
+       // First time we see Pred.  Try to determine the live-out value, but set
+       // it as null if Pred is live-through with an unknown value.
+       VNInfo *VNI = LR.extendInBlock(Start, End);
+       setLiveOutValue(Pred, VNI);
+       if (VNI) {
+         if (TheVNI && TheVNI != VNI)
+           UniqueVNI = false;
+         TheVNI = VNI;
+         continue;
+       }
+
+       // No, we need a live-in value for Pred as well
+       if (Pred != &UseMBB)
+          WorkList.push_back(Pred->getNumber());
+       else
+          // Loopback to UseMBB, so value is really live through.
+         Use = SlotIndex();
+    }
+  }
+
+  LiveIn.clear();
+
+  // Both updateSSA() and LiveRangeUpdater benefit from ordered blocks, but
+  // neither require it. Skip the sorting overhead for small updates.
+  if (WorkList.size() > 4)
+    array_pod_sort(WorkList.begin(), WorkList.end());
+
+  // If a unique reaching def was found, blit in the live ranges immediately.
+  if (UniqueVNI) {
+    LiveRangeUpdater Updater(&LR);
+    for (SmallVectorImpl<unsigned>::const_iterator I = WorkList.begin(),
+         E = WorkList.end(); I != E; ++I) {
+       SlotIndex Start, End;
+       std::tie(Start, End) = Indexes->getMBBRange(*I);
+       // Trim the live range in UseMBB.
+       if (*I == UseMBBNum && Use.isValid())
+         End = Use;
+       else
+         Map[MF->getBlockNumbered(*I)] = LiveOutPair(TheVNI, nullptr);
+       Updater.add(Start, End, TheVNI);
+    }
+    return true;
+  }
+
+  // Multiple values were found, so transfer the work list to the LiveIn array
+  // where UpdateSSA will use it as a work list.
+  LiveIn.reserve(WorkList.size());
+  for (SmallVectorImpl<unsigned>::const_iterator
+       I = WorkList.begin(), E = WorkList.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = MF->getBlockNumbered(*I);
+    addLiveInBlock(LR, DomTree->getNode(MBB));
+    if (MBB == &UseMBB)
+      LiveIn.back().Kill = Use;
+  }
+
+  return false;
+}
+
+
+// This is essentially the same iterative algorithm that SSAUpdater uses,
+// except we already have a dominator tree, so we don't have to recompute it.
+void LiveRangeCalc::updateSSA() {
+  assert(Indexes && "Missing SlotIndexes");
+  assert(DomTree && "Missing dominator tree");
+
+  // Interate until convergence.
+  unsigned Changes;
+  do {
+    Changes = 0;
+    // Propagate live-out values down the dominator tree, inserting phi-defs
+    // when necessary.
+    for (LiveInBlock &I : LiveIn) {
+      MachineDomTreeNode *Node = I.DomNode;
+      // Skip block if the live-in value has already been determined.
+      if (!Node)
+        continue;
+      MachineBasicBlock *MBB = Node->getBlock();
+      MachineDomTreeNode *IDom = Node->getIDom();
+      LiveOutPair IDomValue;
+
+      // We need a live-in value to a block with no immediate dominator?
+      // This is probably an unreachable block that has survived somehow.
+      bool needPHI = !IDom || !Seen.test(IDom->getBlock()->getNumber());
+
+      // IDom dominates all of our predecessors, but it may not be their
+      // immediate dominator. Check if any of them have live-out values that are
+      // properly dominated by IDom. If so, we need a phi-def here.
+      if (!needPHI) {
+        IDomValue = Map[IDom->getBlock()];
+
+        // Cache the DomTree node that defined the value.
+        if (IDomValue.first && !IDomValue.second)
+          Map[IDom->getBlock()].second = IDomValue.second =
+            DomTree->getNode(Indexes->getMBBFromIndex(IDomValue.first->def));
+
+        for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+               PE = MBB->pred_end(); PI != PE; ++PI) {
+          LiveOutPair &Value = Map[*PI];
+          if (!Value.first || Value.first == IDomValue.first)
+            continue;
+
+          // Cache the DomTree node that defined the value.
+          if (!Value.second)
+            Value.second =
+              DomTree->getNode(Indexes->getMBBFromIndex(Value.first->def));
+
+          // This predecessor is carrying something other than IDomValue.
+          // It could be because IDomValue hasn't propagated yet, or it could be
+          // because MBB is in the dominance frontier of that value.
+          if (DomTree->dominates(IDom, Value.second)) {
+            needPHI = true;
+            break;
+          }
+        }
+      }
+
+      // The value may be live-through even if Kill is set, as can happen when
+      // we are called from extendRange. In that case LiveOutSeen is true, and
+      // LiveOut indicates a foreign or missing value.
+      LiveOutPair &LOP = Map[MBB];
+
+      // Create a phi-def if required.
+      if (needPHI) {
+        ++Changes;
+        assert(Alloc && "Need VNInfo allocator to create PHI-defs");
+        SlotIndex Start, End;
+        std::tie(Start, End) = Indexes->getMBBRange(MBB);
+        LiveRange &LR = I.LR;
+        VNInfo *VNI = LR.getNextValue(Start, *Alloc);
+        I.Value = VNI;
+        // This block is done, we know the final value.
+        I.DomNode = nullptr;
+
+        // Add liveness since updateFromLiveIns now skips this node.
+        if (I.Kill.isValid())
+          LR.addSegment(LiveInterval::Segment(Start, I.Kill, VNI));
+        else {
+          LR.addSegment(LiveInterval::Segment(Start, End, VNI));
+          LOP = LiveOutPair(VNI, Node);
+        }
+      } else if (IDomValue.first) {
+        // No phi-def here. Remember incoming value.
+        I.Value = IDomValue.first;
+
+        // If the IDomValue is killed in the block, don't propagate through.
+        if (I.Kill.isValid())
+          continue;
+
+        // Propagate IDomValue if it isn't killed:
+        // MBB is live-out and doesn't define its own value.
+        if (LOP.first == IDomValue.first)
+          continue;
+        ++Changes;
+        LOP = IDomValue;
+      }
+    }
+  } while (Changes);
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeCalc.h b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
new file mode 100644
index 000000000000..9de48b722881
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeCalc.h
@@ -0,0 +1,248 @@
+//===---- LiveRangeCalc.h - Calculate live ranges ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeCalc class can be used to compute live ranges from scratch.  It
+// caches information about values in the CFG to speed up repeated operations
+// on the same live range.  The cache can be shared by non-overlapping live
+// ranges.  SplitKit uses that when computing the live range of split products.
+//
+// A low-level interface is available to clients that know where a variable is
+// live, but don't know which value it has as every point.  LiveRangeCalc will
+// propagate values down the dominator tree, and even insert PHI-defs where
+// needed.  SplitKit uses this faster interface when possible.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVERANGECALC_H
+#define LLVM_LIB_CODEGEN_LIVERANGECALC_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Forward declarations for MachineDominators.h:
+class MachineDominatorTree;
+template <class NodeT> class DomTreeNodeBase;
+typedef DomTreeNodeBase<MachineBasicBlock> MachineDomTreeNode;
+
+class LiveRangeCalc {
+  const MachineFunction *MF;
+  const MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  MachineDominatorTree *DomTree;
+  VNInfo::Allocator *Alloc;
+
+  /// LiveOutPair - A value and the block that defined it.  The domtree node is
+  /// redundant, it can be computed as: MDT[Indexes.getMBBFromIndex(VNI->def)].
+  typedef std::pair<VNInfo*, MachineDomTreeNode*> LiveOutPair;
+
+  /// LiveOutMap - Map basic blocks to the value leaving the block.
+  typedef IndexedMap<LiveOutPair, MBB2NumberFunctor> LiveOutMap;
+
+  /// Bit vector of active entries in LiveOut, also used as a visited set by
+  /// findReachingDefs.  One entry per basic block, indexed by block number.
+  /// This is kept as a separate bit vector because it can be cleared quickly
+  /// when switching live ranges.
+  BitVector Seen;
+
+  /// Map each basic block where a live range is live out to the live-out value
+  /// and its defining block.
+  ///
+  /// For every basic block, MBB, one of these conditions shall be true:
+  ///
+  ///  1. !Seen.count(MBB->getNumber())
+  ///     Blocks without a Seen bit are ignored.
+  ///  2. LiveOut[MBB].second.getNode() == MBB
+  ///     The live-out value is defined in MBB.
+  ///  3. forall P in preds(MBB): LiveOut[P] == LiveOut[MBB]
+  ///     The live-out value passses through MBB. All predecessors must carry
+  ///     the same value.
+  ///
+  /// The domtree node may be null, it can be computed.
+  ///
+  /// The map can be shared by multiple live ranges as long as no two are
+  /// live-out of the same block.
+  LiveOutMap Map;
+
+  /// LiveInBlock - Information about a basic block where a live range is known
+  /// to be live-in, but the value has not yet been determined.
+  struct LiveInBlock {
+    // The live range set that is live-in to this block.  The algorithms can
+    // handle multiple non-overlapping live ranges simultaneously.
+    LiveRange &LR;
+
+    // DomNode - Dominator tree node for the block.
+    // Cleared when the final value has been determined and LI has been updated.
+    MachineDomTreeNode *DomNode;
+
+    // Position in block where the live-in range ends, or SlotIndex() if the
+    // range passes through the block.  When the final value has been
+    // determined, the range from the block start to Kill will be added to LI.
+    SlotIndex Kill;
+
+    // Live-in value filled in by updateSSA once it is known.
+    VNInfo *Value;
+
+    LiveInBlock(LiveRange &LR, MachineDomTreeNode *node, SlotIndex kill)
+      : LR(LR), DomNode(node), Kill(kill), Value(nullptr) {}
+  };
+
+  /// LiveIn - Work list of blocks where the live-in value has yet to be
+  /// determined.  This list is typically computed by findReachingDefs() and
+  /// used as a work list by updateSSA().  The low-level interface may also be
+  /// used to add entries directly.
+  SmallVector<LiveInBlock, 16> LiveIn;
+
+  /// Assuming that @p LR is live-in to @p UseMBB, find the set of defs that can
+  /// reach it.
+  ///
+  /// If only one def can reach @p UseMBB, all paths from the def to @p UseMBB
+  /// are added to @p LR, and the function returns true.
+  ///
+  /// If multiple values can reach @p UseMBB, the blocks that need @p LR to be
+  /// live in are added to the LiveIn array, and the function returns false.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  bool findReachingDefs(LiveRange &LR, MachineBasicBlock &UseMBB,
+                        SlotIndex Kill, unsigned PhysReg);
+
+  /// updateSSA - Compute the values that will be live in to all requested
+  /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
+  ///
+  /// Every live-in block must be jointly dominated by the added live-out
+  /// blocks.  No values are read from the live ranges.
+  void updateSSA();
+
+  /// Transfer information from the LiveIn vector to the live ranges and update
+  /// the given @p LiveOuts.
+  void updateFromLiveIns();
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask LaneMask);
+
+  /// Reset Map and Seen fields.
+  void resetLiveOutMap();
+
+public:
+  LiveRangeCalc() : MF(nullptr), MRI(nullptr), Indexes(nullptr),
+                    DomTree(nullptr), Alloc(nullptr) {}
+
+  //===--------------------------------------------------------------------===//
+  // High-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Calculate live ranges from scratch.
+  //
+
+  /// reset - Prepare caches for a new set of non-overlapping live ranges.  The
+  /// caches must be reset before attempting calculations with a live range
+  /// that may overlap a previously computed live range, and before the first
+  /// live range in a function.  If live ranges are not known to be
+  /// non-overlapping, call reset before each.
+  void reset(const MachineFunction *MF,
+             SlotIndexes*,
+             MachineDominatorTree*,
+             VNInfo::Allocator*);
+
+  //===--------------------------------------------------------------------===//
+  // Mid-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // Modify existing live ranges.
+  //
+
+  /// Extend the live range of @p LR to reach @p Use.
+  ///
+  /// The existing values in @p LR must be live so they jointly dominate @p Use.
+  /// If @p Use is not dominated by a single existing value, PHI-defs are
+  /// inserted as required to preserve SSA form.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveRange &LR, SlotIndex Use, unsigned PhysReg = 0);
+
+  /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
+  /// Each instruction defining Reg gets a new VNInfo with a corresponding
+  /// minimal live range.
+  void createDeadDefs(LiveRange &LR, unsigned Reg);
+
+  /// Extend the live range of @p LR to reach all uses of Reg.
+  ///
+  /// All uses must be jointly dominated by existing liveness.  PHI-defs are
+  /// inserted as needed to preserve SSA form.
+  void extendToUses(LiveRange &LR, unsigned PhysReg) {
+    extendToUses(LR, PhysReg, ~0u);
+  }
+
+  /// Calculates liveness for the register specified in live interval @p LI.
+  /// Creates subregister live ranges as needed if subreg liveness tracking is
+  /// enabled.
+  void calculate(LiveInterval &LI, bool TrackSubRegs);
+
+  /// For live interval \p LI with correct SubRanges construct matching
+  /// information for the main live range. Expects the main live range to not
+  /// have any segments or value numbers.
+  void constructMainRangeFromSubranges(LiveInterval &LI);
+
+  //===--------------------------------------------------------------------===//
+  // Low-level interface.
+  //===--------------------------------------------------------------------===//
+  //
+  // These functions can be used to compute live ranges where the live-in and
+  // live-out blocks are already known, but the SSA value in each block is
+  // unknown.
+  //
+  // After calling reset(), add known live-out values and known live-in blocks.
+  // Then call calculateValues() to compute the actual value that is
+  // live-in to each block, and add liveness to the live ranges.
+  //
+
+  /// setLiveOutValue - Indicate that VNI is live out from MBB.  The
+  /// calculateValues() function will not add liveness for MBB, the caller
+  /// should take care of that.
+  ///
+  /// VNI may be null only if MBB is a live-through block also passed to
+  /// addLiveInBlock().
+  void setLiveOutValue(MachineBasicBlock *MBB, VNInfo *VNI) {
+    Seen.set(MBB->getNumber());
+    Map[MBB] = LiveOutPair(VNI, nullptr);
+  }
+
+  /// addLiveInBlock - Add a block with an unknown live-in value.  This
+  /// function can only be called once per basic block.  Once the live-in value
+  /// has been determined, calculateValues() will add liveness to LI.
+  ///
+  /// @param LR      The live range that is live-in to the block.
+  /// @param DomNode The domtree node for the block.
+  /// @param Kill    Index in block where LI is killed.  If the value is
+  ///                live-through, set Kill = SLotIndex() and also call
+  ///                setLiveOutValue(MBB, 0).
+  void addLiveInBlock(LiveRange &LR,
+                      MachineDomTreeNode *DomNode,
+                      SlotIndex Kill = SlotIndex()) {
+    LiveIn.push_back(LiveInBlock(LR, DomNode, Kill));
+  }
+
+  /// calculateValues - Calculate the value that will be live-in to each block
+  /// added with addLiveInBlock.  Add PHI-def values as needed to preserve SSA
+  /// form.  Add liveness to all live-in blocks up to the Kill point, or the
+  /// whole block for live-through blocks.
+  ///
+  /// Every predecessor of a live-in block must have been given a value with
+  /// setLiveOutValue, the value may be null for live-trough blocks.
+  void calculateValues();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
new file mode 100644
index 000000000000..b35c0adfacad
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -0,0 +1,449 @@
+//===-- LiveRangeEdit.cpp - Basic tools for editing a register live range -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The LiveRangeEdit class represents changes done to a virtual register when it
+// is spilled or split.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumDCEDeleted,     "Number of instructions deleted by DCE");
+STATISTIC(NumDCEFoldedLoads, "Number of single use loads folded after DCE");
+STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
+
+void LiveRangeEdit::Delegate::anchor() { }
+
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  LiveInterval &LI = LIS.createEmptyInterval(VReg);
+  return LI;
+}
+
+unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
+  unsigned VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
+  if (VRM) {
+    VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
+  }
+  return VReg;
+}
+
+bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
+                                          const MachineInstr *DefMI,
+                                          AliasAnalysis *aa) {
+  assert(DefMI && "Missing instruction");
+  ScannedRemattable = true;
+  if (!TII.isTriviallyReMaterializable(*DefMI, aa))
+    return false;
+  Remattable.insert(VNI);
+  return true;
+}
+
+void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
+  for (VNInfo *VNI : getParent().valnos) {
+    if (VNI->isUnused())
+      continue;
+    unsigned Original = VRM->getOriginal(getReg());
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(VNI->def);
+    MachineInstr *DefMI = LIS.getInstructionFromIndex(OrigVNI->def);
+    if (!DefMI)
+      continue;
+    checkRematerializable(OrigVNI, DefMI, aa);
+  }
+  ScannedRemattable = true;
+}
+
+bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
+  if (!ScannedRemattable)
+    scanRemattable(aa);
+  return !Remattable.empty();
+}
+
+/// allUsesAvailableAt - Return true if all registers used by OrigMI at
+/// OrigIdx are also available with the same value at UseIdx.
+bool LiveRangeEdit::allUsesAvailableAt(const MachineInstr *OrigMI,
+                                       SlotIndex OrigIdx,
+                                       SlotIndex UseIdx) const {
+  OrigIdx = OrigIdx.getRegSlot(true);
+  UseIdx = UseIdx.getRegSlot(true);
+  for (unsigned i = 0, e = OrigMI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = OrigMI->getOperand(i);
+    if (!MO.isReg() || !MO.getReg() || !MO.readsReg())
+      continue;
+
+    // We can't remat physreg uses, unless it is a constant.
+    if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+      if (MRI.isConstantPhysReg(MO.getReg(), *OrigMI->getParent()->getParent()))
+        continue;
+      return false;
+    }
+
+    LiveInterval &li = LIS.getInterval(MO.getReg());
+    const VNInfo *OVNI = li.getVNInfoAt(OrigIdx);
+    if (!OVNI)
+      continue;
+
+    // Don't allow rematerialization immediately after the original def.
+    // It would be incorrect if OrigMI redefines the register.
+    // See PR14098.
+    if (SlotIndex::isSameInstr(OrigIdx, UseIdx))
+      return false;
+
+    if (OVNI != li.getVNInfoAt(UseIdx))
+      return false;
+  }
+  return true;
+}
+
+bool LiveRangeEdit::canRematerializeAt(Remat &RM, VNInfo *OrigVNI,
+                                       SlotIndex UseIdx, bool cheapAsAMove) {
+  assert(ScannedRemattable && "Call anyRematerializable first");
+
+  // Use scanRemattable info.
+  if (!Remattable.count(OrigVNI))
+    return false;
+
+  // No defining instruction provided.
+  SlotIndex DefIdx;
+  assert(RM.OrigMI && "No defining instruction for remattable value");
+  DefIdx = LIS.getInstructionIndex(*RM.OrigMI);
+
+  // If only cheap remats were requested, bail out early.
+  if (cheapAsAMove && !TII.isAsCheapAsAMove(*RM.OrigMI))
+    return false;
+
+  // Verify that all used registers are available with the same values.
+  if (!allUsesAvailableAt(RM.OrigMI, DefIdx, UseIdx))
+    return false;
+
+  return true;
+}
+
+SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MI,
+                                         unsigned DestReg,
+                                         const Remat &RM,
+                                         const TargetRegisterInfo &tri,
+                                         bool Late) {
+  assert(RM.OrigMI && "Invalid remat");
+  TII.reMaterialize(MBB, MI, DestReg, 0, *RM.OrigMI, tri);
+  // DestReg of the cloned instruction cannot be Dead. Set isDead of DestReg
+  // to false anyway in case the isDead flag of RM.OrigMI's dest register
+  // is true.
+  (*--MI).getOperand(0).setIsDead(false);
+  Rematted.insert(RM.ParentVNI);
+  return LIS.getSlotIndexes()->insertMachineInstrInMaps(*MI, Late).getRegSlot();
+}
+
+void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
+  if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
+    LIS.removeInterval(Reg);
+}
+
+bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
+                               SmallVectorImpl<MachineInstr*> &Dead) {
+  MachineInstr *DefMI = nullptr, *UseMI = nullptr;
+
+  // Check that there is a single def and a single use.
+  for (MachineOperand &MO : MRI.reg_nodbg_operands(LI->reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MO.isDef()) {
+      if (DefMI && DefMI != MI)
+        return false;
+      if (!MI->canFoldAsLoad())
+        return false;
+      DefMI = MI;
+    } else if (!MO.isUndef()) {
+      if (UseMI && UseMI != MI)
+        return false;
+      // FIXME: Targets don't know how to fold subreg uses.
+      if (MO.getSubReg())
+        return false;
+      UseMI = MI;
+    }
+  }
+  if (!DefMI || !UseMI)
+    return false;
+
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI, LIS.getInstructionIndex(*DefMI),
+                          LIS.getInstructionIndex(*UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(nullptr, SawStore))
+    return false;
+
+  DEBUG(dbgs() << "Try to fold single def: " << *DefMI
+               << "       into single use: " << *UseMI);
+
+  SmallVector<unsigned, 8> Ops;
+  if (UseMI->readsWritesVirtualRegister(LI->reg, &Ops).second)
+    return false;
+
+  MachineInstr *FoldMI = TII.foldMemoryOperand(*UseMI, Ops, *DefMI, &LIS);
+  if (!FoldMI)
+    return false;
+  DEBUG(dbgs() << "                folded: " << *FoldMI);
+  LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
+  UseMI->eraseFromParent();
+  DefMI->addRegisterDead(LI->reg, nullptr);
+  Dead.push_back(DefMI);
+  ++NumDCEFoldedLoads;
+  return true;
+}
+
+bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
+                              const MachineOperand &MO) const {
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex Idx = LIS.getInstructionIndex(MI).getRegSlot();
+  if (LI.Query(Idx).isKill())
+    return true;
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned SubReg = MO.getSubReg();
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubReg);
+  for (const LiveInterval::SubRange &S : LI.subranges()) {
+    if ((S.LaneMask & LaneMask) != 0 && S.Query(Idx).isKill())
+      return true;
+  }
+  return false;
+}
+
+/// Find all live intervals that need to shrink, then remove the instruction.
+void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
+                                     AliasAnalysis *AA) {
+  assert(MI->allDefsAreDead() && "Def isn't really dead");
+  SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
+
+  // Never delete a bundled instruction.
+  if (MI->isBundled()) {
+    return;
+  }
+  // Never delete inline asm.
+  if (MI->isInlineAsm()) {
+    DEBUG(dbgs() << "Won't delete: " << Idx << '\t' << *MI);
+    return;
+  }
+
+  // Use the same criteria as DeadMachineInstructionElim.
+  bool SawStore = false;
+  if (!MI->isSafeToMove(nullptr, SawStore)) {
+    DEBUG(dbgs() << "Can't delete: " << Idx << '\t' << *MI);
+    return;
+  }
+
+  DEBUG(dbgs() << "Deleting dead def " << Idx << '\t' << *MI);
+
+  // Collect virtual registers to be erased after MI is gone.
+  SmallVector<unsigned, 8> RegsToErase;
+  bool ReadsPhysRegs = false;
+  bool isOrigDef = false;
+  unsigned Dest;
+  if (VRM && MI->getOperand(0).isReg()) {
+    Dest = MI->getOperand(0).getReg();
+    unsigned Original = VRM->getOriginal(Dest);
+    LiveInterval &OrigLI = LIS.getInterval(Original);
+    VNInfo *OrigVNI = OrigLI.getVNInfoAt(Idx);
+    // The original live-range may have been shrunk to
+    // an empty live-range. It happens when it is dead, but
+    // we still keep it around to be able to rematerialize
+    // other values that depend on it.
+    if (OrigVNI)
+      isOrigDef = SlotIndex::isSameInstr(OrigVNI->def, Idx);
+  }
+
+  // Check for live intervals that may shrink
+  for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+         MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+    if (!MOI->isReg())
+      continue;
+    unsigned Reg = MOI->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+      // Check if MI reads any unreserved physregs.
+      if (Reg && MOI->readsReg() && !MRI.isReserved(Reg))
+        ReadsPhysRegs = true;
+      else if (MOI->isDef())
+        LIS.removePhysRegDefAt(Reg, Idx);
+      continue;
+    }
+    LiveInterval &LI = LIS.getInterval(Reg);
+
+    // Shrink read registers, unless it is likely to be expensive and
+    // unlikely to change anything. We typically don't want to shrink the
+    // PIC base register that has lots of uses everywhere.
+    // Always shrink COPY uses that probably come from live range splitting.
+    if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MOI->isDef())) ||
+        (MOI->readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, *MOI))))
+      ToShrink.insert(&LI);
+
+    // Remove defined value.
+    if (MOI->isDef()) {
+      if (TheDelegate && LI.getVNInfoAt(Idx) != nullptr)
+        TheDelegate->LRE_WillShrinkVirtReg(LI.reg);
+      LIS.removeVRegDefAt(LI, Idx);
+      if (LI.empty())
+        RegsToErase.push_back(Reg);
+    }
+  }
+
+  // Currently, we don't support DCE of physreg live ranges. If MI reads
+  // any unreserved physregs, don't erase the instruction, but turn it into
+  // a KILL instead. This way, the physreg live ranges don't end up
+  // dangling.
+  // FIXME: It would be better to have something like shrinkToUses() for
+  // physregs. That could potentially enable more DCE and it would free up
+  // the physreg. It would not happen often, though.
+  if (ReadsPhysRegs) {
+    MI->setDesc(TII.get(TargetOpcode::KILL));
+    // Remove all operands that aren't physregs.
+    for (unsigned i = MI->getNumOperands(); i; --i) {
+      const MachineOperand &MO = MI->getOperand(i-1);
+      if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        continue;
+      MI->RemoveOperand(i-1);
+    }
+    DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+  } else {
+    // If the dest of MI is an original reg and MI is reMaterializable,
+    // don't delete the inst. Replace the dest with a new reg, and keep
+    // the inst for remat of other siblings. The inst is saved in
+    // LiveRangeEdit::DeadRemats and will be deleted after all the
+    // allocations of the func are done.
+    if (isOrigDef && DeadRemats && TII.isTriviallyReMaterializable(*MI, AA)) {
+      LiveInterval &NewLI = createEmptyIntervalFrom(Dest);
+      VNInfo *VNI = NewLI.getNextValue(Idx, LIS.getVNInfoAllocator());
+      NewLI.addSegment(LiveInterval::Segment(Idx, Idx.getDeadSlot(), VNI));
+      pop_back();
+      markDeadRemat(MI);
+      const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+      MI->substituteRegister(Dest, NewLI.reg, 0, TRI);
+      MI->getOperand(0).setIsDead(true);
+    } else {
+      if (TheDelegate)
+        TheDelegate->LRE_WillEraseInstruction(MI);
+      LIS.RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      ++NumDCEDeleted;
+    }
+  }
+
+  // Erase any virtregs that are now empty and unused. There may be <undef>
+  // uses around. Keep the empty live range in that case.
+  for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
+    unsigned Reg = RegsToErase[i];
+    if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
+      ToShrink.remove(&LIS.getInterval(Reg));
+      eraseVirtReg(Reg);
+    }
+  }
+}
+
+void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
+                                      ArrayRef<unsigned> RegsBeingSpilled,
+                                      AliasAnalysis *AA) {
+  ToShrinkSet ToShrink;
+
+  for (;;) {
+    // Erase all dead defs.
+    while (!Dead.empty())
+      eliminateDeadDef(Dead.pop_back_val(), ToShrink, AA);
+
+    if (ToShrink.empty())
+      break;
+
+    // Shrink just one live interval. Then delete new dead defs.
+    LiveInterval *LI = ToShrink.back();
+    ToShrink.pop_back();
+    if (foldAsLoad(LI, Dead))
+      continue;
+    unsigned VReg = LI->reg;
+    if (TheDelegate)
+      TheDelegate->LRE_WillShrinkVirtReg(VReg);
+    if (!LIS.shrinkToUses(LI, &Dead))
+      continue;
+
+    // Don't create new intervals for a register being spilled.
+    // The new intervals would have to be spilled anyway so its not worth it.
+    // Also they currently aren't spilled so creating them and not spilling
+    // them results in incorrect code.
+    bool BeingSpilled = false;
+    for (unsigned i = 0, e = RegsBeingSpilled.size(); i != e; ++i) {
+      if (VReg == RegsBeingSpilled[i]) {
+        BeingSpilled = true;
+        break;
+      }
+    }
+
+    if (BeingSpilled) continue;
+
+    // LI may have been separated, create new intervals.
+    LI->RenumberValues();
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(*LI, SplitLIs);
+    if (!SplitLIs.empty())
+      ++NumFracRanges;
+
+    unsigned Original = VRM ? VRM->getOriginal(VReg) : 0;
+    for (const LiveInterval *SplitLI : SplitLIs) {
+      // If LI is an original interval that hasn't been split yet, make the new
+      // intervals their own originals instead of referring to LI. The original
+      // interval must contain all the split products, and LI doesn't.
+      if (Original != VReg && Original != 0)
+        VRM->setIsSplitFromReg(SplitLI->reg, Original);
+      if (TheDelegate)
+        TheDelegate->LRE_DidCloneVirtReg(SplitLI->reg, VReg);
+    }
+  }
+}
+
+// Keep track of new virtual registers created via
+// MachineRegisterInfo::createVirtualRegister.
+void
+LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
+{
+  if (VRM)
+    VRM->grow();
+
+  NewRegs.push_back(VReg);
+}
+
+void
+LiveRangeEdit::calculateRegClassAndHint(MachineFunction &MF,
+                                        const MachineLoopInfo &Loops,
+                                        const MachineBlockFrequencyInfo &MBFI) {
+  VirtRegAuxInfo VRAI(MF, LIS, VRM, Loops, MBFI);
+  for (unsigned I = 0, Size = size(); I < Size; ++I) {
+    LiveInterval &LI = LIS.getInterval(get(I));
+    if (MRI.recomputeRegClass(LI.reg))
+      DEBUG({
+        const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+        dbgs() << "Inflated " << PrintReg(LI.reg) << " to "
+               << TRI->getRegClassName(MRI.getRegClass(LI.reg)) << '\n';
+      });
+    VRAI.calculateSpillWeightAndHint(LI);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveRangeUtils.h b/contrib/llvm/lib/CodeGen/LiveRangeUtils.h
new file mode 100644
index 000000000000..bd57609c3d84
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRangeUtils.h
@@ -0,0 +1,62 @@
+//===-- LiveRangeUtils.h - Live Range modification utilities ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// This file contains helper functions to modify live ranges.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+#define LLVM_LIB_CODEGEN_LIVERANGEUTILS_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+
+namespace llvm {
+
+/// Helper function that distributes live range value numbers and the
+/// corresponding segments of a master live range \p LR to a list of newly
+/// created live ranges \p SplitLRs. \p VNIClasses maps each value number in \p
+/// LR to 0 meaning it should stay or to 1..N meaning it should go to a specific
+/// live range in the \p SplitLRs array.
+template<typename LiveRangeT, typename EqClassesT>
+static void DistributeRange(LiveRangeT &LR, LiveRangeT *SplitLRs[],
+                            EqClassesT VNIClasses) {
+  // Move segments to new intervals.
+  typename LiveRangeT::iterator J = LR.begin(), E = LR.end();
+  while (J != E && VNIClasses[J->valno->id] == 0)
+    ++J;
+  for (typename LiveRangeT::iterator I = J; I != E; ++I) {
+    if (unsigned eq = VNIClasses[I->valno->id]) {
+      assert((SplitLRs[eq-1]->empty() || SplitLRs[eq-1]->expiredAt(I->start)) &&
+             "New intervals should be empty");
+      SplitLRs[eq-1]->segments.push_back(*I);
+    } else
+      *J++ = *I;
+  }
+  LR.segments.erase(J, E);
+
+  // Transfer VNInfos to their new owners and renumber them.
+  unsigned j = 0, e = LR.getNumValNums();
+  while (j != e && VNIClasses[j] == 0)
+    ++j;
+  for (unsigned i = j; i != e; ++i) {
+    VNInfo *VNI = LR.getValNumInfo(i);
+    if (unsigned eq = VNIClasses[i]) {
+      VNI->id = SplitLRs[eq-1]->getNumValNums();
+      SplitLRs[eq-1]->valnos.push_back(VNI);
+    } else {
+      VNI->id = j;
+      LR.valnos[j++] = VNI;
+    }
+  }
+  LR.valnos.resize(j);
+}
+
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
new file mode 100644
index 000000000000..7ee87c1e650f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveRegMatrix.cpp
@@ -0,0 +1,197 @@
+//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the LiveRegMatrix analysis pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumAssigned   , "Number of registers assigned");
+STATISTIC(NumUnassigned , "Number of registers unassigned");
+
+char LiveRegMatrix::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveRegMatrix, "liveregmatrix",
+                      "Live Register Matrix", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
+                    "Live Register Matrix", false, false)
+
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
+  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+
+void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<LiveIntervals>();
+  AU.addRequiredTransitive<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool LiveRegMatrix::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+
+  unsigned NumRegUnits = TRI->getNumRegUnits();
+  if (NumRegUnits != Matrix.size())
+    Queries.reset(new LiveIntervalUnion::Query[NumRegUnits]);
+  Matrix.init(LIUAlloc, NumRegUnits);
+
+  // Make sure no stale queries get reused.
+  invalidateVirtRegs();
+  return false;
+}
+
+void LiveRegMatrix::releaseMemory() {
+  for (unsigned i = 0, e = Matrix.size(); i != e; ++i) {
+    Matrix[i].clear();
+    // No need to clear Queries here, since LiveIntervalUnion::Query doesn't
+    // have anything important to clear and LiveRegMatrix's runOnFunction()
+    // does a std::unique_ptr::reset anyways.
+  }
+}
+
+template<typename Callable>
+bool foreachUnit(const TargetRegisterInfo *TRI, LiveInterval &VRegInterval,
+                 unsigned PhysReg, Callable Func) {
+  if (VRegInterval.hasSubRanges()) {
+    for (MCRegUnitMaskIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      unsigned Unit = (*Units).first;
+      LaneBitmask Mask = (*Units).second;
+      for (LiveInterval::SubRange &S : VRegInterval.subranges()) {
+        if (S.LaneMask & Mask) {
+          if (Func(Unit, S))
+            return true;
+          break;
+        }
+      }
+    }
+  } else {
+    for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+      if (Func(*Units, VRegInterval))
+        return true;
+    }
+  }
+  return false;
+}
+
+void LiveRegMatrix::assign(LiveInterval &VirtReg, unsigned PhysReg) {
+  DEBUG(dbgs() << "assigning " << PrintReg(VirtReg.reg, TRI)
+               << " to " << PrintReg(PhysReg, TRI) << ':');
+  assert(!VRM->hasPhys(VirtReg.reg) && "Duplicate VirtReg assignment");
+  VRM->assignVirt2Phys(VirtReg.reg, PhysReg);
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI) << ' ' << Range);
+    Matrix[Unit].unify(VirtReg, Range);
+    return false;
+  });
+
+  ++NumAssigned;
+  DEBUG(dbgs() << '\n');
+}
+
+void LiveRegMatrix::unassign(LiveInterval &VirtReg) {
+  unsigned PhysReg = VRM->getPhys(VirtReg.reg);
+  DEBUG(dbgs() << "unassigning " << PrintReg(VirtReg.reg, TRI)
+               << " from " << PrintReg(PhysReg, TRI) << ':');
+  VRM->clearVirt(VirtReg.reg);
+
+  foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                         const LiveRange &Range) {
+    DEBUG(dbgs() << ' ' << PrintRegUnit(Unit, TRI));
+    Matrix[Unit].extract(VirtReg, Range);
+    return false;
+  });
+
+  ++NumUnassigned;
+  DEBUG(dbgs() << '\n');
+}
+
+bool LiveRegMatrix::isPhysRegUsed(unsigned PhysReg) const {
+  for (MCRegUnitIterator Unit(PhysReg, TRI); Unit.isValid(); ++Unit) {
+    if (!Matrix[*Unit].empty())
+      return true;
+  }
+  return false;
+}
+
+bool LiveRegMatrix::checkRegMaskInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  // Check if the cached information is valid.
+  // The same BitVector can be reused for all PhysRegs.
+  // We could cache multiple VirtRegs if it becomes necessary.
+  if (RegMaskVirtReg != VirtReg.reg || RegMaskTag != UserTag) {
+    RegMaskVirtReg = VirtReg.reg;
+    RegMaskTag = UserTag;
+    RegMaskUsable.clear();
+    LIS->checkRegMaskInterference(VirtReg, RegMaskUsable);
+  }
+
+  // The BitVector is indexed by PhysReg, not register unit.
+  // Regmask interference is more fine grained than regunits.
+  // For example, a Win64 call can clobber %ymm8 yet preserve %xmm8.
+  return !RegMaskUsable.empty() && (!PhysReg || !RegMaskUsable.test(PhysReg));
+}
+
+bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
+                                             unsigned PhysReg) {
+  if (VirtReg.empty())
+    return false;
+  CoalescerPair CP(VirtReg.reg, PhysReg, *TRI);
+
+  bool Result = foreachUnit(TRI, VirtReg, PhysReg, [&](unsigned Unit,
+                                                       const LiveRange &Range) {
+    const LiveRange &UnitRange = LIS->getRegUnit(Unit);
+    return Range.overlaps(UnitRange, CP, *LIS->getSlotIndexes());
+  });
+  return Result;
+}
+
+LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+                                               unsigned RegUnit) {
+  LiveIntervalUnion::Query &Q = Queries[RegUnit];
+  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  return Q;
+}
+
+LiveRegMatrix::InterferenceKind
+LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
+  if (VirtReg.empty())
+    return IK_Free;
+
+  // Regmask interference is the fastest check.
+  if (checkRegMaskInterference(VirtReg, PhysReg))
+    return IK_RegMask;
+
+  // Check for fixed interference.
+  if (checkRegUnitInterference(VirtReg, PhysReg))
+    return IK_RegUnit;
+
+  // Check the matrix for virtual register interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+    if (query(VirtReg, *Units).checkInterference())
+      return IK_VirtReg;
+
+  return IK_Free;
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
new file mode 100644
index 000000000000..dbf1f96102d1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveStackAnalysis.cpp
@@ -0,0 +1,88 @@
+//===-- LiveStackAnalysis.cpp - Live Stack Slot Analysis ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the live stack slot analysis pass. It is analogous to
+// live interval analysis except it's analyzing liveness of stack slots rather
+// than registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "livestacks"
+
+char LiveStacks::ID = 0;
+INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+
+char &llvm::LiveStacksID = LiveStacks::ID;
+
+void LiveStacks::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequiredTransitive<SlotIndexes>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LiveStacks::releaseMemory() {
+  // Release VNInfo memory regions, VNInfo objects don't need to be dtor'd.
+  VNInfoAllocator.Reset();
+  S2IMap.clear();
+  S2RCMap.clear();
+}
+
+bool LiveStacks::runOnMachineFunction(MachineFunction &MF) {
+  TRI = MF.getSubtarget().getRegisterInfo();
+  // FIXME: No analysis is being done right now. We are relying on the
+  // register allocators to provide the information.
+  return false;
+}
+
+LiveInterval &
+LiveStacks::getOrCreateInterval(int Slot, const TargetRegisterClass *RC) {
+  assert(Slot >= 0 && "Spill slot indice must be >= 0");
+  SS2IntervalMap::iterator I = S2IMap.find(Slot);
+  if (I == S2IMap.end()) {
+    I = S2IMap.emplace(std::piecewise_construct, std::forward_as_tuple(Slot),
+                       std::forward_as_tuple(
+                           TargetRegisterInfo::index2StackSlot(Slot), 0.0F))
+            .first;
+    S2RCMap.insert(std::make_pair(Slot, RC));
+  } else {
+    // Use the largest common subclass register class.
+    const TargetRegisterClass *OldRC = S2RCMap[Slot];
+    S2RCMap[Slot] = TRI->getCommonSubClass(OldRC, RC);
+  }
+  return I->second;
+}
+
+/// print - Implement the dump method.
+void LiveStacks::print(raw_ostream &OS, const Module*) const {
+
+  OS << "********** INTERVALS **********\n";
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    I->second.print(OS);
+    int Slot = I->first;
+    const TargetRegisterClass *RC = getIntervalRegClass(Slot);
+    if (RC)
+      OS << " [" << TRI->getRegClassName(RC) << "]\n";
+    else
+      OS << " [Unknown]\n";
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LiveVariables.cpp b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
new file mode 100644
index 000000000000..dd87216f5e6b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LiveVariables.cpp
@@ -0,0 +1,810 @@
+//===-- LiveVariables.cpp - Live Variable Analysis for Machine Code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the LiveVariable analysis pass.  For each machine
+// instruction in the function, this pass calculates the set of registers that
+// are immediately dead after the instruction (i.e., the instruction calculates
+// the value, but it is never used) and the set of registers that are used by
+// the instruction, but are never used after the instruction (i.e., they are
+// killed).
+//
+// This class computes live variables using a sparse implementation based on
+// the machine code SSA form.  This class computes live variable information for
+// each virtual and _register allocatable_ physical register in a function.  It
+// uses the dominance properties of SSA form to efficiently compute live
+// variables for virtual registers, and assumes that physical registers are only
+// live within a single basic block (allowing it to do a single local analysis
+// to resolve physical register lifetimes in each basic block).  If a physical
+// register is not register allocatable, it is not tracked.  This is useful for
+// things like the stack pointer and condition codes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+char LiveVariables::ID = 0;
+char &llvm::LiveVariablesID = LiveVariables::ID;
+INITIALIZE_PASS_BEGIN(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(UnreachableMachineBlockElim)
+INITIALIZE_PASS_END(LiveVariables, "livevars",
+                "Live Variable Analysis", false, false)
+
+
+void LiveVariables::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequiredID(UnreachableMachineBlockElimID);
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *
+LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    if (Kills[i]->getParent() == MBB)
+      return Kills[i];
+  return nullptr;
+}
+
+LLVM_DUMP_METHOD void LiveVariables::VarInfo::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  Alive in blocks: ";
+  for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
+           E = AliveBlocks.end(); I != E; ++I)
+    dbgs() << *I << ", ";
+  dbgs() << "\n  Killed by:";
+  if (Kills.empty())
+    dbgs() << " No instructions.\n";
+  else {
+    for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+      dbgs() << "\n    #" << i << ": " << *Kills[i];
+    dbgs() << "\n";
+  }
+#endif
+}
+
+/// getVarInfo - Get (possibly creating) a VarInfo object for the given vreg.
+LiveVariables::VarInfo &LiveVariables::getVarInfo(unsigned RegIdx) {
+  assert(TargetRegisterInfo::isVirtualRegister(RegIdx) &&
+         "getVarInfo: not a virtual register!");
+  VirtRegInfo.grow(RegIdx);
+  return VirtRegInfo[RegIdx];
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo& VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB,
+                                    std::vector<MachineBasicBlock*> &WorkList) {
+  unsigned BBNum = MBB->getNumber();
+
+  // Check to see if this basic block is one of the killing blocks.  If so,
+  // remove it.
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    if (VRInfo.Kills[i]->getParent() == MBB) {
+      VRInfo.Kills.erase(VRInfo.Kills.begin()+i);  // Erase entry
+      break;
+    }
+
+  if (MBB == DefBlock) return;  // Terminate recursion
+
+  if (VRInfo.AliveBlocks.test(BBNum))
+    return;  // We already know the block is live
+
+  // Mark the variable known alive in this bb
+  VRInfo.AliveBlocks.set(BBNum);
+
+  assert(MBB != &MF->front() && "Can't find reaching def for virtreg");
+  WorkList.insert(WorkList.end(), MBB->pred_rbegin(), MBB->pred_rend());
+}
+
+void LiveVariables::MarkVirtRegAliveInBlock(VarInfo &VRInfo,
+                                            MachineBasicBlock *DefBlock,
+                                            MachineBasicBlock *MBB) {
+  std::vector<MachineBasicBlock*> WorkList;
+  MarkVirtRegAliveInBlock(VRInfo, DefBlock, MBB, WorkList);
+
+  while (!WorkList.empty()) {
+    MachineBasicBlock *Pred = WorkList.back();
+    WorkList.pop_back();
+    MarkVirtRegAliveInBlock(VRInfo, DefBlock, Pred, WorkList);
+  }
+}
+
+void LiveVariables::HandleVirtRegUse(unsigned reg, MachineBasicBlock *MBB,
+                                     MachineInstr &MI) {
+  assert(MRI->getVRegDef(reg) && "Register use before def!");
+
+  unsigned BBNum = MBB->getNumber();
+
+  VarInfo& VRInfo = getVarInfo(reg);
+
+  // Check to see if this basic block is already a kill block.
+  if (!VRInfo.Kills.empty() && VRInfo.Kills.back()->getParent() == MBB) {
+    // Yes, this register is killed in this basic block already. Increase the
+    // live range by updating the kill instruction.
+    VRInfo.Kills.back() = &MI;
+    return;
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = VRInfo.Kills.size(); i != e; ++i)
+    assert(VRInfo.Kills[i]->getParent() != MBB && "entry should be at end!");
+#endif
+
+  // This situation can occur:
+  //
+  //     ,------.
+  //     |      |
+  //     |      v
+  //     |   t2 = phi ... t1 ...
+  //     |      |
+  //     |      v
+  //     |   t1 = ...
+  //     |  ... = ... t1 ...
+  //     |      |
+  //     `------'
+  //
+  // where there is a use in a PHI node that's a predecessor to the defining
+  // block. We don't want to mark all predecessors as having the value "alive"
+  // in this case.
+  if (MBB == MRI->getVRegDef(reg)->getParent()) return;
+
+  // Add a new kill entry for this basic block. If this virtual register is
+  // already marked as alive in this basic block, that means it is alive in at
+  // least one of the successor blocks, it's not a kill.
+  if (!VRInfo.AliveBlocks.test(BBNum))
+    VRInfo.Kills.push_back(&MI);
+
+  // Update all dominating blocks to mark them as "known live".
+  for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+         E = MBB->pred_end(); PI != E; ++PI)
+    MarkVirtRegAliveInBlock(VRInfo, MRI->getVRegDef(reg)->getParent(), *PI);
+}
+
+void LiveVariables::HandleVirtRegDef(unsigned Reg, MachineInstr &MI) {
+  VarInfo &VRInfo = getVarInfo(Reg);
+
+  if (VRInfo.AliveBlocks.empty())
+    // If vr is not alive in any block, then defaults to dead.
+    VRInfo.Kills.push_back(&MI);
+}
+
+/// FindLastPartialDef - Return the last partial def of the specified register.
+/// Also returns the sub-registers that're defined by the instruction.
+MachineInstr *LiveVariables::FindLastPartialDef(unsigned Reg,
+                                            SmallSet<unsigned,4> &PartDefRegs) {
+  unsigned LastDefReg = 0;
+  unsigned LastDefDist = 0;
+  MachineInstr *LastDef = nullptr;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (!Def)
+      continue;
+    unsigned Dist = DistanceMap[Def];
+    if (Dist > LastDefDist) {
+      LastDefReg  = SubReg;
+      LastDef     = Def;
+      LastDefDist = Dist;
+    }
+  }
+
+  if (!LastDef)
+    return nullptr;
+
+  PartDefRegs.insert(LastDefReg);
+  for (unsigned i = 0, e = LastDef->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = LastDef->getOperand(i);
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() == 0)
+      continue;
+    unsigned DefReg = MO.getReg();
+    if (TRI->isSubRegister(Reg, DefReg)) {
+      for (MCSubRegIterator SubRegs(DefReg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        PartDefRegs.insert(*SubRegs);
+    }
+  }
+  return LastDef;
+}
+
+/// HandlePhysRegUse - Turn previous partial def's into read/mod/writes. Add
+/// implicit defs to a machine instruction if there was an earlier def of its
+/// super-register.
+void LiveVariables::HandlePhysRegUse(unsigned Reg, MachineInstr &MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  // If there was a previous use or a "full" def all is well.
+  if (!LastDef && !PhysRegUse[Reg]) {
+    // Otherwise, the last sub-register def implicitly defines this register.
+    // e.g.
+    // AH =
+    // AL = ... <imp-def EAX>, <imp-kill AH>
+    //    = AH
+    // ...
+    //    = EAX
+    // All of the sub-registers must have been defined before the use of Reg!
+    SmallSet<unsigned, 4> PartDefRegs;
+    MachineInstr *LastPartialDef = FindLastPartialDef(Reg, PartDefRegs);
+    // If LastPartialDef is NULL, it must be using a livein register.
+    if (LastPartialDef) {
+      LastPartialDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                           true/*IsImp*/));
+      PhysRegDef[Reg] = LastPartialDef;
+      SmallSet<unsigned, 8> Processed;
+      for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+        unsigned SubReg = *SubRegs;
+        if (Processed.count(SubReg))
+          continue;
+        if (PartDefRegs.count(SubReg))
+          continue;
+        // This part of Reg was defined before the last partial def. It's killed
+        // here.
+        LastPartialDef->addOperand(MachineOperand::CreateReg(SubReg,
+                                                             false/*IsDef*/,
+                                                             true/*IsImp*/));
+        PhysRegDef[SubReg] = LastPartialDef;
+        for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+          Processed.insert(*SS);
+      }
+    }
+  } else if (LastDef && !PhysRegUse[Reg] &&
+             !LastDef->findRegisterDefOperand(Reg))
+    // Last def defines the super register, add an implicit def of reg.
+    LastDef->addOperand(MachineOperand::CreateReg(Reg, true/*IsDef*/,
+                                                  true/*IsImp*/));
+
+  // Remember this use.
+  for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+       SubRegs.isValid(); ++SubRegs)
+    PhysRegUse[*SubRegs] = &MI;
+}
+
+/// FindLastRefOrPartRef - Return the last reference or partial reference of
+/// the specified register.
+MachineInstr *LiveVariables::FindLastRefOrPartRef(unsigned Reg) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return nullptr;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  unsigned LastPartDefDist = 0;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist)
+        LastPartDefDist = Dist;
+    } else if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  return LastRefOrPartRef;
+}
+
+bool LiveVariables::HandlePhysRegKill(unsigned Reg, MachineInstr *MI) {
+  MachineInstr *LastDef = PhysRegDef[Reg];
+  MachineInstr *LastUse = PhysRegUse[Reg];
+  if (!LastDef && !LastUse)
+    return false;
+
+  MachineInstr *LastRefOrPartRef = LastUse ? LastUse : LastDef;
+  unsigned LastRefOrPartRefDist = DistanceMap[LastRefOrPartRef];
+  // The whole register is used.
+  // AL =
+  // AH =
+  //
+  //    = AX
+  //    = AL, AX<imp-use, kill>
+  // AX =
+  //
+  // Or whole register is defined, but not used at all.
+  // AX<dead> =
+  // ...
+  // AX =
+  //
+  // Or whole register is defined, but only partly used.
+  // AX<dead> = AL<imp-def>
+  //    = AL<kill>
+  // AX =
+  MachineInstr *LastPartDef = nullptr;
+  unsigned LastPartDefDist = 0;
+  SmallSet<unsigned, 8> PartUses;
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    MachineInstr *Def = PhysRegDef[SubReg];
+    if (Def && Def != LastDef) {
+      // There was a def of this sub-register in between. This is a partial
+      // def, keep track of the last one.
+      unsigned Dist = DistanceMap[Def];
+      if (Dist > LastPartDefDist) {
+        LastPartDefDist = Dist;
+        LastPartDef = Def;
+      }
+      continue;
+    }
+    if (MachineInstr *Use = PhysRegUse[SubReg]) {
+      for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true); SS.isValid();
+           ++SS)
+        PartUses.insert(*SS);
+      unsigned Dist = DistanceMap[Use];
+      if (Dist > LastRefOrPartRefDist) {
+        LastRefOrPartRefDist = Dist;
+        LastRefOrPartRef = Use;
+      }
+    }
+  }
+
+  if (!PhysRegUse[Reg]) {
+    // Partial uses. Mark register def dead and add implicit def of
+    // sub-registers which are used.
+    // EAX<dead>  = op  AL<imp-def>
+    // That is, EAX def is dead but AL def extends pass it.
+    PhysRegDef[Reg]->addRegisterDead(Reg, TRI, true);
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      if (!PartUses.count(SubReg))
+        continue;
+      bool NeedDef = true;
+      if (PhysRegDef[Reg] == PhysRegDef[SubReg]) {
+        MachineOperand *MO = PhysRegDef[Reg]->findRegisterDefOperand(SubReg);
+        if (MO) {
+          NeedDef = false;
+          assert(!MO->isDead());
+        }
+      }
+      if (NeedDef)
+        PhysRegDef[Reg]->addOperand(MachineOperand::CreateReg(SubReg,
+                                                 true/*IsDef*/, true/*IsImp*/));
+      MachineInstr *LastSubRef = FindLastRefOrPartRef(SubReg);
+      if (LastSubRef)
+        LastSubRef->addRegisterKilled(SubReg, TRI, true);
+      else {
+        LastRefOrPartRef->addRegisterKilled(SubReg, TRI, true);
+        for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true);
+             SS.isValid(); ++SS)
+          PhysRegUse[*SS] = LastRefOrPartRef;
+      }
+      for (MCSubRegIterator SS(SubReg, TRI); SS.isValid(); ++SS)
+        PartUses.erase(*SS);
+    }
+  } else if (LastRefOrPartRef == PhysRegDef[Reg] && LastRefOrPartRef != MI) {
+    if (LastPartDef)
+      // The last partial def kills the register.
+      LastPartDef->addOperand(MachineOperand::CreateReg(Reg, false/*IsDef*/,
+                                                true/*IsImp*/, true/*IsKill*/));
+    else {
+      MachineOperand *MO =
+        LastRefOrPartRef->findRegisterDefOperand(Reg, false, TRI);
+      bool NeedEC = MO->isEarlyClobber() && MO->getReg() != Reg;
+      // If the last reference is the last def, then it's not used at all.
+      // That is, unless we are currently processing the last reference itself.
+      LastRefOrPartRef->addRegisterDead(Reg, TRI, true);
+      if (NeedEC) {
+        // If we are adding a subreg def and the superreg def is marked early
+        // clobber, add an early clobber marker to the subreg def.
+        MO = LastRefOrPartRef->findRegisterDefOperand(Reg);
+        if (MO)
+          MO->setIsEarlyClobber();
+      }
+    }
+  } else
+    LastRefOrPartRef->addRegisterKilled(Reg, TRI, true);
+  return true;
+}
+
+void LiveVariables::HandleRegMask(const MachineOperand &MO) {
+  // Call HandlePhysRegKill() for all live registers clobbered by Mask.
+  // Clobbered registers are always dead, sp there is no need to use
+  // HandlePhysRegDef().
+  for (unsigned Reg = 1, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg) {
+    // Skip dead regs.
+    if (!PhysRegDef[Reg] && !PhysRegUse[Reg])
+      continue;
+    // Skip mask-preserved regs.
+    if (!MO.clobbersPhysReg(Reg))
+      continue;
+    // Kill the largest clobbered super-register.
+    // This avoids needless implicit operands.
+    unsigned Super = Reg;
+    for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR)
+      if ((PhysRegDef[*SR] || PhysRegUse[*SR]) && MO.clobbersPhysReg(*SR))
+        Super = *SR;
+    HandlePhysRegKill(Super, nullptr);
+  }
+}
+
+void LiveVariables::HandlePhysRegDef(unsigned Reg, MachineInstr *MI,
+                                     SmallVectorImpl<unsigned> &Defs) {
+  // What parts of the register are previously defined?
+  SmallSet<unsigned, 32> Live;
+  if (PhysRegDef[Reg] || PhysRegUse[Reg]) {
+    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
+      Live.insert(*SubRegs);
+  } else {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      // If a register isn't itself defined, but all parts that make up of it
+      // are defined, then consider it also defined.
+      // e.g.
+      // AL =
+      // AH =
+      //    = AX
+      if (Live.count(SubReg))
+        continue;
+      if (PhysRegDef[SubReg] || PhysRegUse[SubReg]) {
+        for (MCSubRegIterator SS(SubReg, TRI, /*IncludeSelf=*/true);
+             SS.isValid(); ++SS)
+          Live.insert(*SS);
+      }
+    }
+  }
+
+  // Start from the largest piece, find the last time any part of the register
+  // is referenced.
+  HandlePhysRegKill(Reg, MI);
+  // Only some of the sub-registers are used.
+  for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+    unsigned SubReg = *SubRegs;
+    if (!Live.count(SubReg))
+      // Skip if this sub-register isn't defined.
+      continue;
+    HandlePhysRegKill(SubReg, MI);
+  }
+
+  if (MI)
+    Defs.push_back(Reg);  // Remember this def.
+}
+
+void LiveVariables::UpdatePhysRegDefs(MachineInstr &MI,
+                                      SmallVectorImpl<unsigned> &Defs) {
+  while (!Defs.empty()) {
+    unsigned Reg = Defs.back();
+    Defs.pop_back();
+    for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs) {
+      unsigned SubReg = *SubRegs;
+      PhysRegDef[SubReg] = &MI;
+      PhysRegUse[SubReg]  = nullptr;
+    }
+  }
+}
+
+void LiveVariables::runOnInstr(MachineInstr &MI,
+                               SmallVectorImpl<unsigned> &Defs) {
+  assert(!MI.isDebugValue());
+  // Process all of the operands of the instruction...
+  unsigned NumOperandsToProcess = MI.getNumOperands();
+
+  // Unless it is a PHI node.  In this case, ONLY process the DEF, not any
+  // of the uses.  They will be handled in other basic blocks.
+  if (MI.isPHI())
+    NumOperandsToProcess = 1;
+
+  // Clear kill and dead markers. LV will recompute them.
+  SmallVector<unsigned, 4> UseRegs;
+  SmallVector<unsigned, 4> DefRegs;
+  SmallVector<unsigned, 1> RegMasks;
+  for (unsigned i = 0; i != NumOperandsToProcess; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (MO.isRegMask()) {
+      RegMasks.push_back(i);
+      continue;
+    }
+    if (!MO.isReg() || MO.getReg() == 0)
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MO.isUse()) {
+      if (!(TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+            MRI->isReserved(MOReg)))
+        MO.setIsKill(false);
+      if (MO.readsReg())
+        UseRegs.push_back(MOReg);
+    } else {
+      assert(MO.isDef());
+      // FIXME: We should not remove any dead flags. However the MIPS RDDSP
+      // instruction needs it at the moment: http://llvm.org/PR27116.
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+          !MRI->isReserved(MOReg))
+        MO.setIsDead(false);
+      DefRegs.push_back(MOReg);
+    }
+  }
+
+  MachineBasicBlock *MBB = MI.getParent();
+  // Process all uses.
+  for (unsigned i = 0, e = UseRegs.size(); i != e; ++i) {
+    unsigned MOReg = UseRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegUse(MOReg, MBB, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegUse(MOReg, MI);
+  }
+
+  // Process all masked registers. (Call clobbers).
+  for (unsigned i = 0, e = RegMasks.size(); i != e; ++i)
+    HandleRegMask(MI.getOperand(RegMasks[i]));
+
+  // Process all defs.
+  for (unsigned i = 0, e = DefRegs.size(); i != e; ++i) {
+    unsigned MOReg = DefRegs[i];
+    if (TargetRegisterInfo::isVirtualRegister(MOReg))
+      HandleVirtRegDef(MOReg, MI);
+    else if (!MRI->isReserved(MOReg))
+      HandlePhysRegDef(MOReg, &MI, Defs);
+  }
+  UpdatePhysRegDefs(MI, Defs);
+}
+
+void LiveVariables::runOnBlock(MachineBasicBlock *MBB, const unsigned NumRegs) {
+  // Mark live-in registers as live-in.
+  SmallVector<unsigned, 4> Defs;
+  for (const auto &LI : MBB->liveins()) {
+    assert(TargetRegisterInfo::isPhysicalRegister(LI.PhysReg) &&
+           "Cannot have a live-in virtual register!");
+    HandlePhysRegDef(LI.PhysReg, nullptr, Defs);
+  }
+
+  // Loop over all of the instructions, processing them.
+  DistanceMap.clear();
+  unsigned Dist = 0;
+  for (MachineInstr &MI : *MBB) {
+    if (MI.isDebugValue())
+      continue;
+    DistanceMap.insert(std::make_pair(&MI, Dist++));
+
+    runOnInstr(MI, Defs);
+  }
+
+  // Handle any virtual assignments from PHI nodes which might be at the
+  // bottom of this basic block.  We check all of our successor blocks to see
+  // if they have PHI nodes, and if so, we simulate an assignment at the end
+  // of the current block.
+  if (!PHIVarInfo[MBB->getNumber()].empty()) {
+    SmallVectorImpl<unsigned> &VarInfoVec = PHIVarInfo[MBB->getNumber()];
+
+    for (SmallVectorImpl<unsigned>::iterator I = VarInfoVec.begin(),
+           E = VarInfoVec.end(); I != E; ++I)
+      // Mark it alive only in the block we are representing.
+      MarkVirtRegAliveInBlock(getVarInfo(*I),MRI->getVRegDef(*I)->getParent(),
+                              MBB);
+  }
+
+  // MachineCSE may CSE instructions which write to non-allocatable physical
+  // registers across MBBs. Remember if any reserved register is liveout.
+  SmallSet<unsigned, 4> LiveOuts;
+  for (MachineBasicBlock::const_succ_iterator SI = MBB->succ_begin(),
+         SE = MBB->succ_end(); SI != SE; ++SI) {
+    MachineBasicBlock *SuccMBB = *SI;
+    if (SuccMBB->isEHPad())
+      continue;
+    for (const auto &LI : SuccMBB->liveins()) {
+      if (!TRI->isInAllocatableClass(LI.PhysReg))
+        // Ignore other live-ins, e.g. those that are live into landing pads.
+        LiveOuts.insert(LI.PhysReg);
+    }
+  }
+
+  // Loop over PhysRegDef / PhysRegUse, killing any registers that are
+  // available at the end of the basic block.
+  for (unsigned i = 0; i != NumRegs; ++i)
+    if ((PhysRegDef[i] || PhysRegUse[i]) && !LiveOuts.count(i))
+      HandlePhysRegDef(i, nullptr, Defs);
+}
+
+bool LiveVariables::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  MRI = &mf.getRegInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+
+  const unsigned NumRegs = TRI->getNumRegs();
+  PhysRegDef.assign(NumRegs, nullptr);
+  PhysRegUse.assign(NumRegs, nullptr);
+  PHIVarInfo.resize(MF->getNumBlockIDs());
+  PHIJoins.clear();
+
+  // FIXME: LiveIntervals will be updated to remove its dependence on
+  // LiveVariables to improve compilation time and eliminate bizarre pass
+  // dependencies. Until then, we can't change much in -O0.
+  if (!MRI->isSSA())
+    report_fatal_error("regalloc=... not currently supported with -O0");
+
+  analyzePHINodes(mf);
+
+  // Calculate live variable information in depth first order on the CFG of the
+  // function.  This guarantees that we will see the definition of a virtual
+  // register before its uses due to dominance properties of SSA (except for PHI
+  // nodes, which are treated as a special case).
+  MachineBasicBlock *Entry = &MF->front();
+  SmallPtrSet<MachineBasicBlock*,16> Visited;
+
+  for (MachineBasicBlock *MBB : depth_first_ext(Entry, Visited)) {
+    runOnBlock(MBB, NumRegs);
+
+    PhysRegDef.assign(NumRegs, nullptr);
+    PhysRegUse.assign(NumRegs, nullptr);
+  }
+
+  // Convert and transfer the dead / killed information we have gathered into
+  // VirtRegInfo onto MI's.
+  for (unsigned i = 0, e1 = VirtRegInfo.size(); i != e1; ++i) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    for (unsigned j = 0, e2 = VirtRegInfo[Reg].Kills.size(); j != e2; ++j)
+      if (VirtRegInfo[Reg].Kills[j] == MRI->getVRegDef(Reg))
+        VirtRegInfo[Reg].Kills[j]->addRegisterDead(Reg, TRI);
+      else
+        VirtRegInfo[Reg].Kills[j]->addRegisterKilled(Reg, TRI);
+  }
+
+  // Check to make sure there are no unreachable blocks in the MC CFG for the
+  // function.  If so, it is due to a bug in the instruction selector or some
+  // other part of the code generator if this happens.
+#ifndef NDEBUG
+  for(MachineFunction::iterator i = MF->begin(), e = MF->end(); i != e; ++i)
+    assert(Visited.count(&*i) != 0 && "unreachable basic block found");
+#endif
+
+  PhysRegDef.clear();
+  PhysRegUse.clear();
+  PHIVarInfo.clear();
+
+  return false;
+}
+
+/// replaceKillInstruction - Update register kill info by replacing a kill
+/// instruction with a new one.
+void LiveVariables::replaceKillInstruction(unsigned Reg, MachineInstr &OldMI,
+                                           MachineInstr &NewMI) {
+  VarInfo &VI = getVarInfo(Reg);
+  std::replace(VI.Kills.begin(), VI.Kills.end(), &OldMI, &NewMI);
+}
+
+/// removeVirtualRegistersKilled - Remove all killed info for the specified
+/// instruction.
+void LiveVariables::removeVirtualRegistersKilled(MachineInstr &MI) {
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg() && MO.isKill()) {
+      MO.setIsKill(false);
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        bool removed = getVarInfo(Reg).removeKill(MI);
+        assert(removed && "kill not in register's VarInfo?");
+        (void)removed;
+      }
+    }
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the variable information of a virtual register
+/// which is used in a PHI node. We map that to the BB the vreg is coming from.
+///
+void LiveVariables::analyzePHINodes(const MachineFunction& Fn) {
+  for (const auto &MBB : Fn)
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
+        if (BBI.getOperand(i).readsReg())
+          PHIVarInfo[BBI.getOperand(i + 1).getMBB()->getNumber()]
+            .push_back(BBI.getOperand(i).getReg());
+    }
+}
+
+bool LiveVariables::VarInfo::isLiveIn(const MachineBasicBlock &MBB,
+                                      unsigned Reg,
+                                      MachineRegisterInfo &MRI) {
+  unsigned Num = MBB.getNumber();
+
+  // Reg is live-through.
+  if (AliveBlocks.test(Num))
+    return true;
+
+  // Registers defined in MBB cannot be live in.
+  const MachineInstr *Def = MRI.getVRegDef(Reg);
+  if (Def && Def->getParent() == &MBB)
+    return false;
+
+ // Reg was not defined in MBB, was it killed here?
+  return findKill(&MBB);
+}
+
+bool LiveVariables::isLiveOut(unsigned Reg, const MachineBasicBlock &MBB) {
+  LiveVariables::VarInfo &VI = getVarInfo(Reg);
+
+  SmallPtrSet<const MachineBasicBlock *, 8> Kills;
+  for (unsigned i = 0, e = VI.Kills.size(); i != e; ++i)
+    Kills.insert(VI.Kills[i]->getParent());
+
+  // Loop over all of the successors of the basic block, checking to see if
+  // the value is either live in the block, or if it is killed in the block.
+  for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+    // Is it alive in this successor?
+    unsigned SuccIdx = SuccMBB->getNumber();
+    if (VI.AliveBlocks.test(SuccIdx))
+      return true;
+    // Or is it live because there is a use in a successor that kills it?
+    if (Kills.count(SuccMBB))
+      return true;
+  }
+
+  return false;
+}
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB.
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB) {
+  const unsigned NumNew = BB->getNumber();
+
+  SmallSet<unsigned, 16> Defs, Kills;
+
+  MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end();
+  for (; BBI != BBE && BBI->isPHI(); ++BBI) {
+    // Record the def of the PHI node.
+    Defs.insert(BBI->getOperand(0).getReg());
+
+    // All registers used by PHI nodes in SuccBB must be live through BB.
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i+1).getMBB() == BB)
+        getVarInfo(BBI->getOperand(i).getReg()).AliveBlocks.set(NumNew);
+  }
+
+  // Record all vreg defs and kills of all instructions in SuccBB.
+  for (; BBI != BBE; ++BBI) {
+    for (MachineInstr::mop_iterator I = BBI->operands_begin(),
+         E = BBI->operands_end(); I != E; ++I) {
+      if (I->isReg() && TargetRegisterInfo::isVirtualRegister(I->getReg())) {
+        if (I->isDef())
+          Defs.insert(I->getReg());
+        else if (I->isKill())
+          Kills.insert(I->getReg());
+      }
+    }
+  }
+
+  // Update info for all live variables
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // If the Defs is defined in the successor it can't be live in BB.
+    if (Defs.count(Reg))
+      continue;
+
+    // If the register is either killed in or live through SuccBB it's also live
+    // through BB.
+    VarInfo &VI = getVarInfo(Reg);
+    if (Kills.count(Reg) || VI.AliveBlocks.test(SuccBB->getNumber()))
+      VI.AliveBlocks.set(NumNew);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
new file mode 100644
index 000000000000..af7392f4435b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -0,0 +1,420 @@
+//===- LocalStackSlotAllocation.cpp - Pre-allocate locals to stack slots --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass assigns local frame indices to stack slots relative to one another
+// and allocates additional base registers to access them when the target
+// estimates they are likely to be out of range of stack pointer and frame
+// pointer relative addressing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "localstackalloc"
+
+STATISTIC(NumAllocations, "Number of frame indices allocated into local block");
+STATISTIC(NumBaseRegisters, "Number of virtual frame base registers allocated");
+STATISTIC(NumReplacements, "Number of frame indices references replaced");
+
+namespace {
+  class FrameRef {
+    MachineBasicBlock::iterator MI; // Instr referencing the frame
+    int64_t LocalOffset;            // Local offset of the frame idx referenced
+    int FrameIdx;                   // The frame index
+  public:
+    FrameRef(MachineBasicBlock::iterator I, int64_t Offset, int Idx) :
+      MI(I), LocalOffset(Offset), FrameIdx(Idx) {}
+    bool operator<(const FrameRef &RHS) const {
+      return LocalOffset < RHS.LocalOffset;
+    }
+    MachineBasicBlock::iterator getMachineInstr() const { return MI; }
+    int64_t getLocalOffset() const { return LocalOffset; }
+    int getFrameIndex() const { return FrameIdx; }
+  };
+
+  class LocalStackSlotPass: public MachineFunctionPass {
+    SmallVector<int64_t,16> LocalOffsets;
+    /// StackObjSet - A set of stack object indexes
+    typedef SmallSetVector<int, 8> StackObjSet;
+
+    void AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx, int64_t &Offset,
+                           bool StackGrowsDown, unsigned &MaxAlign);
+    void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                               SmallSet<int, 16> &ProtectedObjs,
+                               MachineFrameInfo *MFI, bool StackGrowsDown,
+                               int64_t &Offset, unsigned &MaxAlign);
+    void calculateFrameObjectOffsets(MachineFunction &Fn);
+    bool insertFrameReferenceRegisters(MachineFunction &Fn);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    explicit LocalStackSlotPass() : MachineFunctionPass(ID) { 
+      initializeLocalStackSlotPassPass(*PassRegistry::getPassRegistry());
+    }
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<StackProtector>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+  };
+} // end anonymous namespace
+
+char LocalStackSlotPass::ID = 0;
+char &llvm::LocalStackSlotAllocationID = LocalStackSlotPass::ID;
+INITIALIZE_PASS_BEGIN(LocalStackSlotPass, "localstackalloc",
+                      "Local Stack Slot Allocation", false, false)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_PASS_END(LocalStackSlotPass, "localstackalloc",
+                    "Local Stack Slot Allocation", false, false)
+
+
+bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned LocalObjectCount = MFI->getObjectIndexEnd();
+
+  // If the target doesn't want/need this pass, or if there are no locals
+  // to consider, early exit.
+  if (!TRI->requiresVirtualBaseRegisters(MF) || LocalObjectCount == 0)
+    return true;
+
+  // Make sure we have enough space to store the local offsets.
+  LocalOffsets.resize(MFI->getObjectIndexEnd());
+
+  // Lay out the local blob.
+  calculateFrameObjectOffsets(MF);
+
+  // Insert virtual base registers to resolve frame index references.
+  bool UsedBaseRegs = insertFrameReferenceRegisters(MF);
+
+  // Tell MFI whether any base registers were allocated. PEI will only
+  // want to use the local block allocations from this pass if there were any.
+  // Otherwise, PEI can do a bit better job of getting the alignment right
+  // without a hole at the start since it knows the alignment of the stack
+  // at the start of local allocation, and this pass doesn't.
+  MFI->setUseLocalStackAllocationBlock(UsedBaseRegs);
+
+  return true;
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo *MFI,
+                                           int FrameIdx, int64_t &Offset,
+                                           bool StackGrowsDown,
+                                           unsigned &MaxAlign) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = (Offset + Align - 1) / Align * Align;
+
+  int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
+  DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
+        << LocalOffset << "\n");
+  // Keep the offset available for base register allocation
+  LocalOffsets[FrameIdx] = LocalOffset;
+  // And tell MFI about it for PEI to use later
+  MFI->mapLocalFrameObject(FrameIdx, LocalOffset);
+
+  if (!StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  ++NumAllocations;
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+void LocalStackSlotPass::AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                                           SmallSet<int, 16> &ProtectedObjs,
+                                           MachineFrameInfo *MFI,
+                                           bool StackGrowsDown, int64_t &Offset,
+                                           unsigned &MaxAlign) {
+
+  for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
+        E = UnassignedObjs.end(); I != E; ++I) {
+    int i = *I;
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+    ProtectedObjs.insert(i);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+  int64_t Offset = 0;
+  unsigned MaxAlign = 0;
+  StackProtector *SP = &getAnalysis<StackProtector>();
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> ProtectedObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    StackObjSet LargeArrayObjs;
+    StackObjSet SmallArrayObjs;
+    StackObjSet AddrOfObjs;
+
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), Offset,
+                      StackGrowsDown, MaxAlign);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i)
+        continue;
+
+      switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
+      case StackProtector::SSPLK_None:
+        continue;
+      case StackProtector::SSPLK_SmallArray:
+        SmallArrayObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_AddrOf:
+        AddrOfObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_LargeArray:
+        LargeArrayObjs.insert(i);
+        continue;
+      }
+      llvm_unreachable("Unexpected SSPLayoutKind.");
+    }
+
+    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign);
+  }
+
+  // Then assign frame offsets to stack objects that are not used to spill
+  // callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i)
+      continue;
+    if (ProtectedObjs.count(i))
+      continue;
+
+    AdjustStackOffset(MFI, i, Offset, StackGrowsDown, MaxAlign);
+  }
+
+  // Remember how big this blob of stack space is
+  MFI->setLocalFrameSize(Offset);
+  MFI->setLocalFrameMaxAlign(MaxAlign);
+}
+
+static inline bool
+lookupCandidateBaseReg(unsigned BaseReg,
+                       int64_t BaseOffset,
+                       int64_t FrameSizeAdjust,
+                       int64_t LocalFrameOffset,
+                       const MachineInstr &MI,
+                       const TargetRegisterInfo *TRI) {
+  // Check if the relative offset from the where the base register references
+  // to the target address is in range for the instruction.
+  int64_t Offset = FrameSizeAdjust + LocalFrameOffset - BaseOffset;
+  return TRI->isFrameOffsetLegal(&MI, BaseReg, Offset);
+}
+
+bool LocalStackSlotPass::insertFrameReferenceRegisters(MachineFunction &Fn) {
+  // Scan the function's instructions looking for frame index references.
+  // For each, ask the target if it wants a virtual base register for it
+  // based on what we can tell it about where the local will end up in the
+  // stack frame. If it wants one, re-use a suitable one we've previously
+  // allocated, or if there isn't one that fits the bill, allocate a new one
+  // and ask the target to create a defining instruction for it.
+  bool UsedBaseReg = false;
+
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Collect all of the instructions in the block that reference
+  // a frame index. Also store the frame index referenced to ease later
+  // lookup. (For any insn that has more than one FI reference, we arbitrarily
+  // choose the first one).
+  SmallVector<FrameRef, 64> FrameReferenceInsns;
+
+  for (MachineBasicBlock &BB : Fn) {
+    for (MachineInstr &MI : BB) {
+      // Debug value, stackmap and patchpoint instructions can't be out of
+      // range, so they don't need any updates.
+      if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STATEPOINT ||
+          MI.getOpcode() == TargetOpcode::STACKMAP ||
+          MI.getOpcode() == TargetOpcode::PATCHPOINT)
+        continue;
+
+      // For now, allocate the base register(s) within the basic block
+      // where they're used, and don't try to keep them around outside
+      // of that. It may be beneficial to try sharing them more broadly
+      // than that, but the increased register pressure makes that a
+      // tricky thing to balance. Investigate if re-materializing these
+      // becomes an issue.
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        // Consider replacing all frame index operands that reference
+        // an object allocated in the local block.
+        if (MI.getOperand(i).isFI()) {
+          // Don't try this with values not in the local block.
+          if (!MFI->isObjectPreAllocated(MI.getOperand(i).getIndex()))
+            break;
+          int Idx = MI.getOperand(i).getIndex();
+          int64_t LocalOffset = LocalOffsets[Idx];
+          if (!TRI->needsFrameBaseReg(&MI, LocalOffset))
+            break;
+          FrameReferenceInsns.push_back(FrameRef(&MI, LocalOffset, Idx));
+          break;
+        }
+      }
+    }
+  }
+
+  // Sort the frame references by local offset
+  array_pod_sort(FrameReferenceInsns.begin(), FrameReferenceInsns.end());
+
+  MachineBasicBlock *Entry = &Fn.front();
+
+  unsigned BaseReg = 0;
+  int64_t BaseOffset = 0;
+
+  // Loop through the frame references and allocate for them as necessary.
+  for (int ref = 0, e = FrameReferenceInsns.size(); ref < e ; ++ref) {
+    FrameRef &FR = FrameReferenceInsns[ref];
+    MachineInstr &MI = *FR.getMachineInstr();
+    int64_t LocalOffset = FR.getLocalOffset();
+    int FrameIdx = FR.getFrameIndex();
+    assert(MFI->isObjectPreAllocated(FrameIdx) &&
+           "Only pre-allocated locals expected!");
+
+    DEBUG(dbgs() << "Considering: " << MI);
+
+    unsigned idx = 0;
+    for (unsigned f = MI.getNumOperands(); idx != f; ++idx) {
+      if (!MI.getOperand(idx).isFI())
+        continue;
+
+      if (FrameIdx == MI.getOperand(idx).getIndex())
+        break;
+    }
+
+    assert(idx < MI.getNumOperands() && "Cannot find FI operand");
+
+    int64_t Offset = 0;
+    int64_t FrameSizeAdjust = StackGrowsDown ? MFI->getLocalFrameSize() : 0;
+
+    DEBUG(dbgs() << "  Replacing FI in: " << MI);
+
+    // If we have a suitable base register available, use it; otherwise
+    // create a new one. Note that any offset encoded in the
+    // instruction itself will be taken into account by the target,
+    // so we don't have to adjust for it here when reusing a base
+    // register.
+    if (UsedBaseReg &&
+        lookupCandidateBaseReg(BaseReg, BaseOffset, FrameSizeAdjust,
+                               LocalOffset, MI, TRI)) {
+      DEBUG(dbgs() << "  Reusing base register " << BaseReg << "\n");
+      // We found a register to reuse.
+      Offset = FrameSizeAdjust + LocalOffset - BaseOffset;
+    } else {
+      // No previously defined register was in range, so create a new one.
+      int64_t InstrOffset = TRI->getFrameIndexInstrOffset(&MI, idx);
+
+      int64_t PrevBaseOffset = BaseOffset;
+      BaseOffset = FrameSizeAdjust + LocalOffset + InstrOffset;
+
+      // We'd like to avoid creating single-use virtual base registers.
+      // Because the FrameRefs are in sorted order, and we've already
+      // processed all FrameRefs before this one, just check whether or not
+      // the next FrameRef will be able to reuse this new register. If not,
+      // then don't bother creating it.
+      if (ref + 1 >= e ||
+          !lookupCandidateBaseReg(
+              BaseReg, BaseOffset, FrameSizeAdjust,
+              FrameReferenceInsns[ref + 1].getLocalOffset(),
+              *FrameReferenceInsns[ref + 1].getMachineInstr(), TRI)) {
+        BaseOffset = PrevBaseOffset;
+        continue;
+      }
+
+      const MachineFunction *MF = MI.getParent()->getParent();
+      const TargetRegisterClass *RC = TRI->getPointerRegClass(*MF);
+      BaseReg = Fn.getRegInfo().createVirtualRegister(RC);
+
+      DEBUG(dbgs() << "  Materializing base register " << BaseReg <<
+            " at frame local offset " << LocalOffset + InstrOffset << "\n");
+
+      // Tell the target to insert the instruction to initialize
+      // the base register.
+      //            MachineBasicBlock::iterator InsertionPt = Entry->begin();
+      TRI->materializeFrameBaseRegister(Entry, BaseReg, FrameIdx,
+                                        InstrOffset);
+
+      // The base register already includes any offset specified
+      // by the instruction, so account for that so it doesn't get
+      // applied twice.
+      Offset = -InstrOffset;
+
+      ++NumBaseRegisters;
+      UsedBaseReg = true;
+    }
+    assert(BaseReg != 0 && "Unable to allocate virtual base register!");
+
+    // Modify the instruction to use the new base register rather
+    // than the frame index operand.
+    TRI->resolveFrameIndex(MI, BaseReg, Offset);
+    DEBUG(dbgs() << "Resolved: " << MI);
+
+    ++NumReplacements;
+  }
+
+  return UsedBaseReg;
+}
diff --git a/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
new file mode 100644
index 000000000000..6966c8ca4a5f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -0,0 +1,162 @@
+//===- LowerEmuTLS.cpp - Add __emutls_[vt].* variables --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is required for targets depending on libgcc style
+// emulated thread local storage variables. For every defined TLS variable xyz,
+// an __emutls_v.xyz is generated. If there is non-zero initialized value
+// an __emutls_t.xyz is also generated.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loweremutls"
+
+namespace {
+
+class LowerEmuTLS : public ModulePass {
+  const TargetMachine *TM;
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit LowerEmuTLS() : ModulePass(ID), TM(nullptr) { }
+  explicit LowerEmuTLS(const TargetMachine *TM)
+      : ModulePass(ID), TM(TM) {
+    initializeLowerEmuTLSPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnModule(Module &M) override;
+private:
+  bool addEmuTlsVar(Module &M, const GlobalVariable *GV);
+  static void copyLinkageVisibility(Module &M,
+                                    const GlobalVariable *from,
+                                    GlobalVariable *to) {
+    to->setLinkage(from->getLinkage());
+    to->setVisibility(from->getVisibility());
+    if (from->hasComdat()) {
+      to->setComdat(M.getOrInsertComdat(to->getName()));
+      to->getComdat()->setSelectionKind(from->getComdat()->getSelectionKind());
+    }
+  }
+};
+}
+
+char LowerEmuTLS::ID = 0;
+
+INITIALIZE_PASS(LowerEmuTLS, "loweremutls",
+                "Add __emutls_[vt]. variables for emultated TLS model",
+                false, false)
+
+ModulePass *llvm::createLowerEmuTLSPass(const TargetMachine *TM) {
+  return new LowerEmuTLS(TM);
+}
+
+bool LowerEmuTLS::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  if (!TM || !TM->Options.EmulatedTLS)
+    return false;
+
+  bool Changed = false;
+  SmallVector<const GlobalVariable*, 8> TlsVars;
+  for (const auto &G : M.globals()) {
+    if (G.isThreadLocal())
+      TlsVars.append({&G});
+  }
+  for (const auto G : TlsVars)
+    Changed |= addEmuTlsVar(M, G);
+  return Changed;
+}
+
+bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
+  LLVMContext &C = M.getContext();
+  PointerType *VoidPtrType = Type::getInt8PtrTy(C);
+
+  std::string EmuTlsVarName = ("__emutls_v." + GV->getName()).str();
+  GlobalVariable *EmuTlsVar = M.getNamedGlobal(EmuTlsVarName);
+  if (EmuTlsVar)
+    return false;  // It has been added before.
+
+  const DataLayout &DL = M.getDataLayout();
+  Constant *NullPtr = ConstantPointerNull::get(VoidPtrType);
+
+  // Get non-zero initializer from GV's initializer.
+  const Constant *InitValue = nullptr;
+  if (GV->hasInitializer()) {
+    InitValue = GV->getInitializer();
+    const ConstantInt *InitIntValue = dyn_cast<ConstantInt>(InitValue);
+    // When GV's init value is all 0, omit the EmuTlsTmplVar and let
+    // the emutls library function to reset newly allocated TLS variables.
+    if (isa<ConstantAggregateZero>(InitValue) ||
+        (InitIntValue && InitIntValue->isZero()))
+      InitValue = nullptr;
+  }
+
+  // Create the __emutls_v. symbol, whose type has 4 fields:
+  //     word size;   // size of GV in bytes
+  //     word align;  // alignment of GV
+  //     void *ptr;   // initialized to 0; set at run time per thread.
+  //     void *templ; // 0 or point to __emutls_t.*
+  // sizeof(word) should be the same as sizeof(void*) on target.
+  IntegerType *WordType = DL.getIntPtrType(C);
+  PointerType *InitPtrType = InitValue ?
+      PointerType::getUnqual(InitValue->getType()) : VoidPtrType;
+  Type *ElementTypes[4] = {WordType, WordType, VoidPtrType, InitPtrType};
+  ArrayRef<Type*> ElementTypeArray(ElementTypes, 4);
+  StructType *EmuTlsVarType = StructType::create(ElementTypeArray);
+  EmuTlsVar = cast<GlobalVariable>(
+      M.getOrInsertGlobal(EmuTlsVarName, EmuTlsVarType));
+  copyLinkageVisibility(M, GV, EmuTlsVar);
+
+  // Define "__emutls_t.*" and "__emutls_v.*" only if GV is defined.
+  if (!GV->hasInitializer())
+    return true;
+
+  Type *GVType = GV->getValueType();
+  unsigned GVAlignment = GV->getAlignment();
+  if (!GVAlignment) {
+    // When LLVM IL declares a variable without alignment, use
+    // the ABI default alignment for the type.
+    GVAlignment = DL.getABITypeAlignment(GVType);
+  }
+
+  // Define "__emutls_t.*" if there is InitValue
+  GlobalVariable *EmuTlsTmplVar = nullptr;
+  if (InitValue) {
+    std::string EmuTlsTmplName = ("__emutls_t." + GV->getName()).str();
+    EmuTlsTmplVar = dyn_cast_or_null<GlobalVariable>(
+        M.getOrInsertGlobal(EmuTlsTmplName, GVType));
+    assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
+    EmuTlsTmplVar->setConstant(true);
+    EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
+    EmuTlsTmplVar->setAlignment(GVAlignment);
+    copyLinkageVisibility(M, GV, EmuTlsTmplVar);
+  }
+
+  // Define "__emutls_v.*" with initializer and alignment.
+  Constant *ElementValues[4] = {
+      ConstantInt::get(WordType, DL.getTypeStoreSize(GVType)),
+      ConstantInt::get(WordType, GVAlignment),
+      NullPtr, EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr
+  };
+  ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
+  EmuTlsVar->setInitializer(
+      ConstantStruct::get(EmuTlsVarType, ElementValueArray));
+  unsigned MaxAlignment = std::max(
+      DL.getABITypeAlignment(WordType),
+      DL.getABITypeAlignment(VoidPtrType));
+  EmuTlsVar->setAlignment(MaxAlignment);
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
new file mode 100644
index 000000000000..6e3de52f1a9c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -0,0 +1,612 @@
+//===- MILexer.cpp - Machine instructions lexer implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the lexing of machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MILexer.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include <cctype>
+
+using namespace llvm;
+
+namespace {
+
+typedef function_ref<void(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
+/// This class provides a way to iterate and get characters from the source
+/// string.
+class Cursor {
+  const char *Ptr;
+  const char *End;
+
+public:
+  Cursor(NoneType) : Ptr(nullptr), End(nullptr) {}
+
+  explicit Cursor(StringRef Str) {
+    Ptr = Str.data();
+    End = Ptr + Str.size();
+  }
+
+  bool isEOF() const { return Ptr == End; }
+
+  char peek(int I = 0) const { return End - Ptr <= I ? 0 : Ptr[I]; }
+
+  void advance(unsigned I = 1) { Ptr += I; }
+
+  StringRef remaining() const { return StringRef(Ptr, End - Ptr); }
+
+  StringRef upto(Cursor C) const {
+    assert(C.Ptr >= Ptr && C.Ptr <= End);
+    return StringRef(Ptr, C.Ptr - Ptr);
+  }
+
+  StringRef::iterator location() const { return Ptr; }
+
+  operator bool() const { return Ptr != nullptr; }
+};
+
+} // end anonymous namespace
+
+MIToken &MIToken::reset(TokenKind Kind, StringRef Range) {
+  this->Kind = Kind;
+  this->Range = Range;
+  return *this;
+}
+
+MIToken &MIToken::setStringValue(StringRef StrVal) {
+  StringValue = StrVal;
+  return *this;
+}
+
+MIToken &MIToken::setOwnedStringValue(std::string StrVal) {
+  StringValueStorage = std::move(StrVal);
+  StringValue = StringValueStorage;
+  return *this;
+}
+
+MIToken &MIToken::setIntegerValue(APSInt IntVal) {
+  this->IntVal = std::move(IntVal);
+  return *this;
+}
+
+/// Skip the leading whitespace characters and return the updated cursor.
+static Cursor skipWhitespace(Cursor C) {
+  while (isblank(C.peek()))
+    C.advance();
+  return C;
+}
+
+static bool isNewlineChar(char C) { return C == '\n' || C == '\r'; }
+
+/// Skip a line comment and return the updated cursor.
+static Cursor skipComment(Cursor C) {
+  if (C.peek() != ';')
+    return C;
+  while (!isNewlineChar(C.peek()) && !C.isEOF())
+    C.advance();
+  return C;
+}
+
+/// Return true if the given character satisfies the following regular
+/// expression: [-a-zA-Z$._0-9]
+static bool isIdentifierChar(char C) {
+  return isalpha(C) || isdigit(C) || C == '_' || C == '-' || C == '.' ||
+         C == '$';
+}
+
+/// Unescapes the given string value.
+///
+/// Expects the string value to be quoted.
+static std::string unescapeQuotedString(StringRef Value) {
+  assert(Value.front() == '"' && Value.back() == '"');
+  Cursor C = Cursor(Value.substr(1, Value.size() - 2));
+
+  std::string Str;
+  Str.reserve(C.remaining().size());
+  while (!C.isEOF()) {
+    char Char = C.peek();
+    if (Char == '\\') {
+      if (C.peek(1) == '\\') {
+        // Two '\' become one
+        Str += '\\';
+        C.advance(2);
+        continue;
+      }
+      if (isxdigit(C.peek(1)) && isxdigit(C.peek(2))) {
+        Str += hexDigitValue(C.peek(1)) * 16 + hexDigitValue(C.peek(2));
+        C.advance(3);
+        continue;
+      }
+    }
+    Str += Char;
+    C.advance();
+  }
+  return Str;
+}
+
+/// Lex a string constant using the following regular expression: \"[^\"]*\"
+static Cursor lexStringConstant(Cursor C, ErrorCallbackType ErrorCallback) {
+  assert(C.peek() == '"');
+  for (C.advance(); C.peek() != '"'; C.advance()) {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '\"'");
+      return None;
+    }
+  }
+  C.advance();
+  return C;
+}
+
+static Cursor lexName(Cursor C, MIToken &Token, MIToken::TokenKind Type,
+                      unsigned PrefixLength, ErrorCallbackType ErrorCallback) {
+  auto Range = C;
+  C.advance(PrefixLength);
+  if (C.peek() == '"') {
+    if (Cursor R = lexStringConstant(C, ErrorCallback)) {
+      StringRef String = Range.upto(R);
+      Token.reset(Type, String)
+          .setOwnedStringValue(
+              unescapeQuotedString(String.drop_front(PrefixLength)));
+      return R;
+    }
+    Token.reset(MIToken::Error, Range.remaining());
+    return Range;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  Token.reset(Type, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(PrefixLength));
+  return C;
+}
+
+static Cursor maybeLexIntegerType(Cursor C, MIToken &Token) {
+  if (C.peek() != 'i' || !isdigit(C.peek(1)))
+    return None;
+  auto Range = C;
+  C.advance(); // Skip 'i'
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::IntegerType, Range.upto(C));
+  return C;
+}
+
+static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("_", MIToken::underscore)
+      .Case("implicit", MIToken::kw_implicit)
+      .Case("implicit-def", MIToken::kw_implicit_define)
+      .Case("def", MIToken::kw_def)
+      .Case("dead", MIToken::kw_dead)
+      .Case("killed", MIToken::kw_killed)
+      .Case("undef", MIToken::kw_undef)
+      .Case("internal", MIToken::kw_internal)
+      .Case("early-clobber", MIToken::kw_early_clobber)
+      .Case("debug-use", MIToken::kw_debug_use)
+      .Case("tied-def", MIToken::kw_tied_def)
+      .Case("frame-setup", MIToken::kw_frame_setup)
+      .Case("debug-location", MIToken::kw_debug_location)
+      .Case(".cfi_same_value", MIToken::kw_cfi_same_value)
+      .Case(".cfi_offset", MIToken::kw_cfi_offset)
+      .Case(".cfi_def_cfa_register", MIToken::kw_cfi_def_cfa_register)
+      .Case(".cfi_def_cfa_offset", MIToken::kw_cfi_def_cfa_offset)
+      .Case(".cfi_def_cfa", MIToken::kw_cfi_def_cfa)
+      .Case("blockaddress", MIToken::kw_blockaddress)
+      .Case("target-index", MIToken::kw_target_index)
+      .Case("half", MIToken::kw_half)
+      .Case("float", MIToken::kw_float)
+      .Case("double", MIToken::kw_double)
+      .Case("x86_fp80", MIToken::kw_x86_fp80)
+      .Case("fp128", MIToken::kw_fp128)
+      .Case("ppc_fp128", MIToken::kw_ppc_fp128)
+      .Case("target-flags", MIToken::kw_target_flags)
+      .Case("volatile", MIToken::kw_volatile)
+      .Case("non-temporal", MIToken::kw_non_temporal)
+      .Case("invariant", MIToken::kw_invariant)
+      .Case("align", MIToken::kw_align)
+      .Case("stack", MIToken::kw_stack)
+      .Case("got", MIToken::kw_got)
+      .Case("jump-table", MIToken::kw_jump_table)
+      .Case("constant-pool", MIToken::kw_constant_pool)
+      .Case("call-entry", MIToken::kw_call_entry)
+      .Case("liveout", MIToken::kw_liveout)
+      .Case("address-taken", MIToken::kw_address_taken)
+      .Case("landing-pad", MIToken::kw_landing_pad)
+      .Case("liveins", MIToken::kw_liveins)
+      .Case("successors", MIToken::kw_successors)
+      .Default(MIToken::Identifier);
+}
+
+static Cursor maybeLexIdentifier(Cursor C, MIToken &Token) {
+  if (!isalpha(C.peek()) && C.peek() != '_' && C.peek() != '.')
+    return None;
+  auto Range = C;
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  auto Identifier = Range.upto(C);
+  Token.reset(getIdentifierKind(Identifier), Identifier)
+      .setStringValue(Identifier);
+  return C;
+}
+
+static Cursor maybeLexMachineBasicBlock(Cursor C, MIToken &Token,
+                                        ErrorCallbackType ErrorCallback) {
+  bool IsReference = C.remaining().startswith("%bb.");
+  if (!IsReference && !C.remaining().startswith("bb."))
+    return None;
+  auto Range = C;
+  unsigned PrefixLength = IsReference ? 4 : 3;
+  C.advance(PrefixLength); // Skip '%bb.' or 'bb.'
+  if (!isdigit(C.peek())) {
+    Token.reset(MIToken::Error, C.remaining());
+    ErrorCallback(C.location(), "expected a number after '%bb.'");
+    return C;
+  }
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  StringRef Number = NumberRange.upto(C);
+  unsigned StringOffset = PrefixLength + Number.size(); // Drop '%bb.<id>'
+  if (C.peek() == '.') {
+    C.advance(); // Skip '.'
+    ++StringOffset;
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+  }
+  Token.reset(IsReference ? MIToken::MachineBasicBlock
+                          : MIToken::MachineBasicBlockLabel,
+              Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
+  return C;
+}
+
+static Cursor maybeLexIndex(Cursor C, MIToken &Token, StringRef Rule,
+                            MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return None;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(Kind, Range.upto(C)).setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexIndexAndName(Cursor C, MIToken &Token, StringRef Rule,
+                                   MIToken::TokenKind Kind) {
+  if (!C.remaining().startswith(Rule) || !isdigit(C.peek(Rule.size())))
+    return None;
+  auto Range = C;
+  C.advance(Rule.size());
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  StringRef Number = NumberRange.upto(C);
+  unsigned StringOffset = Rule.size() + Number.size();
+  if (C.peek() == '.') {
+    C.advance();
+    ++StringOffset;
+    while (isIdentifierChar(C.peek()))
+      C.advance();
+  }
+  Token.reset(Kind, Range.upto(C))
+      .setIntegerValue(APSInt(Number))
+      .setStringValue(Range.upto(C).drop_front(StringOffset));
+  return C;
+}
+
+static Cursor maybeLexJumpTableIndex(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%jump-table.", MIToken::JumpTableIndex);
+}
+
+static Cursor maybeLexStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndexAndName(C, Token, "%stack.", MIToken::StackObject);
+}
+
+static Cursor maybeLexFixedStackObject(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%fixed-stack.", MIToken::FixedStackObject);
+}
+
+static Cursor maybeLexConstantPoolItem(Cursor C, MIToken &Token) {
+  return maybeLexIndex(C, Token, "%const.", MIToken::ConstantPoolItem);
+}
+
+static Cursor maybeLexSubRegisterIndex(Cursor C, MIToken &Token,
+                                       ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%subreg.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  return lexName(C, Token, MIToken::SubRegisterIndex, Rule.size(),
+                 ErrorCallback);
+}
+
+static Cursor maybeLexIRBlock(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%ir-block.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRBlock);
+  return lexName(C, Token, MIToken::NamedIRBlock, Rule.size(), ErrorCallback);
+}
+
+static Cursor maybeLexIRValue(Cursor C, MIToken &Token,
+                              ErrorCallbackType ErrorCallback) {
+  const StringRef Rule = "%ir.";
+  if (!C.remaining().startswith(Rule))
+    return None;
+  if (isdigit(C.peek(Rule.size())))
+    return maybeLexIndex(C, Token, Rule, MIToken::IRValue);
+  return lexName(C, Token, MIToken::NamedIRValue, Rule.size(), ErrorCallback);
+}
+
+static Cursor lexVirtualRegister(Cursor C, MIToken &Token) {
+  auto Range = C;
+  C.advance(); // Skip '%'
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::VirtualRegister, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexRegister(Cursor C, MIToken &Token) {
+  if (C.peek() != '%')
+    return None;
+  if (isdigit(C.peek(1)))
+    return lexVirtualRegister(C, Token);
+  auto Range = C;
+  C.advance(); // Skip '%'
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  Token.reset(MIToken::NamedRegister, Range.upto(C))
+      .setStringValue(Range.upto(C).drop_front(1)); // Drop the '%'
+  return C;
+}
+
+static Cursor maybeLexGlobalValue(Cursor C, MIToken &Token,
+                                  ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '@')
+    return None;
+  if (!isdigit(C.peek(1)))
+    return lexName(C, Token, MIToken::NamedGlobalValue, /*PrefixLength=*/1,
+                   ErrorCallback);
+  auto Range = C;
+  C.advance(1); // Skip the '@'
+  auto NumberRange = C;
+  while (isdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::GlobalValue, Range.upto(C))
+      .setIntegerValue(APSInt(NumberRange.upto(C)));
+  return C;
+}
+
+static Cursor maybeLexExternalSymbol(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '$')
+    return None;
+  return lexName(C, Token, MIToken::ExternalSymbol, /*PrefixLength=*/1,
+                 ErrorCallback);
+}
+
+static bool isValidHexFloatingPointPrefix(char C) {
+  return C == 'H' || C == 'K' || C == 'L' || C == 'M';
+}
+
+static Cursor maybeLexHexFloatingPointLiteral(Cursor C, MIToken &Token) {
+  if (C.peek() != '0' || C.peek(1) != 'x')
+    return None;
+  Cursor Range = C;
+  C.advance(2); // Skip '0x'
+  if (isValidHexFloatingPointPrefix(C.peek()))
+    C.advance();
+  while (isxdigit(C.peek()))
+    C.advance();
+  Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor lexFloatingPointLiteral(Cursor Range, Cursor C, MIToken &Token) {
+  C.advance();
+  // Skip over [0-9]*([eE][-+]?[0-9]+)?
+  while (isdigit(C.peek()))
+    C.advance();
+  if ((C.peek() == 'e' || C.peek() == 'E') &&
+      (isdigit(C.peek(1)) ||
+       ((C.peek(1) == '-' || C.peek(1) == '+') && isdigit(C.peek(2))))) {
+    C.advance(2);
+    while (isdigit(C.peek()))
+      C.advance();
+  }
+  Token.reset(MIToken::FloatingPointLiteral, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNumericalLiteral(Cursor C, MIToken &Token) {
+  if (!isdigit(C.peek()) && (C.peek() != '-' || !isdigit(C.peek(1))))
+    return None;
+  auto Range = C;
+  C.advance();
+  while (isdigit(C.peek()))
+    C.advance();
+  if (C.peek() == '.')
+    return lexFloatingPointLiteral(Range, C, Token);
+  StringRef StrVal = Range.upto(C);
+  Token.reset(MIToken::IntegerLiteral, StrVal).setIntegerValue(APSInt(StrVal));
+  return C;
+}
+
+static MIToken::TokenKind getMetadataKeywordKind(StringRef Identifier) {
+  return StringSwitch<MIToken::TokenKind>(Identifier)
+      .Case("!tbaa", MIToken::md_tbaa)
+      .Case("!alias.scope", MIToken::md_alias_scope)
+      .Case("!noalias", MIToken::md_noalias)
+      .Case("!range", MIToken::md_range)
+      .Default(MIToken::Error);
+}
+
+static Cursor maybeLexExlaim(Cursor C, MIToken &Token,
+                             ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '!')
+    return None;
+  auto Range = C;
+  C.advance(1);
+  if (isdigit(C.peek()) || !isIdentifierChar(C.peek())) {
+    Token.reset(MIToken::exclaim, Range.upto(C));
+    return C;
+  }
+  while (isIdentifierChar(C.peek()))
+    C.advance();
+  StringRef StrVal = Range.upto(C);
+  Token.reset(getMetadataKeywordKind(StrVal), StrVal);
+  if (Token.isError())
+    ErrorCallback(Token.location(),
+                  "use of unknown metadata keyword '" + StrVal + "'");
+  return C;
+}
+
+static MIToken::TokenKind symbolToken(char C) {
+  switch (C) {
+  case ',':
+    return MIToken::comma;
+  case '=':
+    return MIToken::equal;
+  case ':':
+    return MIToken::colon;
+  case '(':
+    return MIToken::lparen;
+  case ')':
+    return MIToken::rparen;
+  case '{':
+    return MIToken::lbrace;
+  case '}':
+    return MIToken::rbrace;
+  case '+':
+    return MIToken::plus;
+  case '-':
+    return MIToken::minus;
+  case '<':
+    return MIToken::less;
+  case '>':
+    return MIToken::greater;
+  default:
+    return MIToken::Error;
+  }
+}
+
+static Cursor maybeLexSymbol(Cursor C, MIToken &Token) {
+  MIToken::TokenKind Kind;
+  unsigned Length = 1;
+  if (C.peek() == ':' && C.peek(1) == ':') {
+    Kind = MIToken::coloncolon;
+    Length = 2;
+  } else
+    Kind = symbolToken(C.peek());
+  if (Kind == MIToken::Error)
+    return None;
+  auto Range = C;
+  C.advance(Length);
+  Token.reset(Kind, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexNewline(Cursor C, MIToken &Token) {
+  if (!isNewlineChar(C.peek()))
+    return None;
+  auto Range = C;
+  C.advance();
+  Token.reset(MIToken::Newline, Range.upto(C));
+  return C;
+}
+
+static Cursor maybeLexEscapedIRValue(Cursor C, MIToken &Token,
+                                     ErrorCallbackType ErrorCallback) {
+  if (C.peek() != '`')
+    return None;
+  auto Range = C;
+  C.advance();
+  auto StrRange = C;
+  while (C.peek() != '`') {
+    if (C.isEOF() || isNewlineChar(C.peek())) {
+      ErrorCallback(
+          C.location(),
+          "end of machine instruction reached before the closing '`'");
+      Token.reset(MIToken::Error, Range.remaining());
+      return C;
+    }
+    C.advance();
+  }
+  StringRef Value = StrRange.upto(C);
+  C.advance();
+  Token.reset(MIToken::QuotedIRValue, Range.upto(C)).setStringValue(Value);
+  return C;
+}
+
+StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
+                           ErrorCallbackType ErrorCallback) {
+  auto C = skipComment(skipWhitespace(Cursor(Source)));
+  if (C.isEOF()) {
+    Token.reset(MIToken::Eof, C.remaining());
+    return C.remaining();
+  }
+
+  if (Cursor R = maybeLexIntegerType(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIdentifier(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexJumpTableIndex(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexFixedStackObject(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexConstantPoolItem(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexSubRegisterIndex(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIRBlock(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexIRValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexRegister(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexGlobalValue(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexExternalSymbol(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexHexFloatingPointLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexNumericalLiteral(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexExlaim(C, Token, ErrorCallback))
+    return R.remaining();
+  if (Cursor R = maybeLexSymbol(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexNewline(C, Token))
+    return R.remaining();
+  if (Cursor R = maybeLexEscapedIRValue(C, Token, ErrorCallback))
+    return R.remaining();
+
+  Token.reset(MIToken::Error, C.remaining());
+  ErrorCallback(C.location(),
+                Twine("unexpected character '") + Twine(C.peek()) + "'");
+  return C.remaining();
+}
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
new file mode 100644
index 000000000000..32fc8ab271e6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -0,0 +1,196 @@
+//===- MILexer.h - Lexer for machine instructions -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function that lexes the machine instruction source
+// string.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
+#define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
+
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include <functional>
+
+namespace llvm {
+
+class Twine;
+
+/// A token produced by the machine instruction lexer.
+struct MIToken {
+  enum TokenKind {
+    // Markers
+    Eof,
+    Error,
+    Newline,
+
+    // Tokens with no info.
+    comma,
+    equal,
+    underscore,
+    colon,
+    coloncolon,
+    exclaim,
+    lparen,
+    rparen,
+    lbrace,
+    rbrace,
+    plus,
+    minus,
+    less,
+    greater,
+
+    // Keywords
+    kw_implicit,
+    kw_implicit_define,
+    kw_def,
+    kw_dead,
+    kw_killed,
+    kw_undef,
+    kw_internal,
+    kw_early_clobber,
+    kw_debug_use,
+    kw_tied_def,
+    kw_frame_setup,
+    kw_debug_location,
+    kw_cfi_same_value,
+    kw_cfi_offset,
+    kw_cfi_def_cfa_register,
+    kw_cfi_def_cfa_offset,
+    kw_cfi_def_cfa,
+    kw_blockaddress,
+    kw_target_index,
+    kw_half,
+    kw_float,
+    kw_double,
+    kw_x86_fp80,
+    kw_fp128,
+    kw_ppc_fp128,
+    kw_target_flags,
+    kw_volatile,
+    kw_non_temporal,
+    kw_invariant,
+    kw_align,
+    kw_stack,
+    kw_got,
+    kw_jump_table,
+    kw_constant_pool,
+    kw_call_entry,
+    kw_liveout,
+    kw_address_taken,
+    kw_landing_pad,
+    kw_liveins,
+    kw_successors,
+
+    // Named metadata keywords
+    md_tbaa,
+    md_alias_scope,
+    md_noalias,
+    md_range,
+
+    // Identifier tokens
+    Identifier,
+    IntegerType,
+    NamedRegister,
+    MachineBasicBlockLabel,
+    MachineBasicBlock,
+    StackObject,
+    FixedStackObject,
+    NamedGlobalValue,
+    GlobalValue,
+    ExternalSymbol,
+
+    // Other tokens
+    IntegerLiteral,
+    FloatingPointLiteral,
+    VirtualRegister,
+    ConstantPoolItem,
+    JumpTableIndex,
+    NamedIRBlock,
+    IRBlock,
+    NamedIRValue,
+    IRValue,
+    QuotedIRValue, // `<constant value>`
+    SubRegisterIndex
+  };
+
+private:
+  TokenKind Kind;
+  StringRef Range;
+  StringRef StringValue;
+  std::string StringValueStorage;
+  APSInt IntVal;
+
+public:
+  MIToken() : Kind(Error) {}
+
+  MIToken &reset(TokenKind Kind, StringRef Range);
+
+  MIToken &setStringValue(StringRef StrVal);
+  MIToken &setOwnedStringValue(std::string StrVal);
+  MIToken &setIntegerValue(APSInt IntVal);
+
+  TokenKind kind() const { return Kind; }
+
+  bool isError() const { return Kind == Error; }
+
+  bool isNewlineOrEOF() const { return Kind == Newline || Kind == Eof; }
+
+  bool isErrorOrEOF() const { return Kind == Error || Kind == Eof; }
+
+  bool isRegister() const {
+    return Kind == NamedRegister || Kind == underscore ||
+           Kind == VirtualRegister;
+  }
+
+  bool isRegisterFlag() const {
+    return Kind == kw_implicit || Kind == kw_implicit_define ||
+           Kind == kw_def || Kind == kw_dead || Kind == kw_killed ||
+           Kind == kw_undef || Kind == kw_internal ||
+           Kind == kw_early_clobber || Kind == kw_debug_use;
+  }
+
+  bool isMemoryOperandFlag() const {
+    return Kind == kw_volatile || Kind == kw_non_temporal ||
+           Kind == kw_invariant;
+  }
+
+  bool is(TokenKind K) const { return Kind == K; }
+
+  bool isNot(TokenKind K) const { return Kind != K; }
+
+  StringRef::iterator location() const { return Range.begin(); }
+
+  StringRef range() const { return Range; }
+
+  /// Return the token's string value.
+  StringRef stringValue() const { return StringValue; }
+
+  const APSInt &integerValue() const { return IntVal; }
+
+  bool hasIntegerValue() const {
+    return Kind == IntegerLiteral || Kind == MachineBasicBlock ||
+           Kind == MachineBasicBlockLabel || Kind == StackObject ||
+           Kind == FixedStackObject || Kind == GlobalValue ||
+           Kind == VirtualRegister || Kind == ConstantPoolItem ||
+           Kind == JumpTableIndex || Kind == IRBlock || Kind == IRValue;
+  }
+};
+
+/// Consume a single machine instruction token in the given source and return
+/// the remaining source string.
+StringRef lexMIToken(
+    StringRef Source, MIToken &Token,
+    function_ref<void(StringRef::iterator, const Twine &)> ErrorCallback);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
new file mode 100644
index 000000000000..b3fd16f15889
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -0,0 +1,2094 @@
+//===- MIParser.cpp - Machine instructions parser implementation ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the parsing of machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIParser.h"
+#include "MILexer.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
+    SourceMgr &SM, const SlotMapping &IRSlots)
+  : MF(MF), SM(&SM), IRSlots(IRSlots) {
+}
+
+namespace {
+
+/// A wrapper struct around the 'MachineOperand' struct that includes a source
+/// range and other attributes.
+struct ParsedMachineOperand {
+  MachineOperand Operand;
+  StringRef::iterator Begin;
+  StringRef::iterator End;
+  Optional<unsigned> TiedDefIdx;
+
+  ParsedMachineOperand(const MachineOperand &Operand, StringRef::iterator Begin,
+                       StringRef::iterator End, Optional<unsigned> &TiedDefIdx)
+      : Operand(Operand), Begin(Begin), End(End), TiedDefIdx(TiedDefIdx) {
+    if (TiedDefIdx)
+      assert(Operand.isReg() && Operand.isUse() &&
+             "Only used register operands can be tied");
+  }
+};
+
+class MIParser {
+  MachineFunction &MF;
+  SMDiagnostic &Error;
+  StringRef Source, CurrentSource;
+  MIToken Token;
+  const PerFunctionMIParsingState &PFS;
+  /// Maps from instruction names to op codes.
+  StringMap<unsigned> Names2InstrOpCodes;
+  /// Maps from register names to registers.
+  StringMap<unsigned> Names2Regs;
+  /// Maps from register mask names to register masks.
+  StringMap<const uint32_t *> Names2RegMasks;
+  /// Maps from subregister names to subregister indices.
+  StringMap<unsigned> Names2SubRegIndices;
+  /// Maps from slot numbers to function's unnamed basic blocks.
+  DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+  /// Maps from target index names to target indices.
+  StringMap<int> Names2TargetIndices;
+  /// Maps from direct target flag names to the direct target flag values.
+  StringMap<unsigned> Names2DirectTargetFlags;
+  /// Maps from direct target flag names to the bitmask target flag values.
+  StringMap<unsigned> Names2BitmaskTargetFlags;
+
+public:
+  MIParser(const PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+           StringRef Source);
+
+  /// \p SkipChar gives the number of characters to skip before looking
+  /// for the next token.
+  void lex(unsigned SkipChar = 0);
+
+  /// Report an error at the current location with the given message.
+  ///
+  /// This function always return true.
+  bool error(const Twine &Msg);
+
+  /// Report an error at the given location with the given message.
+  ///
+  /// This function always return true.
+  bool error(StringRef::iterator Loc, const Twine &Msg);
+
+  bool
+  parseBasicBlockDefinitions(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlocks();
+  bool parse(MachineInstr *&MI);
+  bool parseStandaloneMBB(MachineBasicBlock *&MBB);
+  bool parseStandaloneNamedRegister(unsigned &Reg);
+  bool parseStandaloneVirtualRegister(unsigned &Reg);
+  bool parseStandaloneStackObject(int &FI);
+  bool parseStandaloneMDNode(MDNode *&Node);
+
+  bool
+  parseBasicBlockDefinition(DenseMap<unsigned, MachineBasicBlock *> &MBBSlots);
+  bool parseBasicBlock(MachineBasicBlock &MBB);
+  bool parseBasicBlockLiveins(MachineBasicBlock &MBB);
+  bool parseBasicBlockSuccessors(MachineBasicBlock &MBB);
+
+  bool parseRegister(unsigned &Reg);
+  bool parseRegisterFlag(unsigned &Flags);
+  bool parseSubRegisterIndex(unsigned &SubReg);
+  bool parseRegisterTiedDefIndex(unsigned &TiedDefIdx);
+  bool parseSize(unsigned &Size);
+  bool parseRegisterOperand(MachineOperand &Dest,
+                            Optional<unsigned> &TiedDefIdx, bool IsDef = false);
+  bool parseImmediateOperand(MachineOperand &Dest);
+  bool parseIRConstant(StringRef::iterator Loc, StringRef Source,
+                       const Constant *&C);
+  bool parseIRConstant(StringRef::iterator Loc, const Constant *&C);
+  bool parseIRType(StringRef::iterator Loc, StringRef Source, unsigned &Read,
+                   Type *&Ty);
+  // \p MustBeSized defines whether or not \p Ty must be sized.
+  bool parseIRType(StringRef::iterator Loc, Type *&Ty, bool MustBeSized = true);
+  bool parseTypedImmediateOperand(MachineOperand &Dest);
+  bool parseFPImmediateOperand(MachineOperand &Dest);
+  bool parseMBBReference(MachineBasicBlock *&MBB);
+  bool parseMBBOperand(MachineOperand &Dest);
+  bool parseStackFrameIndex(int &FI);
+  bool parseStackObjectOperand(MachineOperand &Dest);
+  bool parseFixedStackFrameIndex(int &FI);
+  bool parseFixedStackObjectOperand(MachineOperand &Dest);
+  bool parseGlobalValue(GlobalValue *&GV);
+  bool parseGlobalAddressOperand(MachineOperand &Dest);
+  bool parseConstantPoolIndexOperand(MachineOperand &Dest);
+  bool parseSubRegisterIndexOperand(MachineOperand &Dest);
+  bool parseJumpTableIndexOperand(MachineOperand &Dest);
+  bool parseExternalSymbolOperand(MachineOperand &Dest);
+  bool parseMDNode(MDNode *&Node);
+  bool parseMetadataOperand(MachineOperand &Dest);
+  bool parseCFIOffset(int &Offset);
+  bool parseCFIRegister(unsigned &Reg);
+  bool parseCFIOperand(MachineOperand &Dest);
+  bool parseIRBlock(BasicBlock *&BB, const Function &F);
+  bool parseBlockAddressOperand(MachineOperand &Dest);
+  bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
+  bool parseMachineOperand(MachineOperand &Dest,
+                           Optional<unsigned> &TiedDefIdx);
+  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+                                         Optional<unsigned> &TiedDefIdx);
+  bool parseOffset(int64_t &Offset);
+  bool parseAlignment(unsigned &Alignment);
+  bool parseOperandsOffset(MachineOperand &Op);
+  bool parseIRValue(const Value *&V);
+  bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
+  bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
+  bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
+
+private:
+  /// Convert the integer literal in the current token into an unsigned integer.
+  ///
+  /// Return true if an error occurred.
+  bool getUnsigned(unsigned &Result);
+
+  /// Convert the integer literal in the current token into an uint64.
+  ///
+  /// Return true if an error occurred.
+  bool getUint64(uint64_t &Result);
+
+  /// If the current token is of the given kind, consume it and return false.
+  /// Otherwise report an error and return true.
+  bool expectAndConsume(MIToken::TokenKind TokenKind);
+
+  /// If the current token is of the given kind, consume it and return true.
+  /// Otherwise return false.
+  bool consumeIfPresent(MIToken::TokenKind TokenKind);
+
+  void initNames2InstrOpCodes();
+
+  /// Try to convert an instruction name to an opcode. Return true if the
+  /// instruction name is invalid.
+  bool parseInstrName(StringRef InstrName, unsigned &OpCode);
+
+  bool parseInstruction(unsigned &OpCode, unsigned &Flags);
+
+  bool assignRegisterTies(MachineInstr &MI,
+                          ArrayRef<ParsedMachineOperand> Operands);
+
+  bool verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
+                              const MCInstrDesc &MCID);
+
+  void initNames2Regs();
+
+  /// Try to convert a register name to a register number. Return true if the
+  /// register name is invalid.
+  bool getRegisterByName(StringRef RegName, unsigned &Reg);
+
+  void initNames2RegMasks();
+
+  /// Check if the given identifier is a name of a register mask.
+  ///
+  /// Return null if the identifier isn't a register mask.
+  const uint32_t *getRegMask(StringRef Identifier);
+
+  void initNames2SubRegIndices();
+
+  /// Check if the given identifier is a name of a subregister index.
+  ///
+  /// Return 0 if the name isn't a subregister index class.
+  unsigned getSubRegIndex(StringRef Name);
+
+  const BasicBlock *getIRBlock(unsigned Slot);
+  const BasicBlock *getIRBlock(unsigned Slot, const Function &F);
+
+  const Value *getIRValue(unsigned Slot);
+
+  void initNames2TargetIndices();
+
+  /// Try to convert a name of target index to the corresponding target index.
+  ///
+  /// Return true if the name isn't a name of a target index.
+  bool getTargetIndex(StringRef Name, int &Index);
+
+  void initNames2DirectTargetFlags();
+
+  /// Try to convert a name of a direct target flag to the corresponding
+  /// target flag.
+  ///
+  /// Return true if the name isn't a name of a direct flag.
+  bool getDirectTargetFlag(StringRef Name, unsigned &Flag);
+
+  void initNames2BitmaskTargetFlags();
+
+  /// Try to convert a name of a bitmask target flag to the corresponding
+  /// target flag.
+  ///
+  /// Return true if the name isn't a name of a bitmask target flag.
+  bool getBitmaskTargetFlag(StringRef Name, unsigned &Flag);
+};
+
+} // end anonymous namespace
+
+MIParser::MIParser(const PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
+                   StringRef Source)
+    : MF(PFS.MF), Error(Error), Source(Source), CurrentSource(Source), PFS(PFS)
+{}
+
+void MIParser::lex(unsigned SkipChar) {
+  CurrentSource = lexMIToken(
+      CurrentSource.data() + SkipChar, Token,
+      [this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); });
+}
+
+bool MIParser::error(const Twine &Msg) { return error(Token.location(), Msg); }
+
+bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
+  const SourceMgr &SM = *PFS.SM;
+  assert(Loc >= Source.data() && Loc <= (Source.data() + Source.size()));
+  const MemoryBuffer &Buffer = *SM.getMemoryBuffer(SM.getMainFileID());
+  if (Loc >= Buffer.getBufferStart() && Loc <= Buffer.getBufferEnd()) {
+    // Create an ordinary diagnostic when the source manager's buffer is the
+    // source string.
+    Error = SM.GetMessage(SMLoc::getFromPointer(Loc), SourceMgr::DK_Error, Msg);
+    return true;
+  }
+  // Create a diagnostic for a YAML string literal.
+  Error = SMDiagnostic(SM, SMLoc(), Buffer.getBufferIdentifier(), 1,
+                       Loc - Source.data(), SourceMgr::DK_Error, Msg.str(),
+                       Source, None, None);
+  return true;
+}
+
+static const char *toString(MIToken::TokenKind TokenKind) {
+  switch (TokenKind) {
+  case MIToken::comma:
+    return "','";
+  case MIToken::equal:
+    return "'='";
+  case MIToken::colon:
+    return "':'";
+  case MIToken::lparen:
+    return "'('";
+  case MIToken::rparen:
+    return "')'";
+  default:
+    return "<unknown token>";
+  }
+}
+
+bool MIParser::expectAndConsume(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return error(Twine("expected ") + toString(TokenKind));
+  lex();
+  return false;
+}
+
+bool MIParser::consumeIfPresent(MIToken::TokenKind TokenKind) {
+  if (Token.isNot(TokenKind))
+    return false;
+  lex();
+  return true;
+}
+
+bool MIParser::parseBasicBlockDefinition(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  unsigned ID = 0;
+  if (getUnsigned(ID))
+    return true;
+  auto Loc = Token.location();
+  auto Name = Token.stringValue();
+  lex();
+  bool HasAddressTaken = false;
+  bool IsLandingPad = false;
+  unsigned Alignment = 0;
+  BasicBlock *BB = nullptr;
+  if (consumeIfPresent(MIToken::lparen)) {
+    do {
+      // TODO: Report an error when multiple same attributes are specified.
+      switch (Token.kind()) {
+      case MIToken::kw_address_taken:
+        HasAddressTaken = true;
+        lex();
+        break;
+      case MIToken::kw_landing_pad:
+        IsLandingPad = true;
+        lex();
+        break;
+      case MIToken::kw_align:
+        if (parseAlignment(Alignment))
+          return true;
+        break;
+      case MIToken::IRBlock:
+        // TODO: Report an error when both name and ir block are specified.
+        if (parseIRBlock(BB, *MF.getFunction()))
+          return true;
+        lex();
+        break;
+      default:
+        break;
+      }
+    } while (consumeIfPresent(MIToken::comma));
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  if (expectAndConsume(MIToken::colon))
+    return true;
+
+  if (!Name.empty()) {
+    BB = dyn_cast_or_null<BasicBlock>(
+        MF.getFunction()->getValueSymbolTable().lookup(Name));
+    if (!BB)
+      return error(Loc, Twine("basic block '") + Name +
+                            "' is not defined in the function '" +
+                            MF.getName() + "'");
+  }
+  auto *MBB = MF.CreateMachineBasicBlock(BB);
+  MF.insert(MF.end(), MBB);
+  bool WasInserted = MBBSlots.insert(std::make_pair(ID, MBB)).second;
+  if (!WasInserted)
+    return error(Loc, Twine("redefinition of machine basic block with id #") +
+                          Twine(ID));
+  if (Alignment)
+    MBB->setAlignment(Alignment);
+  if (HasAddressTaken)
+    MBB->setHasAddressTaken();
+  MBB->setIsEHPad(IsLandingPad);
+  return false;
+}
+
+bool MIParser::parseBasicBlockDefinitions(
+    DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  if (Token.isNot(MIToken::MachineBasicBlockLabel))
+    return error("expected a basic block definition before instructions");
+  unsigned BraceDepth = 0;
+  do {
+    if (parseBasicBlockDefinition(MBBSlots))
+      return true;
+    bool IsAfterNewline = false;
+    // Skip until the next machine basic block.
+    while (true) {
+      if ((Token.is(MIToken::MachineBasicBlockLabel) && IsAfterNewline) ||
+          Token.isErrorOrEOF())
+        break;
+      else if (Token.is(MIToken::MachineBasicBlockLabel))
+        return error("basic block definition should be located at the start of "
+                     "the line");
+      else if (consumeIfPresent(MIToken::Newline)) {
+        IsAfterNewline = true;
+        continue;
+      }
+      IsAfterNewline = false;
+      if (Token.is(MIToken::lbrace))
+        ++BraceDepth;
+      if (Token.is(MIToken::rbrace)) {
+        if (!BraceDepth)
+          return error("extraneous closing brace ('}')");
+        --BraceDepth;
+      }
+      lex();
+    }
+    // Verify that we closed all of the '{' at the end of a file or a block.
+    if (!Token.isError() && BraceDepth)
+      return error("expected '}'"); // FIXME: Report a note that shows '{'.
+  } while (!Token.isErrorOrEOF());
+  return Token.isError();
+}
+
+bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_liveins));
+  lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of liveins.
+    return false;
+  do {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg = 0;
+    if (parseRegister(Reg))
+      return true;
+    MBB.addLiveIn(Reg);
+    lex();
+  } while (consumeIfPresent(MIToken::comma));
+  return false;
+}
+
+bool MIParser::parseBasicBlockSuccessors(MachineBasicBlock &MBB) {
+  assert(Token.is(MIToken::kw_successors));
+  lex();
+  if (expectAndConsume(MIToken::colon))
+    return true;
+  if (Token.isNewlineOrEOF()) // Allow an empty list of successors.
+    return false;
+  do {
+    if (Token.isNot(MIToken::MachineBasicBlock))
+      return error("expected a machine basic block reference");
+    MachineBasicBlock *SuccMBB = nullptr;
+    if (parseMBBReference(SuccMBB))
+      return true;
+    lex();
+    unsigned Weight = 0;
+    if (consumeIfPresent(MIToken::lparen)) {
+      if (Token.isNot(MIToken::IntegerLiteral))
+        return error("expected an integer literal after '('");
+      if (getUnsigned(Weight))
+        return true;
+      lex();
+      if (expectAndConsume(MIToken::rparen))
+        return true;
+    }
+    MBB.addSuccessor(SuccMBB, BranchProbability::getRaw(Weight));
+  } while (consumeIfPresent(MIToken::comma));
+  MBB.normalizeSuccProbs();
+  return false;
+}
+
+bool MIParser::parseBasicBlock(MachineBasicBlock &MBB) {
+  // Skip the definition.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  lex();
+  if (consumeIfPresent(MIToken::lparen)) {
+    while (Token.isNot(MIToken::rparen) && !Token.isErrorOrEOF())
+      lex();
+    consumeIfPresent(MIToken::rparen);
+  }
+  consumeIfPresent(MIToken::colon);
+
+  // Parse the liveins and successors.
+  // N.B: Multiple lists of successors and liveins are allowed and they're
+  // merged into one.
+  // Example:
+  //   liveins: %edi
+  //   liveins: %esi
+  //
+  // is equivalent to
+  //   liveins: %edi, %esi
+  while (true) {
+    if (Token.is(MIToken::kw_successors)) {
+      if (parseBasicBlockSuccessors(MBB))
+        return true;
+    } else if (Token.is(MIToken::kw_liveins)) {
+      if (parseBasicBlockLiveins(MBB))
+        return true;
+    } else if (consumeIfPresent(MIToken::Newline)) {
+      continue;
+    } else
+      break;
+    if (!Token.isNewlineOrEOF())
+      return error("expected line break at the end of a list");
+    lex();
+  }
+
+  // Parse the instructions.
+  bool IsInBundle = false;
+  MachineInstr *PrevMI = nullptr;
+  while (true) {
+    if (Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof))
+      return false;
+    else if (consumeIfPresent(MIToken::Newline))
+      continue;
+    if (consumeIfPresent(MIToken::rbrace)) {
+      // The first parsing pass should verify that all closing '}' have an
+      // opening '{'.
+      assert(IsInBundle);
+      IsInBundle = false;
+      continue;
+    }
+    MachineInstr *MI = nullptr;
+    if (parse(MI))
+      return true;
+    MBB.insert(MBB.end(), MI);
+    if (IsInBundle) {
+      PrevMI->setFlag(MachineInstr::BundledSucc);
+      MI->setFlag(MachineInstr::BundledPred);
+    }
+    PrevMI = MI;
+    if (Token.is(MIToken::lbrace)) {
+      if (IsInBundle)
+        return error("nested instruction bundles are not allowed");
+      lex();
+      // This instruction is the start of the bundle.
+      MI->setFlag(MachineInstr::BundledSucc);
+      IsInBundle = true;
+      if (!Token.is(MIToken::Newline))
+        // The next instruction can be on the same line.
+        continue;
+    }
+    assert(Token.isNewlineOrEOF() && "MI is not fully parsed");
+    lex();
+  }
+  return false;
+}
+
+bool MIParser::parseBasicBlocks() {
+  lex();
+  // Skip until the first machine basic block.
+  while (Token.is(MIToken::Newline))
+    lex();
+  if (Token.isErrorOrEOF())
+    return Token.isError();
+  // The first parsing pass should have verified that this token is a MBB label
+  // in the 'parseBasicBlockDefinitions' method.
+  assert(Token.is(MIToken::MachineBasicBlockLabel));
+  do {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(MBB))
+      return true;
+    if (parseBasicBlock(*MBB))
+      return true;
+    // The method 'parseBasicBlock' should parse the whole block until the next
+    // block or the end of file.
+    assert(Token.is(MIToken::MachineBasicBlockLabel) || Token.is(MIToken::Eof));
+  } while (Token.isNot(MIToken::Eof));
+  return false;
+}
+
+bool MIParser::parse(MachineInstr *&MI) {
+  // Parse any register operands before '='
+  MachineOperand MO = MachineOperand::CreateImm(0);
+  SmallVector<ParsedMachineOperand, 8> Operands;
+  while (Token.isRegister() || Token.isRegisterFlag()) {
+    auto Loc = Token.location();
+    Optional<unsigned> TiedDefIdx;
+    if (parseRegisterOperand(MO, TiedDefIdx, /*IsDef=*/true))
+      return true;
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+  if (!Operands.empty() && expectAndConsume(MIToken::equal))
+    return true;
+
+  unsigned OpCode, Flags = 0;
+  if (Token.isError() || parseInstruction(OpCode, Flags))
+    return true;
+
+  Type *Ty = nullptr;
+  if (isPreISelGenericOpcode(OpCode)) {
+    // For generic opcode, a type is mandatory.
+    auto Loc = Token.location();
+    if (parseIRType(Loc, Ty))
+      return true;
+  }
+
+  // Parse the remaining machine operands.
+  while (!Token.isNewlineOrEOF() && Token.isNot(MIToken::kw_debug_location) &&
+         Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
+    auto Loc = Token.location();
+    Optional<unsigned> TiedDefIdx;
+    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
+      return true;
+    Operands.push_back(
+        ParsedMachineOperand(MO, Loc, Token.location(), TiedDefIdx));
+    if (Token.isNewlineOrEOF() || Token.is(MIToken::coloncolon) ||
+        Token.is(MIToken::lbrace))
+      break;
+    if (Token.isNot(MIToken::comma))
+      return error("expected ',' before the next machine operand");
+    lex();
+  }
+
+  DebugLoc DebugLocation;
+  if (Token.is(MIToken::kw_debug_location)) {
+    lex();
+    if (Token.isNot(MIToken::exclaim))
+      return error("expected a metadata node after 'debug-location'");
+    MDNode *Node = nullptr;
+    if (parseMDNode(Node))
+      return true;
+    DebugLocation = DebugLoc(Node);
+  }
+
+  // Parse the machine memory operands.
+  SmallVector<MachineMemOperand *, 2> MemOperands;
+  if (Token.is(MIToken::coloncolon)) {
+    lex();
+    while (!Token.isNewlineOrEOF()) {
+      MachineMemOperand *MemOp = nullptr;
+      if (parseMachineMemoryOperand(MemOp))
+        return true;
+      MemOperands.push_back(MemOp);
+      if (Token.isNewlineOrEOF())
+        break;
+      if (Token.isNot(MIToken::comma))
+        return error("expected ',' before the next machine memory operand");
+      lex();
+    }
+  }
+
+  const auto &MCID = MF.getSubtarget().getInstrInfo()->get(OpCode);
+  if (!MCID.isVariadic()) {
+    // FIXME: Move the implicit operand verification to the machine verifier.
+    if (verifyImplicitOperands(Operands, MCID))
+      return true;
+  }
+
+  // TODO: Check for extraneous machine operands.
+  MI = MF.CreateMachineInstr(MCID, DebugLocation, /*NoImplicit=*/true);
+  MI->setFlags(Flags);
+  if (Ty)
+    MI->setType(Ty);
+  for (const auto &Operand : Operands)
+    MI->addOperand(MF, Operand.Operand);
+  if (assignRegisterTies(*MI, Operands))
+    return true;
+  if (MemOperands.empty())
+    return false;
+  MachineInstr::mmo_iterator MemRefs =
+      MF.allocateMemRefsArray(MemOperands.size());
+  std::copy(MemOperands.begin(), MemOperands.end(), MemRefs);
+  MI->setMemRefs(MemRefs, MemRefs + MemOperands.size());
+  return false;
+}
+
+bool MIParser::parseStandaloneMBB(MachineBasicBlock *&MBB) {
+  lex();
+  if (Token.isNot(MIToken::MachineBasicBlock))
+    return error("expected a machine basic block reference");
+  if (parseMBBReference(MBB))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error(
+        "expected end of string after the machine basic block reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneNamedRegister(unsigned &Reg) {
+  lex();
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a named register");
+  if (parseRegister(Reg))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneVirtualRegister(unsigned &Reg) {
+  lex();
+  if (Token.isNot(MIToken::VirtualRegister))
+    return error("expected a virtual register");
+  if (parseRegister(Reg))
+    return true;
+  lex();
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the register reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneStackObject(int &FI) {
+  lex();
+  if (Token.isNot(MIToken::StackObject))
+    return error("expected a stack object");
+  if (parseStackFrameIndex(FI))
+    return true;
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the stack object reference");
+  return false;
+}
+
+bool MIParser::parseStandaloneMDNode(MDNode *&Node) {
+  lex();
+  if (Token.isNot(MIToken::exclaim))
+    return error("expected a metadata node");
+  if (parseMDNode(Node))
+    return true;
+  if (Token.isNot(MIToken::Eof))
+    return error("expected end of string after the metadata node");
+  return false;
+}
+
+static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
+  assert(MO.isImplicit());
+  return MO.isDef() ? "implicit-def" : "implicit";
+}
+
+static std::string getRegisterName(const TargetRegisterInfo *TRI,
+                                   unsigned Reg) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) && "expected phys reg");
+  return StringRef(TRI->getName(Reg)).lower();
+}
+
+/// Return true if the parsed machine operands contain a given machine operand.
+static bool isImplicitOperandIn(const MachineOperand &ImplicitOperand,
+                                ArrayRef<ParsedMachineOperand> Operands) {
+  for (const auto &I : Operands) {
+    if (ImplicitOperand.isIdenticalTo(I.Operand))
+      return true;
+  }
+  return false;
+}
+
+bool MIParser::verifyImplicitOperands(ArrayRef<ParsedMachineOperand> Operands,
+                                      const MCInstrDesc &MCID) {
+  if (MCID.isCall())
+    // We can't verify call instructions as they can contain arbitrary implicit
+    // register and register mask operands.
+    return false;
+
+  // Gather all the expected implicit operands.
+  SmallVector<MachineOperand, 4> ImplicitOperands;
+  if (MCID.ImplicitDefs)
+    for (const MCPhysReg *ImpDefs = MCID.getImplicitDefs(); *ImpDefs; ++ImpDefs)
+      ImplicitOperands.push_back(
+          MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (MCID.ImplicitUses)
+    for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses; ++ImpUses)
+      ImplicitOperands.push_back(
+          MachineOperand::CreateReg(*ImpUses, false, true));
+
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  for (const auto &I : ImplicitOperands) {
+    if (isImplicitOperandIn(I, Operands))
+      continue;
+    return error(Operands.empty() ? Token.location() : Operands.back().End,
+                 Twine("missing implicit register operand '") +
+                     printImplicitRegisterFlag(I) + " %" +
+                     getRegisterName(TRI, I.getReg()) + "'");
+  }
+  return false;
+}
+
+bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
+  if (Token.is(MIToken::kw_frame_setup)) {
+    Flags |= MachineInstr::FrameSetup;
+    lex();
+  }
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected a machine instruction");
+  StringRef InstrName = Token.stringValue();
+  if (parseInstrName(InstrName, OpCode))
+    return error(Twine("unknown machine instruction name '") + InstrName + "'");
+  lex();
+  return false;
+}
+
+bool MIParser::parseRegister(unsigned &Reg) {
+  switch (Token.kind()) {
+  case MIToken::underscore:
+    Reg = 0;
+    break;
+  case MIToken::NamedRegister: {
+    StringRef Name = Token.stringValue();
+    if (getRegisterByName(Name, Reg))
+      return error(Twine("unknown register name '") + Name + "'");
+    break;
+  }
+  case MIToken::VirtualRegister: {
+    unsigned ID;
+    if (getUnsigned(ID))
+      return true;
+    const auto RegInfo = PFS.VirtualRegisterSlots.find(ID);
+    if (RegInfo == PFS.VirtualRegisterSlots.end())
+      return error(Twine("use of undefined virtual register '%") + Twine(ID) +
+                   "'");
+    Reg = RegInfo->second;
+    break;
+  }
+  // TODO: Parse other register kinds.
+  default:
+    llvm_unreachable("The current token should be a register");
+  }
+  return false;
+}
+
+bool MIParser::parseRegisterFlag(unsigned &Flags) {
+  const unsigned OldFlags = Flags;
+  switch (Token.kind()) {
+  case MIToken::kw_implicit:
+    Flags |= RegState::Implicit;
+    break;
+  case MIToken::kw_implicit_define:
+    Flags |= RegState::ImplicitDefine;
+    break;
+  case MIToken::kw_def:
+    Flags |= RegState::Define;
+    break;
+  case MIToken::kw_dead:
+    Flags |= RegState::Dead;
+    break;
+  case MIToken::kw_killed:
+    Flags |= RegState::Kill;
+    break;
+  case MIToken::kw_undef:
+    Flags |= RegState::Undef;
+    break;
+  case MIToken::kw_internal:
+    Flags |= RegState::InternalRead;
+    break;
+  case MIToken::kw_early_clobber:
+    Flags |= RegState::EarlyClobber;
+    break;
+  case MIToken::kw_debug_use:
+    Flags |= RegState::Debug;
+    break;
+  default:
+    llvm_unreachable("The current token should be a register flag");
+  }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' register flag");
+  lex();
+  return false;
+}
+
+bool MIParser::parseSubRegisterIndex(unsigned &SubReg) {
+  assert(Token.is(MIToken::colon));
+  lex();
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected a subregister index after ':'");
+  auto Name = Token.stringValue();
+  SubReg = getSubRegIndex(Name);
+  if (!SubReg)
+    return error(Twine("use of unknown subregister index '") + Name + "'");
+  lex();
+  return false;
+}
+
+bool MIParser::parseRegisterTiedDefIndex(unsigned &TiedDefIdx) {
+  if (!consumeIfPresent(MIToken::kw_tied_def))
+    return error("expected 'tied-def' after '('");
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after 'tied-def'");
+  if (getUnsigned(TiedDefIdx))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  return false;
+}
+
+bool MIParser::parseSize(unsigned &Size) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal for the size");
+  if (getUnsigned(Size))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  return false;
+}
+
+bool MIParser::assignRegisterTies(MachineInstr &MI,
+                                  ArrayRef<ParsedMachineOperand> Operands) {
+  SmallVector<std::pair<unsigned, unsigned>, 4> TiedRegisterPairs;
+  for (unsigned I = 0, E = Operands.size(); I != E; ++I) {
+    if (!Operands[I].TiedDefIdx)
+      continue;
+    // The parser ensures that this operand is a register use, so we just have
+    // to check the tied-def operand.
+    unsigned DefIdx = Operands[I].TiedDefIdx.getValue();
+    if (DefIdx >= E)
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '" +
+                         Twine(DefIdx) + "'; instruction has only ") +
+                       Twine(E) + " operands");
+    const auto &DefOperand = Operands[DefIdx].Operand;
+    if (!DefOperand.isReg() || !DefOperand.isDef())
+      // FIXME: add note with the def operand.
+      return error(Operands[I].Begin,
+                   Twine("use of invalid tied-def operand index '") +
+                       Twine(DefIdx) + "'; the operand #" + Twine(DefIdx) +
+                       " isn't a defined register");
+    // Check that the tied-def operand wasn't tied elsewhere.
+    for (const auto &TiedPair : TiedRegisterPairs) {
+      if (TiedPair.first == DefIdx)
+        return error(Operands[I].Begin,
+                     Twine("the tied-def operand #") + Twine(DefIdx) +
+                         " is already tied with another register operand");
+    }
+    TiedRegisterPairs.push_back(std::make_pair(DefIdx, I));
+  }
+  // FIXME: Verify that for non INLINEASM instructions, the def and use tied
+  // indices must be less than tied max.
+  for (const auto &TiedPair : TiedRegisterPairs)
+    MI.tieOperands(TiedPair.first, TiedPair.second);
+  return false;
+}
+
+bool MIParser::parseRegisterOperand(MachineOperand &Dest,
+                                    Optional<unsigned> &TiedDefIdx,
+                                    bool IsDef) {
+  unsigned Reg;
+  unsigned Flags = IsDef ? RegState::Define : 0;
+  while (Token.isRegisterFlag()) {
+    if (parseRegisterFlag(Flags))
+      return true;
+  }
+  if (!Token.isRegister())
+    return error("expected a register after register flags");
+  if (parseRegister(Reg))
+    return true;
+  lex();
+  unsigned SubReg = 0;
+  if (Token.is(MIToken::colon)) {
+    if (parseSubRegisterIndex(SubReg))
+      return true;
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("subregister index expects a virtual register");
+  }
+  if ((Flags & RegState::Define) == 0) {
+    if (consumeIfPresent(MIToken::lparen)) {
+      unsigned Idx;
+      if (parseRegisterTiedDefIndex(Idx))
+        return true;
+      TiedDefIdx = Idx;
+    }
+  } else if (consumeIfPresent(MIToken::lparen)) {
+    // Virtual registers may have a size with GlobalISel.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      return error("unexpected size on physical register");
+    unsigned Size;
+    if (parseSize(Size))
+      return true;
+
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    MRI.setSize(Reg, Size);
+  } else if (PFS.GenericVRegs.count(Reg)) {
+    // Generic virtual registers must have a size.
+    // If we end up here this means the size hasn't been specified and
+    // this is bad!
+    return error("generic virtual registers must have a size");
+  }
+  Dest = MachineOperand::CreateReg(
+      Reg, Flags & RegState::Define, Flags & RegState::Implicit,
+      Flags & RegState::Kill, Flags & RegState::Dead, Flags & RegState::Undef,
+      Flags & RegState::EarlyClobber, SubReg, Flags & RegState::Debug,
+      Flags & RegState::InternalRead);
+  return false;
+}
+
+bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::IntegerLiteral));
+  const APSInt &Int = Token.integerValue();
+  if (Int.getMinSignedBits() > 64)
+    return error("integer literal is too large to be an immediate operand");
+  Dest = MachineOperand::CreateImm(Int.getExtValue());
+  lex();
+  return false;
+}
+
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  auto Source = StringValue.str(); // The source has to be null terminated.
+  SMDiagnostic Err;
+  C = parseConstantValue(Source.c_str(), Err, *MF.getFunction()->getParent(),
+                         &PFS.IRSlots);
+  if (!C)
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
+  return false;
+}
+
+bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
+  if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
+    return true;
+  lex();
+  return false;
+}
+
+bool MIParser::parseIRType(StringRef::iterator Loc, StringRef StringValue,
+                           unsigned &Read, Type *&Ty) {
+  auto Source = StringValue.str(); // The source has to be null terminated.
+  SMDiagnostic Err;
+  Ty = parseTypeAtBeginning(Source.c_str(), Read, Err,
+                            *MF.getFunction()->getParent(), &PFS.IRSlots);
+  if (!Ty)
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
+  return false;
+}
+
+bool MIParser::parseIRType(StringRef::iterator Loc, Type *&Ty,
+                           bool MustBeSized) {
+  // At this point we enter in the IR world, i.e., to get the correct type,
+  // we need to hand off the whole string, not just the current token.
+  // E.g., <4 x i64> would give '<' as a token and there is not much
+  // the IR parser can do with that.
+  unsigned Read = 0;
+  if (parseIRType(Loc, StringRef(Loc), Read, Ty))
+    return true;
+  // The type must be sized, otherwise there is not much the backend
+  // can do with it.
+  if (MustBeSized && !Ty->isSized())
+    return error("expected a sized type");
+  // The next token is Read characters from the Loc.
+  // However, the current location is not Loc, but Loc + the length of Token.
+  // Therefore, subtract the length of Token (range().end() - Loc) to the
+  // number of characters to skip before the next token.
+  lex(Read - (Token.range().end() - Loc));
+  return false;
+}
+
+bool MIParser::parseTypedImmediateOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::IntegerType));
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal");
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateCImm(cast<ConstantInt>(C));
+  return false;
+}
+
+bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::FloatingPointLiteral))
+    return error("expected a floating point literal");
+  const Constant *C = nullptr;
+  if (parseIRConstant(Loc, C))
+    return true;
+  Dest = MachineOperand::CreateFPImm(cast<ConstantFP>(C));
+  return false;
+}
+
+bool MIParser::getUnsigned(unsigned &Result) {
+  assert(Token.hasIntegerValue() && "Expected a token with an integer value");
+  const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
+  uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
+  if (Val64 == Limit)
+    return error("expected 32-bit integer (too large)");
+  Result = Val64;
+  return false;
+}
+
+bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
+  assert(Token.is(MIToken::MachineBasicBlock) ||
+         Token.is(MIToken::MachineBasicBlockLabel));
+  unsigned Number;
+  if (getUnsigned(Number))
+    return true;
+  auto MBBInfo = PFS.MBBSlots.find(Number);
+  if (MBBInfo == PFS.MBBSlots.end())
+    return error(Twine("use of undefined machine basic block #") +
+                 Twine(Number));
+  MBB = MBBInfo->second;
+  if (!Token.stringValue().empty() && Token.stringValue() != MBB->getName())
+    return error(Twine("the name of machine basic block #") + Twine(Number) +
+                 " isn't '" + Token.stringValue() + "'");
+  return false;
+}
+
+bool MIParser::parseMBBOperand(MachineOperand &Dest) {
+  MachineBasicBlock *MBB;
+  if (parseMBBReference(MBB))
+    return true;
+  Dest = MachineOperand::CreateMBB(MBB);
+  lex();
+  return false;
+}
+
+bool MIParser::parseStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::StackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.StackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.StackObjectSlots.end())
+    return error(Twine("use of undefined stack object '%stack.") + Twine(ID) +
+                 "'");
+  StringRef Name;
+  if (const auto *Alloca =
+          MF.getFrameInfo()->getObjectAllocation(ObjectInfo->second))
+    Name = Alloca->getName();
+  if (!Token.stringValue().empty() && Token.stringValue() != Name)
+    return error(Twine("the name of the stack object '%stack.") + Twine(ID) +
+                 "' isn't '" + Token.stringValue() + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+bool MIParser::parseFixedStackFrameIndex(int &FI) {
+  assert(Token.is(MIToken::FixedStackObject));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ObjectInfo = PFS.FixedStackObjectSlots.find(ID);
+  if (ObjectInfo == PFS.FixedStackObjectSlots.end())
+    return error(Twine("use of undefined fixed stack object '%fixed-stack.") +
+                 Twine(ID) + "'");
+  lex();
+  FI = ObjectInfo->second;
+  return false;
+}
+
+bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
+  int FI;
+  if (parseFixedStackFrameIndex(FI))
+    return true;
+  Dest = MachineOperand::CreateFI(FI);
+  return false;
+}
+
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  switch (Token.kind()) {
+  case MIToken::NamedGlobalValue: {
+    const Module *M = MF.getFunction()->getParent();
+    GV = M->getNamedValue(Token.stringValue());
+    if (!GV)
+      return error(Twine("use of undefined global value '") + Token.range() +
+                   "'");
+    break;
+  }
+  case MIToken::GlobalValue: {
+    unsigned GVIdx;
+    if (getUnsigned(GVIdx))
+      return true;
+    if (GVIdx >= PFS.IRSlots.GlobalValues.size())
+      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
+                   "'");
+    GV = PFS.IRSlots.GlobalValues[GVIdx];
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be a global value");
+  }
+  return false;
+}
+
+bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  lex();
+  Dest = MachineOperand::CreateGA(GV, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseConstantPoolIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ConstantPoolItem));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto ConstantInfo = PFS.ConstantPoolSlots.find(ID);
+  if (ConstantInfo == PFS.ConstantPoolSlots.end())
+    return error("use of undefined constant '%const." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateCPI(ID, /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseJumpTableIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::JumpTableIndex));
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto JumpTableEntryInfo = PFS.JumpTableSlots.find(ID);
+  if (JumpTableEntryInfo == PFS.JumpTableSlots.end())
+    return error("use of undefined jump table '%jump-table." + Twine(ID) + "'");
+  lex();
+  Dest = MachineOperand::CreateJTI(JumpTableEntryInfo->second);
+  return false;
+}
+
+bool MIParser::parseExternalSymbolOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::ExternalSymbol));
+  const char *Symbol = MF.createExternalSymbolName(Token.stringValue());
+  lex();
+  Dest = MachineOperand::CreateES(Symbol);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseSubRegisterIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::SubRegisterIndex));
+  StringRef Name = Token.stringValue();
+  unsigned SubRegIndex = getSubRegIndex(Token.stringValue());
+  if (SubRegIndex == 0)
+    return error(Twine("unknown subregister index '") + Name + "'");
+  lex();
+  Dest = MachineOperand::CreateImm(SubRegIndex);
+  return false;
+}
+
+bool MIParser::parseMDNode(MDNode *&Node) {
+  assert(Token.is(MIToken::exclaim));
+  auto Loc = Token.location();
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected metadata id after '!'");
+  unsigned ID;
+  if (getUnsigned(ID))
+    return true;
+  auto NodeInfo = PFS.IRSlots.MetadataNodes.find(ID);
+  if (NodeInfo == PFS.IRSlots.MetadataNodes.end())
+    return error(Loc, "use of undefined metadata '!" + Twine(ID) + "'");
+  lex();
+  Node = NodeInfo->second.get();
+  return false;
+}
+
+bool MIParser::parseMetadataOperand(MachineOperand &Dest) {
+  MDNode *Node = nullptr;
+  if (parseMDNode(Node))
+    return true;
+  Dest = MachineOperand::CreateMetadata(Node);
+  return false;
+}
+
+bool MIParser::parseCFIOffset(int &Offset) {
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected a cfi offset");
+  if (Token.integerValue().getMinSignedBits() > 32)
+    return error("expected a 32 bit integer (the cfi offset is too large)");
+  Offset = (int)Token.integerValue().getExtValue();
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIRegister(unsigned &Reg) {
+  if (Token.isNot(MIToken::NamedRegister))
+    return error("expected a cfi register");
+  unsigned LLVMReg;
+  if (parseRegister(LLVMReg))
+    return true;
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  int DwarfReg = TRI->getDwarfRegNum(LLVMReg, true);
+  if (DwarfReg < 0)
+    return error("invalid DWARF register");
+  Reg = (unsigned)DwarfReg;
+  lex();
+  return false;
+}
+
+bool MIParser::parseCFIOperand(MachineOperand &Dest) {
+  auto Kind = Token.kind();
+  lex();
+  auto &MMI = MF.getMMI();
+  int Offset;
+  unsigned Reg;
+  unsigned CFIIndex;
+  switch (Kind) {
+  case MIToken::kw_cfi_same_value:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createSameValue(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_offset:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createOffset(nullptr, Reg, Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa_register:
+    if (parseCFIRegister(Reg))
+      return true;
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, Reg));
+    break;
+  case MIToken::kw_cfi_def_cfa_offset:
+    if (parseCFIOffset(Offset))
+      return true;
+    // NB: MCCFIInstruction::createDefCfaOffset negates the offset.
+    CFIIndex = MMI.addFrameInst(
+        MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+    break;
+  case MIToken::kw_cfi_def_cfa:
+    if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
+        parseCFIOffset(Offset))
+      return true;
+    // NB: MCCFIInstruction::createDefCfa negates the offset.
+    CFIIndex =
+        MMI.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
+    break;
+  default:
+    // TODO: Parse the other CFI operands.
+    llvm_unreachable("The current token should be a cfi operand");
+  }
+  Dest = MachineOperand::CreateCFIIndex(CFIIndex);
+  return false;
+}
+
+bool MIParser::parseIRBlock(BasicBlock *&BB, const Function &F) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRBlock: {
+    BB = dyn_cast_or_null<BasicBlock>(
+        F.getValueSymbolTable().lookup(Token.stringValue()));
+    if (!BB)
+      return error(Twine("use of undefined IR block '") + Token.range() + "'");
+    break;
+  }
+  case MIToken::IRBlock: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(SlotNumber))
+      return true;
+    BB = const_cast<BasicBlock *>(getIRBlock(SlotNumber, F));
+    if (!BB)
+      return error(Twine("use of undefined IR block '%ir-block.") +
+                   Twine(SlotNumber) + "'");
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  return false;
+}
+
+bool MIParser::parseBlockAddressOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_blockaddress));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue))
+    return error("expected a global value");
+  GlobalValue *GV = nullptr;
+  if (parseGlobalValue(GV))
+    return true;
+  auto *F = dyn_cast<Function>(GV);
+  if (!F)
+    return error("expected an IR function reference");
+  lex();
+  if (expectAndConsume(MIToken::comma))
+    return true;
+  BasicBlock *BB = nullptr;
+  if (Token.isNot(MIToken::IRBlock) && Token.isNot(MIToken::NamedIRBlock))
+    return error("expected an IR block reference");
+  if (parseIRBlock(BB, *F))
+    return true;
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateBA(BlockAddress::get(F, BB), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_target_index));
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  if (Token.isNot(MIToken::Identifier))
+    return error("expected the name of the target index");
+  int Index = 0;
+  if (getTargetIndex(Token.stringValue(), Index))
+    return error("use of undefined target index '" + Token.stringValue() + "'");
+  lex();
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateTargetIndex(unsigned(Index), /*Offset=*/0);
+  if (parseOperandsOffset(Dest))
+    return true;
+  return false;
+}
+
+bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.is(MIToken::kw_liveout));
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg = 0;
+    if (parseRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegLiveOut(Mask);
+  return false;
+}
+
+bool MIParser::parseMachineOperand(MachineOperand &Dest,
+                                   Optional<unsigned> &TiedDefIdx) {
+  switch (Token.kind()) {
+  case MIToken::kw_implicit:
+  case MIToken::kw_implicit_define:
+  case MIToken::kw_def:
+  case MIToken::kw_dead:
+  case MIToken::kw_killed:
+  case MIToken::kw_undef:
+  case MIToken::kw_internal:
+  case MIToken::kw_early_clobber:
+  case MIToken::kw_debug_use:
+  case MIToken::underscore:
+  case MIToken::NamedRegister:
+  case MIToken::VirtualRegister:
+    return parseRegisterOperand(Dest, TiedDefIdx);
+  case MIToken::IntegerLiteral:
+    return parseImmediateOperand(Dest);
+  case MIToken::IntegerType:
+    return parseTypedImmediateOperand(Dest);
+  case MIToken::kw_half:
+  case MIToken::kw_float:
+  case MIToken::kw_double:
+  case MIToken::kw_x86_fp80:
+  case MIToken::kw_fp128:
+  case MIToken::kw_ppc_fp128:
+    return parseFPImmediateOperand(Dest);
+  case MIToken::MachineBasicBlock:
+    return parseMBBOperand(Dest);
+  case MIToken::StackObject:
+    return parseStackObjectOperand(Dest);
+  case MIToken::FixedStackObject:
+    return parseFixedStackObjectOperand(Dest);
+  case MIToken::GlobalValue:
+  case MIToken::NamedGlobalValue:
+    return parseGlobalAddressOperand(Dest);
+  case MIToken::ConstantPoolItem:
+    return parseConstantPoolIndexOperand(Dest);
+  case MIToken::JumpTableIndex:
+    return parseJumpTableIndexOperand(Dest);
+  case MIToken::ExternalSymbol:
+    return parseExternalSymbolOperand(Dest);
+  case MIToken::SubRegisterIndex:
+    return parseSubRegisterIndexOperand(Dest);
+  case MIToken::exclaim:
+    return parseMetadataOperand(Dest);
+  case MIToken::kw_cfi_same_value:
+  case MIToken::kw_cfi_offset:
+  case MIToken::kw_cfi_def_cfa_register:
+  case MIToken::kw_cfi_def_cfa_offset:
+  case MIToken::kw_cfi_def_cfa:
+    return parseCFIOperand(Dest);
+  case MIToken::kw_blockaddress:
+    return parseBlockAddressOperand(Dest);
+  case MIToken::kw_target_index:
+    return parseTargetIndexOperand(Dest);
+  case MIToken::kw_liveout:
+    return parseLiveoutRegisterMaskOperand(Dest);
+  case MIToken::Error:
+    return true;
+  case MIToken::Identifier:
+    if (const auto *RegMask = getRegMask(Token.stringValue())) {
+      Dest = MachineOperand::CreateRegMask(RegMask);
+      lex();
+      break;
+    }
+  // fallthrough
+  default:
+    // FIXME: Parse the MCSymbol machine operand.
+    return error("expected a machine operand");
+  }
+  return false;
+}
+
+bool MIParser::parseMachineOperandAndTargetFlags(
+    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+  unsigned TF = 0;
+  bool HasTargetFlags = false;
+  if (Token.is(MIToken::kw_target_flags)) {
+    HasTargetFlags = true;
+    lex();
+    if (expectAndConsume(MIToken::lparen))
+      return true;
+    if (Token.isNot(MIToken::Identifier))
+      return error("expected the name of the target flag");
+    if (getDirectTargetFlag(Token.stringValue(), TF)) {
+      if (getBitmaskTargetFlag(Token.stringValue(), TF))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+    }
+    lex();
+    while (Token.is(MIToken::comma)) {
+      lex();
+      if (Token.isNot(MIToken::Identifier))
+        return error("expected the name of the target flag");
+      unsigned BitFlag = 0;
+      if (getBitmaskTargetFlag(Token.stringValue(), BitFlag))
+        return error("use of undefined target flag '" + Token.stringValue() +
+                     "'");
+      // TODO: Report an error when using a duplicate bit target flag.
+      TF |= BitFlag;
+      lex();
+    }
+    if (expectAndConsume(MIToken::rparen))
+      return true;
+  }
+  auto Loc = Token.location();
+  if (parseMachineOperand(Dest, TiedDefIdx))
+    return true;
+  if (!HasTargetFlags)
+    return false;
+  if (Dest.isReg())
+    return error(Loc, "register operands can't have target flags");
+  Dest.setTargetFlags(TF);
+  return false;
+}
+
+bool MIParser::parseOffset(int64_t &Offset) {
+  if (Token.isNot(MIToken::plus) && Token.isNot(MIToken::minus))
+    return false;
+  StringRef Sign = Token.range();
+  bool IsNegative = Token.is(MIToken::minus);
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected an integer literal after '" + Sign + "'");
+  if (Token.integerValue().getMinSignedBits() > 64)
+    return error("expected 64-bit integer (too large)");
+  Offset = Token.integerValue().getExtValue();
+  if (IsNegative)
+    Offset = -Offset;
+  lex();
+  return false;
+}
+
+bool MIParser::parseAlignment(unsigned &Alignment) {
+  assert(Token.is(MIToken::kw_align));
+  lex();
+  if (Token.isNot(MIToken::IntegerLiteral) || Token.integerValue().isSigned())
+    return error("expected an integer literal after 'align'");
+  if (getUnsigned(Alignment))
+    return true;
+  lex();
+  return false;
+}
+
+bool MIParser::parseOperandsOffset(MachineOperand &Op) {
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Op.setOffset(Offset);
+  return false;
+}
+
+bool MIParser::parseIRValue(const Value *&V) {
+  switch (Token.kind()) {
+  case MIToken::NamedIRValue: {
+    V = MF.getFunction()->getValueSymbolTable().lookup(Token.stringValue());
+    break;
+  }
+  case MIToken::IRValue: {
+    unsigned SlotNumber = 0;
+    if (getUnsigned(SlotNumber))
+      return true;
+    V = getIRValue(SlotNumber);
+    break;
+  }
+  case MIToken::NamedGlobalValue:
+  case MIToken::GlobalValue: {
+    GlobalValue *GV = nullptr;
+    if (parseGlobalValue(GV))
+      return true;
+    V = GV;
+    break;
+  }
+  case MIToken::QuotedIRValue: {
+    const Constant *C = nullptr;
+    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+      return true;
+    V = C;
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be an IR block reference");
+  }
+  if (!V)
+    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+  return false;
+}
+
+bool MIParser::getUint64(uint64_t &Result) {
+  assert(Token.hasIntegerValue());
+  if (Token.integerValue().getActiveBits() > 64)
+    return error("expected 64-bit integer (too large)");
+  Result = Token.integerValue().getZExtValue();
+  return false;
+}
+
+bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
+  const auto OldFlags = Flags;
+  switch (Token.kind()) {
+  case MIToken::kw_volatile:
+    Flags |= MachineMemOperand::MOVolatile;
+    break;
+  case MIToken::kw_non_temporal:
+    Flags |= MachineMemOperand::MONonTemporal;
+    break;
+  case MIToken::kw_invariant:
+    Flags |= MachineMemOperand::MOInvariant;
+    break;
+  // TODO: parse the target specific memory operand flags.
+  default:
+    llvm_unreachable("The current token should be a memory operand flag");
+  }
+  if (OldFlags == Flags)
+    // We know that the same flag is specified more than once when the flags
+    // weren't modified.
+    return error("duplicate '" + Token.stringValue() + "' memory operand flag");
+  lex();
+  return false;
+}
+
+bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
+  switch (Token.kind()) {
+  case MIToken::kw_stack:
+    PSV = MF.getPSVManager().getStack();
+    break;
+  case MIToken::kw_got:
+    PSV = MF.getPSVManager().getGOT();
+    break;
+  case MIToken::kw_jump_table:
+    PSV = MF.getPSVManager().getJumpTable();
+    break;
+  case MIToken::kw_constant_pool:
+    PSV = MF.getPSVManager().getConstantPool();
+    break;
+  case MIToken::FixedStackObject: {
+    int FI;
+    if (parseFixedStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
+  case MIToken::StackObject: {
+    int FI;
+    if (parseStackFrameIndex(FI))
+      return true;
+    PSV = MF.getPSVManager().getFixedStack(FI);
+    // The token was already consumed, so use return here instead of break.
+    return false;
+  }
+  case MIToken::kw_call_entry: {
+    lex();
+    switch (Token.kind()) {
+    case MIToken::GlobalValue:
+    case MIToken::NamedGlobalValue: {
+      GlobalValue *GV = nullptr;
+      if (parseGlobalValue(GV))
+        return true;
+      PSV = MF.getPSVManager().getGlobalValueCallEntry(GV);
+      break;
+    }
+    case MIToken::ExternalSymbol:
+      PSV = MF.getPSVManager().getExternalSymbolCallEntry(
+          MF.createExternalSymbolName(Token.stringValue()));
+      break;
+    default:
+      return error(
+          "expected a global value or an external symbol after 'call-entry'");
+    }
+    break;
+  }
+  default:
+    llvm_unreachable("The current token should be pseudo source value");
+  }
+  lex();
+  return false;
+}
+
+bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
+  if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
+      Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
+      Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
+      Token.is(MIToken::kw_call_entry)) {
+    const PseudoSourceValue *PSV = nullptr;
+    if (parseMemoryPseudoSourceValue(PSV))
+      return true;
+    int64_t Offset = 0;
+    if (parseOffset(Offset))
+      return true;
+    Dest = MachinePointerInfo(PSV, Offset);
+    return false;
+  }
+  if (Token.isNot(MIToken::NamedIRValue) && Token.isNot(MIToken::IRValue) &&
+      Token.isNot(MIToken::GlobalValue) &&
+      Token.isNot(MIToken::NamedGlobalValue) &&
+      Token.isNot(MIToken::QuotedIRValue))
+    return error("expected an IR value reference");
+  const Value *V = nullptr;
+  if (parseIRValue(V))
+    return true;
+  if (!V->getType()->isPointerTy())
+    return error("expected a pointer IR value");
+  lex();
+  int64_t Offset = 0;
+  if (parseOffset(Offset))
+    return true;
+  Dest = MachinePointerInfo(V, Offset);
+  return false;
+}
+
+bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+  MachineMemOperand::Flags Flags = MachineMemOperand::MONone;
+  while (Token.isMemoryOperandFlag()) {
+    if (parseMemoryOperandFlag(Flags))
+      return true;
+  }
+  if (Token.isNot(MIToken::Identifier) ||
+      (Token.stringValue() != "load" && Token.stringValue() != "store"))
+    return error("expected 'load' or 'store' memory operation");
+  if (Token.stringValue() == "load")
+    Flags |= MachineMemOperand::MOLoad;
+  else
+    Flags |= MachineMemOperand::MOStore;
+  lex();
+
+  if (Token.isNot(MIToken::IntegerLiteral))
+    return error("expected the size integer literal after memory operation");
+  uint64_t Size;
+  if (getUint64(Size))
+    return true;
+  lex();
+
+  MachinePointerInfo Ptr = MachinePointerInfo();
+  if (Token.is(MIToken::Identifier)) {
+    const char *Word = Flags & MachineMemOperand::MOLoad ? "from" : "into";
+    if (Token.stringValue() != Word)
+      return error(Twine("expected '") + Word + "'");
+    lex();
+
+    if (parseMachinePointerInfo(Ptr))
+      return true;
+  }
+  unsigned BaseAlignment = Size;
+  AAMDNodes AAInfo;
+  MDNode *Range = nullptr;
+  while (consumeIfPresent(MIToken::comma)) {
+    switch (Token.kind()) {
+    case MIToken::kw_align:
+      if (parseAlignment(BaseAlignment))
+        return true;
+      break;
+    case MIToken::md_tbaa:
+      lex();
+      if (parseMDNode(AAInfo.TBAA))
+        return true;
+      break;
+    case MIToken::md_alias_scope:
+      lex();
+      if (parseMDNode(AAInfo.Scope))
+        return true;
+      break;
+    case MIToken::md_noalias:
+      lex();
+      if (parseMDNode(AAInfo.NoAlias))
+        return true;
+      break;
+    case MIToken::md_range:
+      lex();
+      if (parseMDNode(Range))
+        return true;
+      break;
+    // TODO: Report an error on duplicate metadata nodes.
+    default:
+      return error("expected 'align' or '!tbaa' or '!alias.scope' or "
+                   "'!noalias' or '!range'");
+    }
+  }
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest =
+      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range);
+  return false;
+}
+
+void MIParser::initNames2InstrOpCodes() {
+  if (!Names2InstrOpCodes.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  for (unsigned I = 0, E = TII->getNumOpcodes(); I < E; ++I)
+    Names2InstrOpCodes.insert(std::make_pair(StringRef(TII->getName(I)), I));
+}
+
+bool MIParser::parseInstrName(StringRef InstrName, unsigned &OpCode) {
+  initNames2InstrOpCodes();
+  auto InstrInfo = Names2InstrOpCodes.find(InstrName);
+  if (InstrInfo == Names2InstrOpCodes.end())
+    return true;
+  OpCode = InstrInfo->getValue();
+  return false;
+}
+
+void MIParser::initNames2Regs() {
+  if (!Names2Regs.empty())
+    return;
+  // The '%noreg' register is the register 0.
+  Names2Regs.insert(std::make_pair("noreg", 0));
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  for (unsigned I = 0, E = TRI->getNumRegs(); I < E; ++I) {
+    bool WasInserted =
+        Names2Regs.insert(std::make_pair(StringRef(TRI->getName(I)).lower(), I))
+            .second;
+    (void)WasInserted;
+    assert(WasInserted && "Expected registers to be unique case-insensitively");
+  }
+}
+
+bool MIParser::getRegisterByName(StringRef RegName, unsigned &Reg) {
+  initNames2Regs();
+  auto RegInfo = Names2Regs.find(RegName);
+  if (RegInfo == Names2Regs.end())
+    return true;
+  Reg = RegInfo->getValue();
+  return false;
+}
+
+void MIParser::initNames2RegMasks() {
+  if (!Names2RegMasks.empty())
+    return;
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  ArrayRef<const uint32_t *> RegMasks = TRI->getRegMasks();
+  ArrayRef<const char *> RegMaskNames = TRI->getRegMaskNames();
+  assert(RegMasks.size() == RegMaskNames.size());
+  for (size_t I = 0, E = RegMasks.size(); I < E; ++I)
+    Names2RegMasks.insert(
+        std::make_pair(StringRef(RegMaskNames[I]).lower(), RegMasks[I]));
+}
+
+const uint32_t *MIParser::getRegMask(StringRef Identifier) {
+  initNames2RegMasks();
+  auto RegMaskInfo = Names2RegMasks.find(Identifier);
+  if (RegMaskInfo == Names2RegMasks.end())
+    return nullptr;
+  return RegMaskInfo->getValue();
+}
+
+void MIParser::initNames2SubRegIndices() {
+  if (!Names2SubRegIndices.empty())
+    return;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (unsigned I = 1, E = TRI->getNumSubRegIndices(); I < E; ++I)
+    Names2SubRegIndices.insert(
+        std::make_pair(StringRef(TRI->getSubRegIndexName(I)).lower(), I));
+}
+
+unsigned MIParser::getSubRegIndex(StringRef Name) {
+  initNames2SubRegIndices();
+  auto SubRegInfo = Names2SubRegIndices.find(Name);
+  if (SubRegInfo == Names2SubRegIndices.end())
+    return 0;
+  return SubRegInfo->getValue();
+}
+
+static void initSlots2BasicBlocks(
+    const Function &F,
+    DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (auto &BB : F) {
+    if (BB.hasName())
+      continue;
+    int Slot = MST.getLocalSlot(&BB);
+    if (Slot == -1)
+      continue;
+    Slots2BasicBlocks.insert(std::make_pair(unsigned(Slot), &BB));
+  }
+}
+
+static const BasicBlock *getIRBlockFromSlot(
+    unsigned Slot,
+    const DenseMap<unsigned, const BasicBlock *> &Slots2BasicBlocks) {
+  auto BlockInfo = Slots2BasicBlocks.find(Slot);
+  if (BlockInfo == Slots2BasicBlocks.end())
+    return nullptr;
+  return BlockInfo->second;
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot) {
+  if (Slots2BasicBlocks.empty())
+    initSlots2BasicBlocks(*MF.getFunction(), Slots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, Slots2BasicBlocks);
+}
+
+const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
+  if (&F == MF.getFunction())
+    return getIRBlock(Slot);
+  DenseMap<unsigned, const BasicBlock *> CustomSlots2BasicBlocks;
+  initSlots2BasicBlocks(F, CustomSlots2BasicBlocks);
+  return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
+}
+
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value *MIParser::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(*MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
+void MIParser::initNames2TargetIndices() {
+  if (!Names2TargetIndices.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  for (const auto &I : Indices)
+    Names2TargetIndices.insert(std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getTargetIndex(StringRef Name, int &Index) {
+  initNames2TargetIndices();
+  auto IndexInfo = Names2TargetIndices.find(Name);
+  if (IndexInfo == Names2TargetIndices.end())
+    return true;
+  Index = IndexInfo->second;
+  return false;
+}
+
+void MIParser::initNames2DirectTargetFlags() {
+  if (!Names2DirectTargetFlags.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2DirectTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getDirectTargetFlag(StringRef Name, unsigned &Flag) {
+  initNames2DirectTargetFlags();
+  auto FlagInfo = Names2DirectTargetFlags.find(Name);
+  if (FlagInfo == Names2DirectTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+void MIParser::initNames2BitmaskTargetFlags() {
+  if (!Names2BitmaskTargetFlags.empty())
+    return;
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  auto Flags = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &I : Flags)
+    Names2BitmaskTargetFlags.insert(
+        std::make_pair(StringRef(I.second), I.first));
+}
+
+bool MIParser::getBitmaskTargetFlag(StringRef Name, unsigned &Flag) {
+  initNames2BitmaskTargetFlags();
+  auto FlagInfo = Names2BitmaskTargetFlags.find(Name);
+  if (FlagInfo == Names2BitmaskTargetFlags.end())
+    return true;
+  Flag = FlagInfo->second;
+  return false;
+}
+
+bool llvm::parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
+                                             StringRef Src,
+                                             SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseBasicBlockDefinitions(PFS.MBBSlots);
+}
+
+bool llvm::parseMachineInstructions(const PerFunctionMIParsingState &PFS,
+                                    StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseBasicBlocks();
+}
+
+bool llvm::parseMBBReference(const PerFunctionMIParsingState &PFS,
+                             MachineBasicBlock *&MBB, StringRef Src,
+                             SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMBB(MBB);
+}
+
+bool llvm::parseNamedRegisterReference(const PerFunctionMIParsingState &PFS,
+                                       unsigned &Reg, StringRef Src,
+                                       SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneNamedRegister(Reg);
+}
+
+bool llvm::parseVirtualRegisterReference(const PerFunctionMIParsingState &PFS,
+                                         unsigned &Reg, StringRef Src,
+                                         SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneVirtualRegister(Reg);
+}
+
+bool llvm::parseStackObjectReference(const PerFunctionMIParsingState &PFS,
+                                     int &FI, StringRef Src,
+                                     SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneStackObject(FI);
+}
+
+bool llvm::parseMDNode(const PerFunctionMIParsingState &PFS,
+                       MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
+  return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
+}
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
new file mode 100644
index 000000000000..18895b9e54eb
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIParser.h
@@ -0,0 +1,99 @@
+//===- MIParser.h - Machine Instructions Parser ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the function that parses the machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H
+#define LLVM_LIB_CODEGEN_MIRPARSER_MIPARSER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+
+namespace llvm {
+
+class StringRef;
+class BasicBlock;
+class MachineBasicBlock;
+class MachineFunction;
+class MachineInstr;
+class MachineRegisterInfo;
+class MDNode;
+struct SlotMapping;
+class SMDiagnostic;
+class SourceMgr;
+
+struct PerFunctionMIParsingState {
+  MachineFunction &MF;
+  SourceMgr *SM;
+  const SlotMapping &IRSlots;
+
+  DenseMap<unsigned, MachineBasicBlock *> MBBSlots;
+  DenseMap<unsigned, unsigned> VirtualRegisterSlots;
+  DenseMap<unsigned, int> FixedStackObjectSlots;
+  DenseMap<unsigned, int> StackObjectSlots;
+  DenseMap<unsigned, unsigned> ConstantPoolSlots;
+  DenseMap<unsigned, unsigned> JumpTableSlots;
+  /// Hold the generic virtual registers.
+  SmallSet<unsigned, 8> GenericVRegs;
+
+  PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
+                            const SlotMapping &IRSlots);
+};
+
+/// Parse the machine basic block definitions, and skip the machine
+/// instructions.
+///
+/// This function runs the first parsing pass on the machine function's body.
+/// It parses only the machine basic block definitions and creates the machine
+/// basic blocks in the given machine function.
+///
+/// The machine instructions aren't parsed during the first pass because all
+/// the machine basic blocks aren't defined yet - this makes it impossible to
+/// resolve the machine basic block references.
+///
+/// Return true if an error occurred.
+bool parseMachineBasicBlockDefinitions(PerFunctionMIParsingState &PFS,
+                                       StringRef Src, SMDiagnostic &Error);
+
+/// Parse the machine instructions.
+///
+/// This function runs the second parsing pass on the machine function's body.
+/// It skips the machine basic block definitions and parses only the machine
+/// instructions and basic block attributes like liveins and successors.
+///
+/// The second parsing pass assumes that the first parsing pass already ran
+/// on the given source string.
+///
+/// Return true if an error occurred.
+bool parseMachineInstructions(const PerFunctionMIParsingState &PFS,
+                              StringRef Src, SMDiagnostic &Error);
+
+bool parseMBBReference(const PerFunctionMIParsingState &PFS,
+                       MachineBasicBlock *&MBB, StringRef Src,
+                       SMDiagnostic &Error);
+
+bool parseNamedRegisterReference(const PerFunctionMIParsingState &PFS,
+                                 unsigned &Reg, StringRef Src,
+                                 SMDiagnostic &Error);
+
+bool parseVirtualRegisterReference(const PerFunctionMIParsingState &PFS,
+                                   unsigned &Reg, StringRef Src,
+                                   SMDiagnostic &Error);
+
+bool parseStackObjectReference(const PerFunctionMIParsingState &PFS,
+                               int &FI, StringRef Src, SMDiagnostic &Error);
+
+bool parseMDNode(const PerFunctionMIParsingState &PFS, MDNode *&Node,
+                 StringRef Src, SMDiagnostic &Error);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
new file mode 100644
index 000000000000..4aa3df6326e9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -0,0 +1,789 @@
+//===- MIRParser.cpp - MIR serialization format parser implementation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that parses the optional LLVM IR and machine
+// functions that are stored in MIR files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "MIParser.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/AsmParser/SlotMapping.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <memory>
+
+using namespace llvm;
+
+namespace llvm {
+
+/// This class implements the parsing of LLVM IR that's embedded inside a MIR
+/// file.
+class MIRParserImpl {
+  SourceMgr SM;
+  StringRef Filename;
+  LLVMContext &Context;
+  StringMap<std::unique_ptr<yaml::MachineFunction>> Functions;
+  SlotMapping IRSlots;
+  /// Maps from register class names to register classes.
+  StringMap<const TargetRegisterClass *> Names2RegClasses;
+  /// Maps from register bank names to register banks.
+  StringMap<const RegisterBank *> Names2RegBanks;
+
+public:
+  MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents, StringRef Filename,
+                LLVMContext &Context);
+
+  void reportDiagnostic(const SMDiagnostic &Diag);
+
+  /// Report an error with the given message at unknown location.
+  ///
+  /// Always returns true.
+  bool error(const Twine &Message);
+
+  /// Report an error with the given message at the given location.
+  ///
+  /// Always returns true.
+  bool error(SMLoc Loc, const Twine &Message);
+
+  /// Report a given error with the location translated from the location in an
+  /// embedded string literal to a location in the MIR file.
+  ///
+  /// Always returns true.
+  bool error(const SMDiagnostic &Error, SMRange SourceRange);
+
+  /// Try to parse the optional LLVM module and the machine functions in the MIR
+  /// file.
+  ///
+  /// Return null if an error occurred.
+  std::unique_ptr<Module> parse();
+
+  /// Parse the machine function in the current YAML document.
+  ///
+  /// \param NoLLVMIR - set to true when the MIR file doesn't have LLVM IR.
+  /// A dummy IR function is created and inserted into the given module when
+  /// this parameter is true.
+  ///
+  /// Return true if an error occurred.
+  bool parseMachineFunction(yaml::Input &In, Module &M, bool NoLLVMIR);
+
+  /// Initialize the machine function to the state that's described in the MIR
+  /// file.
+  ///
+  /// Return true if error occurred.
+  bool initializeMachineFunction(MachineFunction &MF);
+
+  bool initializeRegisterInfo(PerFunctionMIParsingState &PFS,
+                              const yaml::MachineFunction &YamlMF);
+
+  void inferRegisterInfo(const PerFunctionMIParsingState &PFS,
+                         const yaml::MachineFunction &YamlMF);
+
+  bool initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                           const yaml::MachineFunction &YamlMF);
+
+  bool parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
+                                std::vector<CalleeSavedInfo> &CSIInfo,
+                                const yaml::StringValue &RegisterSource,
+                                int FrameIdx);
+
+  bool parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
+                                  const yaml::MachineStackObject &Object,
+                                  int FrameIdx);
+
+  bool initializeConstantPool(PerFunctionMIParsingState &PFS,
+                              MachineConstantPool &ConstantPool,
+                              const yaml::MachineFunction &YamlMF);
+
+  bool initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+                               const yaml::MachineJumpTable &YamlJTI);
+
+private:
+  bool parseMDNode(const PerFunctionMIParsingState &PFS, MDNode *&Node,
+                   const yaml::StringValue &Source);
+
+  bool parseMBBReference(const PerFunctionMIParsingState &PFS,
+                         MachineBasicBlock *&MBB,
+                         const yaml::StringValue &Source);
+
+  /// Return a MIR diagnostic converted from an MI string diagnostic.
+  SMDiagnostic diagFromMIStringDiag(const SMDiagnostic &Error,
+                                    SMRange SourceRange);
+
+  /// Return a MIR diagnostic converted from a diagnostic located in a YAML
+  /// block scalar string.
+  SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                       SMRange SourceRange);
+
+  /// Create an empty function with the given name.
+  void createDummyFunction(StringRef Name, Module &M);
+
+  void initNames2RegClasses(const MachineFunction &MF);
+  void initNames2RegBanks(const MachineFunction &MF);
+
+  /// Check if the given identifier is a name of a register class.
+  ///
+  /// Return null if the name isn't a register class.
+  const TargetRegisterClass *getRegClass(const MachineFunction &MF,
+                                         StringRef Name);
+
+  /// Check if the given identifier is a name of a register bank.
+  ///
+  /// Return null if the name isn't a register bank.
+  const RegisterBank *getRegBank(const MachineFunction &MF, StringRef Name);
+};
+
+} // end namespace llvm
+
+MIRParserImpl::MIRParserImpl(std::unique_ptr<MemoryBuffer> Contents,
+                             StringRef Filename, LLVMContext &Context)
+    : SM(), Filename(Filename), Context(Context) {
+  SM.AddNewSourceBuffer(std::move(Contents), SMLoc());
+}
+
+bool MIRParserImpl::error(const Twine &Message) {
+  Context.diagnose(DiagnosticInfoMIRParser(
+      DS_Error, SMDiagnostic(Filename, SourceMgr::DK_Error, Message.str())));
+  return true;
+}
+
+bool MIRParserImpl::error(SMLoc Loc, const Twine &Message) {
+  Context.diagnose(DiagnosticInfoMIRParser(
+      DS_Error, SM.GetMessage(Loc, SourceMgr::DK_Error, Message)));
+  return true;
+}
+
+bool MIRParserImpl::error(const SMDiagnostic &Error, SMRange SourceRange) {
+  assert(Error.getKind() == SourceMgr::DK_Error && "Expected an error");
+  reportDiagnostic(diagFromMIStringDiag(Error, SourceRange));
+  return true;
+}
+
+void MIRParserImpl::reportDiagnostic(const SMDiagnostic &Diag) {
+  DiagnosticSeverity Kind;
+  switch (Diag.getKind()) {
+  case SourceMgr::DK_Error:
+    Kind = DS_Error;
+    break;
+  case SourceMgr::DK_Warning:
+    Kind = DS_Warning;
+    break;
+  case SourceMgr::DK_Note:
+    Kind = DS_Note;
+    break;
+  }
+  Context.diagnose(DiagnosticInfoMIRParser(Kind, Diag));
+}
+
+static void handleYAMLDiag(const SMDiagnostic &Diag, void *Context) {
+  reinterpret_cast<MIRParserImpl *>(Context)->reportDiagnostic(Diag);
+}
+
+std::unique_ptr<Module> MIRParserImpl::parse() {
+  yaml::Input In(SM.getMemoryBuffer(SM.getMainFileID())->getBuffer(),
+                 /*Ctxt=*/nullptr, handleYAMLDiag, this);
+  In.setContext(&In);
+
+  if (!In.setCurrentDocument()) {
+    if (In.error())
+      return nullptr;
+    // Create an empty module when the MIR file is empty.
+    return llvm::make_unique<Module>(Filename, Context);
+  }
+
+  std::unique_ptr<Module> M;
+  bool NoLLVMIR = false;
+  // Parse the block scalar manually so that we can return unique pointer
+  // without having to go trough YAML traits.
+  if (const auto *BSN =
+          dyn_cast_or_null<yaml::BlockScalarNode>(In.getCurrentNode())) {
+    SMDiagnostic Error;
+    M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error,
+                      Context, &IRSlots);
+    if (!M) {
+      reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
+      return nullptr;
+    }
+    In.nextDocument();
+    if (!In.setCurrentDocument())
+      return M;
+  } else {
+    // Create an new, empty module.
+    M = llvm::make_unique<Module>(Filename, Context);
+    NoLLVMIR = true;
+  }
+
+  // Parse the machine functions.
+  do {
+    if (parseMachineFunction(In, *M, NoLLVMIR))
+      return nullptr;
+    In.nextDocument();
+  } while (In.setCurrentDocument());
+
+  return M;
+}
+
+bool MIRParserImpl::parseMachineFunction(yaml::Input &In, Module &M,
+                                         bool NoLLVMIR) {
+  auto MF = llvm::make_unique<yaml::MachineFunction>();
+  yaml::yamlize(In, *MF, false);
+  if (In.error())
+    return true;
+  auto FunctionName = MF->Name;
+  if (Functions.find(FunctionName) != Functions.end())
+    return error(Twine("redefinition of machine function '") + FunctionName +
+                 "'");
+  Functions.insert(std::make_pair(FunctionName, std::move(MF)));
+  if (NoLLVMIR)
+    createDummyFunction(FunctionName, M);
+  else if (!M.getFunction(FunctionName))
+    return error(Twine("function '") + FunctionName +
+                 "' isn't defined in the provided LLVM IR");
+  return false;
+}
+
+void MIRParserImpl::createDummyFunction(StringRef Name, Module &M) {
+  auto &Context = M.getContext();
+  Function *F = cast<Function>(M.getOrInsertFunction(
+      Name, FunctionType::get(Type::getVoidTy(Context), false)));
+  BasicBlock *BB = BasicBlock::Create(Context, "entry", F);
+  new UnreachableInst(Context, BB);
+}
+
+bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
+  auto It = Functions.find(MF.getName());
+  if (It == Functions.end())
+    return error(Twine("no machine function information for function '") +
+                 MF.getName() + "' in the MIR file");
+  // TODO: Recreate the machine function.
+  const yaml::MachineFunction &YamlMF = *It->getValue();
+  if (YamlMF.Alignment)
+    MF.setAlignment(YamlMF.Alignment);
+  MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
+  MF.setHasInlineAsm(YamlMF.HasInlineAsm);
+  if (YamlMF.AllVRegsAllocated)
+    MF.getProperties().set(MachineFunctionProperties::Property::AllVRegsAllocated);
+  PerFunctionMIParsingState PFS(MF, SM, IRSlots);
+  if (initializeRegisterInfo(PFS, YamlMF))
+    return true;
+  if (!YamlMF.Constants.empty()) {
+    auto *ConstantPool = MF.getConstantPool();
+    assert(ConstantPool && "Constant pool must be created");
+    if (initializeConstantPool(PFS, *ConstantPool, YamlMF))
+      return true;
+  }
+
+  StringRef BlockStr = YamlMF.Body.Value.Value;
+  SMDiagnostic Error;
+  SourceMgr BlockSM;
+  BlockSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(BlockStr, "",/*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &BlockSM;
+  if (parseMachineBasicBlockDefinitions(PFS, BlockStr, Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
+  }
+  PFS.SM = &SM;
+
+  if (MF.empty())
+    return error(Twine("machine function '") + Twine(MF.getName()) +
+                 "' requires at least one machine basic block in its body");
+  // Initialize the frame information after creating all the MBBs so that the
+  // MBB references in the frame information can be resolved.
+  if (initializeFrameInfo(PFS, YamlMF))
+    return true;
+  // Initialize the jump table after creating all the MBBs so that the MBB
+  // references can be resolved.
+  if (!YamlMF.JumpTableInfo.Entries.empty() &&
+      initializeJumpTableInfo(PFS, YamlMF.JumpTableInfo))
+    return true;
+  // Parse the machine instructions after creating all of the MBBs so that the
+  // parser can resolve the MBB references.
+  StringRef InsnStr = YamlMF.Body.Value.Value;
+  SourceMgr InsnSM;
+  InsnSM.AddNewSourceBuffer(
+      MemoryBuffer::getMemBuffer(InsnStr, "", /*RequiresNullTerminator=*/false),
+      SMLoc());
+  PFS.SM = &InsnSM;
+  if (parseMachineInstructions(PFS, InsnStr, Error)) {
+    reportDiagnostic(
+        diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
+    return true;
+  }
+  PFS.SM = &SM;
+
+  inferRegisterInfo(PFS, YamlMF);
+  // FIXME: This is a temporary workaround until the reserved registers can be
+  // serialized.
+  MF.getRegInfo().freezeReservedRegs(MF);
+  MF.verify();
+  return false;
+}
+
+bool MIRParserImpl::initializeRegisterInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  assert(RegInfo.isSSA());
+  if (!YamlMF.IsSSA)
+    RegInfo.leaveSSA();
+  assert(RegInfo.tracksLiveness());
+  if (!YamlMF.TracksRegLiveness)
+    RegInfo.invalidateLiveness();
+  RegInfo.enableSubRegLiveness(YamlMF.TracksSubRegLiveness);
+
+  SMDiagnostic Error;
+  // Parse the virtual register information.
+  for (const auto &VReg : YamlMF.VirtualRegisters) {
+    unsigned Reg;
+    if (StringRef(VReg.Class.Value).equals("_")) {
+      // This is a generic virtual register.
+      // The size will be set appropriately when we reach the definition.
+      Reg = RegInfo.createGenericVirtualRegister(/*Size*/ 1);
+      PFS.GenericVRegs.insert(Reg);
+    } else {
+      const auto *RC = getRegClass(MF, VReg.Class.Value);
+      if (RC) {
+        Reg = RegInfo.createVirtualRegister(RC);
+      } else {
+        const auto *RegBank = getRegBank(MF, VReg.Class.Value);
+        if (!RegBank)
+          return error(
+              VReg.Class.SourceRange.Start,
+              Twine("use of undefined register class or register bank '") +
+                  VReg.Class.Value + "'");
+        Reg = RegInfo.createGenericVirtualRegister(/*Size*/ 1);
+        RegInfo.setRegBank(Reg, *RegBank);
+        PFS.GenericVRegs.insert(Reg);
+      }
+    }
+    if (!PFS.VirtualRegisterSlots.insert(std::make_pair(VReg.ID.Value, Reg))
+             .second)
+      return error(VReg.ID.SourceRange.Start,
+                   Twine("redefinition of virtual register '%") +
+                       Twine(VReg.ID.Value) + "'");
+    if (!VReg.PreferredRegister.Value.empty()) {
+      unsigned PreferredReg = 0;
+      if (parseNamedRegisterReference(PFS, PreferredReg,
+                                      VReg.PreferredRegister.Value, Error))
+        return error(Error, VReg.PreferredRegister.SourceRange);
+      RegInfo.setSimpleHint(Reg, PreferredReg);
+    }
+  }
+
+  // Parse the liveins.
+  for (const auto &LiveIn : YamlMF.LiveIns) {
+    unsigned Reg = 0;
+    if (parseNamedRegisterReference(PFS, Reg, LiveIn.Register.Value, Error))
+      return error(Error, LiveIn.Register.SourceRange);
+    unsigned VReg = 0;
+    if (!LiveIn.VirtualRegister.Value.empty()) {
+      if (parseVirtualRegisterReference(PFS, VReg, LiveIn.VirtualRegister.Value,
+                                        Error))
+        return error(Error, LiveIn.VirtualRegister.SourceRange);
+    }
+    RegInfo.addLiveIn(Reg, VReg);
+  }
+
+  // Parse the callee saved register mask.
+  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size());
+  if (!YamlMF.CalleeSavedRegisters)
+    return false;
+  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+    unsigned Reg = 0;
+    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
+      return error(Error, RegSource.SourceRange);
+    CalleeSavedRegisterMask[Reg] = true;
+  }
+  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip());
+  return false;
+}
+
+void MIRParserImpl::inferRegisterInfo(const PerFunctionMIParsingState &PFS,
+                                      const yaml::MachineFunction &YamlMF) {
+  if (YamlMF.CalleeSavedRegisters)
+    return;
+  MachineRegisterInfo &MRI = PFS.MF.getRegInfo();
+  for (const MachineBasicBlock &MBB : PFS.MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
+      }
+    }
+  }
+}
+
+bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
+                                        const yaml::MachineFunction &YamlMF) {
+  MachineFunction &MF = PFS.MF;
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+  const Function &F = *MF.getFunction();
+  const yaml::MachineFrameInfo &YamlMFI = YamlMF.FrameInfo;
+  MFI.setFrameAddressIsTaken(YamlMFI.IsFrameAddressTaken);
+  MFI.setReturnAddressIsTaken(YamlMFI.IsReturnAddressTaken);
+  MFI.setHasStackMap(YamlMFI.HasStackMap);
+  MFI.setHasPatchPoint(YamlMFI.HasPatchPoint);
+  MFI.setStackSize(YamlMFI.StackSize);
+  MFI.setOffsetAdjustment(YamlMFI.OffsetAdjustment);
+  if (YamlMFI.MaxAlignment)
+    MFI.ensureMaxAlignment(YamlMFI.MaxAlignment);
+  MFI.setAdjustsStack(YamlMFI.AdjustsStack);
+  MFI.setHasCalls(YamlMFI.HasCalls);
+  MFI.setMaxCallFrameSize(YamlMFI.MaxCallFrameSize);
+  MFI.setHasOpaqueSPAdjustment(YamlMFI.HasOpaqueSPAdjustment);
+  MFI.setHasVAStart(YamlMFI.HasVAStart);
+  MFI.setHasMustTailInVarArgFunc(YamlMFI.HasMustTailInVarArgFunc);
+  if (!YamlMFI.SavePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(PFS, MBB, YamlMFI.SavePoint))
+      return true;
+    MFI.setSavePoint(MBB);
+  }
+  if (!YamlMFI.RestorePoint.Value.empty()) {
+    MachineBasicBlock *MBB = nullptr;
+    if (parseMBBReference(PFS, MBB, YamlMFI.RestorePoint))
+      return true;
+    MFI.setRestorePoint(MBB);
+  }
+
+  std::vector<CalleeSavedInfo> CSIInfo;
+  // Initialize the fixed frame objects.
+  for (const auto &Object : YamlMF.FixedStackObjects) {
+    int ObjectIdx;
+    if (Object.Type != yaml::FixedMachineStackObject::SpillSlot)
+      ObjectIdx = MFI.CreateFixedObject(Object.Size, Object.Offset,
+                                        Object.IsImmutable, Object.IsAliased);
+    else
+      ObjectIdx = MFI.CreateFixedSpillStackObject(Object.Size, Object.Offset);
+    MFI.setObjectAlignment(ObjectIdx, Object.Alignment);
+    if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value,
+                                                         ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of fixed stack object '%fixed-stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 ObjectIdx))
+      return true;
+  }
+
+  // Initialize the ordinary frame objects.
+  for (const auto &Object : YamlMF.StackObjects) {
+    int ObjectIdx;
+    const AllocaInst *Alloca = nullptr;
+    const yaml::StringValue &Name = Object.Name;
+    if (!Name.Value.empty()) {
+      Alloca = dyn_cast_or_null<AllocaInst>(
+          F.getValueSymbolTable().lookup(Name.Value));
+      if (!Alloca)
+        return error(Name.SourceRange.Start,
+                     "alloca instruction named '" + Name.Value +
+                         "' isn't defined in the function '" + F.getName() +
+                         "'");
+    }
+    if (Object.Type == yaml::MachineStackObject::VariableSized)
+      ObjectIdx = MFI.CreateVariableSizedObject(Object.Alignment, Alloca);
+    else
+      ObjectIdx = MFI.CreateStackObject(
+          Object.Size, Object.Alignment,
+          Object.Type == yaml::MachineStackObject::SpillSlot, Alloca);
+    MFI.setObjectOffset(ObjectIdx, Object.Offset);
+    if (!PFS.StackObjectSlots.insert(std::make_pair(Object.ID.Value, ObjectIdx))
+             .second)
+      return error(Object.ID.SourceRange.Start,
+                   Twine("redefinition of stack object '%stack.") +
+                       Twine(Object.ID.Value) + "'");
+    if (parseCalleeSavedRegister(PFS, CSIInfo, Object.CalleeSavedRegister,
+                                 ObjectIdx))
+      return true;
+    if (Object.LocalOffset)
+      MFI.mapLocalFrameObject(ObjectIdx, Object.LocalOffset.getValue());
+    if (parseStackObjectsDebugInfo(PFS, Object, ObjectIdx))
+      return true;
+  }
+  MFI.setCalleeSavedInfo(CSIInfo);
+  if (!CSIInfo.empty())
+    MFI.setCalleeSavedInfoValid(true);
+
+  // Initialize the various stack object references after initializing the
+  // stack objects.
+  if (!YamlMFI.StackProtector.Value.empty()) {
+    SMDiagnostic Error;
+    int FI;
+    if (parseStackObjectReference(PFS, FI, YamlMFI.StackProtector.Value, Error))
+      return error(Error, YamlMFI.StackProtector.SourceRange);
+    MFI.setStackProtectorIndex(FI);
+  }
+  return false;
+}
+
+bool MIRParserImpl::parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
+    std::vector<CalleeSavedInfo> &CSIInfo,
+    const yaml::StringValue &RegisterSource, int FrameIdx) {
+  if (RegisterSource.Value.empty())
+    return false;
+  unsigned Reg = 0;
+  SMDiagnostic Error;
+  if (parseNamedRegisterReference(PFS, Reg, RegisterSource.Value, Error))
+    return error(Error, RegisterSource.SourceRange);
+  CSIInfo.push_back(CalleeSavedInfo(Reg, FrameIdx));
+  return false;
+}
+
+/// Verify that given node is of a certain type. Return true on error.
+template <typename T>
+static bool typecheckMDNode(T *&Result, MDNode *Node,
+                            const yaml::StringValue &Source,
+                            StringRef TypeString, MIRParserImpl &Parser) {
+  if (!Node)
+    return false;
+  Result = dyn_cast<T>(Node);
+  if (!Result)
+    return Parser.error(Source.SourceRange.Start,
+                        "expected a reference to a '" + TypeString +
+                            "' metadata node");
+  return false;
+}
+
+bool MIRParserImpl::parseStackObjectsDebugInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineStackObject &Object, int FrameIdx) {
+  // Debug information can only be attached to stack objects; Fixed stack
+  // objects aren't supported.
+  assert(FrameIdx >= 0 && "Expected a stack object frame index");
+  MDNode *Var = nullptr, *Expr = nullptr, *Loc = nullptr;
+  if (parseMDNode(PFS, Var, Object.DebugVar) ||
+      parseMDNode(PFS, Expr, Object.DebugExpr) ||
+      parseMDNode(PFS, Loc, Object.DebugLoc))
+    return true;
+  if (!Var && !Expr && !Loc)
+    return false;
+  DILocalVariable *DIVar = nullptr;
+  DIExpression *DIExpr = nullptr;
+  DILocation *DILoc = nullptr;
+  if (typecheckMDNode(DIVar, Var, Object.DebugVar, "DILocalVariable", *this) ||
+      typecheckMDNode(DIExpr, Expr, Object.DebugExpr, "DIExpression", *this) ||
+      typecheckMDNode(DILoc, Loc, Object.DebugLoc, "DILocation", *this))
+    return true;
+  PFS.MF.getMMI().setVariableDbgInfo(DIVar, DIExpr, unsigned(FrameIdx), DILoc);
+  return false;
+}
+
+bool MIRParserImpl::parseMDNode(const PerFunctionMIParsingState &PFS,
+    MDNode *&Node, const yaml::StringValue &Source) {
+  if (Source.Value.empty())
+    return false;
+  SMDiagnostic Error;
+  if (llvm::parseMDNode(PFS, Node, Source.Value, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
+    MachineConstantPool &ConstantPool, const yaml::MachineFunction &YamlMF) {
+  DenseMap<unsigned, unsigned> &ConstantPoolSlots = PFS.ConstantPoolSlots;
+  const MachineFunction &MF = PFS.MF;
+  const auto &M = *MF.getFunction()->getParent();
+  SMDiagnostic Error;
+  for (const auto &YamlConstant : YamlMF.Constants) {
+    const Constant *Value = dyn_cast_or_null<Constant>(
+        parseConstantValue(YamlConstant.Value.Value, Error, M));
+    if (!Value)
+      return error(Error, YamlConstant.Value.SourceRange);
+    unsigned Alignment =
+        YamlConstant.Alignment
+            ? YamlConstant.Alignment
+            : M.getDataLayout().getPrefTypeAlignment(Value->getType());
+    unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
+    if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
+             .second)
+      return error(YamlConstant.ID.SourceRange.Start,
+                   Twine("redefinition of constant pool item '%const.") +
+                       Twine(YamlConstant.ID.Value) + "'");
+  }
+  return false;
+}
+
+bool MIRParserImpl::initializeJumpTableInfo(PerFunctionMIParsingState &PFS,
+    const yaml::MachineJumpTable &YamlJTI) {
+  MachineJumpTableInfo *JTI = PFS.MF.getOrCreateJumpTableInfo(YamlJTI.Kind);
+  for (const auto &Entry : YamlJTI.Entries) {
+    std::vector<MachineBasicBlock *> Blocks;
+    for (const auto &MBBSource : Entry.Blocks) {
+      MachineBasicBlock *MBB = nullptr;
+      if (parseMBBReference(PFS, MBB, MBBSource.Value))
+        return true;
+      Blocks.push_back(MBB);
+    }
+    unsigned Index = JTI->createJumpTableIndex(Blocks);
+    if (!PFS.JumpTableSlots.insert(std::make_pair(Entry.ID.Value, Index))
+             .second)
+      return error(Entry.ID.SourceRange.Start,
+                   Twine("redefinition of jump table entry '%jump-table.") +
+                       Twine(Entry.ID.Value) + "'");
+  }
+  return false;
+}
+
+bool MIRParserImpl::parseMBBReference(const PerFunctionMIParsingState &PFS,
+                                      MachineBasicBlock *&MBB,
+                                      const yaml::StringValue &Source) {
+  SMDiagnostic Error;
+  if (llvm::parseMBBReference(PFS, MBB, Source.Value, Error))
+    return error(Error, Source.SourceRange);
+  return false;
+}
+
+SMDiagnostic MIRParserImpl::diagFromMIStringDiag(const SMDiagnostic &Error,
+                                                 SMRange SourceRange) {
+  assert(SourceRange.isValid() && "Invalid source range");
+  SMLoc Loc = SourceRange.Start;
+  bool HasQuote = Loc.getPointer() < SourceRange.End.getPointer() &&
+                  *Loc.getPointer() == '\'';
+  // Translate the location of the error from the location in the MI string to
+  // the corresponding location in the MIR file.
+  Loc = Loc.getFromPointer(Loc.getPointer() + Error.getColumnNo() +
+                           (HasQuote ? 1 : 0));
+
+  // TODO: Translate any source ranges as well.
+  return SM.GetMessage(Loc, Error.getKind(), Error.getMessage(), None,
+                       Error.getFixIts());
+}
+
+SMDiagnostic MIRParserImpl::diagFromBlockStringDiag(const SMDiagnostic &Error,
+                                                    SMRange SourceRange) {
+  assert(SourceRange.isValid());
+
+  // Translate the location of the error from the location in the llvm IR string
+  // to the corresponding location in the MIR file.
+  auto LineAndColumn = SM.getLineAndColumn(SourceRange.Start);
+  unsigned Line = LineAndColumn.first + Error.getLineNo() - 1;
+  unsigned Column = Error.getColumnNo();
+  StringRef LineStr = Error.getLineContents();
+  SMLoc Loc = Error.getLoc();
+
+  // Get the full line and adjust the column number by taking the indentation of
+  // LLVM IR into account.
+  for (line_iterator L(*SM.getMemoryBuffer(SM.getMainFileID()), false), E;
+       L != E; ++L) {
+    if (L.line_number() == Line) {
+      LineStr = *L;
+      Loc = SMLoc::getFromPointer(LineStr.data());
+      auto Indent = LineStr.find(Error.getLineContents());
+      if (Indent != StringRef::npos)
+        Column += Indent;
+      break;
+    }
+  }
+
+  return SMDiagnostic(SM, Loc, Filename, Line, Column, Error.getKind(),
+                      Error.getMessage(), LineStr, Error.getRanges(),
+                      Error.getFixIts());
+}
+
+void MIRParserImpl::initNames2RegClasses(const MachineFunction &MF) {
+  if (!Names2RegClasses.empty())
+    return;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; ++I) {
+    const auto *RC = TRI->getRegClass(I);
+    Names2RegClasses.insert(
+        std::make_pair(StringRef(TRI->getRegClassName(RC)).lower(), RC));
+  }
+}
+
+void MIRParserImpl::initNames2RegBanks(const MachineFunction &MF) {
+  if (!Names2RegBanks.empty())
+    return;
+  const RegisterBankInfo *RBI = MF.getSubtarget().getRegBankInfo();
+  // If the target does not support GlobalISel, we may not have a
+  // register bank info.
+  if (!RBI)
+    return;
+  for (unsigned I = 0, E = RBI->getNumRegBanks(); I < E; ++I) {
+    const auto &RegBank = RBI->getRegBank(I);
+    Names2RegBanks.insert(
+        std::make_pair(StringRef(RegBank.getName()).lower(), &RegBank));
+  }
+}
+
+const TargetRegisterClass *MIRParserImpl::getRegClass(const MachineFunction &MF,
+                                                      StringRef Name) {
+  initNames2RegClasses(MF);
+  auto RegClassInfo = Names2RegClasses.find(Name);
+  if (RegClassInfo == Names2RegClasses.end())
+    return nullptr;
+  return RegClassInfo->getValue();
+}
+
+const RegisterBank *MIRParserImpl::getRegBank(const MachineFunction &MF,
+                                              StringRef Name) {
+  initNames2RegBanks(MF);
+  auto RegBankInfo = Names2RegBanks.find(Name);
+  if (RegBankInfo == Names2RegBanks.end())
+    return nullptr;
+  return RegBankInfo->getValue();
+}
+
+MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
+    : Impl(std::move(Impl)) {}
+
+MIRParser::~MIRParser() {}
+
+std::unique_ptr<Module> MIRParser::parseLLVMModule() { return Impl->parse(); }
+
+bool MIRParser::initializeMachineFunction(MachineFunction &MF) {
+  return Impl->initializeMachineFunction(MF);
+}
+
+std::unique_ptr<MIRParser> llvm::createMIRParserFromFile(StringRef Filename,
+                                                         SMDiagnostic &Error,
+                                                         LLVMContext &Context) {
+  auto FileOrErr = MemoryBuffer::getFile(Filename);
+  if (std::error_code EC = FileOrErr.getError()) {
+    Error = SMDiagnostic(Filename, SourceMgr::DK_Error,
+                         "Could not open input file: " + EC.message());
+    return nullptr;
+  }
+  return createMIRParser(std::move(FileOrErr.get()), Context);
+}
+
+std::unique_ptr<MIRParser>
+llvm::createMIRParser(std::unique_ptr<MemoryBuffer> Contents,
+                      LLVMContext &Context) {
+  auto Filename = Contents->getBufferIdentifier();
+  return llvm::make_unique<MIRParser>(
+      llvm::make_unique<MIRParserImpl>(std::move(Contents), Filename, Context));
+}
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.cpp b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
new file mode 100644
index 000000000000..703c99d9edd3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -0,0 +1,1005 @@
+//===- MIRPrinter.cpp - MIR serialization format printer ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the class that prints out the LLVM IR and machine
+// functions using the MIR serialization format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRPrinter.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+/// This structure describes how to print out stack object references.
+struct FrameIndexOperand {
+  std::string Name;
+  unsigned ID;
+  bool IsFixed;
+
+  FrameIndexOperand(StringRef Name, unsigned ID, bool IsFixed)
+      : Name(Name.str()), ID(ID), IsFixed(IsFixed) {}
+
+  /// Return an ordinary stack object reference.
+  static FrameIndexOperand create(StringRef Name, unsigned ID) {
+    return FrameIndexOperand(Name, ID, /*IsFixed=*/false);
+  }
+
+  /// Return a fixed stack object reference.
+  static FrameIndexOperand createFixed(unsigned ID) {
+    return FrameIndexOperand("", ID, /*IsFixed=*/true);
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+/// This class prints out the machine functions using the MIR serialization
+/// format.
+class MIRPrinter {
+  raw_ostream &OS;
+  DenseMap<const uint32_t *, unsigned> RegisterMaskIds;
+  /// Maps from stack object indices to operand indices which will be used when
+  /// printing frame index machine operands.
+  DenseMap<int, FrameIndexOperand> StackObjectOperandMapping;
+
+public:
+  MIRPrinter(raw_ostream &OS) : OS(OS) {}
+
+  void print(const MachineFunction &MF);
+
+  void convert(yaml::MachineFunction &MF, const MachineRegisterInfo &RegInfo,
+               const TargetRegisterInfo *TRI);
+  void convert(ModuleSlotTracker &MST, yaml::MachineFrameInfo &YamlMFI,
+               const MachineFrameInfo &MFI);
+  void convert(yaml::MachineFunction &MF,
+               const MachineConstantPool &ConstantPool);
+  void convert(ModuleSlotTracker &MST, yaml::MachineJumpTable &YamlJTI,
+               const MachineJumpTableInfo &JTI);
+  void convertStackObjects(yaml::MachineFunction &MF,
+                           const MachineFrameInfo &MFI, MachineModuleInfo &MMI,
+                           ModuleSlotTracker &MST,
+                           const TargetRegisterInfo *TRI);
+
+private:
+  void initRegisterMaskIds(const MachineFunction &MF);
+};
+
+/// This class prints out the machine instructions using the MIR serialization
+/// format.
+class MIPrinter {
+  raw_ostream &OS;
+  ModuleSlotTracker &MST;
+  const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds;
+  const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping;
+
+public:
+  MIPrinter(raw_ostream &OS, ModuleSlotTracker &MST,
+            const DenseMap<const uint32_t *, unsigned> &RegisterMaskIds,
+            const DenseMap<int, FrameIndexOperand> &StackObjectOperandMapping)
+      : OS(OS), MST(MST), RegisterMaskIds(RegisterMaskIds),
+        StackObjectOperandMapping(StackObjectOperandMapping) {}
+
+  void print(const MachineBasicBlock &MBB);
+
+  void print(const MachineInstr &MI);
+  void printMBBReference(const MachineBasicBlock &MBB);
+  void printIRBlockReference(const BasicBlock &BB);
+  void printIRValueReference(const Value &V);
+  void printStackObjectReference(int FrameIndex);
+  void printOffset(int64_t Offset);
+  void printTargetFlags(const MachineOperand &Op);
+  void print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
+             unsigned I, bool ShouldPrintRegisterTies,
+             const MachineRegisterInfo *MRI = nullptr, bool IsDef = false);
+  void print(const MachineMemOperand &Op);
+
+  void print(const MCCFIInstruction &CFI, const TargetRegisterInfo *TRI);
+};
+
+} // end namespace llvm
+
+namespace llvm {
+namespace yaml {
+
+/// This struct serializes the LLVM IR module.
+template <> struct BlockScalarTraits<Module> {
+  static void output(const Module &Mod, void *Ctxt, raw_ostream &OS) {
+    Mod.print(OS, nullptr);
+  }
+  static StringRef input(StringRef Str, void *Ctxt, Module &Mod) {
+    llvm_unreachable("LLVM Module is supposed to be parsed separately");
+    return "";
+  }
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+static void printReg(unsigned Reg, raw_ostream &OS,
+                     const TargetRegisterInfo *TRI) {
+  // TODO: Print Stack Slots.
+  if (!Reg)
+    OS << '_';
+  else if (TargetRegisterInfo::isVirtualRegister(Reg))
+    OS << '%' << TargetRegisterInfo::virtReg2Index(Reg);
+  else if (Reg < TRI->getNumRegs())
+    OS << '%' << StringRef(TRI->getName(Reg)).lower();
+  else
+    llvm_unreachable("Can't print this kind of register yet");
+}
+
+static void printReg(unsigned Reg, yaml::StringValue &Dest,
+                     const TargetRegisterInfo *TRI) {
+  raw_string_ostream OS(Dest.Value);
+  printReg(Reg, OS, TRI);
+}
+
+void MIRPrinter::print(const MachineFunction &MF) {
+  initRegisterMaskIds(MF);
+
+  yaml::MachineFunction YamlMF;
+  YamlMF.Name = MF.getName();
+  YamlMF.Alignment = MF.getAlignment();
+  YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
+  YamlMF.HasInlineAsm = MF.hasInlineAsm();
+  YamlMF.AllVRegsAllocated = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::AllVRegsAllocated);
+
+  convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
+  ModuleSlotTracker MST(MF.getFunction()->getParent());
+  MST.incorporateFunction(*MF.getFunction());
+  convert(MST, YamlMF.FrameInfo, *MF.getFrameInfo());
+  convertStackObjects(YamlMF, *MF.getFrameInfo(), MF.getMMI(), MST,
+                      MF.getSubtarget().getRegisterInfo());
+  if (const auto *ConstantPool = MF.getConstantPool())
+    convert(YamlMF, *ConstantPool);
+  if (const auto *JumpTableInfo = MF.getJumpTableInfo())
+    convert(MST, YamlMF.JumpTableInfo, *JumpTableInfo);
+  raw_string_ostream StrOS(YamlMF.Body.Value.Value);
+  bool IsNewlineNeeded = false;
+  for (const auto &MBB : MF) {
+    if (IsNewlineNeeded)
+      StrOS << "\n";
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .print(MBB);
+    IsNewlineNeeded = true;
+  }
+  StrOS.flush();
+  yaml::Output Out(OS);
+  Out << YamlMF;
+}
+
+void MIRPrinter::convert(yaml::MachineFunction &MF,
+                         const MachineRegisterInfo &RegInfo,
+                         const TargetRegisterInfo *TRI) {
+  MF.IsSSA = RegInfo.isSSA();
+  MF.TracksRegLiveness = RegInfo.tracksLiveness();
+  MF.TracksSubRegLiveness = RegInfo.subRegLivenessEnabled();
+
+  // Print the virtual register definitions.
+  for (unsigned I = 0, E = RegInfo.getNumVirtRegs(); I < E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    yaml::VirtualRegisterDefinition VReg;
+    VReg.ID = I;
+    if (RegInfo.getRegClassOrNull(Reg))
+      VReg.Class =
+          StringRef(TRI->getRegClassName(RegInfo.getRegClass(Reg))).lower();
+    else if (RegInfo.getRegBankOrNull(Reg))
+      VReg.Class = StringRef(RegInfo.getRegBankOrNull(Reg)->getName()).lower();
+    else {
+      VReg.Class = std::string("_");
+      assert(RegInfo.getSize(Reg) && "Generic registers must have a size");
+    }
+    unsigned PreferredReg = RegInfo.getSimpleHint(Reg);
+    if (PreferredReg)
+      printReg(PreferredReg, VReg.PreferredRegister, TRI);
+    MF.VirtualRegisters.push_back(VReg);
+  }
+
+  // Print the live ins.
+  for (auto I = RegInfo.livein_begin(), E = RegInfo.livein_end(); I != E; ++I) {
+    yaml::MachineFunctionLiveIn LiveIn;
+    printReg(I->first, LiveIn.Register, TRI);
+    if (I->second)
+      printReg(I->second, LiveIn.VirtualRegister, TRI);
+    MF.LiveIns.push_back(LiveIn);
+  }
+  // The used physical register mask is printed as an inverted callee saved
+  // register mask.
+  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask();
+  if (UsedPhysRegMask.none())
+    return;
+  std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) {
+    if (!UsedPhysRegMask[I]) {
+      yaml::FlowStringValue Reg;
+      printReg(I, Reg, TRI);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+  }
+  MF.CalleeSavedRegisters = CalleeSavedRegisters;
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineFrameInfo &YamlMFI,
+                         const MachineFrameInfo &MFI) {
+  YamlMFI.IsFrameAddressTaken = MFI.isFrameAddressTaken();
+  YamlMFI.IsReturnAddressTaken = MFI.isReturnAddressTaken();
+  YamlMFI.HasStackMap = MFI.hasStackMap();
+  YamlMFI.HasPatchPoint = MFI.hasPatchPoint();
+  YamlMFI.StackSize = MFI.getStackSize();
+  YamlMFI.OffsetAdjustment = MFI.getOffsetAdjustment();
+  YamlMFI.MaxAlignment = MFI.getMaxAlignment();
+  YamlMFI.AdjustsStack = MFI.adjustsStack();
+  YamlMFI.HasCalls = MFI.hasCalls();
+  YamlMFI.MaxCallFrameSize = MFI.getMaxCallFrameSize();
+  YamlMFI.HasOpaqueSPAdjustment = MFI.hasOpaqueSPAdjustment();
+  YamlMFI.HasVAStart = MFI.hasVAStart();
+  YamlMFI.HasMustTailInVarArgFunc = MFI.hasMustTailInVarArgFunc();
+  if (MFI.getSavePoint()) {
+    raw_string_ostream StrOS(YamlMFI.SavePoint.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printMBBReference(*MFI.getSavePoint());
+  }
+  if (MFI.getRestorePoint()) {
+    raw_string_ostream StrOS(YamlMFI.RestorePoint.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printMBBReference(*MFI.getRestorePoint());
+  }
+}
+
+void MIRPrinter::convertStackObjects(yaml::MachineFunction &MF,
+                                     const MachineFrameInfo &MFI,
+                                     MachineModuleInfo &MMI,
+                                     ModuleSlotTracker &MST,
+                                     const TargetRegisterInfo *TRI) {
+  // Process fixed stack objects.
+  unsigned ID = 0;
+  for (int I = MFI.getObjectIndexBegin(); I < 0; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::FixedMachineStackObject YamlObject;
+    YamlObject.ID = ID;
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::FixedMachineStackObject::SpillSlot
+                          : yaml::FixedMachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
+    YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
+    MF.FixedStackObjects.push_back(YamlObject);
+    StackObjectOperandMapping.insert(
+        std::make_pair(I, FrameIndexOperand::createFixed(ID++)));
+  }
+
+  // Process ordinary stack objects.
+  ID = 0;
+  for (int I = 0, E = MFI.getObjectIndexEnd(); I < E; ++I) {
+    if (MFI.isDeadObjectIndex(I))
+      continue;
+
+    yaml::MachineStackObject YamlObject;
+    YamlObject.ID = ID;
+    if (const auto *Alloca = MFI.getObjectAllocation(I))
+      YamlObject.Name.Value =
+          Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>";
+    YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
+                          ? yaml::MachineStackObject::SpillSlot
+                          : MFI.isVariableSizedObjectIndex(I)
+                                ? yaml::MachineStackObject::VariableSized
+                                : yaml::MachineStackObject::DefaultType;
+    YamlObject.Offset = MFI.getObjectOffset(I);
+    YamlObject.Size = MFI.getObjectSize(I);
+    YamlObject.Alignment = MFI.getObjectAlignment(I);
+
+    MF.StackObjects.push_back(YamlObject);
+    StackObjectOperandMapping.insert(std::make_pair(
+        I, FrameIndexOperand::create(YamlObject.Name.Value, ID++)));
+  }
+
+  for (const auto &CSInfo : MFI.getCalleeSavedInfo()) {
+    yaml::StringValue Reg;
+    printReg(CSInfo.getReg(), Reg, TRI);
+    auto StackObjectInfo = StackObjectOperandMapping.find(CSInfo.getFrameIdx());
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    if (StackObject.IsFixed)
+      MF.FixedStackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+    else
+      MF.StackObjects[StackObject.ID].CalleeSavedRegister = Reg;
+  }
+  for (unsigned I = 0, E = MFI.getLocalFrameObjectCount(); I < E; ++I) {
+    auto LocalObject = MFI.getLocalFrameObjectMap(I);
+    auto StackObjectInfo = StackObjectOperandMapping.find(LocalObject.first);
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    assert(!StackObject.IsFixed && "Expected a locally mapped stack object");
+    MF.StackObjects[StackObject.ID].LocalOffset = LocalObject.second;
+  }
+
+  // Print the stack object references in the frame information class after
+  // converting the stack objects.
+  if (MFI.hasStackProtectorIndex()) {
+    raw_string_ostream StrOS(MF.FrameInfo.StackProtector.Value);
+    MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+        .printStackObjectReference(MFI.getStackProtectorIndex());
+  }
+
+  // Print the debug variable information.
+  for (MachineModuleInfo::VariableDbgInfo &DebugVar :
+       MMI.getVariableDbgInfo()) {
+    auto StackObjectInfo = StackObjectOperandMapping.find(DebugVar.Slot);
+    assert(StackObjectInfo != StackObjectOperandMapping.end() &&
+           "Invalid stack object index");
+    const FrameIndexOperand &StackObject = StackObjectInfo->second;
+    assert(!StackObject.IsFixed && "Expected a non-fixed stack object");
+    auto &Object = MF.StackObjects[StackObject.ID];
+    {
+      raw_string_ostream StrOS(Object.DebugVar.Value);
+      DebugVar.Var->printAsOperand(StrOS, MST);
+    }
+    {
+      raw_string_ostream StrOS(Object.DebugExpr.Value);
+      DebugVar.Expr->printAsOperand(StrOS, MST);
+    }
+    {
+      raw_string_ostream StrOS(Object.DebugLoc.Value);
+      DebugVar.Loc->printAsOperand(StrOS, MST);
+    }
+  }
+}
+
+void MIRPrinter::convert(yaml::MachineFunction &MF,
+                         const MachineConstantPool &ConstantPool) {
+  unsigned ID = 0;
+  for (const MachineConstantPoolEntry &Constant : ConstantPool.getConstants()) {
+    // TODO: Serialize target specific constant pool entries.
+    if (Constant.isMachineConstantPoolEntry())
+      llvm_unreachable("Can't print target specific constant pool entries yet");
+
+    yaml::MachineConstantPoolValue YamlConstant;
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    Constant.Val.ConstVal->printAsOperand(StrOS);
+    YamlConstant.ID = ID++;
+    YamlConstant.Value = StrOS.str();
+    YamlConstant.Alignment = Constant.getAlignment();
+    MF.Constants.push_back(YamlConstant);
+  }
+}
+
+void MIRPrinter::convert(ModuleSlotTracker &MST,
+                         yaml::MachineJumpTable &YamlJTI,
+                         const MachineJumpTableInfo &JTI) {
+  YamlJTI.Kind = JTI.getEntryKind();
+  unsigned ID = 0;
+  for (const auto &Table : JTI.getJumpTables()) {
+    std::string Str;
+    yaml::MachineJumpTable::Entry Entry;
+    Entry.ID = ID++;
+    for (const auto *MBB : Table.MBBs) {
+      raw_string_ostream StrOS(Str);
+      MIPrinter(StrOS, MST, RegisterMaskIds, StackObjectOperandMapping)
+          .printMBBReference(*MBB);
+      Entry.Blocks.push_back(StrOS.str());
+      Str.clear();
+    }
+    YamlJTI.Entries.push_back(Entry);
+  }
+}
+
+void MIRPrinter::initRegisterMaskIds(const MachineFunction &MF) {
+  const auto *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned I = 0;
+  for (const uint32_t *Mask : TRI->getRegMasks())
+    RegisterMaskIds.insert(std::make_pair(Mask, I++));
+}
+
+void MIPrinter::print(const MachineBasicBlock &MBB) {
+  assert(MBB.getNumber() >= 0 && "Invalid MBB number");
+  OS << "bb." << MBB.getNumber();
+  bool HasAttributes = false;
+  if (const auto *BB = MBB.getBasicBlock()) {
+    if (BB->hasName()) {
+      OS << "." << BB->getName();
+    } else {
+      HasAttributes = true;
+      OS << " (";
+      int Slot = MST.getLocalSlot(BB);
+      if (Slot == -1)
+        OS << "<ir-block badref>";
+      else
+        OS << (Twine("%ir-block.") + Twine(Slot)).str();
+    }
+  }
+  if (MBB.hasAddressTaken()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "address-taken";
+    HasAttributes = true;
+  }
+  if (MBB.isEHPad()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "landing-pad";
+    HasAttributes = true;
+  }
+  if (MBB.getAlignment()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "align " << MBB.getAlignment();
+    HasAttributes = true;
+  }
+  if (HasAttributes)
+    OS << ")";
+  OS << ":\n";
+
+  bool HasLineAttributes = false;
+  // Print the successors
+  if (!MBB.succ_empty()) {
+    OS.indent(2) << "successors: ";
+    for (auto I = MBB.succ_begin(), E = MBB.succ_end(); I != E; ++I) {
+      if (I != MBB.succ_begin())
+        OS << ", ";
+      printMBBReference(**I);
+      if (MBB.hasSuccessorProbabilities())
+        OS << '(' << MBB.getSuccProbability(I) << ')';
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  // Print the live in registers.
+  const auto *TRI = MBB.getParent()->getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  if (!MBB.livein_empty()) {
+    OS.indent(2) << "liveins: ";
+    bool First = true;
+    for (const auto &LI : MBB.liveins()) {
+      if (!First)
+        OS << ", ";
+      First = false;
+      printReg(LI.PhysReg, OS, TRI);
+      if (LI.LaneMask != ~0u)
+        OS << ':' << PrintLaneMask(LI.LaneMask);
+    }
+    OS << "\n";
+    HasLineAttributes = true;
+  }
+
+  if (HasLineAttributes)
+    OS << "\n";
+  bool IsInBundle = false;
+  for (auto I = MBB.instr_begin(), E = MBB.instr_end(); I != E; ++I) {
+    const MachineInstr &MI = *I;
+    if (IsInBundle && !MI.isInsideBundle()) {
+      OS.indent(2) << "}\n";
+      IsInBundle = false;
+    }
+    OS.indent(IsInBundle ? 4 : 2);
+    print(MI);
+    if (!IsInBundle && MI.getFlag(MachineInstr::BundledSucc)) {
+      OS << " {";
+      IsInBundle = true;
+    }
+    OS << "\n";
+  }
+  if (IsInBundle)
+    OS.indent(2) << "}\n";
+}
+
+/// Return true when an instruction has tied register that can't be determined
+/// by the instruction's descriptor.
+static bool hasComplexRegisterTies(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  for (unsigned I = 0, E = MI.getNumOperands(); I < E; ++I) {
+    const auto &Operand = MI.getOperand(I);
+    if (!Operand.isReg() || Operand.isDef())
+      // Ignore the defined registers as MCID marks only the uses as tied.
+      continue;
+    int ExpectedTiedIdx = MCID.getOperandConstraint(I, MCOI::TIED_TO);
+    int TiedIdx = Operand.isTied() ? int(MI.findTiedOperandIdx(I)) : -1;
+    if (ExpectedTiedIdx != TiedIdx)
+      return true;
+  }
+  return false;
+}
+
+void MIPrinter::print(const MachineInstr &MI) {
+  const auto *MF = MI.getParent()->getParent();
+  const auto &MRI = MF->getRegInfo();
+  const auto &SubTarget = MF->getSubtarget();
+  const auto *TRI = SubTarget.getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  const auto *TII = SubTarget.getInstrInfo();
+  assert(TII && "Expected target instruction info");
+  if (MI.isCFIInstruction())
+    assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
+
+  bool ShouldPrintRegisterTies = hasComplexRegisterTies(MI);
+  unsigned I = 0, E = MI.getNumOperands();
+  for (; I < E && MI.getOperand(I).isReg() && MI.getOperand(I).isDef() &&
+         !MI.getOperand(I).isImplicit();
+       ++I) {
+    if (I)
+      OS << ", ";
+    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies, &MRI,
+          /*IsDef=*/true);
+  }
+
+  if (I)
+    OS << " = ";
+  if (MI.getFlag(MachineInstr::FrameSetup))
+    OS << "frame-setup ";
+  OS << TII->getName(MI.getOpcode());
+  if (isPreISelGenericOpcode(MI.getOpcode())) {
+    assert(MI.getType() && "Generic instructions must have a type");
+    OS << ' ';
+    MI.getType()->print(OS, /*IsForDebug*/ false, /*NoDetails*/ true);
+  }
+  if (I < E)
+    OS << ' ';
+
+  bool NeedComma = false;
+  for (; I < E; ++I) {
+    if (NeedComma)
+      OS << ", ";
+    print(MI.getOperand(I), TRI, I, ShouldPrintRegisterTies);
+    NeedComma = true;
+  }
+
+  if (MI.getDebugLoc()) {
+    if (NeedComma)
+      OS << ',';
+    OS << " debug-location ";
+    MI.getDebugLoc()->printAsOperand(OS, MST);
+  }
+
+  if (!MI.memoperands_empty()) {
+    OS << " :: ";
+    bool NeedComma = false;
+    for (const auto *Op : MI.memoperands()) {
+      if (NeedComma)
+        OS << ", ";
+      print(*Op);
+      NeedComma = true;
+    }
+  }
+}
+
+void MIPrinter::printMBBReference(const MachineBasicBlock &MBB) {
+  OS << "%bb." << MBB.getNumber();
+  if (const auto *BB = MBB.getBasicBlock()) {
+    if (BB->hasName())
+      OS << '.' << BB->getName();
+  }
+}
+
+static void printIRSlotNumber(raw_ostream &OS, int Slot) {
+  if (Slot == -1)
+    OS << "<badref>";
+  else
+    OS << Slot;
+}
+
+void MIPrinter::printIRBlockReference(const BasicBlock &BB) {
+  OS << "%ir-block.";
+  if (BB.hasName()) {
+    printLLVMNameWithoutPrefix(OS, BB.getName());
+    return;
+  }
+  const Function *F = BB.getParent();
+  int Slot;
+  if (F == MST.getCurrentFunction()) {
+    Slot = MST.getLocalSlot(&BB);
+  } else {
+    ModuleSlotTracker CustomMST(F->getParent(),
+                                /*ShouldInitializeAllMetadata=*/false);
+    CustomMST.incorporateFunction(*F);
+    Slot = CustomMST.getLocalSlot(&BB);
+  }
+  printIRSlotNumber(OS, Slot);
+}
+
+void MIPrinter::printIRValueReference(const Value &V) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  printIRSlotNumber(OS, MST.getLocalSlot(&V));
+}
+
+void MIPrinter::printStackObjectReference(int FrameIndex) {
+  auto ObjectInfo = StackObjectOperandMapping.find(FrameIndex);
+  assert(ObjectInfo != StackObjectOperandMapping.end() &&
+         "Invalid frame index");
+  const FrameIndexOperand &Operand = ObjectInfo->second;
+  if (Operand.IsFixed) {
+    OS << "%fixed-stack." << Operand.ID;
+    return;
+  }
+  OS << "%stack." << Operand.ID;
+  if (!Operand.Name.empty())
+    OS << '.' << Operand.Name;
+}
+
+void MIPrinter::printOffset(int64_t Offset) {
+  if (Offset == 0)
+    return;
+  if (Offset < 0) {
+    OS << " - " << -Offset;
+    return;
+  }
+  OS << " + " << Offset;
+}
+
+static const char *getTargetFlagName(const TargetInstrInfo *TII, unsigned TF) {
+  auto Flags = TII->getSerializableDirectMachineOperandTargetFlags();
+  for (const auto &I : Flags) {
+    if (I.first == TF) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+void MIPrinter::printTargetFlags(const MachineOperand &Op) {
+  if (!Op.getTargetFlags())
+    return;
+  const auto *TII =
+      Op.getParent()->getParent()->getParent()->getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Flags = TII->decomposeMachineOperandsTargetFlags(Op.getTargetFlags());
+  OS << "target-flags(";
+  const bool HasDirectFlags = Flags.first;
+  const bool HasBitmaskFlags = Flags.second;
+  if (!HasDirectFlags && !HasBitmaskFlags) {
+    OS << "<unknown>) ";
+    return;
+  }
+  if (HasDirectFlags) {
+    if (const auto *Name = getTargetFlagName(TII, Flags.first))
+      OS << Name;
+    else
+      OS << "<unknown target flag>";
+  }
+  if (!HasBitmaskFlags) {
+    OS << ") ";
+    return;
+  }
+  bool IsCommaNeeded = HasDirectFlags;
+  unsigned BitMask = Flags.second;
+  auto BitMasks = TII->getSerializableBitmaskMachineOperandTargetFlags();
+  for (const auto &Mask : BitMasks) {
+    // Check if the flag's bitmask has the bits of the current mask set.
+    if ((BitMask & Mask.first) == Mask.first) {
+      if (IsCommaNeeded)
+        OS << ", ";
+      IsCommaNeeded = true;
+      OS << Mask.second;
+      // Clear the bits which were serialized from the flag's bitmask.
+      BitMask &= ~(Mask.first);
+    }
+  }
+  if (BitMask) {
+    // When the resulting flag's bitmask isn't zero, we know that we didn't
+    // serialize all of the bit flags.
+    if (IsCommaNeeded)
+      OS << ", ";
+    OS << "<unknown bitmask target flag>";
+  }
+  OS << ") ";
+}
+
+static const char *getTargetIndexName(const MachineFunction &MF, int Index) {
+  const auto *TII = MF.getSubtarget().getInstrInfo();
+  assert(TII && "expected instruction info");
+  auto Indices = TII->getSerializableTargetIndices();
+  for (const auto &I : Indices) {
+    if (I.first == Index) {
+      return I.second;
+    }
+  }
+  return nullptr;
+}
+
+void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
+                      unsigned I, bool ShouldPrintRegisterTies,
+                      const MachineRegisterInfo *MRI, bool IsDef) {
+  printTargetFlags(Op);
+  switch (Op.getType()) {
+  case MachineOperand::MO_Register:
+    if (Op.isImplicit())
+      OS << (Op.isDef() ? "implicit-def " : "implicit ");
+    else if (!IsDef && Op.isDef())
+      // Print the 'def' flag only when the operand is defined after '='.
+      OS << "def ";
+    if (Op.isInternalRead())
+      OS << "internal ";
+    if (Op.isDead())
+      OS << "dead ";
+    if (Op.isKill())
+      OS << "killed ";
+    if (Op.isUndef())
+      OS << "undef ";
+    if (Op.isEarlyClobber())
+      OS << "early-clobber ";
+    if (Op.isDebug())
+      OS << "debug-use ";
+    printReg(Op.getReg(), OS, TRI);
+    // Print the sub register.
+    if (Op.getSubReg() != 0)
+      OS << ':' << TRI->getSubRegIndexName(Op.getSubReg());
+    if (ShouldPrintRegisterTies && Op.isTied() && !Op.isDef())
+      OS << "(tied-def " << Op.getParent()->findTiedOperandIdx(I) << ")";
+    assert((!IsDef || MRI) && "for IsDef, MRI must be provided");
+    if (IsDef && MRI->getSize(Op.getReg()))
+      OS << '(' << MRI->getSize(Op.getReg()) << ')';
+    break;
+  case MachineOperand::MO_Immediate:
+    OS << Op.getImm();
+    break;
+  case MachineOperand::MO_CImmediate:
+    Op.getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
+  case MachineOperand::MO_FPImmediate:
+    Op.getFPImm()->printAsOperand(OS, /*PrintType=*/true, MST);
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    printMBBReference(*Op.getMBB());
+    break;
+  case MachineOperand::MO_FrameIndex:
+    printStackObjectReference(Op.getIndex());
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "%const." << Op.getIndex();
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_TargetIndex: {
+    OS << "target-index(";
+    if (const auto *Name = getTargetIndexName(
+            *Op.getParent()->getParent()->getParent(), Op.getIndex()))
+      OS << Name;
+    else
+      OS << "<unknown>";
+    OS << ')';
+    printOffset(Op.getOffset());
+    break;
+  }
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "%jump-table." << Op.getIndex();
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << '$';
+    printLLVMNameWithoutPrefix(OS, Op.getSymbolName());
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    Op.getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_BlockAddress:
+    OS << "blockaddress(";
+    Op.getBlockAddress()->getFunction()->printAsOperand(OS, /*PrintType=*/false,
+                                                        MST);
+    OS << ", ";
+    printIRBlockReference(*Op.getBlockAddress()->getBasicBlock());
+    OS << ')';
+    printOffset(Op.getOffset());
+    break;
+  case MachineOperand::MO_RegisterMask: {
+    auto RegMaskInfo = RegisterMaskIds.find(Op.getRegMask());
+    if (RegMaskInfo != RegisterMaskIds.end())
+      OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();
+    else
+      llvm_unreachable("Can't print this machine register mask yet.");
+    break;
+  }
+  case MachineOperand::MO_RegisterLiveOut: {
+    const uint32_t *RegMask = Op.getRegLiveOut();
+    OS << "liveout(";
+    bool IsCommaNeeded = false;
+    for (unsigned Reg = 0, E = TRI->getNumRegs(); Reg < E; ++Reg) {
+      if (RegMask[Reg / 32] & (1U << (Reg % 32))) {
+        if (IsCommaNeeded)
+          OS << ", ";
+        printReg(Reg, OS, TRI);
+        IsCommaNeeded = true;
+      }
+    }
+    OS << ")";
+    break;
+  }
+  case MachineOperand::MO_Metadata:
+    Op.getMetadata()->printAsOperand(OS, MST);
+    break;
+  case MachineOperand::MO_MCSymbol:
+    OS << "<mcsymbol " << *Op.getMCSymbol() << ">";
+    break;
+  case MachineOperand::MO_CFIIndex: {
+    const auto &MMI = Op.getParent()->getParent()->getParent()->getMMI();
+    print(MMI.getFrameInstructions()[Op.getCFIIndex()], TRI);
+    break;
+  }
+  }
+}
+
+void MIPrinter::print(const MachineMemOperand &Op) {
+  OS << '(';
+  // TODO: Print operand's target specific flags.
+  if (Op.isVolatile())
+    OS << "volatile ";
+  if (Op.isNonTemporal())
+    OS << "non-temporal ";
+  if (Op.isInvariant())
+    OS << "invariant ";
+  if (Op.isLoad())
+    OS << "load ";
+  else {
+    assert(Op.isStore() && "Non load machine operand must be a store");
+    OS << "store ";
+  }
+  OS << Op.getSize();
+  if (const Value *Val = Op.getValue()) {
+    OS << (Op.isLoad() ? " from " : " into ");
+    printIRValueReference(*Val);
+  } else if (const PseudoSourceValue *PVal = Op.getPseudoValue()) {
+    OS << (Op.isLoad() ? " from " : " into ");
+    assert(PVal && "Expected a pseudo source value");
+    switch (PVal->kind()) {
+    case PseudoSourceValue::Stack:
+      OS << "stack";
+      break;
+    case PseudoSourceValue::GOT:
+      OS << "got";
+      break;
+    case PseudoSourceValue::JumpTable:
+      OS << "jump-table";
+      break;
+    case PseudoSourceValue::ConstantPool:
+      OS << "constant-pool";
+      break;
+    case PseudoSourceValue::FixedStack:
+      printStackObjectReference(
+          cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex());
+      break;
+    case PseudoSourceValue::GlobalValueCallEntry:
+      OS << "call-entry ";
+      cast<GlobalValuePseudoSourceValue>(PVal)->getValue()->printAsOperand(
+          OS, /*PrintType=*/false, MST);
+      break;
+    case PseudoSourceValue::ExternalSymbolCallEntry:
+      OS << "call-entry $";
+      printLLVMNameWithoutPrefix(
+          OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
+      break;
+    }
+  }
+  printOffset(Op.getOffset());
+  if (Op.getBaseAlignment() != Op.getSize())
+    OS << ", align " << Op.getBaseAlignment();
+  auto AAInfo = Op.getAAInfo();
+  if (AAInfo.TBAA) {
+    OS << ", !tbaa ";
+    AAInfo.TBAA->printAsOperand(OS, MST);
+  }
+  if (AAInfo.Scope) {
+    OS << ", !alias.scope ";
+    AAInfo.Scope->printAsOperand(OS, MST);
+  }
+  if (AAInfo.NoAlias) {
+    OS << ", !noalias ";
+    AAInfo.NoAlias->printAsOperand(OS, MST);
+  }
+  if (Op.getRanges()) {
+    OS << ", !range ";
+    Op.getRanges()->printAsOperand(OS, MST);
+  }
+  OS << ')';
+}
+
+static void printCFIRegister(unsigned DwarfReg, raw_ostream &OS,
+                             const TargetRegisterInfo *TRI) {
+  int Reg = TRI->getLLVMRegNum(DwarfReg, true);
+  if (Reg == -1) {
+    OS << "<badreg>";
+    return;
+  }
+  printReg(Reg, OS, TRI);
+}
+
+void MIPrinter::print(const MCCFIInstruction &CFI,
+                      const TargetRegisterInfo *TRI) {
+  switch (CFI.getOperation()) {
+  case MCCFIInstruction::OpSameValue:
+    OS << ".cfi_same_value ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpOffset:
+    OS << ".cfi_offset ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfaRegister:
+    OS << ".cfi_def_cfa_register ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    break;
+  case MCCFIInstruction::OpDefCfaOffset:
+    OS << ".cfi_def_cfa_offset ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    OS << CFI.getOffset();
+    break;
+  case MCCFIInstruction::OpDefCfa:
+    OS << ".cfi_def_cfa ";
+    if (CFI.getLabel())
+      OS << "<mcsymbol> ";
+    printCFIRegister(CFI.getRegister(), OS, TRI);
+    OS << ", " << CFI.getOffset();
+    break;
+  default:
+    // TODO: Print the other CFI Operations.
+    OS << "<unserializable cfi operation>";
+    break;
+  }
+}
+
+void llvm::printMIR(raw_ostream &OS, const Module &M) {
+  yaml::Output Out(OS);
+  Out << const_cast<Module &>(M);
+}
+
+void llvm::printMIR(raw_ostream &OS, const MachineFunction &MF) {
+  MIRPrinter Printer(OS);
+  Printer.print(MF);
+}
diff --git a/contrib/llvm/lib/CodeGen/MIRPrinter.h b/contrib/llvm/lib/CodeGen/MIRPrinter.h
new file mode 100644
index 000000000000..16aa9038b6b2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRPrinter.h
@@ -0,0 +1,33 @@
+//===- MIRPrinter.h - MIR serialization format printer --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the functions that print out the LLVM IR and the machine
+// functions using the MIR serialization format.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_MIRPRINTER_H
+#define LLVM_LIB_CODEGEN_MIRPRINTER_H
+
+namespace llvm {
+
+class MachineFunction;
+class Module;
+class raw_ostream;
+
+/// Print LLVM IR using the MIR serialization format to the given output stream.
+void printMIR(raw_ostream &OS, const Module &M);
+
+/// Print a machine function using the MIR serialization format to the given
+/// output stream.
+void printMIR(raw_ostream &OS, const MachineFunction &MF);
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
new file mode 100644
index 000000000000..8e7566a4e46b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MIRPrintingPass.cpp
@@ -0,0 +1,71 @@
+//===- MIRPrintingPass.cpp - Pass that prints out using the MIR format ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a pass that prints out the LLVM module using the MIR
+// serialization format.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MIRPrinter.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+
+/// This pass prints out the LLVM IR to an output stream using the MIR
+/// serialization format.
+struct MIRPrintingPass : public MachineFunctionPass {
+  static char ID;
+  raw_ostream &OS;
+  std::string MachineFunctions;
+
+  MIRPrintingPass() : MachineFunctionPass(ID), OS(dbgs()) {}
+  MIRPrintingPass(raw_ostream &OS) : MachineFunctionPass(ID), OS(OS) {}
+
+  const char *getPassName() const override { return "MIR Printing Pass"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    std::string Str;
+    raw_string_ostream StrOS(Str);
+    printMIR(StrOS, MF);
+    MachineFunctions.append(StrOS.str());
+    return false;
+  }
+
+  bool doFinalization(Module &M) override {
+    printMIR(OS, M);
+    OS << MachineFunctions;
+    return false;
+  }
+};
+
+char MIRPrintingPass::ID = 0;
+
+} // end anonymous namespace
+
+char &llvm::MIRPrintingPassID = MIRPrintingPass::ID;
+INITIALIZE_PASS(MIRPrintingPass, "mir-printer", "MIR Printer", false, false)
+
+namespace llvm {
+
+MachineFunctionPass *createPrintMIRPass(raw_ostream &OS) {
+  return new MIRPrintingPass(OS);
+}
+
+} // end namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
new file mode 100644
index 000000000000..689dd0764ce0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -0,0 +1,1289 @@
+//===-- llvm/CodeGen/MachineBasicBlock.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect the sequence of machine instructions for a basic block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen"
+
+MachineBasicBlock::MachineBasicBlock(MachineFunction &MF, const BasicBlock *B)
+    : BB(B), Number(-1), xParent(&MF) {
+  Insts.Parent = this;
+}
+
+MachineBasicBlock::~MachineBasicBlock() {
+}
+
+/// Return the MCSymbol for this basic block.
+MCSymbol *MachineBasicBlock::getSymbol() const {
+  if (!CachedMCSymbol) {
+    const MachineFunction *MF = getParent();
+    MCContext &Ctx = MF->getContext();
+    const char *Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+    assert(getNumber() >= 0 && "cannot get label for unreachable MBB");
+    CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
+                                           Twine(MF->getFunctionNumber()) +
+                                           "_" + Twine(getNumber()));
+  }
+
+  return CachedMCSymbol;
+}
+
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const MachineBasicBlock &MBB) {
+  MBB.print(OS);
+  return OS;
+}
+
+/// When an MBB is added to an MF, we need to update the parent pointer of the
+/// MBB, the MBB numbering, and any instructions in the MBB to be on the right
+/// operand list for registers.
+///
+/// MBBs start out as #-1. When a MBB is added to a MachineFunction, it
+/// gets the next available unique MBB number. If it is removed from a
+/// MachineFunction, it goes back to being #-1.
+void ilist_traits<MachineBasicBlock>::addNodeToList(MachineBasicBlock *N) {
+  MachineFunction &MF = *N->getParent();
+  N->Number = MF.addToMBBNumbering(N);
+
+  // Make sure the instructions have their operands in the reginfo lists.
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+  for (MachineBasicBlock::instr_iterator
+         I = N->instr_begin(), E = N->instr_end(); I != E; ++I)
+    I->AddRegOperandsToUseLists(RegInfo);
+}
+
+void ilist_traits<MachineBasicBlock>::removeNodeFromList(MachineBasicBlock *N) {
+  N->getParent()->removeFromMBBNumbering(N->Number);
+  N->Number = -1;
+}
+
+/// When we add an instruction to a basic block list, we update its parent
+/// pointer and add its operands from reg use/def lists if appropriate.
+void ilist_traits<MachineInstr>::addNodeToList(MachineInstr *N) {
+  assert(!N->getParent() && "machine instruction already in a basic block");
+  N->setParent(Parent);
+
+  // Add the instruction's register operands to their corresponding
+  // use/def lists.
+  MachineFunction *MF = Parent->getParent();
+  N->AddRegOperandsToUseLists(MF->getRegInfo());
+}
+
+/// When we remove an instruction from a basic block list, we update its parent
+/// pointer and remove its operands from reg use/def lists if appropriate.
+void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
+  assert(N->getParent() && "machine instruction not in a basic block");
+
+  // Remove from the use/def lists.
+  if (MachineFunction *MF = N->getParent()->getParent())
+    N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
+
+  N->setParent(nullptr);
+}
+
+/// When moving a range of instructions from one MBB list to another, we need to
+/// update the parent pointers and the use/def lists.
+void ilist_traits<MachineInstr>::
+transferNodesFromList(ilist_traits<MachineInstr> &FromList,
+                      ilist_iterator<MachineInstr> First,
+                      ilist_iterator<MachineInstr> Last) {
+  assert(Parent->getParent() == FromList.Parent->getParent() &&
+        "MachineInstr parent mismatch!");
+
+  // Splice within the same MBB -> no change.
+  if (Parent == FromList.Parent) return;
+
+  // If splicing between two blocks within the same function, just update the
+  // parent pointers.
+  for (; First != Last; ++First)
+    First->setParent(Parent);
+}
+
+void ilist_traits<MachineInstr>::deleteNode(MachineInstr* MI) {
+  assert(!MI->getParent() && "MI is still in a block!");
+  Parent->getParent()->DeleteMachineInstr(MI);
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonPHI() {
+  instr_iterator I = instr_begin(), E = instr_end();
+  while (I != E && I->isPHI())
+    ++I;
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi MI cannot be inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator
+MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
+  iterator E = end();
+  while (I != E && (I->isPHI() || I->isPosition() || I->isDebugValue()))
+    ++I;
+  // FIXME: This needs to change if we wish to bundle labels / dbg_values
+  // inside the bundle.
+  assert((I == E || !I->isInsideBundle()) &&
+         "First non-phi / non-label instruction is inside a bundle!");
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
+  iterator B = begin(), E = end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::instr_iterator MachineBasicBlock::getFirstInstrTerminator() {
+  instr_iterator B = instr_begin(), E = instr_end(), I = E;
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+    ; /*noop */
+  while (I != E && !I->isTerminator())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getFirstNonDebugInstr() {
+  // Skip over begin-of-block dbg_value instructions.
+  iterator I = begin(), E = end();
+  while (I != E && I->isDebugValue())
+    ++I;
+  return I;
+}
+
+MachineBasicBlock::iterator MachineBasicBlock::getLastNonDebugInstr() {
+  // Skip over end-of-block dbg_value instructions.
+  instr_iterator B = instr_begin(), I = instr_end();
+  while (I != B) {
+    --I;
+    // Return instruction that starts a bundle.
+    if (I->isDebugValue() || I->isInsideBundle())
+      continue;
+    return I;
+  }
+  // The block is all debug values.
+  return end();
+}
+
+bool MachineBasicBlock::hasEHPadSuccessor() const {
+  for (const_succ_iterator I = succ_begin(), E = succ_end(); I != E; ++I)
+    if ((*I)->isEHPad())
+      return true;
+  return false;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
+  print(dbgs());
+}
+#endif
+
+StringRef MachineBasicBlock::getName() const {
+  if (const BasicBlock *LBB = getBasicBlock())
+    return LBB->getName();
+  else
+    return "(null)";
+}
+
+/// Return a hopefully unique identifier for this block.
+std::string MachineBasicBlock::getFullName() const {
+  std::string Name;
+  if (getParent())
+    Name = (getParent()->getName() + ":").str();
+  if (getBasicBlock())
+    Name += getBasicBlock()->getName();
+  else
+    Name += ("BB" + Twine(getNumber())).str();
+  return Name;
+}
+
+void MachineBasicBlock::print(raw_ostream &OS, const SlotIndexes *Indexes)
+    const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+  const Function *F = MF->getFunction();
+  const Module *M = F ? F->getParent() : nullptr;
+  ModuleSlotTracker MST(M);
+  print(OS, MST, Indexes);
+}
+
+void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                              const SlotIndexes *Indexes) const {
+  const MachineFunction *MF = getParent();
+  if (!MF) {
+    OS << "Can't print out MachineBasicBlock because parent MachineFunction"
+       << " is null\n";
+    return;
+  }
+
+  if (Indexes)
+    OS << Indexes->getMBBStartIdx(this) << '\t';
+
+  OS << "BB#" << getNumber() << ": ";
+
+  const char *Comma = "";
+  if (const BasicBlock *LBB = getBasicBlock()) {
+    OS << Comma << "derived from LLVM BB ";
+    LBB->printAsOperand(OS, /*PrintType=*/false, MST);
+    Comma = ", ";
+  }
+  if (isEHPad()) { OS << Comma << "EH LANDING PAD"; Comma = ", "; }
+  if (hasAddressTaken()) { OS << Comma << "ADDRESS TAKEN"; Comma = ", "; }
+  if (Alignment)
+    OS << Comma << "Align " << Alignment << " (" << (1u << Alignment)
+       << " bytes)";
+
+  OS << '\n';
+
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  if (!livein_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Live Ins:";
+    for (const auto &LI : make_range(livein_begin(), livein_end())) {
+      OS << ' ' << PrintReg(LI.PhysReg, TRI);
+      if (LI.LaneMask != ~0u)
+        OS << ':' << PrintLaneMask(LI.LaneMask);
+    }
+    OS << '\n';
+  }
+  // Print the preds of this block according to the CFG.
+  if (!pred_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Predecessors according to CFG:";
+    for (const_pred_iterator PI = pred_begin(), E = pred_end(); PI != E; ++PI)
+      OS << " BB#" << (*PI)->getNumber();
+    OS << '\n';
+  }
+
+  for (auto &I : instrs()) {
+    if (Indexes) {
+      if (Indexes->hasIndex(I))
+        OS << Indexes->getInstructionIndex(I);
+      OS << '\t';
+    }
+    OS << '\t';
+    if (I.isInsideBundle())
+      OS << "  * ";
+    I.print(OS, MST);
+  }
+
+  // Print the successors of this block according to the CFG.
+  if (!succ_empty()) {
+    if (Indexes) OS << '\t';
+    OS << "    Successors according to CFG:";
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
+      OS << " BB#" << (*SI)->getNumber();
+      if (!Probs.empty())
+        OS << '(' << *getProbabilityIterator(SI) << ')';
+    }
+    OS << '\n';
+  }
+}
+
+void MachineBasicBlock::printAsOperand(raw_ostream &OS,
+                                       bool /*PrintType*/) const {
+  OS << "BB#" << getNumber();
+}
+
+void MachineBasicBlock::removeLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) {
+  LiveInVector::iterator I = std::find_if(
+      LiveIns.begin(), LiveIns.end(),
+      [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  if (I == LiveIns.end())
+    return;
+
+  I->LaneMask &= ~LaneMask;
+  if (I->LaneMask == 0)
+    LiveIns.erase(I);
+}
+
+bool MachineBasicBlock::isLiveIn(MCPhysReg Reg, LaneBitmask LaneMask) const {
+  livein_iterator I = std::find_if(
+      LiveIns.begin(), LiveIns.end(),
+      [Reg] (const RegisterMaskPair &LI) { return LI.PhysReg == Reg; });
+  return I != livein_end() && (I->LaneMask & LaneMask) != 0;
+}
+
+void MachineBasicBlock::sortUniqueLiveIns() {
+  std::sort(LiveIns.begin(), LiveIns.end(),
+            [](const RegisterMaskPair &LI0, const RegisterMaskPair &LI1) {
+              return LI0.PhysReg < LI1.PhysReg;
+            });
+  // Liveins are sorted by physreg now we can merge their lanemasks.
+  LiveInVector::const_iterator I = LiveIns.begin();
+  LiveInVector::const_iterator J;
+  LiveInVector::iterator Out = LiveIns.begin();
+  for (; I != LiveIns.end(); ++Out, I = J) {
+    unsigned PhysReg = I->PhysReg;
+    LaneBitmask LaneMask = I->LaneMask;
+    for (J = std::next(I); J != LiveIns.end() && J->PhysReg == PhysReg; ++J)
+      LaneMask |= J->LaneMask;
+    Out->PhysReg = PhysReg;
+    Out->LaneMask = LaneMask;
+  }
+  LiveIns.erase(Out, LiveIns.end());
+}
+
+unsigned
+MachineBasicBlock::addLiveIn(MCPhysReg PhysReg, const TargetRegisterClass *RC) {
+  assert(getParent() && "MBB must be inserted in function");
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) && "Expected physreg");
+  assert(RC && "Register class is required");
+  assert((isEHPad() || this == &getParent()->front()) &&
+         "Only the entry block and landing pads can have physreg live ins");
+
+  bool LiveIn = isLiveIn(PhysReg);
+  iterator I = SkipPHIsAndLabels(begin()), E = end();
+  MachineRegisterInfo &MRI = getParent()->getRegInfo();
+  const TargetInstrInfo &TII = *getParent()->getSubtarget().getInstrInfo();
+
+  // Look for an existing copy.
+  if (LiveIn)
+    for (;I != E && I->isCopy(); ++I)
+      if (I->getOperand(1).getReg() == PhysReg) {
+        unsigned VirtReg = I->getOperand(0).getReg();
+        if (!MRI.constrainRegClass(VirtReg, RC))
+          llvm_unreachable("Incompatible live-in register class.");
+        return VirtReg;
+      }
+
+  // No luck, create a virtual register.
+  unsigned VirtReg = MRI.createVirtualRegister(RC);
+  BuildMI(*this, I, DebugLoc(), TII.get(TargetOpcode::COPY), VirtReg)
+    .addReg(PhysReg, RegState::Kill);
+  if (!LiveIn)
+    addLiveIn(PhysReg);
+  return VirtReg;
+}
+
+void MachineBasicBlock::moveBefore(MachineBasicBlock *NewAfter) {
+  getParent()->splice(NewAfter->getIterator(), getIterator());
+}
+
+void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
+  getParent()->splice(++NewBefore->getIterator(), getIterator());
+}
+
+void MachineBasicBlock::updateTerminator() {
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+  // A block with no successors has no concerns with fall-through edges.
+  if (this->succ_empty())
+    return;
+
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  DebugLoc DL;  // FIXME: this is nowhere
+  bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
+  (void) B;
+  assert(!B && "UpdateTerminators requires analyzable predecessors!");
+  if (Cond.empty()) {
+    if (TBB) {
+      // The block has an unconditional branch. If its successor is now its
+      // layout successor, delete the branch.
+      if (isLayoutSuccessor(TBB))
+        TII->RemoveBranch(*this);
+    } else {
+      // The block has an unconditional fallthrough. If its successor is not its
+      // layout successor, insert a branch. First we have to locate the only
+      // non-landing-pad successor, as that is the fallthrough block.
+      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+        if ((*SI)->isEHPad())
+          continue;
+        assert(!TBB && "Found more than one non-landing-pad successor!");
+        TBB = *SI;
+      }
+
+      // If there is no non-landing-pad successor, the block has no fall-through
+      // edges to be concerned with.
+      if (!TBB)
+        return;
+
+      // Finally update the unconditional successor to be reached via a branch
+      // if it would not be reached by fallthrough.
+      if (!isLayoutSuccessor(TBB))
+        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+    }
+    return;
+  }
+
+  if (FBB) {
+    // The block has a non-fallthrough conditional branch. If one of its
+    // successors is its layout successor, rewrite it to a fallthrough
+    // conditional branch.
+    if (isLayoutSuccessor(TBB)) {
+      if (TII->ReverseBranchCondition(Cond))
+        return;
+      TII->RemoveBranch(*this);
+      TII->InsertBranch(*this, FBB, nullptr, Cond, DL);
+    } else if (isLayoutSuccessor(FBB)) {
+      TII->RemoveBranch(*this);
+      TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+    }
+    return;
+  }
+
+  // Walk through the successors and find the successor which is not a landing
+  // pad and is not the conditional branch destination (in TBB) as the
+  // fallthrough successor.
+  MachineBasicBlock *FallthroughBB = nullptr;
+  for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+    if ((*SI)->isEHPad() || *SI == TBB)
+      continue;
+    assert(!FallthroughBB && "Found more than one fallthrough successor.");
+    FallthroughBB = *SI;
+  }
+
+  if (!FallthroughBB) {
+    if (canFallThrough()) {
+      // We fallthrough to the same basic block as the conditional jump targets.
+      // Remove the conditional jump, leaving unconditional fallthrough.
+      // FIXME: This does not seem like a reasonable pattern to support, but it
+      // has been seen in the wild coming out of degenerate ARM test cases.
+      TII->RemoveBranch(*this);
+  
+      // Finally update the unconditional successor to be reached via a branch if
+      // it would not be reached by fallthrough.
+      if (!isLayoutSuccessor(TBB))
+        TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+      return;
+    }
+
+    // We enter here iff exactly one successor is TBB which cannot fallthrough
+    // and the rest successors if any are EHPads.  In this case, we need to
+    // change the conditional branch into unconditional branch.
+    TII->RemoveBranch(*this);
+    Cond.clear();
+    TII->InsertBranch(*this, TBB, nullptr, Cond, DL);
+    return;
+  }
+
+  // The block has a fallthrough conditional branch.
+  if (isLayoutSuccessor(TBB)) {
+    if (TII->ReverseBranchCondition(Cond)) {
+      // We can't reverse the condition, add an unconditional branch.
+      Cond.clear();
+      TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
+      return;
+    }
+    TII->RemoveBranch(*this);
+    TII->InsertBranch(*this, FallthroughBB, nullptr, Cond, DL);
+  } else if (!isLayoutSuccessor(FallthroughBB)) {
+    TII->RemoveBranch(*this);
+    TII->InsertBranch(*this, TBB, FallthroughBB, Cond, DL);
+  }
+}
+
+void MachineBasicBlock::validateSuccProbs() const {
+#ifndef NDEBUG
+  int64_t Sum = 0;
+  for (auto Prob : Probs)
+    Sum += Prob.getNumerator();
+  // Due to precision issue, we assume that the sum of probabilities is one if
+  // the difference between the sum of their numerators and the denominator is
+  // no greater than the number of successors.
+  assert((uint64_t)std::abs(Sum - BranchProbability::getDenominator()) <=
+             Probs.size() &&
+         "The sum of successors's probabilities exceeds one.");
+#endif // NDEBUG
+}
+
+void MachineBasicBlock::addSuccessor(MachineBasicBlock *Succ,
+                                     BranchProbability Prob) {
+  // Probability list is either empty (if successor list isn't empty, this means
+  // disabled optimization) or has the same size as successor list.
+  if (!(Probs.empty() && !Successors.empty()))
+    Probs.push_back(Prob);
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::addSuccessorWithoutProb(MachineBasicBlock *Succ) {
+  // We need to make sure probability list is either empty or has the same size
+  // of successor list. When this function is called, we can safely delete all
+  // probability in the list.
+  Probs.clear();
+  Successors.push_back(Succ);
+  Succ->addPredecessor(this);
+}
+
+void MachineBasicBlock::removeSuccessor(MachineBasicBlock *Succ,
+                                        bool NormalizeSuccProbs) {
+  succ_iterator I = std::find(Successors.begin(), Successors.end(), Succ);
+  removeSuccessor(I, NormalizeSuccProbs);
+}
+
+MachineBasicBlock::succ_iterator
+MachineBasicBlock::removeSuccessor(succ_iterator I, bool NormalizeSuccProbs) {
+  assert(I != Successors.end() && "Not a current successor!");
+
+  // If probability list is empty it means we don't use it (disabled
+  // optimization).
+  if (!Probs.empty()) {
+    probability_iterator WI = getProbabilityIterator(I);
+    Probs.erase(WI);
+    if (NormalizeSuccProbs)
+      normalizeSuccProbs();
+  }
+
+  (*I)->removePredecessor(this);
+  return Successors.erase(I);
+}
+
+void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
+                                         MachineBasicBlock *New) {
+  if (Old == New)
+    return;
+
+  succ_iterator E = succ_end();
+  succ_iterator NewI = E;
+  succ_iterator OldI = E;
+  for (succ_iterator I = succ_begin(); I != E; ++I) {
+    if (*I == Old) {
+      OldI = I;
+      if (NewI != E)
+        break;
+    }
+    if (*I == New) {
+      NewI = I;
+      if (OldI != E)
+        break;
+    }
+  }
+  assert(OldI != E && "Old is not a successor of this block");
+
+  // If New isn't already a successor, let it take Old's place.
+  if (NewI == E) {
+    Old->removePredecessor(this);
+    New->addPredecessor(this);
+    *OldI = New;
+    return;
+  }
+
+  // New is already a successor.
+  // Update its probability instead of adding a duplicate edge.
+  if (!Probs.empty()) {
+    auto ProbIter = getProbabilityIterator(NewI);
+    if (!ProbIter->isUnknown())
+      *ProbIter += *getProbabilityIterator(OldI);
+  }
+  removeSuccessor(OldI);
+}
+
+void MachineBasicBlock::addPredecessor(MachineBasicBlock *Pred) {
+  Predecessors.push_back(Pred);
+}
+
+void MachineBasicBlock::removePredecessor(MachineBasicBlock *Pred) {
+  pred_iterator I = std::find(Predecessors.begin(), Predecessors.end(), Pred);
+  assert(I != Predecessors.end() && "Pred is not a predecessor of this block!");
+  Predecessors.erase(I);
+}
+
+void MachineBasicBlock::transferSuccessors(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
+    return;
+
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
+
+    // If probability list is empty it means we don't use it (disabled optimization).
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
+
+    FromMBB->removeSuccessor(Succ);
+  }
+}
+
+void
+MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *FromMBB) {
+  if (this == FromMBB)
+    return;
+
+  while (!FromMBB->succ_empty()) {
+    MachineBasicBlock *Succ = *FromMBB->succ_begin();
+    if (!FromMBB->Probs.empty()) {
+      auto Prob = *FromMBB->Probs.begin();
+      addSuccessor(Succ, Prob);
+    } else
+      addSuccessorWithoutProb(Succ);
+    FromMBB->removeSuccessor(Succ);
+
+    // Fix up any PHI nodes in the successor.
+    for (MachineBasicBlock::instr_iterator MI = Succ->instr_begin(),
+           ME = Succ->instr_end(); MI != ME && MI->isPHI(); ++MI)
+      for (unsigned i = 2, e = MI->getNumOperands()+1; i != e; i += 2) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.getMBB() == FromMBB)
+          MO.setMBB(this);
+      }
+  }
+  normalizeSuccProbs();
+}
+
+bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
+  return std::find(pred_begin(), pred_end(), MBB) != pred_end();
+}
+
+bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
+  return std::find(succ_begin(), succ_end(), MBB) != succ_end();
+}
+
+bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
+  MachineFunction::const_iterator I(this);
+  return std::next(I) == MachineFunction::const_iterator(MBB);
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  MachineFunction::iterator Fallthrough = getIterator();
+  ++Fallthrough;
+  // If FallthroughBlock is off the end of the function, it can't fall through.
+  if (Fallthrough == getParent()->end())
+    return false;
+
+  // If FallthroughBlock isn't a successor, no fallthrough is possible.
+  if (!isSuccessor(&*Fallthrough))
+    return false;
+
+  // Analyze the branches, if any, at the end of the block.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+  if (TII->analyzeBranch(*this, TBB, FBB, Cond)) {
+    // If we couldn't analyze the branch, examine the last instruction.
+    // If the block doesn't end in a known control barrier, assume fallthrough
+    // is possible. The isPredicated check is needed because this code can be
+    // called during IfConversion, where an instruction which is normally a
+    // Barrier is predicated and thus no longer an actual control barrier.
+    return empty() || !back().isBarrier() || TII->isPredicated(back());
+  }
+
+  // If there is no branch, control always falls through.
+  if (!TBB) return true;
+
+  // If there is some explicit branch to the fallthrough block, it can obviously
+  // reach, even though the branch should get folded to fall through implicitly.
+  if (MachineFunction::iterator(TBB) == Fallthrough ||
+      MachineFunction::iterator(FBB) == Fallthrough)
+    return true;
+
+  // If it's an unconditional branch to some block not the fall through, it
+  // doesn't fall through.
+  if (Cond.empty()) return false;
+
+  // Otherwise, if it is conditional and has no explicit false block, it falls
+  // through.
+  return FBB == nullptr;
+}
+
+MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
+                                                        Pass &P) {
+  if (!canSplitCriticalEdge(Succ))
+    return nullptr;
+
+  MachineFunction *MF = getParent();
+  DebugLoc DL;  // FIXME: this is nowhere
+
+  MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
+  MF->insert(std::next(MachineFunction::iterator(this)), NMBB);
+  DEBUG(dbgs() << "Splitting critical edge:"
+        " BB#" << getNumber()
+        << " -- BB#" << NMBB->getNumber()
+        << " -- BB#" << Succ->getNumber() << '\n');
+
+  LiveIntervals *LIS = P.getAnalysisIfAvailable<LiveIntervals>();
+  SlotIndexes *Indexes = P.getAnalysisIfAvailable<SlotIndexes>();
+  if (LIS)
+    LIS->insertMBBInMaps(NMBB);
+  else if (Indexes)
+    Indexes->insertMBBInMaps(NMBB);
+
+  // On some targets like Mips, branches may kill virtual registers. Make sure
+  // that LiveVariables is properly updated after updateTerminator replaces the
+  // terminators.
+  LiveVariables *LV = P.getAnalysisIfAvailable<LiveVariables>();
+
+  // Collect a list of virtual registers killed by the terminators.
+  SmallVector<unsigned, 4> KilledRegs;
+  if (LV)
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I) {
+      MachineInstr *MI = &*I;
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || OI->getReg() == 0 ||
+            !OI->isUse() || !OI->isKill() || OI->isUndef())
+          continue;
+        unsigned Reg = OI->getReg();
+        if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+            LV->getVarInfo(Reg).removeKill(*MI)) {
+          KilledRegs.push_back(Reg);
+          DEBUG(dbgs() << "Removing terminator kill: " << *MI);
+          OI->setIsKill(false);
+        }
+      }
+    }
+
+  SmallVector<unsigned, 4> UsedRegs;
+  if (LIS) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I) {
+      MachineInstr *MI = &*I;
+
+      for (MachineInstr::mop_iterator OI = MI->operands_begin(),
+           OE = MI->operands_end(); OI != OE; ++OI) {
+        if (!OI->isReg() || OI->getReg() == 0)
+          continue;
+
+        unsigned Reg = OI->getReg();
+        if (std::find(UsedRegs.begin(), UsedRegs.end(), Reg) == UsedRegs.end())
+          UsedRegs.push_back(Reg);
+      }
+    }
+  }
+
+  ReplaceUsesOfBlockWith(Succ, NMBB);
+
+  // If updateTerminator() removes instructions, we need to remove them from
+  // SlotIndexes.
+  SmallVector<MachineInstr*, 4> Terminators;
+  if (Indexes) {
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      Terminators.push_back(&*I);
+  }
+
+  updateTerminator();
+
+  if (Indexes) {
+    SmallVector<MachineInstr*, 4> NewTerminators;
+    for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
+         I != E; ++I)
+      NewTerminators.push_back(&*I);
+
+    for (SmallVectorImpl<MachineInstr*>::iterator I = Terminators.begin(),
+        E = Terminators.end(); I != E; ++I) {
+      if (std::find(NewTerminators.begin(), NewTerminators.end(), *I) ==
+          NewTerminators.end())
+       Indexes->removeMachineInstrFromMaps(**I);
+    }
+  }
+
+  // Insert unconditional "jump Succ" instruction in NMBB if necessary.
+  NMBB->addSuccessor(Succ);
+  if (!NMBB->isLayoutSuccessor(Succ)) {
+    SmallVector<MachineOperand, 4> Cond;
+    const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
+    TII->InsertBranch(*NMBB, Succ, nullptr, Cond, DL);
+
+    if (Indexes) {
+      for (MachineInstr &MI : NMBB->instrs()) {
+        // Some instructions may have been moved to NMBB by updateTerminator(),
+        // so we first remove any instruction that already has an index.
+        if (Indexes->hasIndex(MI))
+          Indexes->removeMachineInstrFromMaps(MI);
+        Indexes->insertMachineInstrInMaps(MI);
+      }
+    }
+  }
+
+  // Fix PHI nodes in Succ so they refer to NMBB instead of this
+  for (MachineBasicBlock::instr_iterator
+         i = Succ->instr_begin(),e = Succ->instr_end();
+       i != e && i->isPHI(); ++i)
+    for (unsigned ni = 1, ne = i->getNumOperands(); ni != ne; ni += 2)
+      if (i->getOperand(ni+1).getMBB() == this)
+        i->getOperand(ni+1).setMBB(NMBB);
+
+  // Inherit live-ins from the successor
+  for (const auto &LI : Succ->liveins())
+    NMBB->addLiveIn(LI);
+
+  // Update LiveVariables.
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  if (LV) {
+    // Restore kills of virtual registers that were killed by the terminators.
+    while (!KilledRegs.empty()) {
+      unsigned Reg = KilledRegs.pop_back_val();
+      for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
+        if (!(--I)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
+          continue;
+        if (TargetRegisterInfo::isVirtualRegister(Reg))
+          LV->getVarInfo(Reg).Kills.push_back(&*I);
+        DEBUG(dbgs() << "Restored terminator kill: " << *I);
+        break;
+      }
+    }
+    // Update relevant live-through information.
+    LV->addNewBlock(NMBB, this, Succ);
+  }
+
+  if (LIS) {
+    // After splitting the edge and updating SlotIndexes, live intervals may be
+    // in one of two situations, depending on whether this block was the last in
+    // the function. If the original block was the last in the function, all
+    // live intervals will end prior to the beginning of the new split block. If
+    // the original block was not at the end of the function, all live intervals
+    // will extend to the end of the new split block.
+
+    bool isLastMBB =
+      std::next(MachineFunction::iterator(NMBB)) == getParent()->end();
+
+    SlotIndex StartIndex = Indexes->getMBBEndIdx(this);
+    SlotIndex PrevIndex = StartIndex.getPrevSlot();
+    SlotIndex EndIndex = Indexes->getMBBEndIdx(NMBB);
+
+    // Find the registers used from NMBB in PHIs in Succ.
+    SmallSet<unsigned, 8> PHISrcRegs;
+    for (MachineBasicBlock::instr_iterator
+         I = Succ->instr_begin(), E = Succ->instr_end();
+         I != E && I->isPHI(); ++I) {
+      for (unsigned ni = 1, ne = I->getNumOperands(); ni != ne; ni += 2) {
+        if (I->getOperand(ni+1).getMBB() == NMBB) {
+          MachineOperand &MO = I->getOperand(ni);
+          unsigned Reg = MO.getReg();
+          PHISrcRegs.insert(Reg);
+          if (MO.isUndef())
+            continue;
+
+          LiveInterval &LI = LIS->getInterval(Reg);
+          VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+          assert(VNI &&
+                 "PHI sources should be live out of their predecessors.");
+          LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
+        }
+      }
+    }
+
+    MachineRegisterInfo *MRI = &getParent()->getRegInfo();
+    for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+      unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+      if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
+        continue;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (!LI.liveAt(PrevIndex))
+        continue;
+
+      bool isLiveOut = LI.liveAt(LIS->getMBBStartIdx(Succ));
+      if (isLiveOut && isLastMBB) {
+        VNInfo *VNI = LI.getVNInfoAt(PrevIndex);
+        assert(VNI && "LiveInterval should have VNInfo where it is live.");
+        LI.addSegment(LiveInterval::Segment(StartIndex, EndIndex, VNI));
+      } else if (!isLiveOut && !isLastMBB) {
+        LI.removeSegment(StartIndex, EndIndex);
+      }
+    }
+
+    // Update all intervals for registers whose uses may have been modified by
+    // updateTerminator().
+    LIS->repairIntervalsInRange(this, getFirstTerminator(), end(), UsedRegs);
+  }
+
+  if (MachineDominatorTree *MDT =
+          P.getAnalysisIfAvailable<MachineDominatorTree>())
+    MDT->recordSplitCriticalEdge(this, Succ, NMBB);
+
+  if (MachineLoopInfo *MLI = P.getAnalysisIfAvailable<MachineLoopInfo>())
+    if (MachineLoop *TIL = MLI->getLoopFor(this)) {
+      // If one or the other blocks were not in a loop, the new block is not
+      // either, and thus LI doesn't need to be updated.
+      if (MachineLoop *DestLoop = MLI->getLoopFor(Succ)) {
+        if (TIL == DestLoop) {
+          // Both in the same loop, the NMBB joins loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (TIL->contains(DestLoop)) {
+          // Edge from an outer loop to an inner loop.  Add to the outer loop.
+          TIL->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else if (DestLoop->contains(TIL)) {
+          // Edge from an inner loop to an outer loop.  Add to the outer loop.
+          DestLoop->addBasicBlockToLoop(NMBB, MLI->getBase());
+        } else {
+          // Edge from two loops with no containment relation.  Because these
+          // are natural loops, we know that the destination block must be the
+          // header of its loop (adding a branch into a loop elsewhere would
+          // create an irreducible loop).
+          assert(DestLoop->getHeader() == Succ &&
+                 "Should not create irreducible loops!");
+          if (MachineLoop *P = DestLoop->getParentLoop())
+            P->addBasicBlockToLoop(NMBB, MLI->getBase());
+        }
+      }
+    }
+
+  return NMBB;
+}
+
+bool MachineBasicBlock::canSplitCriticalEdge(
+    const MachineBasicBlock *Succ) const {
+  // Splitting the critical edge to a landing pad block is non-trivial. Don't do
+  // it in this generic function.
+  if (Succ->isEHPad())
+    return false;
+
+  const MachineFunction *MF = getParent();
+
+  // Performance might be harmed on HW that implements branching using exec mask
+  // where both sides of the branches are always executed.
+  if (MF->getTarget().requiresStructuredCFG())
+    return false;
+
+  // We may need to update this's terminator, but we can't do that if
+  // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  // AnalyzeBanch should modify this, since we did not allow modification.
+  if (TII->analyzeBranch(*const_cast<MachineBasicBlock *>(this), TBB, FBB, Cond,
+                         /*AllowModify*/ false))
+    return false;
+
+  // Avoid bugpoint weirdness: A block may end with a conditional branch but
+  // jumps to the same MBB is either case. We have duplicate CFG edges in that
+  // case that we can't handle. Since this never happens in properly optimized
+  // code, just skip those edges.
+  if (TBB && TBB == FBB) {
+    DEBUG(dbgs() << "Won't split critical edge after degenerate BB#"
+                 << getNumber() << '\n');
+    return false;
+  }
+  return true;
+}
+
+/// Prepare MI to be removed from its bundle. This fixes bundle flags on MI's
+/// neighboring instructions so the bundle won't be broken by removing MI.
+static void unbundleSingleMI(MachineInstr *MI) {
+  // Removing the first instruction in a bundle.
+  if (MI->isBundledWithSucc() && !MI->isBundledWithPred())
+    MI->unbundleFromSucc();
+  // Removing the last instruction in a bundle.
+  if (MI->isBundledWithPred() && !MI->isBundledWithSucc())
+    MI->unbundleFromPred();
+  // If MI is not bundled, or if it is internal to a bundle, the neighbor flags
+  // are already fine.
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) {
+  unbundleSingleMI(&*I);
+  return Insts.erase(I);
+}
+
+MachineInstr *MachineBasicBlock::remove_instr(MachineInstr *MI) {
+  unbundleSingleMI(MI);
+  MI->clearFlag(MachineInstr::BundledPred);
+  MI->clearFlag(MachineInstr::BundledSucc);
+  return Insts.remove(MI);
+}
+
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::insert(instr_iterator I, MachineInstr *MI) {
+  assert(!MI->isBundledWithPred() && !MI->isBundledWithSucc() &&
+         "Cannot insert instruction with bundle flags");
+  // Set the bundle flags when inserting inside a bundle.
+  if (I != instr_end() && I->isBundledWithPred()) {
+    MI->setFlag(MachineInstr::BundledPred);
+    MI->setFlag(MachineInstr::BundledSucc);
+  }
+  return Insts.insert(I, MI);
+}
+
+/// This method unlinks 'this' from the containing function, and returns it, but
+/// does not delete it.
+MachineBasicBlock *MachineBasicBlock::removeFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->remove(this);
+  return this;
+}
+
+/// This method unlinks 'this' from the containing function, and deletes it.
+void MachineBasicBlock::eraseFromParent() {
+  assert(getParent() && "Not embedded in a function!");
+  getParent()->erase(this);
+}
+
+/// Given a machine basic block that branched to 'Old', change the code and CFG
+/// so that it branches to 'New' instead.
+void MachineBasicBlock::ReplaceUsesOfBlockWith(MachineBasicBlock *Old,
+                                               MachineBasicBlock *New) {
+  assert(Old != New && "Cannot replace self with self!");
+
+  MachineBasicBlock::instr_iterator I = instr_end();
+  while (I != instr_begin()) {
+    --I;
+    if (!I->isTerminator()) break;
+
+    // Scan the operands of this machine instruction, replacing any uses of Old
+    // with New.
+    for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+      if (I->getOperand(i).isMBB() &&
+          I->getOperand(i).getMBB() == Old)
+        I->getOperand(i).setMBB(New);
+  }
+
+  // Update the successor information.
+  replaceSuccessor(Old, New);
+}
+
+/// Various pieces of code can cause excess edges in the CFG to be inserted.  If
+/// we have proven that MBB can only branch to DestA and DestB, remove any other
+/// MBB successors from the CFG.  DestA and DestB can be null.
+///
+/// Besides DestA and DestB, retain other edges leading to LandingPads
+/// (currently there can be only one; we don't check or require that here).
+/// Note it is possible that DestA and/or DestB are LandingPads.
+bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
+                                             MachineBasicBlock *DestB,
+                                             bool IsCond) {
+  // The values of DestA and DestB frequently come from a call to the
+  // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial
+  // values from there.
+  //
+  // 1. If both DestA and DestB are null, then the block ends with no branches
+  //    (it falls through to its successor).
+  // 2. If DestA is set, DestB is null, and IsCond is false, then the block ends
+  //    with only an unconditional branch.
+  // 3. If DestA is set, DestB is null, and IsCond is true, then the block ends
+  //    with a conditional branch that falls through to a successor (DestB).
+  // 4. If DestA and DestB is set and IsCond is true, then the block ends with a
+  //    conditional branch followed by an unconditional branch. DestA is the
+  //    'true' destination and DestB is the 'false' destination.
+
+  bool Changed = false;
+
+  MachineFunction::iterator FallThru = std::next(getIterator());
+
+  if (!DestA && !DestB) {
+    // Block falls through to successor.
+    DestA = &*FallThru;
+    DestB = &*FallThru;
+  } else if (DestA && !DestB) {
+    if (IsCond)
+      // Block ends in conditional jump that falls through to successor.
+      DestB = &*FallThru;
+  } else {
+    assert(DestA && DestB && IsCond &&
+           "CFG in a bad state. Cannot correct CFG edges");
+  }
+
+  // Remove superfluous edges. I.e., those which aren't destinations of this
+  // basic block, duplicate edges, or landing pads.
+  SmallPtrSet<const MachineBasicBlock*, 8> SeenMBBs;
+  MachineBasicBlock::succ_iterator SI = succ_begin();
+  while (SI != succ_end()) {
+    const MachineBasicBlock *MBB = *SI;
+    if (!SeenMBBs.insert(MBB).second ||
+        (MBB != DestA && MBB != DestB && !MBB->isEHPad())) {
+      // This is a superfluous edge, remove it.
+      SI = removeSuccessor(SI);
+      Changed = true;
+    } else {
+      ++SI;
+    }
+  }
+
+  if (Changed)
+    normalizeSuccProbs();
+  return Changed;
+}
+
+/// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE
+/// instructions.  Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
+  DebugLoc DL;
+  instr_iterator E = instr_end();
+  if (MBBI == E)
+    return DL;
+
+  // Skip debug declarations, we don't want a DebugLoc from them.
+  while (MBBI != E && MBBI->isDebugValue())
+    MBBI++;
+  if (MBBI != E)
+    DL = MBBI->getDebugLoc();
+  return DL;
+}
+
+/// Return probability of the edge from this block to MBB.
+BranchProbability
+MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
+  if (Probs.empty())
+    return BranchProbability(1, succ_size());
+
+  const auto &Prob = *getProbabilityIterator(Succ);
+  if (Prob.isUnknown()) {
+    // For unknown probabilities, collect the sum of all known ones, and evenly
+    // ditribute the complemental of the sum to each unknown probability.
+    unsigned KnownProbNum = 0;
+    auto Sum = BranchProbability::getZero();
+    for (auto &P : Probs) {
+      if (!P.isUnknown()) {
+        Sum += P;
+        KnownProbNum++;
+      }
+    }
+    return Sum.getCompl() / (Probs.size() - KnownProbNum);
+  } else
+    return Prob;
+}
+
+/// Set successor probability of a given iterator.
+void MachineBasicBlock::setSuccProbability(succ_iterator I,
+                                           BranchProbability Prob) {
+  assert(!Prob.isUnknown());
+  if (Probs.empty())
+    return;
+  *getProbabilityIterator(I) = Prob;
+}
+
+/// Return probability iterator corresonding to the I successor iterator
+MachineBasicBlock::const_probability_iterator
+MachineBasicBlock::getProbabilityIterator(
+    MachineBasicBlock::const_succ_iterator I) const {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
+}
+
+/// Return probability iterator corresonding to the I successor iterator.
+MachineBasicBlock::probability_iterator
+MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
+  assert(Probs.size() == Successors.size() && "Async probability list!");
+  const size_t index = std::distance(Successors.begin(), I);
+  assert(index < Probs.size() && "Not a current successor!");
+  return Probs.begin() + index;
+}
+
+/// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
+/// as of just before "MI".
+///
+/// Search is localised to a neighborhood of
+/// Neighborhood instructions before (searching for defs or kills) and N
+/// instructions after (searching just for defs) MI.
+MachineBasicBlock::LivenessQueryResult
+MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                           unsigned Reg, const_iterator Before,
+                                           unsigned Neighborhood) const {
+  unsigned N = Neighborhood;
+
+  // Start by searching backwards from Before, looking for kills, reads or defs.
+  const_iterator I(Before);
+  // If this is the first insn in the block, don't search backwards.
+  if (I != begin()) {
+    do {
+      --I;
+
+      MachineOperandIteratorBase::PhysRegInfo Info =
+          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
+
+      // Defs happen after uses so they take precedence if both are present.
+
+      // Register is dead after a dead def of the full register.
+      if (Info.DeadDef)
+        return LQR_Dead;
+      // Register is (at least partially) live after a def.
+      if (Info.Defined) {
+        if (!Info.PartialDeadDef)
+          return LQR_Live;
+        // As soon as we saw a partial definition (dead or not),
+        // we cannot tell if the value is partial live without
+        // tracking the lanemasks. We are not going to do this,
+        // so fall back on the remaining of the analysis.
+        break;
+      }
+      // Register is dead after a full kill or clobber and no def.
+      if (Info.Killed || Info.Clobbered)
+        return LQR_Dead;
+      // Register must be live if we read it.
+      if (Info.Read)
+        return LQR_Live;
+    } while (I != begin() && --N > 0);
+  }
+
+  // Did we get to the start of the block?
+  if (I == begin()) {
+    // If so, the register's state is definitely defined by the live-in state.
+    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true); RAI.isValid();
+         ++RAI)
+      if (isLiveIn(*RAI))
+        return LQR_Live;
+
+    return LQR_Dead;
+  }
+
+  N = Neighborhood;
+
+  // Try searching forwards from Before, looking for reads or defs.
+  I = const_iterator(Before);
+  // If this is the last insn in the block, don't search forwards.
+  if (I != end()) {
+    for (++I; I != end() && N > 0; ++I, --N) {
+      MachineOperandIteratorBase::PhysRegInfo Info =
+          ConstMIOperands(*I).analyzePhysReg(Reg, TRI);
+
+      // Register is live when we read it here.
+      if (Info.Read)
+        return LQR_Live;
+      // Register is dead if we can fully overwrite or clobber it here.
+      if (Info.FullyDefined || Info.Clobbered)
+        return LQR_Dead;
+    }
+  }
+
+  // At this point we have no idea of the liveness of the register.
+  return LQR_Unknown;
+}
+
+const uint32_t *
+MachineBasicBlock::getBeginClobberMask(const TargetRegisterInfo *TRI) const {
+  // EH funclet entry does not preserve any registers.
+  return isEHFuncletEntry() ? TRI->getNoPreservedMask() : nullptr;
+}
+
+const uint32_t *
+MachineBasicBlock::getEndClobberMask(const TargetRegisterInfo *TRI) const {
+  // If we see a return block with successors, this must be a funclet return,
+  // which does not preserve any registers. If there are no successors, we don't
+  // care what kind of return it is, putting a mask after it is a no-op.
+  return isReturnBlock() && !succ_empty() ? TRI->getNoPreservedMask() : nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
new file mode 100644
index 000000000000..6c0f99fa111e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -0,0 +1,200 @@
+//===- MachineBlockFrequencyInfo.cpp - MBB Frequency Analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Loops should be simplified before this analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/Analysis/BlockFrequencyInfoImpl.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "block-freq"
+
+#ifndef NDEBUG
+
+static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
+    "view-machine-block-freq-propagation-dags", cl::Hidden,
+    cl::desc("Pop up a window to show a dag displaying how machine block "
+             "frequencies propagate through the CFG."),
+    cl::values(clEnumValN(GVDT_None, "none", "do not display graphs."),
+               clEnumValN(GVDT_Fraction, "fraction",
+                          "display a graph using the "
+                          "fractional block frequency representation."),
+               clEnumValN(GVDT_Integer, "integer",
+                          "display a graph using the raw "
+                          "integer fractional block frequency representation."),
+               clEnumValN(GVDT_Count, "count", "display a graph using the real "
+                                               "profile count if available."),
+
+               clEnumValEnd));
+
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+extern cl::opt<unsigned> ViewHotFreqPercent;
+
+namespace llvm {
+
+template <> struct GraphTraits<MachineBlockFrequencyInfo *> {
+  typedef const MachineBasicBlock NodeType;
+  typedef MachineBasicBlock::const_succ_iterator ChildIteratorType;
+  typedef MachineFunction::const_iterator nodes_iterator;
+
+  static inline const NodeType *
+  getEntryNode(const MachineBlockFrequencyInfo *G) {
+    return &G->getFunction()->front();
+  }
+
+  static ChildIteratorType child_begin(const NodeType *N) {
+    return N->succ_begin();
+  }
+
+  static ChildIteratorType child_end(const NodeType *N) {
+    return N->succ_end();
+  }
+
+  static nodes_iterator nodes_begin(const MachineBlockFrequencyInfo *G) {
+    return G->getFunction()->begin();
+  }
+
+  static nodes_iterator nodes_end(const MachineBlockFrequencyInfo *G) {
+    return G->getFunction()->end();
+  }
+};
+
+typedef BFIDOTGraphTraitsBase<MachineBlockFrequencyInfo,
+                              MachineBranchProbabilityInfo>
+    MBFIDOTGraphTraitsBase;
+template <>
+struct DOTGraphTraits<MachineBlockFrequencyInfo *>
+    : public MBFIDOTGraphTraitsBase {
+  explicit DOTGraphTraits(bool isSimple = false)
+      : MBFIDOTGraphTraitsBase(isSimple) {}
+
+  std::string getNodeLabel(const MachineBasicBlock *Node,
+                           const MachineBlockFrequencyInfo *Graph) {
+    return MBFIDOTGraphTraitsBase::getNodeLabel(
+        Node, Graph, ViewMachineBlockFreqPropagationDAG);
+  }
+
+  std::string getNodeAttributes(const MachineBasicBlock *Node,
+                                const MachineBlockFrequencyInfo *Graph) {
+    return MBFIDOTGraphTraitsBase::getNodeAttributes(Node, Graph,
+                                                     ViewHotFreqPercent);
+  }
+
+  std::string getEdgeAttributes(const MachineBasicBlock *Node, EdgeIter EI,
+                                const MachineBlockFrequencyInfo *MBFI) {
+    return MBFIDOTGraphTraitsBase::getEdgeAttributes(
+        Node, EI, MBFI, MBFI->getMBPI(), ViewHotFreqPercent);
+  }
+};
+
+} // end namespace llvm
+#endif
+
+INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
+                      "Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineBlockFrequencyInfo, "machine-block-freq",
+                    "Machine Block Frequency Analysis", true, true)
+
+char MachineBlockFrequencyInfo::ID = 0;
+
+MachineBlockFrequencyInfo::MachineBlockFrequencyInfo()
+    : MachineFunctionPass(ID) {
+  initializeMachineBlockFrequencyInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineBlockFrequencyInfo::~MachineBlockFrequencyInfo() {}
+
+void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI =
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  if (!MBFI)
+    MBFI.reset(new ImplType);
+  MBFI->calculate(F, MBPI, MLI);
+#ifndef NDEBUG
+  if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
+      (ViewBlockFreqFuncName.empty() ||
+       F.getName().equals(ViewBlockFreqFuncName))) {
+    view();
+  }
+#endif
+  return false;
+}
+
+void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }
+
+/// Pop up a ghostview window with the current block frequency propagation
+/// rendered using dot.
+void MachineBlockFrequencyInfo::view() const {
+// This code is only for debugging.
+#ifndef NDEBUG
+  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this),
+            "MachineBlockFrequencyDAGs");
+#else
+  errs() << "MachineBlockFrequencyInfo::view is only available in debug builds "
+            "on systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+BlockFrequency
+MachineBlockFrequencyInfo::getBlockFreq(const MachineBasicBlock *MBB) const {
+  return MBFI ? MBFI->getBlockFreq(MBB) : 0;
+}
+
+Optional<uint64_t> MachineBlockFrequencyInfo::getBlockProfileCount(
+    const MachineBasicBlock *MBB) const {
+  const Function *F = MBFI->getFunction()->getFunction();
+  return MBFI ? MBFI->getBlockProfileCount(*F, MBB) : None;
+}
+
+const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
+  return MBFI ? MBFI->getFunction() : nullptr;
+}
+
+const MachineBranchProbabilityInfo *MachineBlockFrequencyInfo::getMBPI() const {
+  return MBFI ? &MBFI->getBPI() : nullptr;
+}
+
+raw_ostream &
+MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI ? MBFI->printBlockFreq(OS, Freq) : OS;
+}
+
+raw_ostream &
+MachineBlockFrequencyInfo::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI ? MBFI->printBlockFreq(OS, MBB) : OS;
+}
+
+uint64_t MachineBlockFrequencyInfo::getEntryFreq() const {
+  return MBFI ? MBFI->getEntryFreq() : 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
new file mode 100644
index 000000000000..03dda8b36a71
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -0,0 +1,1796 @@
+//===-- MachineBlockPlacement.cpp - Basic Block Code Layout optimization --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements basic block placement transformations using the CFG
+// structure and branch probability estimates.
+//
+// The pass strives to preserve the structure of the CFG (that is, retain
+// a topological ordering of basic blocks) in the absence of a *strong* signal
+// to the contrary from probabilities. However, within the CFG structure, it
+// attempts to choose an ordering which favors placing more likely sequences of
+// blocks adjacent to each other.
+//
+// The algorithm works from the inner-most loop within a function outward, and
+// at each stage walks through the basic blocks, trying to coalesce them into
+// sequential chains where allowed by the CFG (or demanded by heavy
+// probabilities). Finally, it walks the blocks in topological order, and the
+// first time it reaches a chain of basic blocks, it schedules them in the
+// function in-order.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "BranchFolding.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "block-placement"
+
+STATISTIC(NumCondBranches, "Number of conditional branches");
+STATISTIC(NumUncondBranches, "Number of unconditional branches");
+STATISTIC(CondBranchTakenFreq,
+          "Potential frequency of taking conditional branches");
+STATISTIC(UncondBranchTakenFreq,
+          "Potential frequency of taking unconditional branches");
+
+static cl::opt<unsigned> AlignAllBlock("align-all-blocks",
+                                       cl::desc("Force the alignment of all "
+                                                "blocks in the function."),
+                                       cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> AlignAllNonFallThruBlocks(
+    "align-all-nofallthru-blocks",
+    cl::desc("Force the alignment of all "
+             "blocks that have no fall-through predecessors (i.e. don't add "
+             "nops that are executed)."),
+    cl::init(0), cl::Hidden);
+
+// FIXME: Find a good default for this flag and remove the flag.
+static cl::opt<unsigned> ExitBlockBias(
+    "block-placement-exit-block-bias",
+    cl::desc("Block frequency percentage a loop exit block needs "
+             "over the original exit to be considered the new exit."),
+    cl::init(0), cl::Hidden);
+
+static cl::opt<bool> OutlineOptionalBranches(
+    "outline-optional-branches",
+    cl::desc("Put completely optional branches, i.e. branches with a common "
+             "post dominator, out of line."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> OutlineOptionalThreshold(
+    "outline-optional-threshold",
+    cl::desc("Don't outline optional branches that are a single block with an "
+             "instruction count below this threshold"),
+    cl::init(4), cl::Hidden);
+
+static cl::opt<unsigned> LoopToColdBlockRatio(
+    "loop-to-cold-block-ratio",
+    cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
+             "(frequency of block) is greater than this ratio"),
+    cl::init(5), cl::Hidden);
+
+static cl::opt<bool>
+    PreciseRotationCost("precise-rotation-cost",
+                        cl::desc("Model the cost of loop rotation more "
+                                 "precisely by using profile data."),
+                        cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    ForcePreciseRotationCost("force-precise-rotation-cost",
+                             cl::desc("Force the use of precise cost "
+                                      "loop rotation strategy."),
+                             cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> MisfetchCost(
+    "misfetch-cost",
+    cl::desc("Cost that models the probabilistic risk of an instruction "
+             "misfetch due to a jump comparing to falling through, whose cost "
+             "is zero."),
+    cl::init(1), cl::Hidden);
+
+static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
+                                      cl::desc("Cost of jump instructions."),
+                                      cl::init(1), cl::Hidden);
+
+static cl::opt<bool>
+BranchFoldPlacement("branch-fold-placement",
+              cl::desc("Perform branch folding during placement. "
+                       "Reduces code size."),
+              cl::init(true), cl::Hidden);
+
+extern cl::opt<unsigned> StaticLikelyProb;
+extern cl::opt<unsigned> ProfileLikelyProb;
+
+namespace {
+class BlockChain;
+/// \brief Type for our function-wide basic block -> block chain mapping.
+typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+}
+
+namespace {
+/// \brief A chain of blocks which will be laid out contiguously.
+///
+/// This is the datastructure representing a chain of consecutive blocks that
+/// are profitable to layout together in order to maximize fallthrough
+/// probabilities and code locality. We also can use a block chain to represent
+/// a sequence of basic blocks which have some external (correctness)
+/// requirement for sequential layout.
+///
+/// Chains can be built around a single basic block and can be merged to grow
+/// them. They participate in a block-to-chain mapping, which is updated
+/// automatically as chains are merged together.
+class BlockChain {
+  /// \brief The sequence of blocks belonging to this chain.
+  ///
+  /// This is the sequence of blocks for a particular chain. These will be laid
+  /// out in-order within the function.
+  SmallVector<MachineBasicBlock *, 4> Blocks;
+
+  /// \brief A handle to the function-wide basic block to block chain mapping.
+  ///
+  /// This is retained in each block chain to simplify the computation of child
+  /// block chains for SCC-formation and iteration. We store the edges to child
+  /// basic blocks, and map them back to their associated chains using this
+  /// structure.
+  BlockToChainMapType &BlockToChain;
+
+public:
+  /// \brief Construct a new BlockChain.
+  ///
+  /// This builds a new block chain representing a single basic block in the
+  /// function. It also registers itself as the chain that block participates
+  /// in with the BlockToChain mapping.
+  BlockChain(BlockToChainMapType &BlockToChain, MachineBasicBlock *BB)
+      : Blocks(1, BB), BlockToChain(BlockToChain), UnscheduledPredecessors(0) {
+    assert(BB && "Cannot create a chain with a null basic block");
+    BlockToChain[BB] = this;
+  }
+
+  /// \brief Iterator over blocks within the chain.
+  typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+
+  /// \brief Beginning of blocks within the chain.
+  iterator begin() { return Blocks.begin(); }
+
+  /// \brief End of blocks within the chain.
+  iterator end() { return Blocks.end(); }
+
+  /// \brief Merge a block chain into this one.
+  ///
+  /// This routine merges a block chain into this one. It takes care of forming
+  /// a contiguous sequence of basic blocks, updating the edge list, and
+  /// updating the block -> chain mapping. It does not free or tear down the
+  /// old chain, but the old chain's block list is no longer valid.
+  void merge(MachineBasicBlock *BB, BlockChain *Chain) {
+    assert(BB);
+    assert(!Blocks.empty());
+
+    // Fast path in case we don't have a chain already.
+    if (!Chain) {
+      assert(!BlockToChain[BB]);
+      Blocks.push_back(BB);
+      BlockToChain[BB] = this;
+      return;
+    }
+
+    assert(BB == *Chain->begin());
+    assert(Chain->begin() != Chain->end());
+
+    // Update the incoming blocks to point to this chain, and add them to the
+    // chain structure.
+    for (MachineBasicBlock *ChainBB : *Chain) {
+      Blocks.push_back(ChainBB);
+      assert(BlockToChain[ChainBB] == Chain && "Incoming blocks not in chain");
+      BlockToChain[ChainBB] = this;
+    }
+  }
+
+#ifndef NDEBUG
+  /// \brief Dump the blocks in this chain.
+  LLVM_DUMP_METHOD void dump() {
+    for (MachineBasicBlock *MBB : *this)
+      MBB->dump();
+  }
+#endif // NDEBUG
+
+  /// \brief Count of predecessors of any block within the chain which have not
+  /// yet been scheduled.  In general, we will delay scheduling this chain
+  /// until those predecessors are scheduled (or we find a sufficiently good
+  /// reason to override this heuristic.)  Note that when forming loop chains,
+  /// blocks outside the loop are ignored and treated as if they were already
+  /// scheduled.
+  ///
+  /// Note: This field is reinitialized multiple times - once for each loop,
+  /// and then once for the function as a whole.
+  unsigned UnscheduledPredecessors;
+};
+}
+
+namespace {
+class MachineBlockPlacement : public MachineFunctionPass {
+  /// \brief A typedef for a block filter set.
+  typedef SmallPtrSet<MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// \brief work lists of blocks that are ready to be laid out
+  SmallVector<MachineBasicBlock *, 16> BlockWorkList;
+  SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
+
+  /// \brief Machine Function
+  MachineFunction *F;
+
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+
+  /// \brief A handle to the function-wide block frequency pass.
+  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
+
+  /// \brief A handle to the loop info.
+  MachineLoopInfo *MLI;
+
+  /// \brief A handle to the target's instruction info.
+  const TargetInstrInfo *TII;
+
+  /// \brief A handle to the target's lowering info.
+  const TargetLoweringBase *TLI;
+
+  /// \brief A handle to the post dominator tree.
+  MachineDominatorTree *MDT;
+
+  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
+  /// all terminators of the MachineFunction.
+  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
+
+  /// \brief Allocator and owner of BlockChain structures.
+  ///
+  /// We build BlockChains lazily while processing the loop structure of
+  /// a function. To reduce malloc traffic, we allocate them using this
+  /// slab-like allocator, and destroy them after the pass completes. An
+  /// important guarantee is that this allocator produces stable pointers to
+  /// the chains.
+  SpecificBumpPtrAllocator<BlockChain> ChainAllocator;
+
+  /// \brief Function wide BasicBlock to BlockChain mapping.
+  ///
+  /// This mapping allows efficiently moving from any given basic block to the
+  /// BlockChain it participates in, if any. We use it to, among other things,
+  /// allow implicitly defining edges between chains as the existing edges
+  /// between basic blocks.
+  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+
+  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+                           const BlockFilterSet *BlockFilter = nullptr);
+  BranchProbability
+  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
+                          const BlockFilterSet *BlockFilter,
+                          SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                                 BlockChain &Chain,
+                                 const BlockFilterSet *BlockFilter,
+                                 BranchProbability SuccProb,
+                                 BranchProbability HotProb);
+  bool
+  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                             BlockChain &SuccChain, BranchProbability SuccProb,
+                             BranchProbability RealSuccProb, BlockChain &Chain,
+                             const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
+                                         BlockChain &Chain,
+                                         const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *
+  selectBestCandidateBlock(BlockChain &Chain,
+                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *
+  getFirstUnplacedBlock(const BlockChain &PlacedChain,
+                        MachineFunction::iterator &PrevUnplacedBlockIt,
+                        const BlockFilterSet *BlockFilter);
+
+  /// \brief Add a basic block to the work list if it is appropriate.
+  ///
+  /// If the optional parameter BlockFilter is provided, only MBB
+  /// present in the set will be added to the worklist. If nullptr
+  /// is provided, no filtering occurs.
+  void fillWorkLists(MachineBasicBlock *MBB,
+                     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+                     const BlockFilterSet *BlockFilter);
+  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+                  const BlockFilterSet *BlockFilter = nullptr);
+  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
+  void buildLoopChains(MachineLoop &L);
+  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
+                  const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
+                             const BlockFilterSet &LoopBlockSet);
+  void collectMustExecuteBBs();
+  void buildCFGChains();
+  void optimizeBranches();
+  void alignBlocks();
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MachineBlockPlacement() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<TargetPassConfig>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char MachineBlockPlacement::ID = 0;
+char &llvm::MachineBlockPlacementID = MachineBlockPlacement::ID;
+INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
+                      "Branch Probability Basic Block Placement", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
+                    "Branch Probability Basic Block Placement", false, false)
+
+#ifndef NDEBUG
+/// \brief Helper to print the name of a MBB.
+///
+/// Only used by debug logging.
+static std::string getBlockName(MachineBasicBlock *BB) {
+  std::string Result;
+  raw_string_ostream OS(Result);
+  OS << "BB#" << BB->getNumber();
+  OS << " ('" << BB->getName() << "')";
+  OS.flush();
+  return Result;
+}
+#endif
+
+/// \brief Mark a chain's successors as having one fewer preds.
+///
+/// When a chain is being merged into the "placed" chain, this routine will
+/// quickly walk the successors of each block in the chain and mark them as
+/// having one fewer active predecessor. It also adds any successors of this
+/// chain which reach the zero-predecessor state to the worklist passed in.
+void MachineBlockPlacement::markChainSuccessors(
+    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+    const BlockFilterSet *BlockFilter) {
+  // Walk all the blocks in this chain, marking their successors as having
+  // a predecessor placed.
+  for (MachineBasicBlock *MBB : Chain) {
+    // Add any successors for which this is the only un-placed in-loop
+    // predecessor to the worklist as a viable candidate for CFG-neutral
+    // placement. No subsequent placement of this block will violate the CFG
+    // shape, so we get to use heuristics to choose a favorable placement.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (BlockFilter && !BlockFilter->count(Succ))
+        continue;
+      BlockChain &SuccChain = *BlockToChain[Succ];
+      // Disregard edges within a fixed chain, or edges to the loop header.
+      if (&Chain == &SuccChain || Succ == LoopHeaderBB)
+        continue;
+
+      // This is a cross-chain edge that is within the loop, so decrement the
+      // loop predecessor count of the destination chain.
+      if (SuccChain.UnscheduledPredecessors == 0 ||
+          --SuccChain.UnscheduledPredecessors > 0)
+        continue;
+
+      auto *MBB = *SuccChain.begin();
+      if (MBB->isEHPad())
+        EHPadWorkList.push_back(MBB);
+      else
+        BlockWorkList.push_back(MBB);
+    }
+  }
+}
+
+/// This helper function collects the set of successors of block
+/// \p BB that are allowed to be its layout successors, and return
+/// the total branch probability of edges from \p BB to those
+/// blocks.
+BranchProbability MachineBlockPlacement::collectViableSuccessors(
+    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    SmallVector<MachineBasicBlock *, 4> &Successors) {
+  // Adjust edge probabilities by excluding edges pointing to blocks that is
+  // either not in BlockFilter or is already in the current chain. Consider the
+  // following CFG:
+  //
+  //     --->A
+  //     |  / \
+  //     | B   C
+  //     |  \ / \
+  //     ----D   E
+  //
+  // Assume A->C is very hot (>90%), and C->D has a 50% probability, then after
+  // A->C is chosen as a fall-through, D won't be selected as a successor of C
+  // due to CFG constraint (the probability of C->D is not greater than
+  // HotProb to break top-order). If we exclude E that is not in BlockFilter
+  // when calculating the  probability of C->D, D will be selected and we
+  // will get A C D B as the layout of this loop.
+  auto AdjustedSumProb = BranchProbability::getOne();
+  for (MachineBasicBlock *Succ : BB->successors()) {
+    bool SkipSucc = false;
+    if (Succ->isEHPad() || (BlockFilter && !BlockFilter->count(Succ))) {
+      SkipSucc = true;
+    } else {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain == &Chain) {
+        SkipSucc = true;
+      } else if (Succ != *SuccChain->begin()) {
+        DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> Mid chain!\n");
+        continue;
+      }
+    }
+    if (SkipSucc)
+      AdjustedSumProb -= MBPI->getEdgeProbability(BB, Succ);
+    else
+      Successors.push_back(Succ);
+  }
+
+  return AdjustedSumProb;
+}
+
+/// The helper function returns the branch probability that is adjusted
+/// or normalized over the new total \p AdjustedSumProb.
+static BranchProbability
+getAdjustedProbability(BranchProbability OrigProb,
+                       BranchProbability AdjustedSumProb) {
+  BranchProbability SuccProb;
+  uint32_t SuccProbN = OrigProb.getNumerator();
+  uint32_t SuccProbD = AdjustedSumProb.getNumerator();
+  if (SuccProbN >= SuccProbD)
+    SuccProb = BranchProbability::getOne();
+  else
+    SuccProb = BranchProbability(SuccProbN, SuccProbD);
+
+  return SuccProb;
+}
+
+/// When the option OutlineOptionalBranches is on, this method
+/// checks if the fallthrough candidate block \p Succ (of block
+/// \p BB) also has other unscheduled predecessor blocks which
+/// are also successors of \p BB (forming triangular shape CFG).
+/// If none of such predecessors are small, it returns true.
+/// The caller can choose to select \p Succ as the layout successors
+/// so that \p Succ's predecessors (optional branches) can be
+/// outlined.
+/// FIXME: fold this with more general layout cost analysis.
+bool MachineBlockPlacement::shouldPredBlockBeOutlined(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
+    BranchProbability HotProb) {
+  if (!OutlineOptionalBranches)
+    return false;
+  // If we outline optional branches, look whether Succ is unavoidable, i.e.
+  // dominates all terminators of the MachineFunction. If it does, other
+  // successors must be optional. Don't do this for cold branches.
+  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
+    for (MachineBasicBlock *Pred : Succ->predecessors()) {
+      // Check whether there is an unplaced optional branch.
+      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
+          BlockToChain[Pred] == &Chain)
+        continue;
+      // Check whether the optional branch has exactly one BB.
+      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+        continue;
+      // Check whether the optional branch is small.
+      if (Pred->size() < OutlineOptionalThreshold)
+        return false;
+    }
+    return true;
+  } else
+    return false;
+}
+
+// When profile is not present, return the StaticLikelyProb.
+// When profile is available, we need to handle the triangle-shape CFG.
+static BranchProbability getLayoutSuccessorProbThreshold(
+      MachineBasicBlock *BB) {
+  if (!BB->getParent()->getFunction()->getEntryCount())
+    return BranchProbability(StaticLikelyProb, 100);
+  if (BB->succ_size() == 2) {
+    const MachineBasicBlock *Succ1 = *BB->succ_begin();
+    const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
+    if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
+      /* See case 1 below for the cost analysis. For BB->Succ to
+       * be taken with smaller cost, the following needs to hold:
+       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
+       *   So the threshold T
+       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
+       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
+       * branch bias, we have
+       *   T = (2/3)*(ProfileLikelyProb/50)
+       *     = (2*ProfileLikelyProb)/150)
+       */
+      return BranchProbability(2 * ProfileLikelyProb, 150);
+    }
+  }
+  return BranchProbability(ProfileLikelyProb, 100);
+}
+
+/// Checks to see if the layout candidate block \p Succ has a better layout
+/// predecessor than \c BB. If yes, returns true.
+bool MachineBlockPlacement::hasBetterLayoutPredecessor(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
+    BranchProbability SuccProb, BranchProbability RealSuccProb,
+    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+
+  // There isn't a better layout when there are no unscheduled predecessors.
+  if (SuccChain.UnscheduledPredecessors == 0)
+    return false;
+
+  // There are two basic scenarios here:
+  // -------------------------------------
+  // Case 1: triangular shape CFG (if-then):
+  //     BB
+  //     | \
+  //     |  \
+  //     |   Pred
+  //     |   /
+  //     Succ
+  // In this case, we are evaluating whether to select edge -> Succ, e.g.
+  // set Succ as the layout successor of BB. Picking Succ as BB's
+  // successor breaks the CFG constraints (FIXME: define these constraints).
+  // With this layout, Pred BB
+  // is forced to be outlined, so the overall cost will be cost of the
+  // branch taken from BB to Pred, plus the cost of back taken branch
+  // from Pred to Succ, as well as the additional cost associated
+  // with the needed unconditional jump instruction from Pred To Succ.
+
+  // The cost of the topological order layout is the taken branch cost
+  // from BB to Succ, so to make BB->Succ a viable candidate, the following
+  // must hold:
+  //     2 * freq(BB->Pred) * taken_branch_cost + unconditional_jump_cost
+  //      < freq(BB->Succ) *  taken_branch_cost.
+  // Ignoring unconditional jump cost, we get
+  //    freq(BB->Succ) > 2 * freq(BB->Pred), i.e.,
+  //    prob(BB->Succ) > 2 * prob(BB->Pred)
+  //
+  // When real profile data is available, we can precisely compute the
+  // probability threshold that is needed for edge BB->Succ to be considered.
+  // Without profile data, the heuristic requires the branch bias to be
+  // a lot larger to make sure the signal is very strong (e.g. 80% default).
+  // -----------------------------------------------------------------
+  // Case 2: diamond like CFG (if-then-else):
+  //     S
+  //    / \
+  //   |   \
+  //  BB    Pred
+  //   \    /
+  //    Succ
+  //    ..
+  //
+  // The current block is BB and edge BB->Succ is now being evaluated.
+  // Note that edge S->BB was previously already selected because
+  // prob(S->BB) > prob(S->Pred).
+  // At this point, 2 blocks can be placed after BB: Pred or Succ. If we
+  // choose Pred, we will have a topological ordering as shown on the left
+  // in the picture below. If we choose Succ, we have the solution as shown
+  // on the right:
+  //
+  //   topo-order:
+  //
+  //       S-----                             ---S
+  //       |    |                             |  |
+  //    ---BB   |                             |  BB
+  //    |       |                             |  |
+  //    |  pred--                             |  Succ--
+  //    |  |                                  |       |
+  //    ---succ                               ---pred--
+  //
+  // cost = freq(S->Pred) + freq(BB->Succ)    cost = 2 * freq (S->Pred)
+  //      = freq(S->Pred) + freq(S->BB)
+  //
+  // If we have profile data (i.e, branch probabilities can be trusted), the
+  // cost (number of taken branches) with layout S->BB->Succ->Pred is 2 *
+  // freq(S->Pred) while the cost of topo order is freq(S->Pred) + freq(S->BB).
+  // We know Prob(S->BB) > Prob(S->Pred), so freq(S->BB) > freq(S->Pred), which
+  // means the cost of topological order is greater.
+  // When profile data is not available, however, we need to be more
+  // conservative. If the branch prediction is wrong, breaking the topo-order
+  // will actually yield a layout with large cost. For this reason, we need
+  // strong biased branch at block S with Prob(S->BB) in order to select
+  // BB->Succ. This is equivalent to looking the CFG backward with backward
+  // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
+  // profile data).
+
+  BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
+
+  // Forward checking. For case 2, SuccProb will be 1.
+  if (SuccProb < HotProb) {
+    DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
+                 << " (prob) (CFG conflict)\n");
+    return true;
+  }
+
+  // Make sure that a hot successor doesn't have a globally more
+  // important predecessor.
+  BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
+  bool BadCFGConflict = false;
+
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
+        (BlockFilter && !BlockFilter->count(Pred)) ||
+        BlockToChain[Pred] == &Chain)
+      continue;
+    // Do backward checking. For case 1, it is actually redundant check. For
+    // case 2 above, we need a backward checking to filter out edges that are
+    // not 'strongly' biased. With profile data available, the check is mostly
+    // redundant too (when threshold prob is set at 50%) unless S has more than
+    // two successors.
+    // BB  Pred
+    //  \ /
+    //  Succ
+    // We select edge BB->Succ if
+    //      freq(BB->Succ) > freq(Succ) * HotProb
+    //      i.e. freq(BB->Succ) > freq(BB->Succ) * HotProb + freq(Pred->Succ) *
+    //      HotProb
+    //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
+    BlockFrequency PredEdgeFreq =
+        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+    if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
+      BadCFGConflict = true;
+      break;
+    }
+  }
+
+  if (BadCFGConflict) {
+    DEBUG(dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
+                 << " (prob) (non-cold CFG conflict)\n");
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Select the best successor for a block.
+///
+/// This looks across all successors of a particular block and attempts to
+/// select the "best" one to be the layout successor. It only considers direct
+/// successors which also pass the block filter. It will attempt to avoid
+/// breaking CFG structure, but cave and break such structures in the case of
+/// very hot successor edges.
+///
+/// \returns The best successor block found, or null if none are viable.
+MachineBasicBlock *
+MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
+                                           BlockChain &Chain,
+                                           const BlockFilterSet *BlockFilter) {
+  const BranchProbability HotProb(StaticLikelyProb, 100);
+
+  MachineBasicBlock *BestSucc = nullptr;
+  auto BestProb = BranchProbability::getZero();
+
+  SmallVector<MachineBasicBlock *, 4> Successors;
+  auto AdjustedSumProb =
+      collectViableSuccessors(BB, Chain, BlockFilter, Successors);
+
+  DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n");
+  for (MachineBasicBlock *Succ : Successors) {
+    auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
+    BranchProbability SuccProb =
+        getAdjustedProbability(RealSuccProb, AdjustedSumProb);
+
+    // This heuristic is off by default.
+    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
+                                  HotProb))
+      return Succ;
+
+    BlockChain &SuccChain = *BlockToChain[Succ];
+    // Skip the edge \c BB->Succ if block \c Succ has a better layout
+    // predecessor that yields lower global cost.
+    if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
+                                   Chain, BlockFilter))
+      continue;
+
+    DEBUG(
+        dbgs() << "    " << getBlockName(Succ) << " -> " << SuccProb
+               << " (prob)"
+               << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
+               << "\n");
+    if (BestSucc && BestProb >= SuccProb)
+      continue;
+    BestSucc = Succ;
+    BestProb = SuccProb;
+  }
+  return BestSucc;
+}
+
+/// \brief Select the best block from a worklist.
+///
+/// This looks through the provided worklist as a list of candidate basic
+/// blocks and select the most profitable one to place. The definition of
+/// profitable only really makes sense in the context of a loop. This returns
+/// the most frequently visited block in the worklist, which in the case of
+/// a loop, is the one most desirable to be physically close to the rest of the
+/// loop body in order to improve i-cache behavior.
+///
+/// \returns The best block found, or null if none are viable.
+MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
+    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+  // Once we need to walk the worklist looking for a candidate, cleanup the
+  // worklist of already placed entries.
+  // FIXME: If this shows up on profiles, it could be folded (at the cost of
+  // some code complexity) into the loop below.
+  WorkList.erase(std::remove_if(WorkList.begin(), WorkList.end(),
+                                [&](MachineBasicBlock *BB) {
+                                  return BlockToChain.lookup(BB) == &Chain;
+                                }),
+                 WorkList.end());
+
+  if (WorkList.empty())
+    return nullptr;
+
+  bool IsEHPad = WorkList[0]->isEHPad();
+
+  MachineBasicBlock *BestBlock = nullptr;
+  BlockFrequency BestFreq;
+  for (MachineBasicBlock *MBB : WorkList) {
+    assert(MBB->isEHPad() == IsEHPad);
+
+    BlockChain &SuccChain = *BlockToChain[MBB];
+    if (&SuccChain == &Chain)
+      continue;
+
+    assert(SuccChain.UnscheduledPredecessors == 0 && "Found CFG-violating block");
+
+    BlockFrequency CandidateFreq = MBFI->getBlockFreq(MBB);
+    DEBUG(dbgs() << "    " << getBlockName(MBB) << " -> ";
+          MBFI->printBlockFreq(dbgs(), CandidateFreq) << " (freq)\n");
+
+    // For ehpad, we layout the least probable first as to avoid jumping back
+    // from least probable landingpads to more probable ones.
+    //
+    // FIXME: Using probability is probably (!) not the best way to achieve
+    // this. We should probably have a more principled approach to layout
+    // cleanup code.
+    //
+    // The goal is to get:
+    //
+    //                 +--------------------------+
+    //                 |                          V
+    // InnerLp -> InnerCleanup    OuterLp -> OuterCleanup -> Resume
+    //
+    // Rather than:
+    //
+    //                 +-------------------------------------+
+    //                 V                                     |
+    // OuterLp -> OuterCleanup -> Resume     InnerLp -> InnerCleanup
+    if (BestBlock && (IsEHPad ^ (BestFreq >= CandidateFreq)))
+      continue;
+
+    BestBlock = MBB;
+    BestFreq = CandidateFreq;
+  }
+
+  return BestBlock;
+}
+
+/// \brief Retrieve the first unplaced basic block.
+///
+/// This routine is called when we are unable to use the CFG to walk through
+/// all of the basic blocks and form a chain due to unnatural loops in the CFG.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
+MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
+    const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F->end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(&*I))
+      continue;
+    if (BlockToChain[&*I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[&*I]->begin();
+    }
+  }
+  return nullptr;
+}
+
+void MachineBlockPlacement::fillWorkLists(
+    MachineBasicBlock *MBB,
+    SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+    const BlockFilterSet *BlockFilter = nullptr) {
+  BlockChain &Chain = *BlockToChain[MBB];
+  if (!UpdatedPreds.insert(&Chain).second)
+    return;
+
+  assert(Chain.UnscheduledPredecessors == 0);
+  for (MachineBasicBlock *ChainBB : Chain) {
+    assert(BlockToChain[ChainBB] == &Chain);
+    for (MachineBasicBlock *Pred : ChainBB->predecessors()) {
+      if (BlockFilter && !BlockFilter->count(Pred))
+        continue;
+      if (BlockToChain[Pred] == &Chain)
+        continue;
+      ++Chain.UnscheduledPredecessors;
+    }
+  }
+
+  if (Chain.UnscheduledPredecessors != 0)
+    return;
+
+  MBB = *Chain.begin();
+  if (MBB->isEHPad())
+    EHPadWorkList.push_back(MBB);
+  else
+    BlockWorkList.push_back(MBB);
+}
+
+void MachineBlockPlacement::buildChain(
+    MachineBasicBlock *BB, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  assert(BB && "BB must not be null.\n");
+  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
+
+  MachineBasicBlock *LoopHeaderBB = BB;
+  markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
+  BB = *std::prev(Chain.end());
+  for (;;) {
+    assert(BB && "null block found at end of chain in loop.");
+    assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
+    assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
+
+
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+
+    // If an immediate successor isn't available, look for the best viable
+    // block among those we've identified as not violating the loop's CFG at
+    // this point. This won't be a fallthrough, but it will increase locality.
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, BlockWorkList);
+    if (!BestSucc)
+      BestSucc = selectBestCandidateBlock(Chain, EHPadWorkList);
+
+    if (!BestSucc) {
+      BestSucc = getFirstUnplacedBlock(Chain, PrevUnplacedBlockIt, BlockFilter);
+      if (!BestSucc)
+        break;
+
+      DEBUG(dbgs() << "Unnatural loop CFG detected, forcibly merging the "
+                      "layout successor until the CFG reduces\n");
+    }
+
+    // Place this block, updating the datastructures to reflect its placement.
+    BlockChain &SuccChain = *BlockToChain[BestSucc];
+    // Zero out UnscheduledPredecessors for the successor we're about to merge in case
+    // we selected a successor that didn't fit naturally into the CFG.
+    SuccChain.UnscheduledPredecessors = 0;
+    DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
+                 << getBlockName(BestSucc) << "\n");
+    markChainSuccessors(SuccChain, LoopHeaderBB, BlockFilter);
+    Chain.merge(BestSucc, &SuccChain);
+    BB = *std::prev(Chain.end());
+  }
+
+  DEBUG(dbgs() << "Finished forming chain for header block "
+               << getBlockName(*Chain.begin()) << "\n");
+}
+
+/// \brief Find the best loop top block for layout.
+///
+/// Look for a block which is strictly better than the loop header for laying
+/// out at the top of the loop. This looks for one and only one pattern:
+/// a latch block with no conditional exit. This block will cause a conditional
+/// jump around it or will be the bottom of the loop if we lay it out in place,
+/// but if it it doesn't end up at the bottom of the loop for any reason,
+/// rotation alone won't fix it. Because such a block will always result in an
+/// unconditional jump (for the backedge) rotating it in front of the loop
+/// header is always profitable.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+                                       const BlockFilterSet &LoopBlockSet) {
+  // Check that the header hasn't been fused with a preheader block due to
+  // crazy branches. If it has, we need to start with the header at the top to
+  // prevent pulling the preheader into the loop body.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return L.getHeader();
+
+  DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
+               << "\n");
+
+  BlockFrequency BestPredFreq;
+  MachineBasicBlock *BestPred = nullptr;
+  for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
+    if (!LoopBlockSet.count(Pred))
+      continue;
+    DEBUG(dbgs() << "    header pred: " << getBlockName(Pred) << ", "
+                 << Pred->succ_size() << " successors, ";
+          MBFI->printBlockFreq(dbgs(), Pred) << " freq\n");
+    if (Pred->succ_size() > 1)
+      continue;
+
+    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+    if (!BestPred || PredFreq > BestPredFreq ||
+        (!(PredFreq < BestPredFreq) &&
+         Pred->isLayoutSuccessor(L.getHeader()))) {
+      BestPred = Pred;
+      BestPredFreq = PredFreq;
+    }
+  }
+
+  // If no direct predecessor is fine, just use the loop header.
+  if (!BestPred) {
+    DEBUG(dbgs() << "    final top unchanged\n");
+    return L.getHeader();
+  }
+
+  // Walk backwards through any straight line of predecessors.
+  while (BestPred->pred_size() == 1 &&
+         (*BestPred->pred_begin())->succ_size() == 1 &&
+         *BestPred->pred_begin() != L.getHeader())
+    BestPred = *BestPred->pred_begin();
+
+  DEBUG(dbgs() << "    final top: " << getBlockName(BestPred) << "\n");
+  return BestPred;
+}
+
+/// \brief Find the best loop exiting block for layout.
+///
+/// This routine implements the logic to analyze the loop looking for the best
+/// block to layout at the top of the loop. Typically this is done to maximize
+/// fallthrough opportunities.
+MachineBasicBlock *
+MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
+                                        const BlockFilterSet &LoopBlockSet) {
+  // We don't want to layout the loop linearly in all cases. If the loop header
+  // is just a normal basic block in the loop, we want to look for what block
+  // within the loop is the best one to layout at the top. However, if the loop
+  // header has be pre-merged into a chain due to predecessors not having
+  // analyzable branches, *and* the predecessor it is merged with is *not* part
+  // of the loop, rotating the header into the middle of the loop will create
+  // a non-contiguous range of blocks which is Very Bad. So start with the
+  // header and only rotate if safe.
+  BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
+  if (!LoopBlockSet.count(*HeaderChain.begin()))
+    return nullptr;
+
+  BlockFrequency BestExitEdgeFreq;
+  unsigned BestExitLoopDepth = 0;
+  MachineBasicBlock *ExitingBB = nullptr;
+  // If there are exits to outer loops, loop rotation can severely limit
+  // fallthrough opportunities unless it selects such an exit. Keep a set of
+  // blocks where rotating to exit with that block will reach an outer loop.
+  SmallPtrSet<MachineBasicBlock *, 4> BlocksExitingToOuterLoop;
+
+  DEBUG(dbgs() << "Finding best loop exit for: " << getBlockName(L.getHeader())
+               << "\n");
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    BlockChain &Chain = *BlockToChain[MBB];
+    // Ensure that this block is at the end of a chain; otherwise it could be
+    // mid-way through an inner loop or a successor of an unanalyzable branch.
+    if (MBB != *std::prev(Chain.end()))
+      continue;
+
+    // Now walk the successors. We need to establish whether this has a viable
+    // exiting successor and whether it has a viable non-exiting successor.
+    // We store the old exiting state and restore it if a viable looping
+    // successor isn't found.
+    MachineBasicBlock *OldExitingBB = ExitingBB;
+    BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
+    bool HasLoopingSucc = false;
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ->isEHPad())
+        continue;
+      if (Succ == MBB)
+        continue;
+      BlockChain &SuccChain = *BlockToChain[Succ];
+      // Don't split chains, either this chain or the successor's chain.
+      if (&Chain == &SuccChain) {
+        DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                     << getBlockName(Succ) << " (chain conflict)\n");
+        continue;
+      }
+
+      auto SuccProb = MBPI->getEdgeProbability(MBB, Succ);
+      if (LoopBlockSet.count(Succ)) {
+        DEBUG(dbgs() << "    looping: " << getBlockName(MBB) << " -> "
+                     << getBlockName(Succ) << " (" << SuccProb << ")\n");
+        HasLoopingSucc = true;
+        continue;
+      }
+
+      unsigned SuccLoopDepth = 0;
+      if (MachineLoop *ExitLoop = MLI->getLoopFor(Succ)) {
+        SuccLoopDepth = ExitLoop->getLoopDepth();
+        if (ExitLoop->contains(&L))
+          BlocksExitingToOuterLoop.insert(MBB);
+      }
+
+      BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(MBB) * SuccProb;
+      DEBUG(dbgs() << "    exiting: " << getBlockName(MBB) << " -> "
+                   << getBlockName(Succ) << " [L:" << SuccLoopDepth << "] (";
+            MBFI->printBlockFreq(dbgs(), ExitEdgeFreq) << ")\n");
+      // Note that we bias this toward an existing layout successor to retain
+      // incoming order in the absence of better information. The exit must have
+      // a frequency higher than the current exit before we consider breaking
+      // the layout.
+      BranchProbability Bias(100 - ExitBlockBias, 100);
+      if (!ExitingBB || SuccLoopDepth > BestExitLoopDepth ||
+          ExitEdgeFreq > BestExitEdgeFreq ||
+          (MBB->isLayoutSuccessor(Succ) &&
+           !(ExitEdgeFreq < BestExitEdgeFreq * Bias))) {
+        BestExitEdgeFreq = ExitEdgeFreq;
+        ExitingBB = MBB;
+      }
+    }
+
+    if (!HasLoopingSucc) {
+      // Restore the old exiting state, no viable looping successor was found.
+      ExitingBB = OldExitingBB;
+      BestExitEdgeFreq = OldBestExitEdgeFreq;
+    }
+  }
+  // Without a candidate exiting block or with only a single block in the
+  // loop, just use the loop header to layout the loop.
+  if (!ExitingBB || L.getNumBlocks() == 1)
+    return nullptr;
+
+  // Also, if we have exit blocks which lead to outer loops but didn't select
+  // one of them as the exiting block we are rotating toward, disable loop
+  // rotation altogether.
+  if (!BlocksExitingToOuterLoop.empty() &&
+      !BlocksExitingToOuterLoop.count(ExitingBB))
+    return nullptr;
+
+  DEBUG(dbgs() << "  Best exiting block: " << getBlockName(ExitingBB) << "\n");
+  return ExitingBB;
+}
+
+/// \brief Attempt to rotate an exiting block to the bottom of the loop.
+///
+/// Once we have built a chain, try to rotate it to line up the hot exit block
+/// with fallthrough out of the loop if doing so doesn't introduce unnecessary
+/// branches. For example, if the loop has fallthrough into its header and out
+/// of its bottom already, don't rotate it.
+void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
+                                       MachineBasicBlock *ExitingBB,
+                                       const BlockFilterSet &LoopBlockSet) {
+  if (!ExitingBB)
+    return;
+
+  MachineBasicBlock *Top = *LoopChain.begin();
+  bool ViableTopFallthrough = false;
+  for (MachineBasicBlock *Pred : Top->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      ViableTopFallthrough = true;
+      break;
+    }
+  }
+
+  // If the header has viable fallthrough, check whether the current loop
+  // bottom is a viable exiting block. If so, bail out as rotating will
+  // introduce an unnecessary branch.
+  if (ViableTopFallthrough) {
+    MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
+    for (MachineBasicBlock *Succ : Bottom->successors()) {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (!LoopBlockSet.count(Succ) &&
+          (!SuccChain || Succ == *SuccChain->begin()))
+        return;
+    }
+  }
+
+  BlockChain::iterator ExitIt =
+      std::find(LoopChain.begin(), LoopChain.end(), ExitingBB);
+  if (ExitIt == LoopChain.end())
+    return;
+
+  std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
+}
+
+/// \brief Attempt to rotate a loop based on profile data to reduce branch cost.
+///
+/// With profile data, we can determine the cost in terms of missed fall through
+/// opportunities when rotating a loop chain and select the best rotation.
+/// Basically, there are three kinds of cost to consider for each rotation:
+///    1. The possibly missed fall through edge (if it exists) from BB out of
+///    the loop to the loop header.
+///    2. The possibly missed fall through edges (if they exist) from the loop
+///    exits to BB out of the loop.
+///    3. The missed fall through edge (if it exists) from the last BB to the
+///    first BB in the loop chain.
+///  Therefore, the cost for a given rotation is the sum of costs listed above.
+///  We select the best rotation with the smallest cost.
+void MachineBlockPlacement::rotateLoopWithProfile(
+    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+  auto HeaderBB = L.getHeader();
+  auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB);
+  auto RotationPos = LoopChain.end();
+
+  BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency();
+
+  // A utility lambda that scales up a block frequency by dividing it by a
+  // branch probability which is the reciprocal of the scale.
+  auto ScaleBlockFrequency = [](BlockFrequency Freq,
+                                unsigned Scale) -> BlockFrequency {
+    if (Scale == 0)
+      return 0;
+    // Use operator / between BlockFrequency and BranchProbability to implement
+    // saturating multiplication.
+    return Freq / BranchProbability(1, Scale);
+  };
+
+  // Compute the cost of the missed fall-through edge to the loop header if the
+  // chain head is not the loop header. As we only consider natural loops with
+  // single header, this computation can be done only once.
+  BlockFrequency HeaderFallThroughCost(0);
+  for (auto *Pred : HeaderBB->predecessors()) {
+    BlockChain *PredChain = BlockToChain[Pred];
+    if (!LoopBlockSet.count(Pred) &&
+        (!PredChain || Pred == *std::prev(PredChain->end()))) {
+      auto EdgeFreq =
+          MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB);
+      auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
+      // If the predecessor has only an unconditional jump to the header, we
+      // need to consider the cost of this jump.
+      if (Pred->succ_size() == 1)
+        FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost);
+      HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost);
+    }
+  }
+
+  // Here we collect all exit blocks in the loop, and for each exit we find out
+  // its hottest exit edge. For each loop rotation, we define the loop exit cost
+  // as the sum of frequencies of exit edges we collect here, excluding the exit
+  // edge from the tail of the loop chain.
+  SmallVector<std::pair<MachineBasicBlock *, BlockFrequency>, 4> ExitsWithFreq;
+  for (auto BB : LoopChain) {
+    auto LargestExitEdgeProb = BranchProbability::getZero();
+    for (auto *Succ : BB->successors()) {
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (!LoopBlockSet.count(Succ) &&
+          (!SuccChain || Succ == *SuccChain->begin())) {
+        auto SuccProb = MBPI->getEdgeProbability(BB, Succ);
+        LargestExitEdgeProb = std::max(LargestExitEdgeProb, SuccProb);
+      }
+    }
+    if (LargestExitEdgeProb > BranchProbability::getZero()) {
+      auto ExitFreq = MBFI->getBlockFreq(BB) * LargestExitEdgeProb;
+      ExitsWithFreq.emplace_back(BB, ExitFreq);
+    }
+  }
+
+  // In this loop we iterate every block in the loop chain and calculate the
+  // cost assuming the block is the head of the loop chain. When the loop ends,
+  // we should have found the best candidate as the loop chain's head.
+  for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()),
+            EndIter = LoopChain.end();
+       Iter != EndIter; Iter++, TailIter++) {
+    // TailIter is used to track the tail of the loop chain if the block we are
+    // checking (pointed by Iter) is the head of the chain.
+    if (TailIter == LoopChain.end())
+      TailIter = LoopChain.begin();
+
+    auto TailBB = *TailIter;
+
+    // Calculate the cost by putting this BB to the top.
+    BlockFrequency Cost = 0;
+
+    // If the current BB is the loop header, we need to take into account the
+    // cost of the missed fall through edge from outside of the loop to the
+    // header.
+    if (Iter != HeaderIter)
+      Cost += HeaderFallThroughCost;
+
+    // Collect the loop exit cost by summing up frequencies of all exit edges
+    // except the one from the chain tail.
+    for (auto &ExitWithFreq : ExitsWithFreq)
+      if (TailBB != ExitWithFreq.first)
+        Cost += ExitWithFreq.second;
+
+    // The cost of breaking the once fall-through edge from the tail to the top
+    // of the loop chain. Here we need to consider three cases:
+    // 1. If the tail node has only one successor, then we will get an
+    //    additional jmp instruction. So the cost here is (MisfetchCost +
+    //    JumpInstCost) * tail node frequency.
+    // 2. If the tail node has two successors, then we may still get an
+    //    additional jmp instruction if the layout successor after the loop
+    //    chain is not its CFG successor. Note that the more frequently executed
+    //    jmp instruction will be put ahead of the other one. Assume the
+    //    frequency of those two branches are x and y, where x is the frequency
+    //    of the edge to the chain head, then the cost will be
+    //    (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency.
+    // 3. If the tail node has more than two successors (this rarely happens),
+    //    we won't consider any additional cost.
+    if (TailBB->isSuccessor(*Iter)) {
+      auto TailBBFreq = MBFI->getBlockFreq(TailBB);
+      if (TailBB->succ_size() == 1)
+        Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(),
+                                    MisfetchCost + JumpInstCost);
+      else if (TailBB->succ_size() == 2) {
+        auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter);
+        auto TailToHeadFreq = TailBBFreq * TailToHeadProb;
+        auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2)
+                                  ? TailBBFreq * TailToHeadProb.getCompl()
+                                  : TailToHeadFreq;
+        Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) +
+                ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost);
+      }
+    }
+
+    DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockName(*Iter)
+                 << " to the top: " << Cost.getFrequency() << "\n");
+
+    if (Cost < SmallestRotationCost) {
+      SmallestRotationCost = Cost;
+      RotationPos = Iter;
+    }
+  }
+
+  if (RotationPos != LoopChain.end()) {
+    DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos)
+                 << " to the top\n");
+    std::rotate(LoopChain.begin(), RotationPos, LoopChain.end());
+  }
+}
+
+/// \brief Collect blocks in the given loop that are to be placed.
+///
+/// When profile data is available, exclude cold blocks from the returned set;
+/// otherwise, collect all blocks in the loop.
+MachineBlockPlacement::BlockFilterSet
+MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
+  BlockFilterSet LoopBlockSet;
+
+  // Filter cold blocks off from LoopBlockSet when profile data is available.
+  // Collect the sum of frequencies of incoming edges to the loop header from
+  // outside. If we treat the loop as a super block, this is the frequency of
+  // the loop. Then for each block in the loop, we calculate the ratio between
+  // its frequency and the frequency of the loop block. When it is too small,
+  // don't add it to the loop chain. If there are outer loops, then this block
+  // will be merged into the first outer loop chain for which this block is not
+  // cold anymore. This needs precise profile data and we only do this when
+  // profile data is available.
+  if (F->getFunction()->getEntryCount()) {
+    BlockFrequency LoopFreq(0);
+    for (auto LoopPred : L.getHeader()->predecessors())
+      if (!L.contains(LoopPred))
+        LoopFreq += MBFI->getBlockFreq(LoopPred) *
+                    MBPI->getEdgeProbability(LoopPred, L.getHeader());
+
+    for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+      if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBlockRatio)
+        continue;
+      LoopBlockSet.insert(LoopBB);
+    }
+  } else
+    LoopBlockSet.insert(L.block_begin(), L.block_end());
+
+  return LoopBlockSet;
+}
+
+/// \brief Forms basic block chains from the natural loop structures.
+///
+/// These chains are designed to preserve the existing *structure* of the code
+/// as much as possible. We can then stitch the chains together in a way which
+/// both preserves the topological structure and minimizes taken conditional
+/// branches.
+void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
+  // First recurse through any nested loops, building chains for those inner
+  // loops.
+  for (MachineLoop *InnerLoop : L)
+    buildLoopChains(*InnerLoop);
+
+  assert(BlockWorkList.empty());
+  assert(EHPadWorkList.empty());
+  BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
+
+  // Check if we have profile data for this function. If yes, we will rotate
+  // this loop by modeling costs more precisely which requires the profile data
+  // for better layout.
+  bool RotateLoopWithProfile =
+      ForcePreciseRotationCost ||
+      (PreciseRotationCost && F->getFunction()->getEntryCount());
+
+  // First check to see if there is an obviously preferable top block for the
+  // loop. This will default to the header, but may end up as one of the
+  // predecessors to the header if there is one which will result in strictly
+  // fewer branches in the loop body.
+  // When we use profile data to rotate the loop, this is unnecessary.
+  MachineBasicBlock *LoopTop =
+      RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet);
+
+  // If we selected just the header for the loop top, look for a potentially
+  // profitable exit block in the event that rotating the loop can eliminate
+  // branches by placing an exit edge at the bottom.
+  MachineBasicBlock *ExitingBB = nullptr;
+  if (!RotateLoopWithProfile && LoopTop == L.getHeader())
+    ExitingBB = findBestLoopExit(L, LoopBlockSet);
+
+  BlockChain &LoopChain = *BlockToChain[LoopTop];
+
+  // FIXME: This is a really lame way of walking the chains in the loop: we
+  // walk the blocks, and use a set to prevent visiting a particular chain
+  // twice.
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  assert(LoopChain.UnscheduledPredecessors == 0);
+  UpdatedPreds.insert(&LoopChain);
+
+  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+    fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
+
+  buildChain(LoopTop, LoopChain, &LoopBlockSet);
+
+  if (RotateLoopWithProfile)
+    rotateLoopWithProfile(LoopChain, L, LoopBlockSet);
+  else
+    rotateLoop(LoopChain, ExitingBB, LoopBlockSet);
+
+  DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadLoop = false;
+    if (LoopChain.UnscheduledPredecessors) {
+      BadLoop = true;
+      dbgs() << "Loop chain contains a block without its preds placed!\n"
+             << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+             << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
+    }
+    for (MachineBasicBlock *ChainBB : LoopChain) {
+      dbgs() << "          ... " << getBlockName(ChainBB) << "\n";
+      if (!LoopBlockSet.erase(ChainBB)) {
+        // We don't mark the loop as bad here because there are real situations
+        // where this can occur. For example, with an unanalyzable fallthrough
+        // from a loop block to a non-loop block or vice versa.
+        dbgs() << "Loop chain contains a block not contained by the loop!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(ChainBB) << "\n";
+      }
+    }
+
+    if (!LoopBlockSet.empty()) {
+      BadLoop = true;
+      for (MachineBasicBlock *LoopBB : LoopBlockSet)
+        dbgs() << "Loop contains blocks never placed into a chain!\n"
+               << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
+               << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
+               << "  Bad block:    " << getBlockName(LoopBB) << "\n";
+    }
+    assert(!BadLoop && "Detected problems with the placement of this loop.");
+  });
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+/// When OutlineOpitonalBranches is on, this method collects BBs that
+/// dominates all terminator blocks of the function \p F.
+void MachineBlockPlacement::collectMustExecuteBBs() {
+  if (OutlineOptionalBranches) {
+    // Find the nearest common dominator of all of F's terminators.
+    MachineBasicBlock *Terminator = nullptr;
+    for (MachineBasicBlock &MBB : *F) {
+      if (MBB.succ_size() == 0) {
+        if (Terminator == nullptr)
+          Terminator = &MBB;
+        else
+          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
+      }
+    }
+
+    // MBBs dominating this common dominator are unavoidable.
+    UnavoidableBlocks.clear();
+    for (MachineBasicBlock &MBB : *F) {
+      if (MDT->dominates(&MBB, Terminator)) {
+        UnavoidableBlocks.insert(&MBB);
+      }
+    }
+  }
+}
+
+void MachineBlockPlacement::buildCFGChains() {
+  // Ensure that every BB in the function has an associated chain to simplify
+  // the assumptions of the remaining algorithm.
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
+       ++FI) {
+    MachineBasicBlock *BB = &*FI;
+    BlockChain *Chain =
+        new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    for (;;) {
+      Cond.clear();
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+      if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI = std::next(FI);
+      MachineBasicBlock *NextBB = &*NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                   << "\n");
+      Chain->merge(NextBB, nullptr);
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
+
+  // Turned on with OutlineOptionalBranches option
+  collectMustExecuteBBs();
+
+  // Build any loop-based chains.
+  for (MachineLoop *L : *MLI)
+    buildLoopChains(*L);
+
+  assert(BlockWorkList.empty());
+  assert(EHPadWorkList.empty());
+
+  SmallPtrSet<BlockChain *, 4> UpdatedPreds;
+  for (MachineBasicBlock &MBB : *F)
+    fillWorkLists(&MBB, UpdatedPreds);
+
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  buildChain(&F->front(), FunctionChain);
+
+#ifndef NDEBUG
+  typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
+#endif
+  DEBUG({
+    // Crash at the end so we get all of the debugging output first.
+    bool BadFunc = false;
+    FunctionBlockSetType FunctionBlockSet;
+    for (MachineBasicBlock &MBB : *F)
+      FunctionBlockSet.insert(&MBB);
+
+    for (MachineBasicBlock *ChainBB : FunctionChain)
+      if (!FunctionBlockSet.erase(ChainBB)) {
+        BadFunc = true;
+        dbgs() << "Function chain contains a block not in the function!\n"
+               << "  Bad block:    " << getBlockName(ChainBB) << "\n";
+      }
+
+    if (!FunctionBlockSet.empty()) {
+      BadFunc = true;
+      for (MachineBasicBlock *RemainingBB : FunctionBlockSet)
+        dbgs() << "Function contains blocks never placed into a chain!\n"
+               << "  Bad block:    " << getBlockName(RemainingBB) << "\n";
+    }
+    assert(!BadFunc && "Detected problems with the block placement.");
+  });
+
+  // Splice the blocks into place.
+  MachineFunction::iterator InsertPos = F->begin();
+  DEBUG(dbgs() << "[MBP] Function: "<< F->getName() << "\n");
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    DEBUG(dbgs() << (ChainBB == *FunctionChain.begin() ? "Placing chain "
+                                                       : "          ... ")
+                 << getBlockName(ChainBB) << "\n");
+    if (InsertPos != MachineFunction::iterator(ChainBB))
+      F->splice(InsertPos, ChainBB);
+    else
+      ++InsertPos;
+
+    // Update the terminator of the previous block.
+    if (ChainBB == *FunctionChain.begin())
+      continue;
+    MachineBasicBlock *PrevBB = &*std::prev(MachineFunction::iterator(ChainBB));
+
+    // FIXME: It would be awesome of updateTerminator would just return rather
+    // than assert when the branch cannot be analyzed in order to remove this
+    // boiler plate.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+
+    // The "PrevBB" is not yet updated to reflect current code layout, so,
+    //   o. it may fall-through to a block without explicit "goto" instruction
+    //      before layout, and no longer fall-through it after layout; or
+    //   o. just opposite.
+    //
+    // analyzeBranch() may return erroneous value for FBB when these two
+    // situations take place. For the first scenario FBB is mistakenly set NULL;
+    // for the 2nd scenario, the FBB, which is expected to be NULL, is
+    // mistakenly pointing to "*BI".
+    // Thus, if the future change needs to use FBB before the layout is set, it
+    // has to correct FBB first by using the code similar to the following:
+    //
+    // if (!Cond.empty() && (!FBB || FBB == ChainBB)) {
+    //   PrevBB->updateTerminator();
+    //   Cond.clear();
+    //   TBB = FBB = nullptr;
+    //   if (TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+    //     // FIXME: This should never take place.
+    //     TBB = FBB = nullptr;
+    //   }
+    // }
+    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
+      PrevBB->updateTerminator();
+  }
+
+  // Fixup the last block.
+  Cond.clear();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
+    F->back().updateTerminator();
+
+  BlockWorkList.clear();
+  EHPadWorkList.clear();
+}
+
+void MachineBlockPlacement::optimizeBranches() {
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+
+  // Now that all the basic blocks in the chain have the proper layout,
+  // make a final call to AnalyzeBranch with AllowModify set.
+  // Indeed, the target may be able to optimize the branches in a way we
+  // cannot because all branches may not be analyzable.
+  // E.g., the target may be able to remove an unconditional branch to
+  // a fallthrough when it occurs after predicated terminators.
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+    if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
+      // If PrevBB has a two-way branch, try to re-order the branches
+      // such that we branch to the successor with higher probability first.
+      if (TBB && !Cond.empty() && FBB &&
+          MBPI->getEdgeProbability(ChainBB, FBB) >
+              MBPI->getEdgeProbability(ChainBB, TBB) &&
+          !TII->ReverseBranchCondition(Cond)) {
+        DEBUG(dbgs() << "Reverse order of the two branches: "
+                     << getBlockName(ChainBB) << "\n");
+        DEBUG(dbgs() << "    Edge probability: "
+                     << MBPI->getEdgeProbability(ChainBB, FBB) << " vs "
+                     << MBPI->getEdgeProbability(ChainBB, TBB) << "\n");
+        DebugLoc dl; // FIXME: this is nowhere
+        TII->RemoveBranch(*ChainBB);
+        TII->InsertBranch(*ChainBB, FBB, TBB, Cond, dl);
+        ChainBB->updateTerminator();
+      }
+    }
+  }
+}
+
+void MachineBlockPlacement::alignBlocks() {
+  // Walk through the backedges of the function now that we have fully laid out
+  // the basic blocks and align the destination of each backedge. We don't rely
+  // exclusively on the loop info here so that we can align backedges in
+  // unnatural CFGs and backedges that were introduced purely because of the
+  // loop rotations done during this layout pass.
+  if (F->getFunction()->optForSize())
+    return;
+  BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  if (FunctionChain.begin() == FunctionChain.end())
+    return; // Empty chain.
+
+  const BranchProbability ColdProb(1, 5); // 20%
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
+  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
+  for (MachineBasicBlock *ChainBB : FunctionChain) {
+    if (ChainBB == *FunctionChain.begin())
+      continue;
+
+    // Don't align non-looping basic blocks. These are unlikely to execute
+    // enough times to matter in practice. Note that we'll still handle
+    // unnatural CFGs inside of a natural outer loop (the common case) and
+    // rotated loops.
+    MachineLoop *L = MLI->getLoopFor(ChainBB);
+    if (!L)
+      continue;
+
+    unsigned Align = TLI->getPrefLoopAlignment(L);
+    if (!Align)
+      continue; // Don't care about loop alignment.
+
+    // If the block is cold relative to the function entry don't waste space
+    // aligning it.
+    BlockFrequency Freq = MBFI->getBlockFreq(ChainBB);
+    if (Freq < WeightedEntryFreq)
+      continue;
+
+    // If the block is cold relative to its loop header, don't align it
+    // regardless of what edges into the block exist.
+    MachineBasicBlock *LoopHeader = L->getHeader();
+    BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
+    if (Freq < (LoopHeaderFreq * ColdProb))
+      continue;
+
+    // Check for the existence of a non-layout predecessor which would benefit
+    // from aligning this block.
+    MachineBasicBlock *LayoutPred =
+        &*std::prev(MachineFunction::iterator(ChainBB));
+
+    // Force alignment if all the predecessors are jumps. We already checked
+    // that the block isn't cold above.
+    if (!LayoutPred->isSuccessor(ChainBB)) {
+      ChainBB->setAlignment(Align);
+      continue;
+    }
+
+    // Align this block if the layout predecessor's edge into this block is
+    // cold relative to the block. When this is true, other predecessors make up
+    // all of the hot entries into the block and thus alignment is likely to be
+    // important.
+    BranchProbability LayoutProb =
+        MBPI->getEdgeProbability(LayoutPred, ChainBB);
+    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
+    if (LayoutEdgeFreq <= (Freq * ColdProb))
+      ChainBB->setAlignment(Align);
+  }
+}
+
+bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  // Check for single-block functions and skip them.
+  if (std::next(MF.begin()) == MF.end())
+    return false;
+
+  F = &MF;
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = llvm::make_unique<BranchFolder::MBFIWrapper>(
+      getAnalysis<MachineBlockFrequencyInfo>());
+  MLI = &getAnalysis<MachineLoopInfo>();
+  TII = MF.getSubtarget().getInstrInfo();
+  TLI = MF.getSubtarget().getTargetLowering();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  assert(BlockToChain.empty());
+
+  buildCFGChains();
+
+  // Changing the layout can create new tail merging opportunities.
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+  // TailMerge can create jump into if branches that make CFG irreducible for
+  // HW that requires structured CFG.
+  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                         PassConfig->getEnableTailMerge() &&
+                         BranchFoldPlacement;
+  // No tail merging opportunities if the block number is less than four.
+  if (MF.size() > 3 && EnableTailMerge) {
+    BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
+                    *MBPI);
+
+    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
+                            getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
+                            /*AfterBlockPlacement=*/true)) {
+      // Redo the layout if tail merging creates/removes/moves blocks.
+      BlockToChain.clear();
+      ChainAllocator.DestroyAll();
+      buildCFGChains();
+    }
+  }
+
+  optimizeBranches();
+  alignBlocks();
+
+  BlockToChain.clear();
+  ChainAllocator.DestroyAll();
+
+  if (AlignAllBlock)
+    // Align all of the blocks in the function to a specific alignment.
+    for (MachineBasicBlock &MBB : MF)
+      MBB.setAlignment(AlignAllBlock);
+  else if (AlignAllNonFallThruBlocks) {
+    // Align all of the blocks that have no fall-through predecessors to a
+    // specific alignment.
+    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
+      auto LayoutPred = std::prev(MBI);
+      if (!LayoutPred->isSuccessor(&*MBI))
+        MBI->setAlignment(AlignAllNonFallThruBlocks);
+    }
+  }
+
+  // We always return true as we have no way to track whether the final order
+  // differs from the original order.
+  return true;
+}
+
+namespace {
+/// \brief A pass to compute block placement statistics.
+///
+/// A separate pass to compute interesting statistics for evaluating block
+/// placement. This is separate from the actual placement pass so that they can
+/// be computed in the absence of any placement transformations or when using
+/// alternative placement strategies.
+class MachineBlockPlacementStats : public MachineFunctionPass {
+  /// \brief A handle to the branch probability pass.
+  const MachineBranchProbabilityInfo *MBPI;
+
+  /// \brief A handle to the function-wide block frequency pass.
+  const MachineBlockFrequencyInfo *MBFI;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  MachineBlockPlacementStats() : MachineFunctionPass(ID) {
+    initializeMachineBlockPlacementStatsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineBranchProbabilityInfo>();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+}
+
+char MachineBlockPlacementStats::ID = 0;
+char &llvm::MachineBlockPlacementStatsID = MachineBlockPlacementStats::ID;
+INITIALIZE_PASS_BEGIN(MachineBlockPlacementStats, "block-placement-stats",
+                      "Basic Block Placement Stats", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_END(MachineBlockPlacementStats, "block-placement-stats",
+                    "Basic Block Placement Stats", false, false)
+
+bool MachineBlockPlacementStats::runOnMachineFunction(MachineFunction &F) {
+  // Check for single-block functions and skip them.
+  if (std::next(F.begin()) == F.end())
+    return false;
+
+  MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  for (MachineBasicBlock &MBB : F) {
+    BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
+    Statistic &NumBranches =
+        (MBB.succ_size() > 1) ? NumCondBranches : NumUncondBranches;
+    Statistic &BranchTakenFreq =
+        (MBB.succ_size() > 1) ? CondBranchTakenFreq : UncondBranchTakenFreq;
+    for (MachineBasicBlock *Succ : MBB.successors()) {
+      // Skip if this successor is a fallthrough.
+      if (MBB.isLayoutSuccessor(Succ))
+        continue;
+
+      BlockFrequency EdgeFreq =
+          BlockFreq * MBPI->getEdgeProbability(&MBB, Succ);
+      ++NumBranches;
+      BranchTakenFreq += EdgeFreq.getFrequency();
+    }
+  }
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
new file mode 100644
index 000000000000..fe7340618374
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -0,0 +1,93 @@
+//===- MachineBranchProbabilityInfo.cpp - Machine Branch Probability Info -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis uses probability info stored in Machine Basic Blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+INITIALIZE_PASS_BEGIN(MachineBranchProbabilityInfo, "machine-branch-prob",
+                      "Machine Branch Probability Analysis", false, true)
+INITIALIZE_PASS_END(MachineBranchProbabilityInfo, "machine-branch-prob",
+                    "Machine Branch Probability Analysis", false, true)
+
+cl::opt<unsigned>
+    StaticLikelyProb("static-likely-prob",
+                     cl::desc("branch probability threshold in percentage"
+                              "to be considered very likely"),
+                     cl::init(80), cl::Hidden);
+
+cl::opt<unsigned> ProfileLikelyProb(
+    "profile-likely-prob",
+    cl::desc("branch probability threshold in percentage to be considered"
+             " very likely when profile is available"),
+    cl::init(51), cl::Hidden);
+
+char MachineBranchProbabilityInfo::ID = 0;
+
+void MachineBranchProbabilityInfo::anchor() {}
+
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src,
+    MachineBasicBlock::const_succ_iterator Dst) const {
+  return Src->getSuccProbability(Dst);
+}
+
+BranchProbability MachineBranchProbabilityInfo::getEdgeProbability(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeProbability(Src,
+                            std::find(Src->succ_begin(), Src->succ_end(), Dst));
+}
+
+bool MachineBranchProbabilityInfo::isEdgeHot(
+    const MachineBasicBlock *Src, const MachineBasicBlock *Dst) const {
+  BranchProbability HotProb(StaticLikelyProb, 100);
+  return getEdgeProbability(Src, Dst) > HotProb;
+}
+
+MachineBasicBlock *
+MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
+  auto MaxProb = BranchProbability::getZero();
+  MachineBasicBlock *MaxSucc = nullptr;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    auto Prob = getEdgeProbability(MBB, I);
+    if (Prob > MaxProb) {
+      MaxProb = Prob;
+      MaxSucc = *I;
+    }
+  }
+
+  BranchProbability HotProb(StaticLikelyProb, 100);
+  if (getEdgeProbability(MBB, MaxSucc) >= HotProb)
+    return MaxSucc;
+
+  return nullptr;
+}
+
+raw_ostream &MachineBranchProbabilityInfo::printEdgeProbability(
+    raw_ostream &OS, const MachineBasicBlock *Src,
+    const MachineBasicBlock *Dst) const {
+
+  const BranchProbability Prob = getEdgeProbability(Src, Dst);
+  OS << "edge MBB#" << Src->getNumber() << " -> MBB#" << Dst->getNumber()
+     << " probability is " << Prob
+     << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
+
+  return OS;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineCSE.cpp b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
new file mode 100644
index 000000000000..1209f73d9601
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCSE.cpp
@@ -0,0 +1,716 @@
+//===-- MachineCSE.cpp - Machine Common Subexpression Elimination Pass ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs global common subexpression elimination on machine
+// instructions using a scoped hash table based value numbering scheme. It
+// must be run while the machine function is still in SSA form.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/RecyclingAllocator.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-cse"
+
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+STATISTIC(NumCSEs,      "Number of common subexpression eliminated");
+STATISTIC(NumPhysCSEs,
+          "Number of physreg referencing common subexpr eliminated");
+STATISTIC(NumCrossBBCSEs,
+          "Number of cross-MBB physreg referencing CS eliminated");
+STATISTIC(NumCommutes,  "Number of copies coalesced after commuting");
+
+namespace {
+  class MachineCSE : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    AliasAnalysis *AA;
+    MachineDominatorTree *DT;
+    MachineRegisterInfo *MRI;
+  public:
+    static char ID; // Pass identification
+    MachineCSE() : MachineFunctionPass(ID), LookAheadLimit(0), CurrVN(0) {
+      initializeMachineCSEPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreservedID(MachineLoopInfoID);
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+    }
+
+    void releaseMemory() override {
+      ScopeMap.clear();
+      Exps.clear();
+    }
+
+  private:
+    unsigned LookAheadLimit;
+    typedef RecyclingAllocator<BumpPtrAllocator,
+        ScopedHashTableVal<MachineInstr*, unsigned> > AllocatorTy;
+    typedef ScopedHashTable<MachineInstr*, unsigned,
+        MachineInstrExpressionTrait, AllocatorTy> ScopedHTType;
+    typedef ScopedHTType::ScopeTy ScopeType;
+    DenseMap<MachineBasicBlock*, ScopeType*> ScopeMap;
+    ScopedHTType VNT;
+    SmallVector<MachineInstr*, 64> Exps;
+    unsigned CurrVN;
+
+    bool PerformTrivialCopyPropagation(MachineInstr *MI,
+                                       MachineBasicBlock *MBB);
+    bool isPhysDefTriviallyDead(unsigned Reg,
+                                MachineBasicBlock::const_iterator I,
+                                MachineBasicBlock::const_iterator E) const;
+    bool hasLivePhysRegDefUses(const MachineInstr *MI,
+                               const MachineBasicBlock *MBB,
+                               SmallSet<unsigned,8> &PhysRefs,
+                               SmallVectorImpl<unsigned> &PhysDefs,
+                               bool &PhysUseDef) const;
+    bool PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                          SmallSet<unsigned,8> &PhysRefs,
+                          SmallVectorImpl<unsigned> &PhysDefs,
+                          bool &NonLocal) const;
+    bool isCSECandidate(MachineInstr *MI);
+    bool isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                           MachineInstr *CSMI, MachineInstr *MI);
+    void EnterScope(MachineBasicBlock *MBB);
+    void ExitScope(MachineBasicBlock *MBB);
+    bool ProcessBlock(MachineBasicBlock *MBB);
+    void ExitScopeIfDone(MachineDomTreeNode *Node,
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
+    bool PerformCSE(MachineDomTreeNode *Node);
+  };
+} // end anonymous namespace
+
+char MachineCSE::ID = 0;
+char &llvm::MachineCSEID = MachineCSE::ID;
+INITIALIZE_PASS_BEGIN(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineCSE, "machine-cse",
+                "Machine Common Subexpression Elimination", false, false)
+
+/// The source register of a COPY machine instruction can be propagated to all
+/// its users, and this propagation could increase the probability of finding
+/// common subexpressions. If the COPY has only one user, the COPY itself can
+/// be removed.
+bool MachineCSE::PerformTrivialCopyPropagation(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) {
+  bool Changed = false;
+  for (MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    bool OnlyOneUse = MRI->hasOneNonDBGUse(Reg);
+    MachineInstr *DefMI = MRI->getVRegDef(Reg);
+    if (!DefMI->isCopy())
+      continue;
+    unsigned SrcReg = DefMI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      continue;
+    if (DefMI->getOperand(0).getSubReg())
+      continue;
+    // FIXME: We should trivially coalesce subregister copies to expose CSE
+    // opportunities on instructions with truncated operands (see
+    // cse-add-with-overflow.ll). This can be done here as follows:
+    // if (SrcSubReg)
+    //  RC = TRI->getMatchingSuperRegClass(MRI->getRegClass(SrcReg), RC,
+    //                                     SrcSubReg);
+    // MO.substVirtReg(SrcReg, SrcSubReg, *TRI);
+    //
+    // The 2-addr pass has been updated to handle coalesced subregs. However,
+    // some machine-specific code still can't handle it.
+    // To handle it properly we also need a way find a constrained subregister
+    // class given a super-reg class and subreg index.
+    if (DefMI->getOperand(1).getSubReg())
+      continue;
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+    if (!MRI->constrainRegClass(SrcReg, RC))
+      continue;
+    DEBUG(dbgs() << "Coalescing: " << *DefMI);
+    DEBUG(dbgs() << "***     to: " << *MI);
+    // Propagate SrcReg of copies to MI.
+    MO.setReg(SrcReg);
+    MRI->clearKillFlags(SrcReg);
+    // Coalesce single use copies.
+    if (OnlyOneUse) {
+      DefMI->eraseFromParent();
+      ++NumCoalesces;
+    }
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool
+MachineCSE::isPhysDefTriviallyDead(unsigned Reg,
+                                   MachineBasicBlock::const_iterator I,
+                                   MachineBasicBlock::const_iterator E) const {
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I->isDebugValue())
+      ++I;
+
+    if (I == E)
+      // Reached end of block, register is obviously dead.
+      return true;
+
+    bool SeenDef = false;
+    for (const MachineOperand &MO : I->operands()) {
+      if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+        SeenDef = true;
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (!TRI->regsOverlap(MO.getReg(), Reg))
+        continue;
+      if (MO.isUse())
+        // Found a use!
+        return false;
+      SeenDef = true;
+    }
+    if (SeenDef)
+      // See a def of Reg (or an alias) before encountering any use, it's
+      // trivially dead.
+      return true;
+
+    --LookAheadLeft;
+    ++I;
+  }
+  return false;
+}
+
+/// hasLivePhysRegDefUses - Return true if the specified instruction read/write
+/// physical registers (except for dead defs of physical registers). It also
+/// returns the physical register def by reference if it's the only one and the
+/// instruction does not uses a physical register.
+bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
+                                       const MachineBasicBlock *MBB,
+                                       SmallSet<unsigned,8> &PhysRefs,
+                                       SmallVectorImpl<unsigned> &PhysDefs,
+                                       bool &PhysUseDef) const{
+  // First, add all uses to PhysRefs.
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // Reading constant physregs is ok.
+    if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRefs.insert(*AI);
+  }
+
+  // Next, collect all defs into PhysDefs.  If any is already in PhysRefs
+  // (which currently contains only uses), set the PhysUseDef flag.
+  PhysUseDef = false;
+  MachineBasicBlock::const_iterator I = MI; I = std::next(I);
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    // Check against PhysRefs even if the def is "dead".
+    if (PhysRefs.count(Reg))
+      PhysUseDef = true;
+    // If the def is dead, it's ok. But the def may not marked "dead". That's
+    // common since this pass is run before livevariables. We can scan
+    // forward a few instructions and check if it is obviously dead.
+    if (!MO.isDead() && !isPhysDefTriviallyDead(Reg, I, MBB->end()))
+      PhysDefs.push_back(Reg);
+  }
+
+  // Finally, add all defs to PhysRefs as well.
+  for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i)
+    for (MCRegAliasIterator AI(PhysDefs[i], TRI, true); AI.isValid(); ++AI)
+      PhysRefs.insert(*AI);
+
+  return !PhysRefs.empty();
+}
+
+bool MachineCSE::PhysRegDefsReach(MachineInstr *CSMI, MachineInstr *MI,
+                                  SmallSet<unsigned,8> &PhysRefs,
+                                  SmallVectorImpl<unsigned> &PhysDefs,
+                                  bool &NonLocal) const {
+  // For now conservatively returns false if the common subexpression is
+  // not in the same basic block as the given instruction. The only exception
+  // is if the common subexpression is in the sole predecessor block.
+  const MachineBasicBlock *MBB = MI->getParent();
+  const MachineBasicBlock *CSMBB = CSMI->getParent();
+
+  bool CrossMBB = false;
+  if (CSMBB != MBB) {
+    if (MBB->pred_size() != 1 || *MBB->pred_begin() != CSMBB)
+      return false;
+
+    for (unsigned i = 0, e = PhysDefs.size(); i != e; ++i) {
+      if (MRI->isAllocatable(PhysDefs[i]) || MRI->isReserved(PhysDefs[i]))
+        // Avoid extending live range of physical registers if they are
+        //allocatable or reserved.
+        return false;
+    }
+    CrossMBB = true;
+  }
+  MachineBasicBlock::const_iterator I = CSMI; I = std::next(I);
+  MachineBasicBlock::const_iterator E = MI;
+  MachineBasicBlock::const_iterator EE = CSMBB->end();
+  unsigned LookAheadLeft = LookAheadLimit;
+  while (LookAheadLeft) {
+    // Skip over dbg_value's.
+    while (I != E && I != EE && I->isDebugValue())
+      ++I;
+
+    if (I == EE) {
+      assert(CrossMBB && "Reaching end-of-MBB without finding MI?");
+      (void)CrossMBB;
+      CrossMBB = false;
+      NonLocal = true;
+      I = MBB->begin();
+      EE = MBB->end();
+      continue;
+    }
+
+    if (I == E)
+      return true;
+
+    for (const MachineOperand &MO : I->operands()) {
+      // RegMasks go on instructions like calls that clobber lots of physregs.
+      // Don't attempt to CSE across such an instruction.
+      if (MO.isRegMask())
+        return false;
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (TargetRegisterInfo::isVirtualRegister(MOReg))
+        continue;
+      if (PhysRefs.count(MOReg))
+        return false;
+    }
+
+    --LookAheadLeft;
+    ++I;
+  }
+
+  return false;
+}
+
+bool MachineCSE::isCSECandidate(MachineInstr *MI) {
+  if (MI->isPosition() || MI->isPHI() || MI->isImplicitDef() || MI->isKill() ||
+      MI->isInlineAsm() || MI->isDebugValue())
+    return false;
+
+  // Ignore copies.
+  if (MI->isCopyLike())
+    return false;
+
+  // Ignore stuff that we obviously can't move.
+  if (MI->mayStore() || MI->isCall() || MI->isTerminator() ||
+      MI->hasUnmodeledSideEffects())
+    return false;
+
+  if (MI->mayLoad()) {
+    // Okay, this instruction does a load. As a refinement, we allow the target
+    // to decide whether the loaded value is actually a constant. If so, we can
+    // actually use it as a load.
+    if (!MI->isInvariantLoad(AA))
+      // FIXME: we should be able to hoist loads with no other side effects if
+      // there are no other instructions which can change memory in this loop.
+      // This is a trivial form of alias analysis.
+      return false;
+  }
+
+  // Ignore stack guard loads, otherwise the register that holds CSEed value may
+  // be spilled and get loaded back with corrupted data.
+  if (MI->getOpcode() == TargetOpcode::LOAD_STACK_GUARD)
+    return false;
+
+  return true;
+}
+
+/// isProfitableToCSE - Return true if it's profitable to eliminate MI with a
+/// common expression that defines Reg.
+bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
+                                   MachineInstr *CSMI, MachineInstr *MI) {
+  // FIXME: Heuristics that works around the lack the live range splitting.
+
+  // If CSReg is used at all uses of Reg, CSE should not increase register
+  // pressure of CSReg.
+  bool MayIncreasePressure = true;
+  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
+      TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MayIncreasePressure = false;
+    SmallPtrSet<MachineInstr*, 8> CSUses;
+    for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
+      CSUses.insert(&MI);
+    }
+    for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+      if (!CSUses.count(&MI)) {
+        MayIncreasePressure = true;
+        break;
+      }
+    }
+  }
+  if (!MayIncreasePressure) return true;
+
+  // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
+  // an immediate predecessor. We don't want to increase register pressure and
+  // end up causing other computation to be spilled.
+  if (TII->isAsCheapAsAMove(*MI)) {
+    MachineBasicBlock *CSBB = CSMI->getParent();
+    MachineBasicBlock *BB = MI->getParent();
+    if (CSBB != BB && !CSBB->isSuccessor(BB))
+      return false;
+  }
+
+  // Heuristics #2: If the expression doesn't not use a vr and the only use
+  // of the redundant computation are copies, do not cse.
+  bool HasVRegUse = false;
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isReg() && MO.isUse() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+      HasVRegUse = true;
+      break;
+    }
+  }
+  if (!HasVRegUse) {
+    bool HasNonCopyUse = false;
+    for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) {
+      // Ignore copies.
+      if (!MI.isCopyLike()) {
+        HasNonCopyUse = true;
+        break;
+      }
+    }
+    if (!HasNonCopyUse)
+      return false;
+  }
+
+  // Heuristics #3: If the common subexpression is used by PHIs, do not reuse
+  // it unless the defined value is already used in the BB of the new use.
+  bool HasPHI = false;
+  SmallPtrSet<MachineBasicBlock*, 4> CSBBs;
+  for (MachineInstr &MI : MRI->use_nodbg_instructions(CSReg)) {
+    HasPHI |= MI.isPHI();
+    CSBBs.insert(MI.getParent());
+  }
+
+  if (!HasPHI)
+    return true;
+  return CSBBs.count(MI->getParent());
+}
+
+void MachineCSE::EnterScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Entering: " << MBB->getName() << '\n');
+  ScopeType *Scope = new ScopeType(VNT);
+  ScopeMap[MBB] = Scope;
+}
+
+void MachineCSE::ExitScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Exiting: " << MBB->getName() << '\n');
+  DenseMap<MachineBasicBlock*, ScopeType*>::iterator SI = ScopeMap.find(MBB);
+  assert(SI != ScopeMap.end());
+  delete SI->second;
+  ScopeMap.erase(SI);
+}
+
+bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
+  bool Changed = false;
+
+  SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  SmallVector<unsigned, 2> ImplicitDefsToUpdate;
+  SmallVector<unsigned, 2> ImplicitDefs;
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (!isCSECandidate(MI))
+      continue;
+
+    bool FoundCSE = VNT.count(MI);
+    if (!FoundCSE) {
+      // Using trivial copy propagation to find more CSE opportunities.
+      if (PerformTrivialCopyPropagation(MI, MBB)) {
+        Changed = true;
+
+        // After coalescing MI itself may become a copy.
+        if (MI->isCopyLike())
+          continue;
+
+        // Try again to see if CSE is possible.
+        FoundCSE = VNT.count(MI);
+      }
+    }
+
+    // Commute commutable instructions.
+    bool Commuted = false;
+    if (!FoundCSE && MI->isCommutable()) {
+      if (MachineInstr *NewMI = TII->commuteInstruction(*MI)) {
+        Commuted = true;
+        FoundCSE = VNT.count(NewMI);
+        if (NewMI != MI) {
+          // New instruction. It doesn't need to be kept.
+          NewMI->eraseFromParent();
+          Changed = true;
+        } else if (!FoundCSE)
+          // MI was changed but it didn't help, commute it back!
+          (void)TII->commuteInstruction(*MI);
+      }
+    }
+
+    // If the instruction defines physical registers and the values *may* be
+    // used, then it's not safe to replace it with a common subexpression.
+    // It's also not safe if the instruction uses physical registers.
+    bool CrossMBBPhysDef = false;
+    SmallSet<unsigned, 8> PhysRefs;
+    SmallVector<unsigned, 2> PhysDefs;
+    bool PhysUseDef = false;
+    if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs,
+                                          PhysDefs, PhysUseDef)) {
+      FoundCSE = false;
+
+      // ... Unless the CS is local or is in the sole predecessor block
+      // and it also defines the physical register which is not clobbered
+      // in between and the physical register uses were not clobbered.
+      // This can never be the case if the instruction both uses and
+      // defines the same physical register, which was detected above.
+      if (!PhysUseDef) {
+        unsigned CSVN = VNT.lookup(MI);
+        MachineInstr *CSMI = Exps[CSVN];
+        if (PhysRegDefsReach(CSMI, MI, PhysRefs, PhysDefs, CrossMBBPhysDef))
+          FoundCSE = true;
+      }
+    }
+
+    if (!FoundCSE) {
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+      continue;
+    }
+
+    // Found a common subexpression, eliminate it.
+    unsigned CSVN = VNT.lookup(MI);
+    MachineInstr *CSMI = Exps[CSVN];
+    DEBUG(dbgs() << "Examining: " << *MI);
+    DEBUG(dbgs() << "*** Found a common subexpression: " << *CSMI);
+
+    // Check if it's profitable to perform this CSE.
+    bool DoCSE = true;
+    unsigned NumDefs = MI->getDesc().getNumDefs() +
+                       MI->getDesc().getNumImplicitDefs();
+
+    for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned OldReg = MO.getReg();
+      unsigned NewReg = CSMI->getOperand(i).getReg();
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
+        ImplicitDefsToUpdate.push_back(i);
+
+      // Keep track of implicit defs of CSMI and MI, to clear possibly
+      // made-redundant kill flags.
+      if (MO.isImplicit() && !MO.isDead() && OldReg == NewReg)
+        ImplicitDefs.push_back(OldReg);
+
+      if (OldReg == NewReg) {
+        --NumDefs;
+        continue;
+      }
+
+      assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
+             TargetRegisterInfo::isVirtualRegister(NewReg) &&
+             "Do not CSE physical register defs!");
+
+      if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      // Don't perform CSE if the result of the old instruction cannot exist
+      // within the register class of the new instruction.
+      const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
+      if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
+        DoCSE = false;
+        break;
+      }
+
+      CSEPairs.push_back(std::make_pair(OldReg, NewReg));
+      --NumDefs;
+    }
+
+    // Actually perform the elimination.
+    if (DoCSE) {
+      for (std::pair<unsigned, unsigned> &CSEPair : CSEPairs) {
+        unsigned OldReg = CSEPair.first;
+        unsigned NewReg = CSEPair.second;
+        // OldReg may have been unused but is used now, clear the Dead flag
+        MachineInstr *Def = MRI->getUniqueVRegDef(NewReg);
+        assert(Def != nullptr && "CSEd register has no unique definition?");
+        Def->clearRegisterDeads(NewReg);
+        // Replace with NewReg and clear kill flags which may be wrong now.
+        MRI->replaceRegWith(OldReg, NewReg);
+        MRI->clearKillFlags(NewReg);
+      }
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      for (unsigned ImplicitDefToUpdate : ImplicitDefsToUpdate)
+        CSMI->getOperand(ImplicitDefToUpdate).setIsDead(false);
+
+      // Go through implicit defs of CSMI and MI, and clear the kill flags on
+      // their uses in all the instructions between CSMI and MI.
+      // We might have made some of the kill flags redundant, consider:
+      //   subs  ... %NZCV<imp-def>        <- CSMI
+      //   csinc ... %NZCV<imp-use,kill>   <- this kill flag isn't valid anymore
+      //   subs  ... %NZCV<imp-def>        <- MI, to be eliminated
+      //   csinc ... %NZCV<imp-use,kill>
+      // Since we eliminated MI, and reused a register imp-def'd by CSMI
+      // (here %NZCV), that register, if it was killed before MI, should have
+      // that kill flag removed, because it's lifetime was extended.
+      if (CSMI->getParent() == MI->getParent()) {
+        for (MachineBasicBlock::iterator II = CSMI, IE = MI; II != IE; ++II)
+          for (auto ImplicitDef : ImplicitDefs)
+            if (MachineOperand *MO = II->findRegisterUseOperand(
+                    ImplicitDef, /*isKill=*/true, TRI))
+              MO->setIsKill(false);
+      } else {
+        // If the instructions aren't in the same BB, bail out and clear the
+        // kill flag on all uses of the imp-def'd register.
+        for (auto ImplicitDef : ImplicitDefs)
+          MRI->clearKillFlags(ImplicitDef);
+      }
+
+      if (CrossMBBPhysDef) {
+        // Add physical register defs now coming in from a predecessor to MBB
+        // livein list.
+        while (!PhysDefs.empty()) {
+          unsigned LiveIn = PhysDefs.pop_back_val();
+          if (!MBB->isLiveIn(LiveIn))
+            MBB->addLiveIn(LiveIn);
+        }
+        ++NumCrossBBCSEs;
+      }
+
+      MI->eraseFromParent();
+      ++NumCSEs;
+      if (!PhysRefs.empty())
+        ++NumPhysCSEs;
+      if (Commuted)
+        ++NumCommutes;
+      Changed = true;
+    } else {
+      VNT.insert(MI, CurrVN++);
+      Exps.push_back(MI);
+    }
+    CSEPairs.clear();
+    ImplicitDefsToUpdate.clear();
+    ImplicitDefs.clear();
+  }
+
+  return Changed;
+}
+
+/// ExitScopeIfDone - Destroy scope for the MBB that corresponds to the given
+/// dominator tree node if its a leaf or all of its children are done. Walk
+/// up the dominator tree to destroy ancestors which are now done.
+void
+MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  CurrVN = 0;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(Node);
+  do {
+    Node = WorkList.pop_back_val();
+    Scopes.push_back(Node);
+    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+    OpenChildren[Node] = Children.size();
+    for (MachineDomTreeNode *Child : Children)
+      WorkList.push_back(Child);
+  } while (!WorkList.empty());
+
+  // Now perform CSE.
+  bool Changed = false;
+  for (MachineDomTreeNode *Node : Scopes) {
+    MachineBasicBlock *MBB = Node->getBlock();
+    EnterScope(MBB);
+    Changed |= ProcessBlock(MBB);
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren);
+  }
+
+  return Changed;
+}
+
+bool MachineCSE::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<MachineDominatorTree>();
+  LookAheadLimit = TII->getMachineCSELookAheadLimit();
+  return PerformCSE(DT->getRootNode());
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineCombiner.cpp b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
new file mode 100644
index 000000000000..6b5c6ba82506
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -0,0 +1,477 @@
+//===---- MachineCombiner.cpp - Instcombining on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// The machine combiner pass uses machine trace metrics to ensure the combined
+// instructions does not lengthen the critical path or the resource depth.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-combiner"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+STATISTIC(NumInstCombined, "Number of machineinst combined");
+
+namespace {
+class MachineCombiner : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MCSchedModel SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineLoopInfo *MLI; // Current MachineLoopInfo
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+
+  TargetSchedModel TSchedModel;
+
+  /// True if optimizing for code size.
+  bool OptSize;
+
+public:
+  static char ID;
+  MachineCombiner() : MachineFunctionPass(ID) {
+    initializeMachineCombinerPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  const char *getPassName() const override { return "Machine InstCombiner"; }
+
+private:
+  bool doSubstitute(unsigned NewSize, unsigned OldSize);
+  bool combineInstructions(MachineBasicBlock *);
+  MachineInstr *getOperandDef(const MachineOperand &MO);
+  unsigned getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                    MachineTraceMetrics::Trace BlockTrace);
+  unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                      MachineTraceMetrics::Trace BlockTrace);
+  bool
+  improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+                          MachineTraceMetrics::Trace BlockTrace,
+                          SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineCombinerPattern Pattern);
+  bool preservesResourceLen(MachineBasicBlock *MBB,
+                            MachineTraceMetrics::Trace BlockTrace,
+                            SmallVectorImpl<MachineInstr *> &InsInstrs,
+                            SmallVectorImpl<MachineInstr *> &DelInstrs);
+  void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
+                     SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+};
+}
+
+char MachineCombiner::ID = 0;
+char &llvm::MachineCombinerID = MachineCombiner::ID;
+
+INITIALIZE_PASS_BEGIN(MachineCombiner, "machine-combiner",
+                      "Machine InstCombiner", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(MachineCombiner, "machine-combiner", "Machine InstCombiner",
+                    false, false)
+
+void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineInstr *MachineCombiner::getOperandDef(const MachineOperand &MO) {
+  MachineInstr *DefInstr = nullptr;
+  // We need a virtual register definition.
+  if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+    DefInstr = MRI->getUniqueVRegDef(MO.getReg());
+  // PHI's have no depth etc.
+  if (DefInstr && DefInstr->isPHI())
+    DefInstr = nullptr;
+  return DefInstr;
+}
+
+/// Computes depth of instructions in vector \InsInstr.
+///
+/// \param InsInstrs is a vector of machine instructions
+/// \param InstrIdxForVirtReg is a dense map of virtual register to index
+/// of defining machine instruction in \p InsInstrs
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Depth of last instruction in \InsInstrs ("NewRoot")
+unsigned
+MachineCombiner::getDepth(SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                          MachineTraceMetrics::Trace BlockTrace) {
+  SmallVector<unsigned, 16> InstrDepth;
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
+
+  // For each instruction in the new sequence compute the depth based on the
+  // operands. Use the trace information when possible. For new operands which
+  // are tracked in the InstrIdxForVirtReg map depth is looked up in InstrDepth
+  for (auto *InstrPtr : InsInstrs) { // for each Use
+    unsigned IDepth = 0;
+    DEBUG(dbgs() << "NEW INSTR "; InstrPtr->dump(); dbgs() << "\n";);
+    for (const MachineOperand &MO : InstrPtr->operands()) {
+      // Check for virtual register operand.
+      if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+        continue;
+      if (!MO.isUse())
+        continue;
+      unsigned DepthOp = 0;
+      unsigned LatencyOp = 0;
+      DenseMap<unsigned, unsigned>::iterator II =
+          InstrIdxForVirtReg.find(MO.getReg());
+      if (II != InstrIdxForVirtReg.end()) {
+        // Operand is new virtual register not in trace
+        assert(II->second < InstrDepth.size() && "Bad Index");
+        MachineInstr *DefInstr = InsInstrs[II->second];
+        assert(DefInstr &&
+               "There must be a definition for a new virtual register");
+        DepthOp = InstrDepth[II->second];
+        LatencyOp = TSchedModel.computeOperandLatency(
+            DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+            InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+      } else {
+        MachineInstr *DefInstr = getOperandDef(MO);
+        if (DefInstr) {
+          DepthOp = BlockTrace.getInstrCycles(*DefInstr).Depth;
+          LatencyOp = TSchedModel.computeOperandLatency(
+              DefInstr, DefInstr->findRegisterDefOperandIdx(MO.getReg()),
+              InstrPtr, InstrPtr->findRegisterUseOperandIdx(MO.getReg()));
+        }
+      }
+      IDepth = std::max(IDepth, DepthOp + LatencyOp);
+    }
+    InstrDepth.push_back(IDepth);
+  }
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  return InstrDepth[NewRootIdx];
+}
+
+/// Computes instruction latency as max of latency of defined operands.
+///
+/// \param Root is a machine instruction that could be replaced by NewRoot.
+/// It is used to compute a more accurate latency information for NewRoot in
+/// case there is a dependent instruction in the same trace (\p BlockTrace)
+/// \param NewRoot is the instruction for which the latency is computed
+/// \param BlockTrace is a trace of machine instructions
+///
+/// \returns Latency of \p NewRoot
+unsigned MachineCombiner::getLatency(MachineInstr *Root, MachineInstr *NewRoot,
+                                     MachineTraceMetrics::Trace BlockTrace) {
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
+
+  // Check each definition in NewRoot and compute the latency
+  unsigned NewRootLatency = 0;
+
+  for (const MachineOperand &MO : NewRoot->operands()) {
+    // Check for virtual register operand.
+    if (!(MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())))
+      continue;
+    if (!MO.isDef())
+      continue;
+    // Get the first instruction that uses MO
+    MachineRegisterInfo::reg_iterator RI = MRI->reg_begin(MO.getReg());
+    RI++;
+    MachineInstr *UseMO = RI->getParent();
+    unsigned LatencyOp = 0;
+    if (UseMO && BlockTrace.isDepInTrace(*Root, *UseMO)) {
+      LatencyOp = TSchedModel.computeOperandLatency(
+          NewRoot, NewRoot->findRegisterDefOperandIdx(MO.getReg()), UseMO,
+          UseMO->findRegisterUseOperandIdx(MO.getReg()));
+    } else {
+      LatencyOp = TSchedModel.computeInstrLatency(NewRoot);
+    }
+    NewRootLatency = std::max(NewRootLatency, LatencyOp);
+  }
+  return NewRootLatency;
+}
+
+/// The combiner's goal may differ based on which pattern it is attempting
+/// to optimize.
+enum class CombinerObjective {
+  MustReduceDepth, // The data dependency chain must be improved.
+  Default          // The critical path must not be lengthened.
+};
+
+static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
+  // TODO: If C++ ever gets a real enum class, make this part of the
+  // MachineCombinerPattern class.
+  switch (P) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    return CombinerObjective::MustReduceDepth;
+  default:
+    return CombinerObjective::Default;
+  }
+}
+
+/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
+/// The new code sequence ends in MI NewRoot. A necessary condition for the new
+/// sequence to replace the old sequence is that it cannot lengthen the critical
+/// path. The definition of "improve" may be restricted by specifying that the
+/// new path improves the data dependency chain (MustReduceDepth).
+bool MachineCombiner::improvesCriticalPathLen(
+    MachineBasicBlock *MBB, MachineInstr *Root,
+    MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+    MachineCombinerPattern Pattern) {
+  assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
+         "Missing machine model\n");
+  // NewRoot is the last instruction in the \p InsInstrs vector.
+  unsigned NewRootIdx = InsInstrs.size() - 1;
+  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
+
+  // Get depth and latency of NewRoot and Root.
+  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
+  unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
+
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+        dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
+        dbgs() << " RootDepth: " << RootDepth << "\n");
+
+  // For a transform such as reassociation, the cost equation is
+  // conservatively calculated so that we must improve the depth (data
+  // dependency cycles) in the critical path to proceed with the transform.
+  // Being conservative also protects against inaccuracies in the underlying
+  // machine trace metrics and CPU models.
+  if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
+    return NewRootDepth < RootDepth;
+
+  // A more flexible cost calculation for the critical path includes the slack
+  // of the original code sequence. This may allow the transform to proceed
+  // even if the instruction depths (data dependency cycles) become worse.
+  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
+  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
+
+  DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
+        dbgs() << " RootLatency: " << RootLatency << "\n";
+        dbgs() << " RootSlack: " << RootSlack << "\n";
+        dbgs() << " NewRootDepth + NewRootLatency = "
+               << NewRootDepth + NewRootLatency << "\n";
+        dbgs() << " RootDepth + RootLatency + RootSlack = "
+               << RootDepth + RootLatency + RootSlack << "\n";);
+
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+
+  return NewCycleCount <= OldCycleCount;
+}
+
+/// helper routine to convert instructions into SC
+void MachineCombiner::instr2instrSC(
+    SmallVectorImpl<MachineInstr *> &Instrs,
+    SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC) {
+  for (auto *InstrPtr : Instrs) {
+    unsigned Opc = InstrPtr->getOpcode();
+    unsigned Idx = TII->get(Opc).getSchedClass();
+    const MCSchedClassDesc *SC = SchedModel.getSchedClassDesc(Idx);
+    InstrsSC.push_back(SC);
+  }
+}
+
+/// True when the new instructions do not increase resource length
+bool MachineCombiner::preservesResourceLen(
+    MachineBasicBlock *MBB, MachineTraceMetrics::Trace BlockTrace,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs) {
+  if (!TSchedModel.hasInstrSchedModel())
+    return true;
+
+  // Compute current resource length
+
+  //ArrayRef<const MachineBasicBlock *> MBBarr(MBB);
+  SmallVector <const MachineBasicBlock *, 1> MBBarr;
+  MBBarr.push_back(MBB);
+  unsigned ResLenBeforeCombine = BlockTrace.getResourceLength(MBBarr);
+
+  // Deal with SC rather than Instructions.
+  SmallVector<const MCSchedClassDesc *, 16> InsInstrsSC;
+  SmallVector<const MCSchedClassDesc *, 16> DelInstrsSC;
+
+  instr2instrSC(InsInstrs, InsInstrsSC);
+  instr2instrSC(DelInstrs, DelInstrsSC);
+
+  ArrayRef<const MCSchedClassDesc *> MSCInsArr = makeArrayRef(InsInstrsSC);
+  ArrayRef<const MCSchedClassDesc *> MSCDelArr = makeArrayRef(DelInstrsSC);
+
+  // Compute new resource length.
+  unsigned ResLenAfterCombine =
+      BlockTrace.getResourceLength(MBBarr, MSCInsArr, MSCDelArr);
+
+  DEBUG(dbgs() << "RESOURCE DATA: \n";
+        dbgs() << " resource len before: " << ResLenBeforeCombine
+               << " after: " << ResLenAfterCombine << "\n";);
+
+  return ResLenAfterCombine <= ResLenBeforeCombine;
+}
+
+/// \returns true when new instruction sequence should be generated
+/// independent if it lengthens critical path or not
+bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
+  if (OptSize && (NewSize < OldSize))
+    return true;
+  if (!TSchedModel.hasInstrSchedModelOrItineraries())
+    return true;
+  return false;
+}
+
+/// Substitute a slow code sequence with a faster one by
+/// evaluating instruction combining pattern.
+/// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
+/// combining based on machine trace metrics. Only combine a sequence of
+/// instructions  when this neither lengthens the critical path nor increases
+/// resource pressure. When optimizing for codesize always combine when the new
+/// sequence is shorter.
+bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
+
+  auto BlockIter = MBB->begin();
+  // Check if the block is in a loop.
+  const MachineLoop *ML = MLI->getLoopFor(MBB);
+
+  while (BlockIter != MBB->end()) {
+    auto &MI = *BlockIter++;
+
+    DEBUG(dbgs() << "INSTR "; MI.dump(); dbgs() << "\n";);
+    SmallVector<MachineCombinerPattern, 16> Patterns;
+    // The motivating example is:
+    //
+    //     MUL  Other        MUL_op1 MUL_op2  Other
+    //      \    /               \      |    /
+    //      ADD/SUB      =>        MADD/MSUB
+    //      (=Root)                (=NewRoot)
+
+    // The DAGCombine code always replaced MUL + ADD/SUB by MADD. While this is
+    // usually beneficial for code size it unfortunately can hurt performance
+    // when the ADD is on the critical path, but the MUL is not. With the
+    // substitution the MUL becomes part of the critical path (in form of the
+    // MADD) and can lengthen it on architectures where the MADD latency is
+    // longer than the ADD latency.
+    //
+    // For each instruction we check if it can be the root of a combiner
+    // pattern. Then for each pattern the new code sequence in form of MI is
+    // generated and evaluated. When the efficiency criteria (don't lengthen
+    // critical path, don't use more resources) is met the new sequence gets
+    // hooked up into the basic block before the old sequence is removed.
+    //
+    // The algorithm does not try to evaluate all patterns and pick the best.
+    // This is only an artificial restriction though. In practice there is
+    // mostly one pattern, and getMachineCombinerPatterns() can order patterns
+    // based on an internal cost heuristic.
+
+    if (!TII->getMachineCombinerPatterns(MI, Patterns))
+      continue;
+
+    for (auto P : Patterns) {
+      SmallVector<MachineInstr *, 16> InsInstrs;
+      SmallVector<MachineInstr *, 16> DelInstrs;
+      DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+      if (!MinInstr)
+        MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+      Traces->verifyAnalysis();
+      TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
+                                      InstrIdxForVirtReg);
+      unsigned NewInstCount = InsInstrs.size();
+      unsigned OldInstCount = DelInstrs.size();
+      // Found pattern, but did not generate alternative sequence.
+      // This can happen e.g. when an immediate could not be materialized
+      // in a single instruction.
+      if (!NewInstCount)
+        continue;
+
+      bool SubstituteAlways = false;
+      if (ML && TII->isThroughputPattern(P))
+        SubstituteAlways = true;
+
+      // Substitute when we optimize for codesize and the new sequence has
+      // fewer instructions OR
+      // the new sequence neither lengthens the critical path nor increases
+      // resource pressure.
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
+          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                   InstrIdxForVirtReg, P) &&
+           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
+        for (auto *InstrPtr : InsInstrs)
+          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
+        for (auto *InstrPtr : DelInstrs)
+          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+
+        Changed = true;
+        ++NumInstCombined;
+
+        Traces->invalidate(MBB);
+        Traces->verifyAnalysis();
+        // Eagerly stop after the first pattern fires.
+        break;
+      } else {
+        // Cleanup instructions of the alternative code sequence. There is no
+        // use for them.
+        MachineFunction *MF = MBB->getParent();
+        for (auto *InstrPtr : InsInstrs)
+          MF->DeleteMachineInstr(InstrPtr);
+      }
+      InstrIdxForVirtReg.clear();
+    }
+  }
+
+  return Changed;
+}
+
+bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  TII = STI.getInstrInfo();
+  TRI = STI.getRegisterInfo();
+  SchedModel = STI.getSchedModel();
+  TSchedModel.init(SchedModel, &STI, TII);
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = nullptr;
+  OptSize = MF.getFunction()->optForSize();
+
+  DEBUG(dbgs() << getPassName() << ": " << MF.getName() << '\n');
+  if (!TII->useMachineCombiner()) {
+    DEBUG(dbgs() << "  Skipping pass: Target does not support machine combiner\n");
+    return false;
+  }
+
+  bool Changed = false;
+
+  // Try to combine instructions.
+  for (auto &MBB : MF)
+    Changed |= combineInstructions(&MBB);
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
new file mode 100644
index 000000000000..8fdf39d54bd0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -0,0 +1,371 @@
+//===- MachineCopyPropagation.cpp - Machine Copy Propagation Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is an extremely simple MachineInstr-level copy propagation pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen-cp"
+
+STATISTIC(NumDeletes, "Number of dead copies deleted");
+
+namespace {
+  typedef SmallVector<unsigned, 4> RegList;
+  typedef DenseMap<unsigned, RegList> SourceMap;
+  typedef DenseMap<unsigned, MachineInstr*> Reg2MIMap;
+
+  class MachineCopyPropagation : public MachineFunctionPass {
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+    const MachineRegisterInfo *MRI;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineCopyPropagation() : MachineFunctionPass(ID) {
+      initializeMachineCopyPropagationPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
+  private:
+    void ClobberRegister(unsigned Reg);
+    void CopyPropagateBlock(MachineBasicBlock &MBB);
+    bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
+
+    /// Candidates for deletion.
+    SmallSetVector<MachineInstr*, 8> MaybeDeadCopies;
+    /// Def -> available copies map.
+    Reg2MIMap AvailCopyMap;
+    /// Def -> copies map.
+    Reg2MIMap CopyMap;
+    /// Src -> Def map
+    SourceMap SrcMap;
+    bool Changed;
+  };
+}
+char MachineCopyPropagation::ID = 0;
+char &llvm::MachineCopyPropagationID = MachineCopyPropagation::ID;
+
+INITIALIZE_PASS(MachineCopyPropagation, "machine-cp",
+                "Machine Copy Propagation Pass", false, false)
+
+/// Remove any entry in \p Map where the register is a subregister or equal to
+/// a register contained in \p Regs.
+static void removeRegsFromMap(Reg2MIMap &Map, const RegList &Regs,
+                              const TargetRegisterInfo &TRI) {
+  for (unsigned Reg : Regs) {
+    // Source of copy is no longer available for propagation.
+    for (MCSubRegIterator SR(Reg, &TRI, true); SR.isValid(); ++SR)
+      Map.erase(*SR);
+  }
+}
+
+/// Remove any entry in \p Map that is marked clobbered in \p RegMask.
+/// The map will typically have a lot fewer entries than the regmask clobbers,
+/// so this is more efficient than iterating the clobbered registers and calling
+/// ClobberRegister() on them.
+static void removeClobberedRegsFromMap(Reg2MIMap &Map,
+                                       const MachineOperand &RegMask) {
+  for (Reg2MIMap::iterator I = Map.begin(), E = Map.end(), Next; I != E;
+       I = Next) {
+    Next = std::next(I);
+    unsigned Reg = I->first;
+    if (RegMask.clobbersPhysReg(Reg))
+      Map.erase(I);
+  }
+}
+
+void MachineCopyPropagation::ClobberRegister(unsigned Reg) {
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    CopyMap.erase(*AI);
+    AvailCopyMap.erase(*AI);
+
+    SourceMap::iterator SI = SrcMap.find(*AI);
+    if (SI != SrcMap.end()) {
+      removeRegsFromMap(AvailCopyMap, SI->second, *TRI);
+      SrcMap.erase(SI);
+    }
+  }
+}
+
+/// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
+/// This fact may have been obscured by sub register usage or may not be true at
+/// all even though Src and Def are subregisters of the registers used in
+/// PreviousCopy. e.g.
+/// isNopCopy("ecx = COPY eax", AX, CX) == true
+/// isNopCopy("ecx = COPY eax", AH, CL) == false
+static bool isNopCopy(const MachineInstr &PreviousCopy, unsigned Src,
+                      unsigned Def, const TargetRegisterInfo *TRI) {
+  unsigned PreviousSrc = PreviousCopy.getOperand(1).getReg();
+  unsigned PreviousDef = PreviousCopy.getOperand(0).getReg();
+  if (Src == PreviousSrc) {
+    assert(Def == PreviousDef);
+    return true;
+  }
+  if (!TRI->isSubRegister(PreviousSrc, Src))
+    return false;
+  unsigned SubIdx = TRI->getSubRegIndex(PreviousSrc, Src);
+  return SubIdx == TRI->getSubRegIndex(PreviousDef, Def);
+}
+
+/// Remove instruction \p Copy if there exists a previous copy that copies the
+/// register \p Src to the register \p Def; This may happen indirectly by
+/// copying the super registers.
+bool MachineCopyPropagation::eraseIfRedundant(MachineInstr &Copy, unsigned Src,
+                                              unsigned Def) {
+  // Avoid eliminating a copy from/to a reserved registers as we cannot predict
+  // the value (Example: The sparc zero register is writable but stays zero).
+  if (MRI->isReserved(Src) || MRI->isReserved(Def))
+    return false;
+
+  // Search for an existing copy.
+  Reg2MIMap::iterator CI = AvailCopyMap.find(Def);
+  if (CI == AvailCopyMap.end())
+    return false;
+
+  // Check that the existing copy uses the correct sub registers.
+  MachineInstr &PrevCopy = *CI->second;
+  if (!isNopCopy(PrevCopy, Src, Def, TRI))
+    return false;
+
+  DEBUG(dbgs() << "MCP: copy is a NOP, removing: "; Copy.dump());
+
+  // Copy was redundantly redefining either Src or Def. Remove earlier kill
+  // flags between Copy and PrevCopy because the value will be reused now.
+  assert(Copy.isCopy());
+  unsigned CopyDef = Copy.getOperand(0).getReg();
+  assert(CopyDef == Src || CopyDef == Def);
+  for (MachineInstr &MI :
+       make_range(PrevCopy.getIterator(), Copy.getIterator()))
+    MI.clearRegisterKills(CopyDef, TRI);
+
+  Copy.eraseFromParent();
+  Changed = true;
+  ++NumDeletes;
+  return true;
+}
+
+void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
+  DEBUG(dbgs() << "MCP: CopyPropagateBlock " << MBB.getName() << "\n");
+
+  for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E; ) {
+    MachineInstr *MI = &*I;
+    ++I;
+
+    if (MI->isCopy()) {
+      unsigned Def = MI->getOperand(0).getReg();
+      unsigned Src = MI->getOperand(1).getReg();
+
+      assert(!TargetRegisterInfo::isVirtualRegister(Def) &&
+             !TargetRegisterInfo::isVirtualRegister(Src) &&
+             "MachineCopyPropagation should be run after register allocation!");
+
+      // The two copies cancel out and the source of the first copy
+      // hasn't been overridden, eliminate the second one. e.g.
+      //  %ECX<def> = COPY %EAX
+      //  ... nothing clobbered EAX.
+      //  %EAX<def> = COPY %ECX
+      // =>
+      //  %ECX<def> = COPY %EAX
+      //
+      // or
+      //
+      //  %ECX<def> = COPY %EAX
+      //  ... nothing clobbered EAX.
+      //  %ECX<def> = COPY %EAX
+      // =>
+      //  %ECX<def> = COPY %EAX
+      if (eraseIfRedundant(*MI, Def, Src) || eraseIfRedundant(*MI, Src, Def))
+        continue;
+
+      // If Src is defined by a previous copy, the previous copy cannot be
+      // eliminated.
+      for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
+        Reg2MIMap::iterator CI = CopyMap.find(*AI);
+        if (CI != CopyMap.end()) {
+          DEBUG(dbgs() << "MCP: Copy is no longer dead: "; CI->second->dump());
+          MaybeDeadCopies.remove(CI->second);
+        }
+      }
+
+      DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
+
+      // Copy is now a candidate for deletion.
+      if (!MRI->isReserved(Def))
+        MaybeDeadCopies.insert(MI);
+
+      // If 'Def' is previously source of another copy, then this earlier copy's
+      // source is no longer available. e.g.
+      // %xmm9<def> = copy %xmm2
+      // ...
+      // %xmm2<def> = copy %xmm0
+      // ...
+      // %xmm2<def> = copy %xmm9
+      ClobberRegister(Def);
+
+      // Remember Def is defined by the copy.
+      for (MCSubRegIterator SR(Def, TRI, /*IncludeSelf=*/true); SR.isValid();
+           ++SR) {
+        CopyMap[*SR] = MI;
+        AvailCopyMap[*SR] = MI;
+      }
+
+      // Remember source that's copied to Def. Once it's clobbered, then
+      // it's no longer available for copy propagation.
+      RegList &DestList = SrcMap[Src];
+      if (std::find(DestList.begin(), DestList.end(), Def) == DestList.end())
+        DestList.push_back(Def);
+
+      continue;
+    }
+
+    // Not a copy.
+    SmallVector<unsigned, 2> Defs;
+    const MachineOperand *RegMask = nullptr;
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isRegMask())
+        RegMask = &MO;
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      assert(!TargetRegisterInfo::isVirtualRegister(Reg) &&
+             "MachineCopyPropagation should be run after register allocation!");
+
+      if (MO.isDef()) {
+        Defs.push_back(Reg);
+        continue;
+      }
+
+      // If 'Reg' is defined by a copy, the copy is no longer a candidate
+      // for elimination.
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+        Reg2MIMap::iterator CI = CopyMap.find(*AI);
+        if (CI != CopyMap.end()) {
+          DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
+          MaybeDeadCopies.remove(CI->second);
+        }
+      }
+      // Treat undef use like defs for copy propagation but not for
+      // dead copy. We would need to do a liveness check to be sure the copy
+      // is dead for undef uses.
+      // The backends are allowed to do whatever they want with undef value
+      // and we cannot be sure this register will not be rewritten to break
+      // some false dependencies for the hardware for instance.
+      if (MO.isUndef())
+        Defs.push_back(Reg);
+    }
+
+    // The instruction has a register mask operand which means that it clobbers
+    // a large set of registers.  Treat clobbered registers the same way as
+    // defined registers.
+    if (RegMask) {
+      // Erase any MaybeDeadCopies whose destination register is clobbered.
+      for (SmallSetVector<MachineInstr *, 8>::iterator DI =
+               MaybeDeadCopies.begin();
+           DI != MaybeDeadCopies.end();) {
+        MachineInstr *MaybeDead = *DI;
+        unsigned Reg = MaybeDead->getOperand(0).getReg();
+        assert(!MRI->isReserved(Reg));
+
+        if (!RegMask->clobbersPhysReg(Reg)) {
+          ++DI;
+          continue;
+        }
+
+        DEBUG(dbgs() << "MCP: Removing copy due to regmask clobbering: ";
+              MaybeDead->dump());
+
+        // erase() will return the next valid iterator pointing to the next
+        // element after the erased one.
+        DI = MaybeDeadCopies.erase(DI);
+        MaybeDead->eraseFromParent();
+        Changed = true;
+        ++NumDeletes;
+      }
+
+      removeClobberedRegsFromMap(AvailCopyMap, *RegMask);
+      removeClobberedRegsFromMap(CopyMap, *RegMask);
+      for (SourceMap::iterator I = SrcMap.begin(), E = SrcMap.end(), Next;
+           I != E; I = Next) {
+        Next = std::next(I);
+        if (RegMask->clobbersPhysReg(I->first)) {
+          removeRegsFromMap(AvailCopyMap, I->second, *TRI);
+          SrcMap.erase(I);
+        }
+      }
+    }
+
+    // Any previous copy definition or reading the Defs is no longer available.
+    for (unsigned Reg : Defs)
+      ClobberRegister(Reg);
+  }
+
+  // If MBB doesn't have successors, delete the copies whose defs are not used.
+  // If MBB does have successors, then conservative assume the defs are live-out
+  // since we don't want to trust live-in lists.
+  if (MBB.succ_empty()) {
+    for (MachineInstr *MaybeDead : MaybeDeadCopies) {
+      assert(!MRI->isReserved(MaybeDead->getOperand(0).getReg()));
+      MaybeDead->eraseFromParent();
+      Changed = true;
+      ++NumDeletes;
+    }
+  }
+
+  MaybeDeadCopies.clear();
+  AvailCopyMap.clear();
+  CopyMap.clear();
+  SrcMap.clear();
+}
+
+bool MachineCopyPropagation::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  Changed = false;
+
+  TRI = MF.getSubtarget().getRegisterInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+
+  for (MachineBasicBlock &MBB : MF)
+    CopyPropagateBlock(MBB);
+
+  return Changed;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
new file mode 100644
index 000000000000..acb7c4810b16
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineDominanceFrontier.cpp
@@ -0,0 +1,54 @@
+//===- MachineDominanceFrontier.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/Analysis/DominanceFrontierImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+
+
+using namespace llvm;
+
+namespace llvm {
+template class DominanceFrontierBase<MachineBasicBlock>;
+template class ForwardDominanceFrontierBase<MachineBasicBlock>;
+}
+
+
+char MachineDominanceFrontier::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineDominanceFrontier, "machine-domfrontier",
+                "Machine Dominance Frontier Construction", true, true)
+
+MachineDominanceFrontier::MachineDominanceFrontier()
+  : MachineFunctionPass(ID),
+    Base() {
+  initializeMachineDominanceFrontierPass(*PassRegistry::getPassRegistry());
+}
+
+char &llvm::MachineDominanceFrontierID = MachineDominanceFrontier::ID;
+
+bool MachineDominanceFrontier::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  Base.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  return false;
+}
+
+void MachineDominanceFrontier::releaseMemory() {
+  Base.releaseMemory();
+}
+
+void MachineDominanceFrontier::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineDominators.cpp b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
new file mode 100644
index 000000000000..303a6a9263be
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineDominators.cpp
@@ -0,0 +1,157 @@
+//===- MachineDominators.cpp - Machine Dominator Calculation --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// forward dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+// Always verify dominfo if expensive checking is enabled.
+#ifdef EXPENSIVE_CHECKS
+static bool VerifyMachineDomInfo = true;
+#else
+static bool VerifyMachineDomInfo = false;
+#endif
+static cl::opt<bool, true> VerifyMachineDomInfoX(
+    "verify-machine-dom-info", cl::location(VerifyMachineDomInfo),
+    cl::desc("Verify machine dominator info (time consuming)"));
+
+namespace llvm {
+template class DomTreeNodeBase<MachineBasicBlock>;
+template class DominatorTreeBase<MachineBasicBlock>;
+}
+
+char MachineDominatorTree::ID = 0;
+
+INITIALIZE_PASS(MachineDominatorTree, "machinedomtree",
+                "MachineDominator Tree Construction", true, true)
+
+char &llvm::MachineDominatorsID = MachineDominatorTree::ID;
+
+void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  CriticalEdgesToSplit.clear();
+  NewBBs.clear();
+  DT->recalculate(F);
+
+  return false;
+}
+
+MachineDominatorTree::MachineDominatorTree()
+    : MachineFunctionPass(ID) {
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(false);
+}
+
+MachineDominatorTree::~MachineDominatorTree() {
+  delete DT;
+}
+
+void MachineDominatorTree::releaseMemory() {
+  DT->releaseMemory();
+}
+
+void MachineDominatorTree::verifyAnalysis() const {
+  if (VerifyMachineDomInfo)
+    verifyDomTree();
+}
+
+void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
+  DT->print(OS);
+}
+
+void MachineDominatorTree::applySplitCriticalEdges() const {
+  // Bail out early if there is nothing to do.
+  if (CriticalEdgesToSplit.empty())
+    return;
+
+  // For each element in CriticalEdgesToSplit, remember whether or not element
+  // is the new immediate domminator of its successor. The mapping is done by
+  // index, i.e., the information for the ith element of CriticalEdgesToSplit is
+  // the ith element of IsNewIDom.
+  SmallBitVector IsNewIDom(CriticalEdgesToSplit.size(), true);
+  size_t Idx = 0;
+
+  // Collect all the dominance properties info, before invalidating
+  // the underlying DT.
+  for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+    // Update dominator information.
+    MachineBasicBlock *Succ = Edge.ToBB;
+    MachineDomTreeNode *SuccDTNode = DT->getNode(Succ);
+
+    for (MachineBasicBlock *PredBB : Succ->predecessors()) {
+      if (PredBB == Edge.NewBB)
+        continue;
+      // If we are in this situation:
+      // FromBB1        FromBB2
+      //    +              +
+      //   + +            + +
+      //  +   +          +   +
+      // ...  Split1  Split2 ...
+      //           +   +
+      //            + +
+      //             +
+      //            Succ
+      // Instead of checking the domiance property with Split2, we check it with
+      // FromBB2 since Split2 is still unknown of the underlying DT structure.
+      if (NewBBs.count(PredBB)) {
+        assert(PredBB->pred_size() == 1 && "A basic block resulting from a "
+                                           "critical edge split has more "
+                                           "than one predecessor!");
+        PredBB = *PredBB->pred_begin();
+      }
+      if (!DT->dominates(SuccDTNode, DT->getNode(PredBB))) {
+        IsNewIDom[Idx] = false;
+        break;
+      }
+    }
+    ++Idx;
+  }
+
+  // Now, update DT with the collected dominance properties info.
+  Idx = 0;
+  for (CriticalEdge &Edge : CriticalEdgesToSplit) {
+    // We know FromBB dominates NewBB.
+    MachineDomTreeNode *NewDTNode = DT->addNewBlock(Edge.NewBB, Edge.FromBB);
+
+    // If all the other predecessors of "Succ" are dominated by "Succ" itself
+    // then the new block is the new immediate dominator of "Succ". Otherwise,
+    // the new block doesn't dominate anything.
+    if (IsNewIDom[Idx])
+      DT->changeImmediateDominator(DT->getNode(Edge.ToBB), NewDTNode);
+    ++Idx;
+  }
+  NewBBs.clear();
+  CriticalEdgesToSplit.clear();
+}
+
+void MachineDominatorTree::verifyDomTree() const {
+  MachineFunction &F = *getRoot()->getParent();
+
+  MachineDominatorTree OtherDT;
+  OtherDT.DT->recalculate(F);
+  if (compare(OtherDT)) {
+    errs() << "MachineDominatorTree is not up to date!\nComputed:\n";
+    print(errs(), nullptr);
+    errs() << "\nActual:\n";
+    OtherDT.print(errs(), nullptr);
+    abort();
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunction.cpp b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
new file mode 100644
index 000000000000..a7c63ef4c852
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunction.cpp
@@ -0,0 +1,1002 @@
+//===-- MachineFunction.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Collect native machine code information for a function.  This allows
+// target-specific information about the generated code to be stored with each
+// function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionInitializer.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "codegen"
+
+static cl::opt<unsigned>
+    AlignAllFunctions("align-all-functions",
+                      cl::desc("Force the alignment of all functions."),
+                      cl::init(0), cl::Hidden);
+
+void MachineFunctionInitializer::anchor() {}
+
+void MachineFunctionProperties::print(raw_ostream &ROS, bool OnlySet) const {
+  // Leave this function even in NDEBUG as an out-of-line anchor.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  for (BitVector::size_type i = 0; i < Properties.size(); ++i) {
+    bool HasProperty = Properties[i];
+    if (OnlySet && !HasProperty)
+      continue;
+    switch(static_cast<Property>(i)) {
+      case Property::IsSSA:
+        ROS << (HasProperty ? "SSA, " : "Post SSA, ");
+        break;
+      case Property::TracksLiveness:
+        ROS << (HasProperty ? "" : "not ") << "tracking liveness, ";
+        break;
+      case Property::AllVRegsAllocated:
+        ROS << (HasProperty ? "AllVRegsAllocated" : "HasVRegs");
+        break;
+      default:
+        break;
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+// MachineFunction implementation
+//===----------------------------------------------------------------------===//
+
+// Out-of-line virtual method.
+MachineFunctionInfo::~MachineFunctionInfo() {}
+
+void ilist_traits<MachineBasicBlock>::deleteNode(MachineBasicBlock *MBB) {
+  MBB->getParent()->DeleteMachineBasicBlock(MBB);
+}
+
+static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
+                                           const Function *Fn) {
+  if (Fn->hasFnAttribute(Attribute::StackAlignment))
+    return Fn->getFnStackAlignment();
+  return STI->getFrameLowering()->getStackAlignment();
+}
+
+MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
+                                 unsigned FunctionNum, MachineModuleInfo &mmi)
+    : Fn(F), Target(TM), STI(TM.getSubtargetImpl(*F)), Ctx(mmi.getContext()),
+      MMI(mmi) {
+  // Assume the function starts in SSA form with correct liveness.
+  Properties.set(MachineFunctionProperties::Property::IsSSA);
+  Properties.set(MachineFunctionProperties::Property::TracksLiveness);
+  if (STI->getRegisterInfo())
+    RegInfo = new (Allocator) MachineRegisterInfo(this);
+  else
+    RegInfo = nullptr;
+
+  MFInfo = nullptr;
+  // We can realign the stack if the target supports it and the user hasn't
+  // explicitly asked us not to.
+  bool CanRealignSP = STI->getFrameLowering()->isStackRealignable() &&
+                      !F->hasFnAttribute("no-realign-stack");
+  FrameInfo = new (Allocator) MachineFrameInfo(
+      getFnStackAlignment(STI, Fn), /*StackRealignable=*/CanRealignSP,
+      /*ForceRealign=*/CanRealignSP &&
+          F->hasFnAttribute(Attribute::StackAlignment));
+
+  if (Fn->hasFnAttribute(Attribute::StackAlignment))
+    FrameInfo->ensureMaxAlignment(Fn->getFnStackAlignment());
+
+  ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
+  Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
+
+  // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
+  // FIXME: Use Function::optForSize().
+  if (!Fn->hasFnAttribute(Attribute::OptimizeForSize))
+    Alignment = std::max(Alignment,
+                         STI->getTargetLowering()->getPrefFunctionAlignment());
+
+  if (AlignAllFunctions)
+    Alignment = AlignAllFunctions;
+
+  FunctionNumber = FunctionNum;
+  JumpTableInfo = nullptr;
+
+  if (isFuncletEHPersonality(classifyEHPersonality(
+          F->hasPersonalityFn() ? F->getPersonalityFn() : nullptr))) {
+    WinEHInfo = new (Allocator) WinEHFuncInfo();
+  }
+
+  assert(TM.isCompatibleDataLayout(getDataLayout()) &&
+         "Can't create a MachineFunction using a Module with a "
+         "Target-incompatible DataLayout attached\n");
+
+  PSVManager = llvm::make_unique<PseudoSourceValueManager>();
+}
+
+MachineFunction::~MachineFunction() {
+  // Don't call destructors on MachineInstr and MachineOperand. All of their
+  // memory comes from the BumpPtrAllocator which is about to be purged.
+  //
+  // Do call MachineBasicBlock destructors, it contains std::vectors.
+  for (iterator I = begin(), E = end(); I != E; I = BasicBlocks.erase(I))
+    I->Insts.clearAndLeakNodesUnsafely();
+
+  InstructionRecycler.clear(Allocator);
+  OperandRecycler.clear(Allocator);
+  BasicBlockRecycler.clear(Allocator);
+  if (RegInfo) {
+    RegInfo->~MachineRegisterInfo();
+    Allocator.Deallocate(RegInfo);
+  }
+  if (MFInfo) {
+    MFInfo->~MachineFunctionInfo();
+    Allocator.Deallocate(MFInfo);
+  }
+
+  FrameInfo->~MachineFrameInfo();
+  Allocator.Deallocate(FrameInfo);
+
+  ConstantPool->~MachineConstantPool();
+  Allocator.Deallocate(ConstantPool);
+
+  if (JumpTableInfo) {
+    JumpTableInfo->~MachineJumpTableInfo();
+    Allocator.Deallocate(JumpTableInfo);
+  }
+
+  if (WinEHInfo) {
+    WinEHInfo->~WinEHFuncInfo();
+    Allocator.Deallocate(WinEHInfo);
+  }
+}
+
+const DataLayout &MachineFunction::getDataLayout() const {
+  return Fn->getParent()->getDataLayout();
+}
+
+/// Get the JumpTableInfo for this function.
+/// If it does not already exist, allocate one.
+MachineJumpTableInfo *MachineFunction::
+getOrCreateJumpTableInfo(unsigned EntryKind) {
+  if (JumpTableInfo) return JumpTableInfo;
+
+  JumpTableInfo = new (Allocator)
+    MachineJumpTableInfo((MachineJumpTableInfo::JTEntryKind)EntryKind);
+  return JumpTableInfo;
+}
+
+/// Should we be emitting segmented stack stuff for the function
+bool MachineFunction::shouldSplitStack() const {
+  return getFunction()->hasFnAttribute("split-stack");
+}
+
+/// This discards all of the MachineBasicBlock numbers and recomputes them.
+/// This guarantees that the MBB numbers are sequential, dense, and match the
+/// ordering of the blocks within the function.  If a specific MachineBasicBlock
+/// is specified, only that block and those after it are renumbered.
+void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
+  if (empty()) { MBBNumbering.clear(); return; }
+  MachineFunction::iterator MBBI, E = end();
+  if (MBB == nullptr)
+    MBBI = begin();
+  else
+    MBBI = MBB->getIterator();
+
+  // Figure out the block number this should have.
+  unsigned BlockNo = 0;
+  if (MBBI != begin())
+    BlockNo = std::prev(MBBI)->getNumber() + 1;
+
+  for (; MBBI != E; ++MBBI, ++BlockNo) {
+    if (MBBI->getNumber() != (int)BlockNo) {
+      // Remove use of the old number.
+      if (MBBI->getNumber() != -1) {
+        assert(MBBNumbering[MBBI->getNumber()] == &*MBBI &&
+               "MBB number mismatch!");
+        MBBNumbering[MBBI->getNumber()] = nullptr;
+      }
+
+      // If BlockNo is already taken, set that block's number to -1.
+      if (MBBNumbering[BlockNo])
+        MBBNumbering[BlockNo]->setNumber(-1);
+
+      MBBNumbering[BlockNo] = &*MBBI;
+      MBBI->setNumber(BlockNo);
+    }
+  }
+
+  // Okay, all the blocks are renumbered.  If we have compactified the block
+  // numbering, shrink MBBNumbering now.
+  assert(BlockNo <= MBBNumbering.size() && "Mismatch!");
+  MBBNumbering.resize(BlockNo);
+}
+
+/// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
+MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
+                                                  const DebugLoc &DL,
+                                                  bool NoImp) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+    MachineInstr(*this, MCID, DL, NoImp);
+}
+
+/// Create a new MachineInstr which is a copy of the 'Orig' instruction,
+/// identical in all ways except the instruction has no parent, prev, or next.
+MachineInstr *
+MachineFunction::CloneMachineInstr(const MachineInstr *Orig) {
+  return new (InstructionRecycler.Allocate<MachineInstr>(Allocator))
+             MachineInstr(*this, *Orig);
+}
+
+/// Delete the given MachineInstr.
+///
+/// This function also serves as the MachineInstr destructor - the real
+/// ~MachineInstr() destructor must be empty.
+void
+MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
+  // Strip it for parts. The operand array and the MI object itself are
+  // independently recyclable.
+  if (MI->Operands)
+    deallocateOperandArray(MI->CapOperands, MI->Operands);
+  // Don't call ~MachineInstr() which must be trivial anyway because
+  // ~MachineFunction drops whole lists of MachineInstrs wihout calling their
+  // destructors.
+  InstructionRecycler.Deallocate(Allocator, MI);
+}
+
+/// Allocate a new MachineBasicBlock. Use this instead of
+/// `new MachineBasicBlock'.
+MachineBasicBlock *
+MachineFunction::CreateMachineBasicBlock(const BasicBlock *bb) {
+  return new (BasicBlockRecycler.Allocate<MachineBasicBlock>(Allocator))
+             MachineBasicBlock(*this, bb);
+}
+
+/// Delete the given MachineBasicBlock.
+void
+MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
+  assert(MBB->getParent() == this && "MBB parent mismatch!");
+  MBB->~MachineBasicBlock();
+  BasicBlockRecycler.Deallocate(Allocator, MBB);
+}
+
+MachineMemOperand *MachineFunction::getMachineMemOperand(
+    MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
+    unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  return new (Allocator)
+      MachineMemOperand(PtrInfo, f, s, base_alignment, AAInfo, Ranges);
+}
+
+MachineMemOperand *
+MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
+                                      int64_t Offset, uint64_t Size) {
+  if (MMO->getValue())
+    return new (Allocator)
+               MachineMemOperand(MachinePointerInfo(MMO->getValue(),
+                                                    MMO->getOffset()+Offset),
+                                 MMO->getFlags(), Size,
+                                 MMO->getBaseAlignment());
+  return new (Allocator)
+             MachineMemOperand(MachinePointerInfo(MMO->getPseudoValue(),
+                                                  MMO->getOffset()+Offset),
+                               MMO->getFlags(), Size,
+                               MMO->getBaseAlignment());
+}
+
+MachineInstr::mmo_iterator
+MachineFunction::allocateMemRefsArray(unsigned long Num) {
+  return Allocator.Allocate<MachineMemOperand *>(Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractLoadMemRefs(MachineInstr::mmo_iterator Begin,
+                                    MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isLoad())
+      ++Num;
+
+  // Allocate a new array and populate it with the load information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isLoad()) {
+      if (!(*I)->isStore())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the store flag.
+        MachineMemOperand *JustLoad =
+          getMachineMemOperand((*I)->getPointerInfo(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOStore,
+                               (*I)->getSize(), (*I)->getBaseAlignment(),
+                               (*I)->getAAInfo());
+        Result[Index] = JustLoad;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+std::pair<MachineInstr::mmo_iterator, MachineInstr::mmo_iterator>
+MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
+                                     MachineInstr::mmo_iterator End) {
+  // Count the number of load mem refs.
+  unsigned Num = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I)
+    if ((*I)->isStore())
+      ++Num;
+
+  // Allocate a new array and populate it with the store information.
+  MachineInstr::mmo_iterator Result = allocateMemRefsArray(Num);
+  unsigned Index = 0;
+  for (MachineInstr::mmo_iterator I = Begin; I != End; ++I) {
+    if ((*I)->isStore()) {
+      if (!(*I)->isLoad())
+        // Reuse the MMO.
+        Result[Index] = *I;
+      else {
+        // Clone the MMO and unset the load flag.
+        MachineMemOperand *JustStore =
+          getMachineMemOperand((*I)->getPointerInfo(),
+                               (*I)->getFlags() & ~MachineMemOperand::MOLoad,
+                               (*I)->getSize(), (*I)->getBaseAlignment(),
+                               (*I)->getAAInfo());
+        Result[Index] = JustStore;
+      }
+      ++Index;
+    }
+  }
+  return std::make_pair(Result, Result + Num);
+}
+
+const char *MachineFunction::createExternalSymbolName(StringRef Name) {
+  char *Dest = Allocator.Allocate<char>(Name.size() + 1);
+  std::copy(Name.begin(), Name.end(), Dest);
+  Dest[Name.size()] = 0;
+  return Dest;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineFunction::dump() const {
+  print(dbgs());
+}
+#endif
+
+StringRef MachineFunction::getName() const {
+  assert(getFunction() && "No function!");
+  return getFunction()->getName();
+}
+
+void MachineFunction::print(raw_ostream &OS, const SlotIndexes *Indexes) const {
+  OS << "# Machine code for function " << getName() << ": ";
+  OS << "Properties: <";
+  getProperties().print(OS);
+  OS << ">\n";
+
+  // Print Frame Information
+  FrameInfo->print(*this, OS);
+
+  // Print JumpTable Information
+  if (JumpTableInfo)
+    JumpTableInfo->print(OS);
+
+  // Print Constant Pool
+  ConstantPool->print(OS);
+
+  const TargetRegisterInfo *TRI = getSubtarget().getRegisterInfo();
+
+  if (RegInfo && !RegInfo->livein_empty()) {
+    OS << "Function Live Ins: ";
+    for (MachineRegisterInfo::livein_iterator
+         I = RegInfo->livein_begin(), E = RegInfo->livein_end(); I != E; ++I) {
+      OS << PrintReg(I->first, TRI);
+      if (I->second)
+        OS << " in " << PrintReg(I->second, TRI);
+      if (std::next(I) != E)
+        OS << ", ";
+    }
+    OS << '\n';
+  }
+
+  ModuleSlotTracker MST(getFunction()->getParent());
+  MST.incorporateFunction(*getFunction());
+  for (const auto &BB : *this) {
+    OS << '\n';
+    BB.print(OS, MST, Indexes);
+  }
+
+  OS << "\n# End machine code for function " << getName() << ".\n\n";
+}
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<const MachineFunction*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const MachineFunction *F) {
+      return ("CFG for '" + F->getName() + "' function").str();
+    }
+
+    std::string getNodeLabel(const MachineBasicBlock *Node,
+                             const MachineFunction *Graph) {
+      std::string OutStr;
+      {
+        raw_string_ostream OSS(OutStr);
+
+        if (isSimple()) {
+          OSS << "BB#" << Node->getNumber();
+          if (const BasicBlock *BB = Node->getBasicBlock())
+            OSS << ": " << BB->getName();
+        } else
+          Node->print(OSS);
+      }
+
+      if (OutStr[0] == '\n') OutStr.erase(OutStr.begin());
+
+      // Process string output to make it nicer...
+      for (unsigned i = 0; i != OutStr.length(); ++i)
+        if (OutStr[i] == '\n') {                            // Left justify
+          OutStr[i] = '\\';
+          OutStr.insert(OutStr.begin()+i+1, 'l');
+        }
+      return OutStr;
+    }
+  };
+}
+
+void MachineFunction::viewCFG() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName());
+#else
+  errs() << "MachineFunction::viewCFG is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+void MachineFunction::viewCFGOnly() const
+{
+#ifndef NDEBUG
+  ViewGraph(this, "mf" + getName(), true);
+#else
+  errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif // NDEBUG
+}
+
+/// Add the specified physical register as a live-in value and
+/// create a corresponding virtual register for it.
+unsigned MachineFunction::addLiveIn(unsigned PReg,
+                                    const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = getRegInfo();
+  unsigned VReg = MRI.getLiveInVirtReg(PReg);
+  if (VReg) {
+    const TargetRegisterClass *VRegRC = MRI.getRegClass(VReg);
+    (void)VRegRC;
+    // A physical register can be added several times.
+    // Between two calls, the register class of the related virtual register
+    // may have been constrained to match some operation constraints.
+    // In that case, check that the current register class includes the
+    // physical register and is a sub class of the specified RC.
+    assert((VRegRC == RC || (VRegRC->contains(PReg) &&
+                             RC->hasSubClassEq(VRegRC))) &&
+            "Register class mismatch!");
+    return VReg;
+  }
+  VReg = MRI.createVirtualRegister(RC);
+  MRI.addLiveIn(PReg, VReg);
+  return VReg;
+}
+
+/// Return the MCSymbol for the specified non-empty jump table.
+/// If isLinkerPrivate is specified, an 'l' label is returned, otherwise a
+/// normal 'L' label is returned.
+MCSymbol *MachineFunction::getJTISymbol(unsigned JTI, MCContext &Ctx,
+                                        bool isLinkerPrivate) const {
+  const DataLayout &DL = getDataLayout();
+  assert(JumpTableInfo && "No jump tables");
+  assert(JTI < JumpTableInfo->getJumpTables().size() && "Invalid JTI!");
+
+  const char *Prefix = isLinkerPrivate ? DL.getLinkerPrivateGlobalPrefix()
+                                       : DL.getPrivateGlobalPrefix();
+  SmallString<60> Name;
+  raw_svector_ostream(Name)
+    << Prefix << "JTI" << getFunctionNumber() << '_' << JTI;
+  return Ctx.getOrCreateSymbol(Name);
+}
+
+/// Return a function-local symbol to represent the PIC base.
+MCSymbol *MachineFunction::getPICBaseSymbol() const {
+  const DataLayout &DL = getDataLayout();
+  return Ctx.getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
+                               Twine(getFunctionNumber()) + "$pb");
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineFrameInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// Make sure the function is at least Align bytes aligned.
+void MachineFrameInfo::ensureMaxAlignment(unsigned Align) {
+  if (!StackRealignable)
+    assert(Align <= StackAlignment &&
+           "For targets without stack realignment, Align is out of limit!");
+  if (MaxAlignment < Align) MaxAlignment = Align;
+}
+
+/// Clamp the alignment if requested and emit a warning.
+static inline unsigned clampStackAlignment(bool ShouldClamp, unsigned Align,
+                                           unsigned StackAlign) {
+  if (!ShouldClamp || Align <= StackAlign)
+    return Align;
+  DEBUG(dbgs() << "Warning: requested alignment " << Align
+               << " exceeds the stack alignment " << StackAlign
+               << " when stack realignment is off" << '\n');
+  return StackAlign;
+}
+
+/// Create a new statically sized stack object, returning a nonnegative
+/// identifier to represent it.
+int MachineFrameInfo::CreateStackObject(uint64_t Size, unsigned Alignment,
+                      bool isSS, const AllocaInst *Alloca) {
+  assert(Size != 0 && "Cannot allocate zero size stack objects!");
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(Size, Alignment, 0, false, isSS, Alloca,
+                                !isSS));
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  assert(Index >= 0 && "Bad frame index!");
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+/// Create a new statically sized stack object that represents a spill slot,
+/// returning a nonnegative identifier to represent it.
+int MachineFrameInfo::CreateSpillStackObject(uint64_t Size,
+                                             unsigned Alignment) {
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  CreateStackObject(Size, Alignment, true);
+  int Index = (int)Objects.size() - NumFixedObjects - 1;
+  ensureMaxAlignment(Alignment);
+  return Index;
+}
+
+/// Notify the MachineFrameInfo object that a variable sized object has been
+/// created. This must be created whenever a variable sized object is created,
+/// whether or not the index returned is actually used.
+int MachineFrameInfo::CreateVariableSizedObject(unsigned Alignment,
+                                                const AllocaInst *Alloca) {
+  HasVarSizedObjects = true;
+  Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
+  Objects.push_back(StackObject(0, Alignment, 0, false, false, Alloca, true));
+  ensureMaxAlignment(Alignment);
+  return (int)Objects.size()-NumFixedObjects-1;
+}
+
+/// Create a new object at a fixed location on the stack.
+/// All fixed objects should be created before other objects are created for
+/// efficiency. By default, fixed objects are immutable. This returns an
+/// index with a negative value.
+int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
+                                        bool Immutable, bool isAliased) {
+  assert(Size != 0 && "Cannot allocate zero size fixed stack objects!");
+  // The alignment of the frame index can be determined from its offset from
+  // the incoming frame position.  If the frame object is at offset 32 and
+  // the stack is guaranteed to be 16-byte aligned, then we know that the
+  // object is 16-byte aligned. Note that unlike the non-fixed case, if the
+  // stack needs realignment, we can't assume that the stack will in fact be
+  // aligned.
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset, Immutable,
+                                              /*isSS*/   false,
+                                              /*Alloca*/ nullptr, isAliased));
+  return -++NumFixedObjects;
+}
+
+/// Create a spill slot at a fixed location on the stack.
+/// Returns an index with a negative value.
+int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
+                                                  int64_t SPOffset) {
+  unsigned Align = MinAlign(SPOffset, ForcedRealign ? 1 : StackAlignment);
+  Align = clampStackAlignment(!StackRealignable, Align, StackAlignment);
+  Objects.insert(Objects.begin(), StackObject(Size, Align, SPOffset,
+                                              /*Immutable*/ true,
+                                              /*isSS*/ true,
+                                              /*Alloca*/ nullptr,
+                                              /*isAliased*/ false));
+  return -++NumFixedObjects;
+}
+
+BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  BitVector BV(TRI->getNumRegs());
+
+  // Before CSI is calculated, no registers are considered pristine. They can be
+  // freely used and PEI will make sure they are saved.
+  if (!isCalleeSavedInfoValid())
+    return BV;
+
+  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+    BV.set(*CSR);
+
+  // Saved CSRs are not pristine.
+  for (auto &I : getCalleeSavedInfo())
+    for (MCSubRegIterator S(I.getReg(), TRI, true); S.isValid(); ++S)
+      BV.reset(*S);
+
+  return BV;
+}
+
+unsigned MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
+  unsigned MaxAlign = getMaxAlignment();
+  int Offset = 0;
+
+  // This code is very, very similar to PEI::calculateFrameObjectOffsets().
+  // It really should be refactored to share code. Until then, changes
+  // should keep in mind that there's tight coupling between the two.
+
+  for (int i = getObjectIndexBegin(); i != 0; ++i) {
+    int FixedOff = -getObjectOffset(i);
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+  for (unsigned i = 0, e = getObjectIndexEnd(); i != e; ++i) {
+    if (isDeadObjectIndex(i))
+      continue;
+    Offset += getObjectSize(i);
+    unsigned Align = getObjectAlignment(i);
+    // Adjust to alignment boundary
+    Offset = (Offset+Align-1)/Align*Align;
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  if (adjustsStack() && TFI->hasReservedCallFrame(MF))
+    Offset += getMaxCallFrameSize();
+
+  // Round up the size to a multiple of the alignment.  If the function has
+  // any calls or alloca's, align to the target's StackAlignment value to
+  // ensure that the callee's frame or the alloca data is suitably aligned;
+  // otherwise, for leaf functions, align to the TransientStackAlignment
+  // value.
+  unsigned StackAlign;
+  if (adjustsStack() || hasVarSizedObjects() ||
+      (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0))
+    StackAlign = TFI->getStackAlignment();
+  else
+    StackAlign = TFI->getTransientStackAlignment();
+
+  // If the frame pointer is eliminated, all frame offsets will be relative to
+  // SP not FP. Align to MaxAlign so this works.
+  StackAlign = std::max(StackAlign, MaxAlign);
+  unsigned AlignMask = StackAlign - 1;
+  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+
+  return (unsigned)Offset;
+}
+
+void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
+  if (Objects.empty()) return;
+
+  const TargetFrameLowering *FI = MF.getSubtarget().getFrameLowering();
+  int ValOffset = (FI ? FI->getOffsetOfLocalArea() : 0);
+
+  OS << "Frame Objects:\n";
+
+  for (unsigned i = 0, e = Objects.size(); i != e; ++i) {
+    const StackObject &SO = Objects[i];
+    OS << "  fi#" << (int)(i-NumFixedObjects) << ": ";
+    if (SO.Size == ~0ULL) {
+      OS << "dead\n";
+      continue;
+    }
+    if (SO.Size == 0)
+      OS << "variable sized";
+    else
+      OS << "size=" << SO.Size;
+    OS << ", align=" << SO.Alignment;
+
+    if (i < NumFixedObjects)
+      OS << ", fixed";
+    if (i < NumFixedObjects || SO.SPOffset != -1) {
+      int64_t Off = SO.SPOffset - ValOffset;
+      OS << ", at location [SP";
+      if (Off > 0)
+        OS << "+" << Off;
+      else if (Off < 0)
+        OS << Off;
+      OS << "]";
+    }
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void MachineFrameInfo::dump(const MachineFunction &MF) const {
+  print(MF, dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//  MachineJumpTableInfo implementation
+//===----------------------------------------------------------------------===//
+
+/// Return the size of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntrySize(const DataLayout &TD) const {
+  // The size of a jump table entry is 4 bytes unless the entry is just the
+  // address of a block, in which case it is the pointer size.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerSize();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return 8;
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return 4;
+  case MachineJumpTableInfo::EK_Inline:
+    return 0;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// Return the alignment of each entry in the jump table.
+unsigned MachineJumpTableInfo::getEntryAlignment(const DataLayout &TD) const {
+  // The alignment of a jump table entry is the alignment of int32 unless the
+  // entry is just the address of a block, in which case it is the pointer
+  // alignment.
+  switch (getEntryKind()) {
+  case MachineJumpTableInfo::EK_BlockAddress:
+    return TD.getPointerABIAlignment();
+  case MachineJumpTableInfo::EK_GPRel64BlockAddress:
+    return TD.getABIIntegerTypeAlignment(64);
+  case MachineJumpTableInfo::EK_GPRel32BlockAddress:
+  case MachineJumpTableInfo::EK_LabelDifference32:
+  case MachineJumpTableInfo::EK_Custom32:
+    return TD.getABIIntegerTypeAlignment(32);
+  case MachineJumpTableInfo::EK_Inline:
+    return 1;
+  }
+  llvm_unreachable("Unknown jump table encoding!");
+}
+
+/// Create a new jump table entry in the jump table info.
+unsigned MachineJumpTableInfo::createJumpTableIndex(
+                               const std::vector<MachineBasicBlock*> &DestBBs) {
+  assert(!DestBBs.empty() && "Cannot create an empty jump table!");
+  JumpTables.push_back(MachineJumpTableEntry(DestBBs));
+  return JumpTables.size()-1;
+}
+
+/// If Old is the target of any jump tables, update the jump tables to branch
+/// to New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTables(MachineBasicBlock *Old,
+                                                  MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  for (size_t i = 0, e = JumpTables.size(); i != e; ++i)
+    ReplaceMBBInJumpTable(i, Old, New);
+  return MadeChange;
+}
+
+/// If Old is a target of the jump tables, update the jump table to branch to
+/// New instead.
+bool MachineJumpTableInfo::ReplaceMBBInJumpTable(unsigned Idx,
+                                                 MachineBasicBlock *Old,
+                                                 MachineBasicBlock *New) {
+  assert(Old != New && "Not making a change?");
+  bool MadeChange = false;
+  MachineJumpTableEntry &JTE = JumpTables[Idx];
+  for (size_t j = 0, e = JTE.MBBs.size(); j != e; ++j)
+    if (JTE.MBBs[j] == Old) {
+      JTE.MBBs[j] = New;
+      MadeChange = true;
+    }
+  return MadeChange;
+}
+
+void MachineJumpTableInfo::print(raw_ostream &OS) const {
+  if (JumpTables.empty()) return;
+
+  OS << "Jump Tables:\n";
+
+  for (unsigned i = 0, e = JumpTables.size(); i != e; ++i) {
+    OS << "  jt#" << i << ": ";
+    for (unsigned j = 0, f = JumpTables[i].MBBs.size(); j != f; ++j)
+      OS << " BB#" << JumpTables[i].MBBs[j]->getNumber();
+  }
+
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineJumpTableInfo::dump() const { print(dbgs()); }
+#endif
+
+
+//===----------------------------------------------------------------------===//
+//  MachineConstantPool implementation
+//===----------------------------------------------------------------------===//
+
+void MachineConstantPoolValue::anchor() { }
+
+Type *MachineConstantPoolEntry::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool MachineConstantPoolEntry::needsRelocation() const {
+  if (isMachineConstantPoolEntry())
+    return true;
+  return Val.ConstVal->needsRelocation();
+}
+
+SectionKind
+MachineConstantPoolEntry::getSectionKind(const DataLayout *DL) const {
+  if (needsRelocation())
+    return SectionKind::getReadOnlyWithRel();
+  switch (DL->getTypeAllocSize(getType())) {
+  case 4:
+    return SectionKind::getMergeableConst4();
+  case 8:
+    return SectionKind::getMergeableConst8();
+  case 16:
+    return SectionKind::getMergeableConst16();
+  case 32:
+    return SectionKind::getMergeableConst32();
+  default:
+    return SectionKind::getReadOnly();
+  }
+}
+
+MachineConstantPool::~MachineConstantPool() {
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (Constants[i].isMachineConstantPoolEntry())
+      delete Constants[i].Val.MachineCPVal;
+  for (DenseSet<MachineConstantPoolValue*>::iterator I =
+       MachineCPVsSharingEntries.begin(), E = MachineCPVsSharingEntries.end();
+       I != E; ++I)
+    delete *I;
+}
+
+/// Test whether the given two constants can be allocated the same constant pool
+/// entry.
+static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
+                                      const DataLayout &DL) {
+  // Handle the trivial case quickly.
+  if (A == B) return true;
+
+  // If they have the same type but weren't the same constant, quickly
+  // reject them.
+  if (A->getType() == B->getType()) return false;
+
+  // We can't handle structs or arrays.
+  if (isa<StructType>(A->getType()) || isa<ArrayType>(A->getType()) ||
+      isa<StructType>(B->getType()) || isa<ArrayType>(B->getType()))
+    return false;
+
+  // For now, only support constants with the same size.
+  uint64_t StoreSize = DL.getTypeStoreSize(A->getType());
+  if (StoreSize != DL.getTypeStoreSize(B->getType()) || StoreSize > 128)
+    return false;
+
+  Type *IntTy = IntegerType::get(A->getContext(), StoreSize*8);
+
+  // Try constant folding a bitcast of both instructions to an integer.  If we
+  // get two identical ConstantInt's, then we are good to share them.  We use
+  // the constant folding APIs to do this so that we get the benefit of
+  // DataLayout.
+  if (isa<PointerType>(A->getType()))
+    A = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(A), IntTy, DL);
+  else if (A->getType() != IntTy)
+    A = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(A),
+                                IntTy, DL);
+  if (isa<PointerType>(B->getType()))
+    B = ConstantFoldCastOperand(Instruction::PtrToInt,
+                                const_cast<Constant *>(B), IntTy, DL);
+  else if (B->getType() != IntTy)
+    B = ConstantFoldCastOperand(Instruction::BitCast, const_cast<Constant *>(B),
+                                IntTy, DL);
+
+  return A == B;
+}
+
+/// Create a new entry in the constant pool or return an existing one.
+/// User must specify the log2 of the minimum required alignment for the object.
+unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i)
+    if (!Constants[i].isMachineConstantPoolEntry() &&
+        CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) {
+      if ((unsigned)Constants[i].getAlignment() < Alignment)
+        Constants[i].Alignment = Alignment;
+      return i;
+    }
+
+  Constants.push_back(MachineConstantPoolEntry(C, Alignment));
+  return Constants.size()-1;
+}
+
+unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
+                                                   unsigned Alignment) {
+  assert(Alignment && "Alignment must be specified!");
+  if (Alignment > PoolAlignment) PoolAlignment = Alignment;
+
+  // Check to see if we already have this constant.
+  //
+  // FIXME, this could be made much more efficient for large constant pools.
+  int Idx = V->getExistingMachineCPValue(this, Alignment);
+  if (Idx != -1) {
+    MachineCPVsSharingEntries.insert(V);
+    return (unsigned)Idx;
+  }
+
+  Constants.push_back(MachineConstantPoolEntry(V, Alignment));
+  return Constants.size()-1;
+}
+
+void MachineConstantPool::print(raw_ostream &OS) const {
+  if (Constants.empty()) return;
+
+  OS << "Constant Pool:\n";
+  for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
+    OS << "  cp#" << i << ": ";
+    if (Constants[i].isMachineConstantPoolEntry())
+      Constants[i].Val.MachineCPVal->print(OS);
+    else
+      Constants[i].Val.ConstVal->printAsOperand(OS, /*PrintType=*/false);
+    OS << ", align=" << Constants[i].getAlignment();
+    OS << "\n";
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineConstantPool::dump() const { print(dbgs()); }
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
new file mode 100644
index 000000000000..338cd1e22032
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -0,0 +1,60 @@
+//===-- MachineFunctionAnalysis.cpp ---------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionAnalysis members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineFunctionInitializer.h"
+using namespace llvm;
+
+char MachineFunctionAnalysis::ID = 0;
+
+MachineFunctionAnalysis::MachineFunctionAnalysis(
+    const TargetMachine &tm, MachineFunctionInitializer *MFInitializer)
+    : FunctionPass(ID), TM(tm), MF(nullptr), MFInitializer(MFInitializer) {
+  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineFunctionAnalysis::~MachineFunctionAnalysis() {
+  releaseMemory();
+  assert(!MF && "MachineFunctionAnalysis left initialized!");
+}
+
+void MachineFunctionAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineModuleInfo>();
+}
+
+bool MachineFunctionAnalysis::doInitialization(Module &M) {
+  MachineModuleInfo *MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  assert(MMI && "MMI not around yet??");
+  MMI->setModule(&M);
+  NextFnNum = 0;
+  return false;
+}
+
+
+bool MachineFunctionAnalysis::runOnFunction(Function &F) {
+  assert(!MF && "MachineFunctionAnalysis already initialized!");
+  MF = new MachineFunction(&F, TM, NextFnNum++,
+                           getAnalysis<MachineModuleInfo>());
+  if (MFInitializer)
+    MFInitializer->initializeMachineFunction(*MF);
+  return false;
+}
+
+void MachineFunctionAnalysis::releaseMemory() {
+  delete MF;
+  MF = nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
new file mode 100644
index 000000000000..228fe170ab46
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPass.cpp
@@ -0,0 +1,89 @@
+//===-- MachineFunctionPass.cpp -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the definitions of the MachineFunctionPass members.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DominanceFrontier.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+
+using namespace llvm;
+
+Pass *MachineFunctionPass::createPrinterPass(raw_ostream &O,
+                                             const std::string &Banner) const {
+  return createMachineFunctionPrinterPass(O, Banner);
+}
+
+bool MachineFunctionPass::runOnFunction(Function &F) {
+  // Do not codegen any 'available_externally' functions at all, they have
+  // definitions outside the translation unit.
+  if (F.hasAvailableExternallyLinkage())
+    return false;
+
+  MachineFunction &MF = getAnalysis<MachineFunctionAnalysis>().getMF();
+  MachineFunctionProperties &MFProps = MF.getProperties();
+
+#ifndef NDEBUG
+  if (!MFProps.verifyRequiredProperties(RequiredProperties)) {
+    errs() << "MachineFunctionProperties required by " << getPassName()
+           << " pass are not met by function " << F.getName() << ".\n"
+           << "Required properties: ";
+    RequiredProperties.print(errs(), /*OnlySet=*/true);
+    errs() << "\nCurrent properties: ";
+    MFProps.print(errs());
+    errs() << "\n";
+    llvm_unreachable("MachineFunctionProperties check failed");
+  }
+#endif
+
+  bool RV = runOnMachineFunction(MF);
+
+  MFProps.set(SetProperties);
+  MFProps.clear(ClearedProperties);
+  return RV;
+}
+
+void MachineFunctionPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addPreserved<MachineFunctionAnalysis>();
+
+  // MachineFunctionPass preserves all LLVM IR passes, but there's no
+  // high-level way to express this. Instead, just list a bunch of
+  // passes explicitly. This does not include setPreservesCFG,
+  // because CodeGen overloads that to mean preserving the MachineBasicBlock
+  // CFG in addition to the LLVM IR CFG.
+  AU.addPreserved<BasicAAWrapperPass>();
+  AU.addPreserved<DominanceFrontierWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
+  AU.addPreserved<IVUsersWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addPreserved<MemoryDependenceWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<SCEVAAWrapperPass>();
+  AU.addPreserved<StackProtector>();
+
+  FunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
new file mode 100644
index 000000000000..4f424ff292cc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -0,0 +1,69 @@
+//===-- MachineFunctionPrinterPass.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineFunctionPrinterPass implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+namespace {
+/// MachineFunctionPrinterPass - This is a pass to dump the IR of a
+/// MachineFunction.
+///
+struct MachineFunctionPrinterPass : public MachineFunctionPass {
+  static char ID;
+
+  raw_ostream &OS;
+  const std::string Banner;
+
+  MachineFunctionPrinterPass() : MachineFunctionPass(ID), OS(dbgs()) { }
+  MachineFunctionPrinterPass(raw_ostream &os, const std::string &banner)
+      : MachineFunctionPass(ID), OS(os), Banner(banner) {}
+
+  const char *getPassName() const override { return "MachineFunction Printer"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (!llvm::isFunctionInPrintList(MF.getName()))
+      return false;
+    OS << "# " << Banner << ":\n";
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
+    return false;
+  }
+};
+
+char MachineFunctionPrinterPass::ID = 0;
+}
+
+char &llvm::MachineFunctionPrinterPassID = MachineFunctionPrinterPass::ID;
+INITIALIZE_PASS(MachineFunctionPrinterPass, "machineinstr-printer",
+                "Machine Function Printer", false, false)
+
+namespace llvm {
+/// Returns a newly-created MachineFunction Printer pass. The
+/// default banner is empty.
+///
+MachineFunctionPass *createMachineFunctionPrinterPass(raw_ostream &OS,
+                                                      const std::string &Banner){
+  return new MachineFunctionPrinterPass(OS, Banner);
+}
+
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineInstr.cpp b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
new file mode 100644
index 000000000000..3cdf8d2941d3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineInstr.cpp
@@ -0,0 +1,2236 @@
+//===-- lib/CodeGen/MachineInstr.cpp --------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Methods common to all machine instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSlotTracker.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+static cl::opt<bool> PrintWholeRegMask(
+    "print-whole-regmask",
+    cl::desc("Print the full contents of regmask operands in IR dumps"),
+    cl::init(true), cl::Hidden);
+
+//===----------------------------------------------------------------------===//
+// MachineOperand Implementation
+//===----------------------------------------------------------------------===//
+
+void MachineOperand::setReg(unsigned Reg) {
+  if (getReg() == Reg) return; // No change.
+
+  // Otherwise, we have to change the register.  If this operand is embedded
+  // into a machine function, we need to update the old and new register's
+  // use/def lists.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        SmallContents.RegNo = Reg;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+
+  // Otherwise, just change the register, no problem.  :)
+  SmallContents.RegNo = Reg;
+}
+
+void MachineOperand::substVirtReg(unsigned Reg, unsigned SubIdx,
+                                  const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  if (SubIdx && getSubReg())
+    SubIdx = TRI.composeSubRegIndices(SubIdx, getSubReg());
+  setReg(Reg);
+  if (SubIdx)
+    setSubReg(SubIdx);
+}
+
+void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+  if (getSubReg()) {
+    Reg = TRI.getSubReg(Reg, getSubReg());
+    // Note that getSubReg() may return 0 if the sub-register doesn't exist.
+    // That won't happen in legal code.
+    setSubReg(0);
+  }
+  setReg(Reg);
+}
+
+/// Change a def to a use, or a use to a def.
+void MachineOperand::setIsDef(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert((!Val || !isDebug()) && "Marking a debug operation as def");
+  if (IsDef == Val)
+    return;
+  // MRI may keep uses and defs in different list positions.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        IsDef = Val;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+  IsDef = Val;
+}
+
+// If this operand is currently a register operand, and if this is in a
+// function, deregister the operand from the register's use/def list.
+void MachineOperand::removeRegFromUses() {
+  if (!isReg() || !isOnRegUseList())
+    return;
+
+  if (MachineInstr *MI = getParent()) {
+    if (MachineBasicBlock *MBB = MI->getParent()) {
+      if (MachineFunction *MF = MBB->getParent())
+        MF->getRegInfo().removeRegOperandFromUseList(this);
+    }
+  }
+}
+
+/// ChangeToImmediate - Replace this operand with a new immediate operand of
+/// the specified value.  If an operand is known to be an immediate already,
+/// the setImm method should be used.
+void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_Immediate;
+  Contents.ImmVal = ImmVal;
+}
+
+void MachineOperand::ChangeToFPImmediate(const ConstantFP *FPImm) {
+  assert((!isReg() || !isTied()) && "Cannot change a tied operand into an imm");
+
+  removeRegFromUses();
+
+  OpKind = MO_FPImmediate;
+  Contents.CFP = FPImm;
+}
+
+void MachineOperand::ChangeToES(const char *SymName, unsigned char TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into an external symbol");
+
+  removeRegFromUses();
+
+  OpKind = MO_ExternalSymbol;
+  Contents.OffsetedInfo.Val.SymbolName = SymName;
+  setOffset(0); // Offset is always 0.
+  setTargetFlags(TargetFlags);
+}
+
+void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into an MCSymbol");
+
+  removeRegFromUses();
+
+  OpKind = MO_MCSymbol;
+  Contents.Sym = Sym;
+}
+
+/// ChangeToRegister - Replace this operand with a new register operand of
+/// the specified value.  If an operand is known to be an register already,
+/// the setReg method should be used.
+void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
+                                      bool isKill, bool isDead, bool isUndef,
+                                      bool isDebug) {
+  MachineRegisterInfo *RegInfo = nullptr;
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent())
+        RegInfo = &MF->getRegInfo();
+  // If this operand is already a register operand, remove it from the
+  // register's use/def lists.
+  bool WasReg = isReg();
+  if (RegInfo && WasReg)
+    RegInfo->removeRegOperandFromUseList(this);
+
+  // Change this to a register and set the reg#.
+  OpKind = MO_Register;
+  SmallContents.RegNo = Reg;
+  SubReg_TargetFlags = 0;
+  IsDef = isDef;
+  IsImp = isImp;
+  IsKill = isKill;
+  IsDead = isDead;
+  IsUndef = isUndef;
+  IsInternalRead = false;
+  IsEarlyClobber = false;
+  IsDebug = isDebug;
+  // Ensure isOnRegUseList() returns false.
+  Contents.Reg.Prev = nullptr;
+  // Preserve the tie when the operand was already a register.
+  if (!WasReg)
+    TiedTo = 0;
+
+  // If this operand is embedded in a function, add the operand to the
+  // register's use/def list.
+  if (RegInfo)
+    RegInfo->addRegOperandToUseList(this);
+}
+
+/// isIdenticalTo - Return true if this operand is identical to the specified
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
+bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
+  if (getType() != Other.getType() ||
+      getTargetFlags() != Other.getTargetFlags())
+    return false;
+
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    return getReg() == Other.getReg() && isDef() == Other.isDef() &&
+           getSubReg() == Other.getSubReg();
+  case MachineOperand::MO_Immediate:
+    return getImm() == Other.getImm();
+  case MachineOperand::MO_CImmediate:
+    return getCImm() == Other.getCImm();
+  case MachineOperand::MO_FPImmediate:
+    return getFPImm() == Other.getFPImm();
+  case MachineOperand::MO_MachineBasicBlock:
+    return getMBB() == Other.getMBB();
+  case MachineOperand::MO_FrameIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_JumpTableIndex:
+    return getIndex() == Other.getIndex();
+  case MachineOperand::MO_GlobalAddress:
+    return getGlobal() == Other.getGlobal() && getOffset() == Other.getOffset();
+  case MachineOperand::MO_ExternalSymbol:
+    return !strcmp(getSymbolName(), Other.getSymbolName()) &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_BlockAddress:
+    return getBlockAddress() == Other.getBlockAddress() &&
+           getOffset() == Other.getOffset();
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return getRegMask() == Other.getRegMask();
+  case MachineOperand::MO_MCSymbol:
+    return getMCSymbol() == Other.getMCSymbol();
+  case MachineOperand::MO_CFIIndex:
+    return getCFIIndex() == Other.getCFIIndex();
+  case MachineOperand::MO_Metadata:
+    return getMetadata() == Other.getMetadata();
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    // Register operands don't have target flags.
+    return hash_combine(MO.getType(), MO.getReg(), MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        MO.getSymbolName());
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getBlockAddress(), MO.getOffset());
+  case MachineOperand::MO_RegisterMask:
+  case MachineOperand::MO_RegisterLiveOut:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  case MachineOperand::MO_CFIIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCFIIndex());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
+void MachineOperand::print(raw_ostream &OS,
+                           const TargetRegisterInfo *TRI) const {
+  ModuleSlotTracker DummyMST(nullptr);
+  print(OS, DummyMST, TRI);
+}
+
+void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                           const TargetRegisterInfo *TRI) const {
+  switch (getType()) {
+  case MachineOperand::MO_Register:
+    OS << PrintReg(getReg(), TRI, getSubReg());
+
+    if (isDef() || isKill() || isDead() || isImplicit() || isUndef() ||
+        isInternalRead() || isEarlyClobber() || isTied()) {
+      OS << '<';
+      bool NeedComma = false;
+      if (isDef()) {
+        if (NeedComma) OS << ',';
+        if (isEarlyClobber())
+          OS << "earlyclobber,";
+        if (isImplicit())
+          OS << "imp-";
+        OS << "def";
+        NeedComma = true;
+        // <def,read-undef> only makes sense when getSubReg() is set.
+        // Don't clutter the output otherwise.
+        if (isUndef() && getSubReg())
+          OS << ",read-undef";
+      } else if (isImplicit()) {
+        OS << "imp-use";
+        NeedComma = true;
+      }
+
+      if (isKill()) {
+        if (NeedComma) OS << ',';
+        OS << "kill";
+        NeedComma = true;
+      }
+      if (isDead()) {
+        if (NeedComma) OS << ',';
+        OS << "dead";
+        NeedComma = true;
+      }
+      if (isUndef() && isUse()) {
+        if (NeedComma) OS << ',';
+        OS << "undef";
+        NeedComma = true;
+      }
+      if (isInternalRead()) {
+        if (NeedComma) OS << ',';
+        OS << "internal";
+        NeedComma = true;
+      }
+      if (isTied()) {
+        if (NeedComma) OS << ',';
+        OS << "tied";
+        if (TiedTo != 15)
+          OS << unsigned(TiedTo - 1);
+      }
+      OS << '>';
+    }
+    break;
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
+    break;
+  case MachineOperand::MO_CImmediate:
+    getCImm()->getValue().print(OS, false);
+    break;
+  case MachineOperand::MO_FPImmediate:
+    if (getFPImm()->getType()->isFloatTy()) {
+      OS << getFPImm()->getValueAPF().convertToFloat();
+    } else if (getFPImm()->getType()->isHalfTy()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      bool Unused;
+      APF.convert(APFloat::IEEEsingle, APFloat::rmNearestTiesToEven, &Unused);
+      OS << "half " << APF.convertToFloat();
+    } else {
+      OS << getFPImm()->getValueAPF().convertToDouble();
+    }
+    break;
+  case MachineOperand::MO_MachineBasicBlock:
+    OS << "<BB#" << getMBB()->getNumber() << ">";
+    break;
+  case MachineOperand::MO_FrameIndex:
+    OS << "<fi#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_ConstantPoolIndex:
+    OS << "<cp#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_TargetIndex:
+    OS << "<ti#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_JumpTableIndex:
+    OS << "<jt#" << getIndex() << '>';
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    OS << "<ga:";
+    getGlobal()->printAsOperand(OS, /*PrintType=*/false, MST);
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_ExternalSymbol:
+    OS << "<es:" << getSymbolName();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_BlockAddress:
+    OS << '<';
+    getBlockAddress()->printAsOperand(OS, /*PrintType=*/false, MST);
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
+  case MachineOperand::MO_RegisterMask: {
+    unsigned NumRegsInMask = 0;
+    unsigned NumRegsEmitted = 0;
+    OS << "<regmask";
+    for (unsigned i = 0; i < TRI->getNumRegs(); ++i) {
+      unsigned MaskWord = i / 32;
+      unsigned MaskBit = i % 32;
+      if (getRegMask()[MaskWord] & (1 << MaskBit)) {
+        if (PrintWholeRegMask || NumRegsEmitted <= 10) {
+          OS << " " << PrintReg(i, TRI);
+          NumRegsEmitted++;
+        }
+        NumRegsInMask++;
+      }
+    }
+    if (NumRegsEmitted != NumRegsInMask)
+      OS << " and " << (NumRegsInMask - NumRegsEmitted) << " more...";
+    OS << ">";
+    break;
+  }
+  case MachineOperand::MO_RegisterLiveOut:
+    OS << "<regliveout>";
+    break;
+  case MachineOperand::MO_Metadata:
+    OS << '<';
+    getMetadata()->printAsOperand(OS, MST);
+    OS << '>';
+    break;
+  case MachineOperand::MO_MCSymbol:
+    OS << "<MCSym=" << *getMCSymbol() << '>';
+    break;
+  case MachineOperand::MO_CFIIndex:
+    OS << "<call frame instruction>";
+    break;
+  }
+
+  if (unsigned TF = getTargetFlags())
+    OS << "[TF=" << TF << ']';
+}
+
+//===----------------------------------------------------------------------===//
+// MachineMemOperand Implementation
+//===----------------------------------------------------------------------===//
+
+/// getAddrSpace - Return the LLVM IR address space number that this pointer
+/// points into.
+unsigned MachinePointerInfo::getAddrSpace() const {
+  if (V.isNull() || V.is<const PseudoSourceValue*>()) return 0;
+  return cast<PointerType>(V.get<const Value*>()->getType())->getAddressSpace();
+}
+
+/// getConstantPool - Return a MachinePointerInfo record that refers to the
+/// constant pool.
+MachinePointerInfo MachinePointerInfo::getConstantPool(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getConstantPool());
+}
+
+/// getFixedStack - Return a MachinePointerInfo record that refers to the
+/// the specified FrameIndex.
+MachinePointerInfo MachinePointerInfo::getFixedStack(MachineFunction &MF,
+                                                     int FI, int64_t Offset) {
+  return MachinePointerInfo(MF.getPSVManager().getFixedStack(FI), Offset);
+}
+
+MachinePointerInfo MachinePointerInfo::getJumpTable(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getJumpTable());
+}
+
+MachinePointerInfo MachinePointerInfo::getGOT(MachineFunction &MF) {
+  return MachinePointerInfo(MF.getPSVManager().getGOT());
+}
+
+MachinePointerInfo MachinePointerInfo::getStack(MachineFunction &MF,
+                                                int64_t Offset) {
+  return MachinePointerInfo(MF.getPSVManager().getStack(), Offset);
+}
+
+MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
+                                     uint64_t s, unsigned int a,
+                                     const AAMDNodes &AAInfo,
+                                     const MDNode *Ranges)
+    : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1),
+      AAInfo(AAInfo), Ranges(Ranges) {
+  assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue*>() ||
+          isa<PointerType>(PtrInfo.V.get<const Value*>()->getType())) &&
+         "invalid pointer value");
+  assert(getBaseAlignment() == a && "Alignment is not a power of 2!");
+  assert((isLoad() || isStore()) && "Not a load/store!");
+}
+
+/// Profile - Gather unique data for the object.
+///
+void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
+  ID.AddInteger(getOffset());
+  ID.AddInteger(Size);
+  ID.AddPointer(getOpaqueValue());
+  ID.AddInteger(getFlags());
+  ID.AddInteger(getBaseAlignment());
+}
+
+void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
+  // The Value and Offset may differ due to CSE. But the flags and size
+  // should be the same.
+  assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
+  assert(MMO->getSize() == getSize() && "Size mismatch!");
+
+  if (MMO->getBaseAlignment() >= getBaseAlignment()) {
+    // Update the alignment value.
+    BaseAlignLog2 = Log2_32(MMO->getBaseAlignment()) + 1;
+    // Also update the base and offset, because the new alignment may
+    // not be applicable with the old ones.
+    PtrInfo = MMO->PtrInfo;
+  }
+}
+
+/// getAlignment - Return the minimum known alignment in bytes of the
+/// actual memory reference.
+uint64_t MachineMemOperand::getAlignment() const {
+  return MinAlign(getBaseAlignment(), getOffset());
+}
+
+void MachineMemOperand::print(raw_ostream &OS) const {
+  ModuleSlotTracker DummyMST(nullptr);
+  print(OS, DummyMST);
+}
+void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST) const {
+  assert((isLoad() || isStore()) &&
+         "SV has to be a load, store or both.");
+
+  if (isVolatile())
+    OS << "Volatile ";
+
+  if (isLoad())
+    OS << "LD";
+  if (isStore())
+    OS << "ST";
+  OS << getSize();
+
+  // Print the address information.
+  OS << "[";
+  if (const Value *V = getValue())
+    V->printAsOperand(OS, /*PrintType=*/false, MST);
+  else if (const PseudoSourceValue *PSV = getPseudoValue())
+    PSV->printCustom(OS);
+  else
+    OS << "<unknown>";
+
+  unsigned AS = getAddrSpace();
+  if (AS != 0)
+    OS << "(addrspace=" << AS << ')';
+
+  // If the alignment of the memory reference itself differs from the alignment
+  // of the base pointer, print the base alignment explicitly, next to the base
+  // pointer.
+  if (getBaseAlignment() != getAlignment())
+    OS << "(align=" << getBaseAlignment() << ")";
+
+  if (getOffset() != 0)
+    OS << "+" << getOffset();
+  OS << "]";
+
+  // Print the alignment of the reference.
+  if (getBaseAlignment() != getAlignment() || getBaseAlignment() != getSize())
+    OS << "(align=" << getAlignment() << ")";
+
+  // Print TBAA info.
+  if (const MDNode *TBAAInfo = getAAInfo().TBAA) {
+    OS << "(tbaa=";
+    if (TBAAInfo->getNumOperands() > 0)
+      TBAAInfo->getOperand(0)->printAsOperand(OS, MST);
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print AA scope info.
+  if (const MDNode *ScopeInfo = getAAInfo().Scope) {
+    OS << "(alias.scope=";
+    if (ScopeInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = ScopeInfo->getNumOperands(); i != ie; ++i) {
+        ScopeInfo->getOperand(i)->printAsOperand(OS, MST);
+        if (i != ie-1)
+          OS << ",";
+      }
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print AA noalias scope info.
+  if (const MDNode *NoAliasInfo = getAAInfo().NoAlias) {
+    OS << "(noalias=";
+    if (NoAliasInfo->getNumOperands() > 0)
+      for (unsigned i = 0, ie = NoAliasInfo->getNumOperands(); i != ie; ++i) {
+        NoAliasInfo->getOperand(i)->printAsOperand(OS, MST);
+        if (i != ie-1)
+          OS << ",";
+      }
+    else
+      OS << "<unknown>";
+    OS << ")";
+  }
+
+  // Print nontemporal info.
+  if (isNonTemporal())
+    OS << "(nontemporal)";
+
+  if (isInvariant())
+    OS << "(invariant)";
+}
+
+//===----------------------------------------------------------------------===//
+// MachineInstr Implementation
+//===----------------------------------------------------------------------===//
+
+void MachineInstr::addImplicitDefUseOperands(MachineFunction &MF) {
+  if (MCID->ImplicitDefs)
+    for (const MCPhysReg *ImpDefs = MCID->getImplicitDefs(); *ImpDefs;
+           ++ImpDefs)
+      addOperand(MF, MachineOperand::CreateReg(*ImpDefs, true, true));
+  if (MCID->ImplicitUses)
+    for (const MCPhysReg *ImpUses = MCID->getImplicitUses(); *ImpUses;
+           ++ImpUses)
+      addOperand(MF, MachineOperand::CreateReg(*ImpUses, false, true));
+}
+
+/// MachineInstr ctor - This constructor creates a MachineInstr and adds the
+/// implicit operands. It reserves space for the number of operands specified by
+/// the MCInstrDesc.
+MachineInstr::MachineInstr(MachineFunction &MF, const MCInstrDesc &tid,
+                           DebugLoc dl, bool NoImp)
+    : MCID(&tid), Parent(nullptr), Operands(nullptr), NumOperands(0), Flags(0),
+      AsmPrinterFlags(0), NumMemRefs(0), MemRefs(nullptr),
+      debugLoc(std::move(dl))
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+      ,
+      Ty(nullptr)
+#endif
+{
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
+  // Reserve space for the expected number of operands.
+  if (unsigned NumOps = MCID->getNumOperands() +
+    MCID->getNumImplicitDefs() + MCID->getNumImplicitUses()) {
+    CapOperands = OperandCapacity::get(NumOps);
+    Operands = MF.allocateOperandArray(CapOperands);
+  }
+
+  if (!NoImp)
+    addImplicitDefUseOperands(MF);
+}
+
+/// MachineInstr ctor - Copies MachineInstr arg exactly
+///
+MachineInstr::MachineInstr(MachineFunction &MF, const MachineInstr &MI)
+    : MCID(&MI.getDesc()), Parent(nullptr), Operands(nullptr), NumOperands(0),
+      Flags(0), AsmPrinterFlags(0), NumMemRefs(MI.NumMemRefs),
+      MemRefs(MI.MemRefs), debugLoc(MI.getDebugLoc())
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+      ,
+      Ty(nullptr)
+#endif
+{
+  assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
+
+  CapOperands = OperandCapacity::get(MI.getNumOperands());
+  Operands = MF.allocateOperandArray(CapOperands);
+
+  // Copy operands.
+  for (const MachineOperand &MO : MI.operands())
+    addOperand(MF, MO);
+
+  // Copy all the sensible flags.
+  setFlags(MI.Flags);
+}
+
+/// getRegInfo - If this instruction is embedded into a MachineFunction,
+/// return the MachineRegisterInfo object for the current function, otherwise
+/// return null.
+MachineRegisterInfo *MachineInstr::getRegInfo() {
+  if (MachineBasicBlock *MBB = getParent())
+    return &MBB->getParent()->getRegInfo();
+  return nullptr;
+}
+
+// Implement dummy setter and getter for type when
+// global-isel is not built.
+// The proper implementation is WIP and is tracked here:
+// PR26576.
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+void MachineInstr::setType(Type *Ty) {}
+
+Type *MachineInstr::getType() const { return nullptr; }
+
+#else
+void MachineInstr::setType(Type *Ty) {
+  assert((!Ty || isPreISelGenericOpcode(getOpcode())) &&
+         "Non generic instructions are not supposed to be typed");
+  this->Ty = Ty;
+}
+
+Type *MachineInstr::getType() const { return Ty; }
+#endif // LLVM_BUILD_GLOBAL_ISEL
+
+/// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands already be on their use lists.
+void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.removeRegOperandFromUseList(&MO);
+}
+
+/// AddRegOperandsToUseLists - Add all of the register operands in
+/// this instruction from their respective use lists.  This requires that the
+/// operands not be on their use lists yet.
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+  for (MachineOperand &MO : operands())
+    if (MO.isReg())
+      MRI.addRegOperandToUseList(&MO);
+}
+
+void MachineInstr::addOperand(const MachineOperand &Op) {
+  MachineBasicBlock *MBB = getParent();
+  assert(MBB && "Use MachineInstrBuilder to add operands to dangling instrs");
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Use MachineInstrBuilder to add operands to dangling instrs");
+  addOperand(*MF, Op);
+}
+
+/// Move NumOps MachineOperands from Src to Dst, with support for overlapping
+/// ranges. If MRI is non-null also update use-def chains.
+static void moveOperands(MachineOperand *Dst, MachineOperand *Src,
+                         unsigned NumOps, MachineRegisterInfo *MRI) {
+  if (MRI)
+    return MRI->moveOperands(Dst, Src, NumOps);
+
+  // MachineOperand is a trivially copyable type so we can just use memmove.
+  std::memmove(Dst, Src, NumOps * sizeof(MachineOperand));
+}
+
+/// addOperand - Add the specified operand to the instruction.  If it is an
+/// implicit operand, it is added to the end of the operand list.  If it is
+/// an explicit operand it is added at the end of the explicit operand list
+/// (before the first implicit operand).
+void MachineInstr::addOperand(MachineFunction &MF, const MachineOperand &Op) {
+  assert(MCID && "Cannot add operands before providing an instr descriptor");
+
+  // Check if we're adding one of our existing operands.
+  if (&Op >= Operands && &Op < Operands + NumOperands) {
+    // This is unusual: MI->addOperand(MI->getOperand(i)).
+    // If adding Op requires reallocating or moving existing operands around,
+    // the Op reference could go stale. Support it by copying Op.
+    MachineOperand CopyOp(Op);
+    return addOperand(MF, CopyOp);
+  }
+
+  // Find the insert location for the new operand.  Implicit registers go at
+  // the end, everything else goes before the implicit regs.
+  //
+  // FIXME: Allow mixed explicit and implicit operands on inline asm.
+  // InstrEmitter::EmitSpecialNode() is marking inline asm clobbers as
+  // implicit-defs, but they must not be moved around.  See the FIXME in
+  // InstrEmitter.cpp.
+  unsigned OpNo = getNumOperands();
+  bool isImpReg = Op.isReg() && Op.isImplicit();
+  if (!isImpReg && !isInlineAsm()) {
+    while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
+      --OpNo;
+      assert(!Operands[OpNo].isTied() && "Cannot move tied operands");
+    }
+  }
+
+#ifndef NDEBUG
+  bool isMetaDataOp = Op.getType() == MachineOperand::MO_Metadata;
+  // OpNo now points as the desired insertion point.  Unless this is a variadic
+  // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
+  // RegMask operands go between the explicit and implicit operands.
+  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+          OpNo < MCID->getNumOperands() || isMetaDataOp) &&
+         "Trying to add an operand to a machine instr that is already done!");
+#endif
+
+  MachineRegisterInfo *MRI = getRegInfo();
+
+  // Determine if the Operands array needs to be reallocated.
+  // Save the old capacity and operand array.
+  OperandCapacity OldCap = CapOperands;
+  MachineOperand *OldOperands = Operands;
+  if (!OldOperands || OldCap.getSize() == getNumOperands()) {
+    CapOperands = OldOperands ? OldCap.getNext() : OldCap.get(1);
+    Operands = MF.allocateOperandArray(CapOperands);
+    // Move the operands before the insertion point.
+    if (OpNo)
+      moveOperands(Operands, OldOperands, OpNo, MRI);
+  }
+
+  // Move the operands following the insertion point.
+  if (OpNo != NumOperands)
+    moveOperands(Operands + OpNo + 1, OldOperands + OpNo, NumOperands - OpNo,
+                 MRI);
+  ++NumOperands;
+
+  // Deallocate the old operand array.
+  if (OldOperands != Operands && OldOperands)
+    MF.deallocateOperandArray(OldCap, OldOperands);
+
+  // Copy Op into place. It still needs to be inserted into the MRI use lists.
+  MachineOperand *NewMO = new (Operands + OpNo) MachineOperand(Op);
+  NewMO->ParentMI = this;
+
+  // When adding a register operand, tell MRI about it.
+  if (NewMO->isReg()) {
+    // Ensure isOnRegUseList() returns false, regardless of Op's status.
+    NewMO->Contents.Reg.Prev = nullptr;
+    // Ignore existing ties. This is not a property that can be copied.
+    NewMO->TiedTo = 0;
+    // Add the new operand to MRI, but only for instructions in an MBB.
+    if (MRI)
+      MRI->addRegOperandToUseList(NewMO);
+    // The MCID operand information isn't accurate until we start adding
+    // explicit operands. The implicit operands are added first, then the
+    // explicits are inserted before them.
+    if (!isImpReg) {
+      // Tie uses to defs as indicated in MCInstrDesc.
+      if (NewMO->isUse()) {
+        int DefIdx = MCID->getOperandConstraint(OpNo, MCOI::TIED_TO);
+        if (DefIdx != -1)
+          tieOperands(DefIdx, OpNo);
+      }
+      // If the register operand is flagged as early, mark the operand as such.
+      if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
+        NewMO->setIsEarlyClobber(true);
+    }
+  }
+}
+
+/// RemoveOperand - Erase an operand  from an instruction, leaving it with one
+/// fewer operand than it started with.
+///
+void MachineInstr::RemoveOperand(unsigned OpNo) {
+  assert(OpNo < getNumOperands() && "Invalid operand number");
+  untieRegOperand(OpNo);
+
+#ifndef NDEBUG
+  // Moving tied operands would break the ties.
+  for (unsigned i = OpNo + 1, e = getNumOperands(); i != e; ++i)
+    if (Operands[i].isReg())
+      assert(!Operands[i].isTied() && "Cannot move tied operands");
+#endif
+
+  MachineRegisterInfo *MRI = getRegInfo();
+  if (MRI && Operands[OpNo].isReg())
+    MRI->removeRegOperandFromUseList(Operands + OpNo);
+
+  // Don't call the MachineOperand destructor. A lot of this code depends on
+  // MachineOperand having a trivial destructor anyway, and adding a call here
+  // wouldn't make it 'destructor-correct'.
+
+  if (unsigned N = NumOperands - 1 - OpNo)
+    moveOperands(Operands + OpNo, Operands + OpNo + 1, N, MRI);
+  --NumOperands;
+}
+
+/// addMemOperand - Add a MachineMemOperand to the machine instruction.
+/// This function should be used only occasionally. The setMemRefs function
+/// is the primary method for setting up a MachineInstr's MemRefs list.
+void MachineInstr::addMemOperand(MachineFunction &MF,
+                                 MachineMemOperand *MO) {
+  mmo_iterator OldMemRefs = MemRefs;
+  unsigned OldNumMemRefs = NumMemRefs;
+
+  unsigned NewNum = NumMemRefs + 1;
+  mmo_iterator NewMemRefs = MF.allocateMemRefsArray(NewNum);
+
+  std::copy(OldMemRefs, OldMemRefs + OldNumMemRefs, NewMemRefs);
+  NewMemRefs[NewNum - 1] = MO;
+  setMemRefs(NewMemRefs, NewMemRefs + NewNum);
+}
+
+/// Check to see if the MMOs pointed to by the two MemRefs arrays are
+/// identical.
+static bool hasIdenticalMMOs(const MachineInstr &MI1, const MachineInstr &MI2) {
+  auto I1 = MI1.memoperands_begin(), E1 = MI1.memoperands_end();
+  auto I2 = MI2.memoperands_begin(), E2 = MI2.memoperands_end();
+  if ((E1 - I1) != (E2 - I2))
+    return false;
+  for (; I1 != E1; ++I1, ++I2) {
+    if (**I1 != **I2)
+      return false;
+  }
+  return true;
+}
+
+std::pair<MachineInstr::mmo_iterator, unsigned>
+MachineInstr::mergeMemRefsWith(const MachineInstr& Other) {
+
+  // If either of the incoming memrefs are empty, we must be conservative and
+  // treat this as if we've exhausted our space for memrefs and dropped them.
+  if (memoperands_empty() || Other.memoperands_empty())
+    return std::make_pair(nullptr, 0);
+
+  // If both instructions have identical memrefs, we don't need to merge them.
+  // Since many instructions have a single memref, and we tend to merge things
+  // like pairs of loads from the same location, this catches a large number of
+  // cases in practice.
+  if (hasIdenticalMMOs(*this, Other))
+    return std::make_pair(MemRefs, NumMemRefs);
+
+  // TODO: consider uniquing elements within the operand lists to reduce
+  // space usage and fall back to conservative information less often.
+  size_t CombinedNumMemRefs = NumMemRefs + Other.NumMemRefs;
+
+  // If we don't have enough room to store this many memrefs, be conservative
+  // and drop them.  Otherwise, we'd fail asserts when trying to add them to
+  // the new instruction.
+  if (CombinedNumMemRefs != uint8_t(CombinedNumMemRefs))
+    return std::make_pair(nullptr, 0);
+
+  MachineFunction *MF = getParent()->getParent();
+  mmo_iterator MemBegin = MF->allocateMemRefsArray(CombinedNumMemRefs);
+  mmo_iterator MemEnd = std::copy(memoperands_begin(), memoperands_end(),
+                                  MemBegin);
+  MemEnd = std::copy(Other.memoperands_begin(), Other.memoperands_end(),
+                     MemEnd);
+  assert(MemEnd - MemBegin == (ptrdiff_t)CombinedNumMemRefs &&
+         "missing memrefs");
+
+  return std::make_pair(MemBegin, CombinedNumMemRefs);
+}
+
+bool MachineInstr::hasPropertyInBundle(unsigned Mask, QueryType Type) const {
+  assert(!isBundledWithPred() && "Must be called on bundle header");
+  for (MachineBasicBlock::const_instr_iterator MII = getIterator();; ++MII) {
+    if (MII->getDesc().getFlags() & Mask) {
+      if (Type == AnyInBundle)
+        return true;
+    } else {
+      if (Type == AllInBundle && !MII->isBundle())
+        return false;
+    }
+    // This was the last instruction in the bundle.
+    if (!MII->isBundledWithSucc())
+      return Type == AllInBundle;
+  }
+}
+
+bool MachineInstr::isIdenticalTo(const MachineInstr &Other,
+                                 MICheckType Check) const {
+  // If opcodes or number of operands are not the same then the two
+  // instructions are obviously not identical.
+  if (Other.getOpcode() != getOpcode() ||
+      Other.getNumOperands() != getNumOperands())
+    return false;
+
+  if (isBundle()) {
+    // Both instructions are bundles, compare MIs inside the bundle.
+    MachineBasicBlock::const_instr_iterator I1 = getIterator();
+    MachineBasicBlock::const_instr_iterator E1 = getParent()->instr_end();
+    MachineBasicBlock::const_instr_iterator I2 = Other.getIterator();
+    MachineBasicBlock::const_instr_iterator E2 = Other.getParent()->instr_end();
+    while (++I1 != E1 && I1->isInsideBundle()) {
+      ++I2;
+      if (I2 == E2 || !I2->isInsideBundle() || !I1->isIdenticalTo(*I2, Check))
+        return false;
+    }
+  }
+
+  // Check operands to make sure they match.
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    const MachineOperand &OMO = Other.getOperand(i);
+    if (!MO.isReg()) {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      continue;
+    }
+
+    // Clients may or may not want to ignore defs when testing for equality.
+    // For example, machine CSE pass only cares about finding common
+    // subexpressions, so it's safe to ignore virtual register defs.
+    if (MO.isDef()) {
+      if (Check == IgnoreDefs)
+        continue;
+      else if (Check == IgnoreVRegDefs) {
+        if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+            TargetRegisterInfo::isPhysicalRegister(OMO.getReg()))
+          if (MO.getReg() != OMO.getReg())
+            return false;
+      } else {
+        if (!MO.isIdenticalTo(OMO))
+          return false;
+        if (Check == CheckKillDead && MO.isDead() != OMO.isDead())
+          return false;
+      }
+    } else {
+      if (!MO.isIdenticalTo(OMO))
+        return false;
+      if (Check == CheckKillDead && MO.isKill() != OMO.isKill())
+        return false;
+    }
+  }
+  // If DebugLoc does not match then two dbg.values are not identical.
+  if (isDebugValue())
+    if (getDebugLoc() && Other.getDebugLoc() &&
+        getDebugLoc() != Other.getDebugLoc())
+      return false;
+  return true;
+}
+
+MachineInstr *MachineInstr::removeFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove(this);
+}
+
+MachineInstr *MachineInstr::removeFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove_instr(this);
+}
+
+void MachineInstr::eraseFromParent() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase(this);
+}
+
+void MachineInstr::eraseFromParentAndMarkDBGValuesForRemoval() {
+  assert(getParent() && "Not embedded in a basic block!");
+  MachineBasicBlock *MBB = getParent();
+  MachineFunction *MF = MBB->getParent();
+  assert(MF && "Not embedded in a function!");
+
+  MachineInstr *MI = (MachineInstr *)this;
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    MRI.markUsesInDebugValueAsUndef(Reg);
+  }
+  MI->eraseFromParent();
+}
+
+void MachineInstr::eraseFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase_instr(this);
+}
+
+/// getNumExplicitOperands - Returns the number of non-implicit operands.
+///
+unsigned MachineInstr::getNumExplicitOperands() const {
+  unsigned NumOperands = MCID->getNumOperands();
+  if (!MCID->isVariadic())
+    return NumOperands;
+
+  for (unsigned i = NumOperands, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isImplicit())
+      NumOperands++;
+  }
+  return NumOperands;
+}
+
+void MachineInstr::bundleWithPred() {
+  assert(!isBundledWithPred() && "MI is already bundled with its predecessor");
+  setFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = getIterator();
+  --Pred;
+  assert(!Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->setFlag(BundledSucc);
+}
+
+void MachineInstr::bundleWithSucc() {
+  assert(!isBundledWithSucc() && "MI is already bundled with its successor");
+  setFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = getIterator();
+  ++Succ;
+  assert(!Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->setFlag(BundledPred);
+}
+
+void MachineInstr::unbundleFromPred() {
+  assert(isBundledWithPred() && "MI isn't bundled with its predecessor");
+  clearFlag(BundledPred);
+  MachineBasicBlock::instr_iterator Pred = getIterator();
+  --Pred;
+  assert(Pred->isBundledWithSucc() && "Inconsistent bundle flags");
+  Pred->clearFlag(BundledSucc);
+}
+
+void MachineInstr::unbundleFromSucc() {
+  assert(isBundledWithSucc() && "MI isn't bundled with its successor");
+  clearFlag(BundledSucc);
+  MachineBasicBlock::instr_iterator Succ = getIterator();
+  ++Succ;
+  assert(Succ->isBundledWithPred() && "Inconsistent bundle flags");
+  Succ->clearFlag(BundledPred);
+}
+
+bool MachineInstr::isStackAligningInlineAsm() const {
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      return true;
+  }
+  return false;
+}
+
+InlineAsm::AsmDialect MachineInstr::getInlineAsmDialect() const {
+  assert(isInlineAsm() && "getInlineAsmDialect() only works for inline asms!");
+  unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+  return InlineAsm::AsmDialect((ExtraInfo & InlineAsm::Extra_AsmDialect) != 0);
+}
+
+int MachineInstr::findInlineAsmFlagIdx(unsigned OpIdx,
+                                       unsigned *GroupNo) const {
+  assert(isInlineAsm() && "Expected an inline asm instruction");
+  assert(OpIdx < getNumOperands() && "OpIdx out of range");
+
+  // Ignore queries about the initial operands.
+  if (OpIdx < InlineAsm::MIOp_FirstOperand)
+    return -1;
+
+  unsigned Group = 0;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    // If we reach the implicit register operands, stop looking.
+    if (!FlagMO.isImm())
+      return -1;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    if (i + NumOps > OpIdx) {
+      if (GroupNo)
+        *GroupNo = Group;
+      return i;
+    }
+    ++Group;
+  }
+  return -1;
+}
+
+const DILocalVariable *MachineInstr::getDebugVariable() const {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return cast<DILocalVariable>(getOperand(2).getMetadata());
+}
+
+const DIExpression *MachineInstr::getDebugExpression() const {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return cast<DIExpression>(getOperand(3).getMetadata());
+}
+
+const TargetRegisterClass*
+MachineInstr::getRegClassConstraint(unsigned OpIdx,
+                                    const TargetInstrInfo *TII,
+                                    const TargetRegisterInfo *TRI) const {
+  assert(getParent() && "Can't have an MBB reference here!");
+  assert(getParent()->getParent() && "Can't have an MF reference here!");
+  const MachineFunction &MF = *getParent()->getParent();
+
+  // Most opcodes have fixed constraints in their MCInstrDesc.
+  if (!isInlineAsm())
+    return TII->getRegClass(getDesc(), OpIdx, TRI, MF);
+
+  if (!getOperand(OpIdx).isReg())
+    return nullptr;
+
+  // For tied uses on inline asm, get the constraint from the def.
+  unsigned DefIdx;
+  if (getOperand(OpIdx).isUse() && isRegTiedToDefOperand(OpIdx, &DefIdx))
+    OpIdx = DefIdx;
+
+  // Inline asm stores register class constraints in the flag word.
+  int FlagIdx = findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0)
+    return nullptr;
+
+  unsigned Flag = getOperand(FlagIdx).getImm();
+  unsigned RCID;
+  if ((InlineAsm::getKind(Flag) == InlineAsm::Kind_RegUse ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDef ||
+       InlineAsm::getKind(Flag) == InlineAsm::Kind_RegDefEarlyClobber) &&
+      InlineAsm::hasRegClassConstraint(Flag, RCID))
+    return TRI->getRegClass(RCID);
+
+  // Assume that all registers in a memory operand are pointers.
+  if (InlineAsm::getKind(Flag) == InlineAsm::Kind_Mem)
+    return TRI->getPointerRegClass(MF);
+
+  return nullptr;
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVReg(
+    unsigned Reg, const TargetRegisterClass *CurRC, const TargetInstrInfo *TII,
+    const TargetRegisterInfo *TRI, bool ExploreBundle) const {
+  // Check every operands inside the bundle if we have
+  // been asked to.
+  if (ExploreBundle)
+    for (ConstMIBundleOperands OpndIt(*this); OpndIt.isValid() && CurRC;
+         ++OpndIt)
+      CurRC = OpndIt->getParent()->getRegClassConstraintEffectForVRegImpl(
+          OpndIt.getOperandNo(), Reg, CurRC, TII, TRI);
+  else
+    // Otherwise, just check the current operands.
+    for (unsigned i = 0, e = NumOperands; i < e && CurRC; ++i)
+      CurRC = getRegClassConstraintEffectForVRegImpl(i, Reg, CurRC, TII, TRI);
+  return CurRC;
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffectForVRegImpl(
+    unsigned OpIdx, unsigned Reg, const TargetRegisterClass *CurRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
+  assert(CurRC && "Invalid initial register class");
+  // Check if Reg is constrained by some of its use/def from MI.
+  const MachineOperand &MO = getOperand(OpIdx);
+  if (!MO.isReg() || MO.getReg() != Reg)
+    return CurRC;
+  // If yes, accumulate the constraints through the operand.
+  return getRegClassConstraintEffect(OpIdx, CurRC, TII, TRI);
+}
+
+const TargetRegisterClass *MachineInstr::getRegClassConstraintEffect(
+    unsigned OpIdx, const TargetRegisterClass *CurRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI) const {
+  const TargetRegisterClass *OpRC = getRegClassConstraint(OpIdx, TII, TRI);
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isReg() &&
+         "Cannot get register constraints for non-register operand");
+  assert(CurRC && "Invalid initial register class");
+  if (unsigned SubIdx = MO.getSubReg()) {
+    if (OpRC)
+      CurRC = TRI->getMatchingSuperRegClass(CurRC, OpRC, SubIdx);
+    else
+      CurRC = TRI->getSubClassWithSubReg(CurRC, SubIdx);
+  } else if (OpRC)
+    CurRC = TRI->getCommonSubClass(CurRC, OpRC);
+  return CurRC;
+}
+
+/// Return the number of instructions inside the MI bundle, not counting the
+/// header instruction.
+unsigned MachineInstr::getBundleSize() const {
+  MachineBasicBlock::const_instr_iterator I = getIterator();
+  unsigned Size = 0;
+  while (I->isBundledWithSucc()) {
+    ++Size;
+    ++I;
+  }
+  return Size;
+}
+
+/// Returns true if the MachineInstr has an implicit-use operand of exactly
+/// the given register (not considering sub/super-registers).
+bool MachineInstr::hasRegisterImplicitUseOperand(unsigned Reg) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (MO.isReg() && MO.isUse() && MO.isImplicit() && MO.getReg() == Reg)
+      return true;
+  }
+  return false;
+}
+
+/// findRegisterUseOperandIdx() - Returns the MachineOperand that is a use of
+/// the specific register or -1 if it is not found. It further tightens
+/// the search criteria to a use that kills the register if isKill is true.
+int MachineInstr::findRegisterUseOperandIdx(unsigned Reg, bool isKill,
+                                          const TargetRegisterInfo *TRI) const {
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MOReg == Reg ||
+        (TRI &&
+         TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+         TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         TRI->isSubRegister(MOReg, Reg)))
+      if (!isKill || MO.isKill())
+        return i;
+  }
+  return -1;
+}
+
+/// readsWritesVirtualRegister - Return a pair of bools (reads, writes)
+/// indicating if this instruction reads or writes Reg. This also considers
+/// partial defines.
+std::pair<bool,bool>
+MachineInstr::readsWritesVirtualRegister(unsigned Reg,
+                                         SmallVectorImpl<unsigned> *Ops) const {
+  bool PartDef = false; // Partial redefine.
+  bool FullDef = false; // Full define.
+  bool Use = false;
+
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+    if (Ops)
+      Ops->push_back(i);
+    if (MO.isUse())
+      Use |= !MO.isUndef();
+    else if (MO.getSubReg() && !MO.isUndef())
+      // A partial <def,undef> doesn't count as reading the register.
+      PartDef = true;
+    else
+      FullDef = true;
+  }
+  // A partial redefine uses Reg unless there is also a full define.
+  return std::make_pair(Use || (PartDef && !FullDef), PartDef || FullDef);
+}
+
+/// findRegisterDefOperandIdx() - Returns the operand index that is a def of
+/// the specified register or -1 if it is not found. If isDead is true, defs
+/// that are not dead are skipped. If TargetRegisterInfo is non-null, then it
+/// also checks if there is a def of a super-register.
+int
+MachineInstr::findRegisterDefOperandIdx(unsigned Reg, bool isDead, bool Overlap,
+                                        const TargetRegisterInfo *TRI) const {
+  bool isPhys = TargetRegisterInfo::isPhysicalRegister(Reg);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+    // Accept regmask operands when Overlap is set.
+    // Ignore them when looking for a specific def operand (Overlap == false).
+    if (isPhys && Overlap && MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      return i;
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned MOReg = MO.getReg();
+    bool Found = (MOReg == Reg);
+    if (!Found && TRI && isPhys &&
+        TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      if (Overlap)
+        Found = TRI->regsOverlap(MOReg, Reg);
+      else
+        Found = TRI->isSubRegister(MOReg, Reg);
+    }
+    if (Found && (!isDead || MO.isDead()))
+      return i;
+  }
+  return -1;
+}
+
+/// findFirstPredOperandIdx() - Find the index of the first operand in the
+/// operand list that is used to represent the predicate. It returns -1 if
+/// none is found.
+int MachineInstr::findFirstPredOperandIdx() const {
+  // Don't call MCID.findFirstPredOperandIdx() because this variant
+  // is sometimes called on an instruction that's not yet complete, and
+  // so the number of operands is less than the MCID indicates. In
+  // particular, the PTX target does this.
+  const MCInstrDesc &MCID = getDesc();
+  if (MCID.isPredicable()) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (MCID.OpInfo[i].isPredicate())
+        return i;
+  }
+
+  return -1;
+}
+
+// MachineOperand::TiedTo is 4 bits wide.
+const unsigned TiedMax = 15;
+
+/// tieOperands - Mark operands at DefIdx and UseIdx as tied to each other.
+///
+/// Use and def operands can be tied together, indicated by a non-zero TiedTo
+/// field. TiedTo can have these values:
+///
+/// 0:              Operand is not tied to anything.
+/// 1 to TiedMax-1: Tied to getOperand(TiedTo-1).
+/// TiedMax:        Tied to an operand >= TiedMax-1.
+///
+/// The tied def must be one of the first TiedMax operands on a normal
+/// instruction. INLINEASM instructions allow more tied defs.
+///
+void MachineInstr::tieOperands(unsigned DefIdx, unsigned UseIdx) {
+  MachineOperand &DefMO = getOperand(DefIdx);
+  MachineOperand &UseMO = getOperand(UseIdx);
+  assert(DefMO.isDef() && "DefIdx must be a def operand");
+  assert(UseMO.isUse() && "UseIdx must be a use operand");
+  assert(!DefMO.isTied() && "Def is already tied to another use");
+  assert(!UseMO.isTied() && "Use is already tied to another def");
+
+  if (DefIdx < TiedMax)
+    UseMO.TiedTo = DefIdx + 1;
+  else {
+    // Inline asm can use the group descriptors to find tied operands, but on
+    // normal instruction, the tied def must be within the first TiedMax
+    // operands.
+    assert(isInlineAsm() && "DefIdx out of range");
+    UseMO.TiedTo = TiedMax;
+  }
+
+  // UseIdx can be out of range, we'll search for it in findTiedOperandIdx().
+  DefMO.TiedTo = std::min(UseIdx + 1, TiedMax);
+}
+
+/// Given the index of a tied register operand, find the operand it is tied to.
+/// Defs are tied to uses and vice versa. Returns the index of the tied operand
+/// which must exist.
+unsigned MachineInstr::findTiedOperandIdx(unsigned OpIdx) const {
+  const MachineOperand &MO = getOperand(OpIdx);
+  assert(MO.isTied() && "Operand isn't tied");
+
+  // Normally TiedTo is in range.
+  if (MO.TiedTo < TiedMax)
+    return MO.TiedTo - 1;
+
+  // Uses on normal instructions can be out of range.
+  if (!isInlineAsm()) {
+    // Normal tied defs must be in the 0..TiedMax-1 range.
+    if (MO.isUse())
+      return TiedMax - 1;
+    // MO is a def. Search for the tied use.
+    for (unsigned i = TiedMax - 1, e = getNumOperands(); i != e; ++i) {
+      const MachineOperand &UseMO = getOperand(i);
+      if (UseMO.isReg() && UseMO.isUse() && UseMO.TiedTo == OpIdx + 1)
+        return i;
+    }
+    llvm_unreachable("Can't find tied use");
+  }
+
+  // Now deal with inline asm by parsing the operand group descriptor flags.
+  // Find the beginning of each operand group.
+  SmallVector<unsigned, 8> GroupIdx;
+  unsigned OpIdxGroup = ~0u;
+  unsigned NumOps;
+  for (unsigned i = InlineAsm::MIOp_FirstOperand, e = getNumOperands(); i < e;
+       i += NumOps) {
+    const MachineOperand &FlagMO = getOperand(i);
+    assert(FlagMO.isImm() && "Invalid tied operand on inline asm");
+    unsigned CurGroup = GroupIdx.size();
+    GroupIdx.push_back(i);
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(FlagMO.getImm());
+    // OpIdx belongs to this operand group.
+    if (OpIdx > i && OpIdx < i + NumOps)
+      OpIdxGroup = CurGroup;
+    unsigned TiedGroup;
+    if (!InlineAsm::isUseOperandTiedToDef(FlagMO.getImm(), TiedGroup))
+      continue;
+    // Operands in this group are tied to operands in TiedGroup which must be
+    // earlier. Find the number of operands between the two groups.
+    unsigned Delta = i - GroupIdx[TiedGroup];
+
+    // OpIdx is a use tied to TiedGroup.
+    if (OpIdxGroup == CurGroup)
+      return OpIdx - Delta;
+
+    // OpIdx is a def tied to this use group.
+    if (OpIdxGroup == TiedGroup)
+      return OpIdx + Delta;
+  }
+  llvm_unreachable("Invalid tied operand on inline asm");
+}
+
+/// clearKillInfo - Clears kill flags on all operands.
+///
+void MachineInstr::clearKillInfo() {
+  for (MachineOperand &MO : operands()) {
+    if (MO.isReg() && MO.isUse())
+      MO.setIsKill(false);
+  }
+}
+
+void MachineInstr::substituteRegister(unsigned FromReg,
+                                      unsigned ToReg,
+                                      unsigned SubIdx,
+                                      const TargetRegisterInfo &RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+    if (SubIdx)
+      ToReg = RegInfo.getSubReg(ToReg, SubIdx);
+    for (MachineOperand &MO : operands()) {
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substPhysReg(ToReg, RegInfo);
+    }
+  } else {
+    for (MachineOperand &MO : operands()) {
+      if (!MO.isReg() || MO.getReg() != FromReg)
+        continue;
+      MO.substVirtReg(ToReg, SubIdx, RegInfo);
+    }
+  }
+}
+
+/// isSafeToMove - Return true if it is safe to move this instruction. If
+/// SawStore is set to true, it means that there is a store (or call) between
+/// the instruction's location and its intended destination.
+bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
+  // Ignore stuff that we obviously can't move.
+  //
+  // Treat volatile loads as stores. This is not strictly necessary for
+  // volatiles, but it is required for atomic loads. It is not allowed to move
+  // a load across an atomic load with Ordering > Monotonic.
+  if (mayStore() || isCall() ||
+      (mayLoad() && hasOrderedMemoryRef())) {
+    SawStore = true;
+    return false;
+  }
+
+  if (isPosition() || isDebugValue() || isTerminator() ||
+      hasUnmodeledSideEffects())
+    return false;
+
+  // See if this instruction does a load.  If so, we have to guarantee that the
+  // loaded value doesn't change between the load and the its intended
+  // destination. The check for isInvariantLoad gives the targe the chance to
+  // classify the load as always returning a constant, e.g. a constant pool
+  // load.
+  if (mayLoad() && !isInvariantLoad(AA))
+    // Otherwise, this is a real load.  If there is a store between the load and
+    // end of block, we can't move it.
+    return !SawStore;
+
+  return true;
+}
+
+/// hasOrderedMemoryRef - Return true if this instruction may have an ordered
+/// or volatile memory reference, or if the information describing the memory
+/// reference is not available. Return false if it is known to have no ordered
+/// memory references.
+bool MachineInstr::hasOrderedMemoryRef() const {
+  // An instruction known never to access memory won't have a volatile access.
+  if (!mayStore() &&
+      !mayLoad() &&
+      !isCall() &&
+      !hasUnmodeledSideEffects())
+    return false;
+
+  // Otherwise, if the instruction has no memory reference information,
+  // conservatively assume it wasn't preserved.
+  if (memoperands_empty())
+    return true;
+
+  // Check if any of our memory operands are ordered.
+  return any_of(memoperands(), [](const MachineMemOperand *MMO) {
+    return !MMO->isUnordered();
+  });
+}
+
+/// isInvariantLoad - Return true if this instruction is loading from a
+/// location whose value is invariant across the function.  For example,
+/// loading a value from the constant pool or from the argument area
+/// of a function if it does not change.  This should only return true of
+/// *all* loads the instruction does are invariant (if it does multiple loads).
+bool MachineInstr::isInvariantLoad(AliasAnalysis *AA) const {
+  // If the instruction doesn't load at all, it isn't an invariant load.
+  if (!mayLoad())
+    return false;
+
+  // If the instruction has lost its memoperands, conservatively assume that
+  // it may not be an invariant load.
+  if (memoperands_empty())
+    return false;
+
+  const MachineFrameInfo *MFI = getParent()->getParent()->getFrameInfo();
+
+  for (MachineMemOperand *MMO : memoperands()) {
+    if (MMO->isVolatile()) return false;
+    if (MMO->isStore()) return false;
+    if (MMO->isInvariant()) continue;
+
+    // A load from a constant PseudoSourceValue is invariant.
+    if (const PseudoSourceValue *PSV = MMO->getPseudoValue())
+      if (PSV->isConstant(MFI))
+        continue;
+
+    if (const Value *V = MMO->getValue()) {
+      // If we have an AliasAnalysis, ask it whether the memory is constant.
+      if (AA &&
+          AA->pointsToConstantMemory(
+              MemoryLocation(V, MMO->getSize(), MMO->getAAInfo())))
+        continue;
+    }
+
+    // Otherwise assume conservatively.
+    return false;
+  }
+
+  // Everything checks out.
+  return true;
+}
+
+/// isConstantValuePHI - If the specified instruction is a PHI that always
+/// merges together the same virtual register, return the register, otherwise
+/// return 0.
+unsigned MachineInstr::isConstantValuePHI() const {
+  if (!isPHI())
+    return 0;
+  assert(getNumOperands() >= 3 &&
+         "It's illegal to have a PHI without source operands");
+
+  unsigned Reg = getOperand(1).getReg();
+  for (unsigned i = 3, e = getNumOperands(); i < e; i += 2)
+    if (getOperand(i).getReg() != Reg)
+      return 0;
+  return Reg;
+}
+
+bool MachineInstr::hasUnmodeledSideEffects() const {
+  if (hasProperty(MCID::UnmodeledSideEffects))
+    return true;
+  if (isInlineAsm()) {
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      return true;
+  }
+
+  return false;
+}
+
+bool MachineInstr::isLoadFoldBarrier() const {
+  return mayStore() || isCall() || hasUnmodeledSideEffects();
+}
+
+/// allDefsAreDead - Return true if all the defs of this instruction are dead.
+///
+bool MachineInstr::allDefsAreDead() const {
+  for (const MachineOperand &MO : operands()) {
+    if (!MO.isReg() || MO.isUse())
+      continue;
+    if (!MO.isDead())
+      return false;
+  }
+  return true;
+}
+
+/// copyImplicitOps - Copy implicit register operands from specified
+/// instruction to this instruction.
+void MachineInstr::copyImplicitOps(MachineFunction &MF,
+                                   const MachineInstr &MI) {
+  for (unsigned i = MI.getDesc().getNumOperands(), e = MI.getNumOperands();
+       i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if ((MO.isReg() && MO.isImplicit()) || MO.isRegMask())
+      addOperand(MF, MO);
+  }
+}
+
+LLVM_DUMP_METHOD void MachineInstr::dump() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "  " << *this;
+#endif
+}
+
+void MachineInstr::print(raw_ostream &OS, bool SkipOpers) const {
+  const Module *M = nullptr;
+  if (const MachineBasicBlock *MBB = getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      M = MF->getFunction()->getParent();
+
+  ModuleSlotTracker MST(M);
+  print(OS, MST, SkipOpers);
+}
+
+void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
+                         bool SkipOpers) const {
+  // We can be a bit tidier if we know the MachineFunction.
+  const MachineFunction *MF = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  if (const MachineBasicBlock *MBB = getParent()) {
+    MF = MBB->getParent();
+    if (MF) {
+      MRI = &MF->getRegInfo();
+      TRI = MF->getSubtarget().getRegisterInfo();
+      TII = MF->getSubtarget().getInstrInfo();
+    }
+  }
+
+  // Save a list of virtual registers.
+  SmallVector<unsigned, 8> VirtRegs;
+
+  // Print explicitly defined operands on the left of an assignment syntax.
+  unsigned StartOp = 0, e = getNumOperands();
+  for (; StartOp < e && getOperand(StartOp).isReg() &&
+         getOperand(StartOp).isDef() &&
+         !getOperand(StartOp).isImplicit();
+       ++StartOp) {
+    if (StartOp != 0) OS << ", ";
+    getOperand(StartOp).print(OS, MST, TRI);
+    unsigned Reg = getOperand(StartOp).getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      VirtRegs.push_back(Reg);
+      unsigned Size;
+      if (MRI && (Size = MRI->getSize(Reg)))
+        OS << '(' << Size << ')';
+    }
+  }
+
+  if (StartOp != 0)
+    OS << " = ";
+
+  // Print the opcode name.
+  if (TII)
+    OS << TII->getName(getOpcode());
+  else
+    OS << "UNKNOWN";
+
+  if (getType()) {
+    OS << ' ';
+    getType()->print(OS, /*IsForDebug*/ false, /*NoDetails*/ true);
+    OS << ' ';
+  }
+
+  if (SkipOpers)
+    return;
+
+  // Print the rest of the operands.
+  bool OmittedAnyCallClobbers = false;
+  bool FirstOp = true;
+  unsigned AsmDescOp = ~0u;
+  unsigned AsmOpCount = 0;
+
+  if (isInlineAsm() && e >= InlineAsm::MIOp_FirstOperand) {
+    // Print asm string.
+    OS << " ";
+    getOperand(InlineAsm::MIOp_AsmString).print(OS, MST, TRI);
+
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
+    unsigned ExtraInfo = getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+    if (ExtraInfo & InlineAsm::Extra_HasSideEffects)
+      OS << " [sideeffect]";
+    if (ExtraInfo & InlineAsm::Extra_MayLoad)
+      OS << " [mayload]";
+    if (ExtraInfo & InlineAsm::Extra_MayStore)
+      OS << " [maystore]";
+    if (ExtraInfo & InlineAsm::Extra_IsConvergent)
+      OS << " [isconvergent]";
+    if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+      OS << " [alignstack]";
+    if (getInlineAsmDialect() == InlineAsm::AD_ATT)
+      OS << " [attdialect]";
+    if (getInlineAsmDialect() == InlineAsm::AD_Intel)
+      OS << " [inteldialect]";
+
+    StartOp = AsmDescOp = InlineAsm::MIOp_FirstOperand;
+    FirstOp = false;
+  }
+
+  for (unsigned i = StartOp, e = getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = getOperand(i);
+
+    if (MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      VirtRegs.push_back(MO.getReg());
+
+    // Omit call-clobbered registers which aren't used anywhere. This makes
+    // call instructions much less noisy on targets where calls clobber lots
+    // of registers. Don't rely on MO.isDead() because we may be called before
+    // LiveVariables is run, or we may be looking at a non-allocatable reg.
+    if (MRI && isCall() &&
+        MO.isReg() && MO.isImplicit() && MO.isDef()) {
+      unsigned Reg = MO.getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (MRI->use_empty(Reg)) {
+          bool HasAliasLive = false;
+          for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+            unsigned AliasReg = *AI;
+            if (!MRI->use_empty(AliasReg)) {
+              HasAliasLive = true;
+              break;
+            }
+          }
+          if (!HasAliasLive) {
+            OmittedAnyCallClobbers = true;
+            continue;
+          }
+        }
+      }
+    }
+
+    if (FirstOp) FirstOp = false; else OS << ",";
+    OS << " ";
+    if (i < getDesc().NumOperands) {
+      const MCOperandInfo &MCOI = getDesc().OpInfo[i];
+      if (MCOI.isPredicate())
+        OS << "pred:";
+      if (MCOI.isOptionalDef())
+        OS << "opt:";
+    }
+    if (isDebugValue() && MO.isMetadata()) {
+      // Pretty print DBG_VALUE instructions.
+      auto *DIV = dyn_cast<DILocalVariable>(MO.getMetadata());
+      if (DIV && !DIV->getName().empty())
+        OS << "!\"" << DIV->getName() << '\"';
+      else
+        MO.print(OS, MST, TRI);
+    } else if (TRI && (isInsertSubreg() || isRegSequence()) && MO.isImm()) {
+      OS << TRI->getSubRegIndexName(MO.getImm());
+    } else if (i == AsmDescOp && MO.isImm()) {
+      // Pretty print the inline asm operand descriptor.
+      OS << '$' << AsmOpCount++;
+      unsigned Flag = MO.getImm();
+      switch (InlineAsm::getKind(Flag)) {
+      case InlineAsm::Kind_RegUse:             OS << ":[reguse"; break;
+      case InlineAsm::Kind_RegDef:             OS << ":[regdef"; break;
+      case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec"; break;
+      case InlineAsm::Kind_Clobber:            OS << ":[clobber"; break;
+      case InlineAsm::Kind_Imm:                OS << ":[imm"; break;
+      case InlineAsm::Kind_Mem:                OS << ":[mem"; break;
+      default: OS << ":[??" << InlineAsm::getKind(Flag); break;
+      }
+
+      unsigned RCID = 0;
+      if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
+          InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+        if (TRI) {
+          OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
+        } else
+          OS << ":RC" << RCID;
+      }
+
+      if (InlineAsm::isMemKind(Flag)) {
+        unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
+        switch (MCID) {
+        case InlineAsm::Constraint_es: OS << ":es"; break;
+        case InlineAsm::Constraint_i:  OS << ":i"; break;
+        case InlineAsm::Constraint_m:  OS << ":m"; break;
+        case InlineAsm::Constraint_o:  OS << ":o"; break;
+        case InlineAsm::Constraint_v:  OS << ":v"; break;
+        case InlineAsm::Constraint_Q:  OS << ":Q"; break;
+        case InlineAsm::Constraint_R:  OS << ":R"; break;
+        case InlineAsm::Constraint_S:  OS << ":S"; break;
+        case InlineAsm::Constraint_T:  OS << ":T"; break;
+        case InlineAsm::Constraint_Um: OS << ":Um"; break;
+        case InlineAsm::Constraint_Un: OS << ":Un"; break;
+        case InlineAsm::Constraint_Uq: OS << ":Uq"; break;
+        case InlineAsm::Constraint_Us: OS << ":Us"; break;
+        case InlineAsm::Constraint_Ut: OS << ":Ut"; break;
+        case InlineAsm::Constraint_Uv: OS << ":Uv"; break;
+        case InlineAsm::Constraint_Uy: OS << ":Uy"; break;
+        case InlineAsm::Constraint_X:  OS << ":X"; break;
+        case InlineAsm::Constraint_Z:  OS << ":Z"; break;
+        case InlineAsm::Constraint_ZC: OS << ":ZC"; break;
+        case InlineAsm::Constraint_Zy: OS << ":Zy"; break;
+        default: OS << ":?"; break;
+        }
+      }
+
+      unsigned TiedTo = 0;
+      if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
+        OS << " tiedto:$" << TiedTo;
+
+      OS << ']';
+
+      // Compute the index of the next operand descriptor.
+      AsmDescOp += 1 + InlineAsm::getNumOperandRegisters(Flag);
+    } else
+      MO.print(OS, MST, TRI);
+  }
+
+  // Briefly indicate whether any call clobbers were omitted.
+  if (OmittedAnyCallClobbers) {
+    if (!FirstOp) OS << ",";
+    OS << " ...";
+  }
+
+  bool HaveSemi = false;
+  const unsigned PrintableFlags = FrameSetup | FrameDestroy;
+  if (Flags & PrintableFlags) {
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
+    OS << " flags: ";
+
+    if (Flags & FrameSetup)
+      OS << "FrameSetup";
+
+    if (Flags & FrameDestroy)
+      OS << "FrameDestroy";
+  }
+
+  if (!memoperands_empty()) {
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
+
+    OS << " mem:";
+    for (mmo_iterator i = memoperands_begin(), e = memoperands_end();
+         i != e; ++i) {
+      (*i)->print(OS, MST);
+      if (std::next(i) != e)
+        OS << " ";
+    }
+  }
+
+  // Print the regclass of any virtual registers encountered.
+  if (MRI && !VirtRegs.empty()) {
+    if (!HaveSemi) {
+      OS << ";";
+      HaveSemi = true;
+    }
+    for (unsigned i = 0; i != VirtRegs.size(); ++i) {
+      const RegClassOrRegBank &RC = MRI->getRegClassOrRegBank(VirtRegs[i]);
+      if (!RC)
+        continue;
+      // Generic virtual registers do not have register classes.
+      if (RC.is<const RegisterBank *>())
+        OS << " " << RC.get<const RegisterBank *>()->getName();
+      else
+        OS << " "
+           << TRI->getRegClassName(RC.get<const TargetRegisterClass *>());
+      OS << ':' << PrintReg(VirtRegs[i]);
+      for (unsigned j = i+1; j != VirtRegs.size();) {
+        if (MRI->getRegClassOrRegBank(VirtRegs[j]) != RC) {
+          ++j;
+          continue;
+        }
+        if (VirtRegs[i] != VirtRegs[j])
+          OS << "," << PrintReg(VirtRegs[j]);
+        VirtRegs.erase(VirtRegs.begin()+j);
+      }
+    }
+  }
+
+  // Print debug location information.
+  if (isDebugValue() && getOperand(e - 2).isMetadata()) {
+    if (!HaveSemi)
+      OS << ";";
+    auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
+    OS << " line no:" <<  DV->getLine();
+    if (auto *InlinedAt = debugLoc->getInlinedAt()) {
+      DebugLoc InlinedAtDL(InlinedAt);
+      if (InlinedAtDL && MF) {
+        OS << " inlined @[ ";
+        InlinedAtDL.print(OS);
+        OS << " ]";
+      }
+    }
+    if (isIndirectDebugValue())
+      OS << " indirect";
+  } else if (debugLoc && MF) {
+    if (!HaveSemi)
+      OS << ";";
+    OS << " dbg:";
+    debugLoc.print(OS);
+  }
+
+  OS << '\n';
+}
+
+bool MachineInstr::addRegisterKilled(unsigned IncomingReg,
+                                     const TargetRegisterInfo *RegInfo,
+                                     bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(IncomingReg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(IncomingReg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.isUndef())
+      continue;
+
+    // DEBUG_VALUE nodes do not contribute to code generation and should
+    // always be ignored. Failure to do so may result in trying to modify
+    // KILL flags on DEBUG_VALUE nodes.
+    if (MO.isDebug())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+
+    if (Reg == IncomingReg) {
+      if (!Found) {
+        if (MO.isKill())
+          // The register is already marked kill.
+          return true;
+        if (isPhysReg && isRegTiedToDefOperand(i))
+          // Two-address uses of physregs must not be marked kill.
+          return true;
+        MO.setIsKill();
+        Found = true;
+      }
+    } else if (hasAliases && MO.isKill() &&
+               TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      // A super-register kill already exists.
+      if (RegInfo->isSuperRegister(IncomingReg, Reg))
+        return true;
+      if (RegInfo->isSubRegister(IncomingReg, Reg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded kill operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsKill(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is killed. Add a
+  // new implicit operand if required.
+  if (!Found && AddIfNotFound) {
+    addOperand(MachineOperand::CreateReg(IncomingReg,
+                                         false /*IsDef*/,
+                                         true  /*IsImp*/,
+                                         true  /*IsKill*/));
+    return true;
+  }
+  return Found;
+}
+
+void MachineInstr::clearRegisterKills(unsigned Reg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+    RegInfo = nullptr;
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isUse() || !MO.isKill())
+      continue;
+    unsigned OpReg = MO.getReg();
+    if ((RegInfo && RegInfo->regsOverlap(Reg, OpReg)) || Reg == OpReg)
+      MO.setIsKill(false);
+  }
+}
+
+bool MachineInstr::addRegisterDead(unsigned Reg,
+                                   const TargetRegisterInfo *RegInfo,
+                                   bool AddIfNotFound) {
+  bool isPhysReg = TargetRegisterInfo::isPhysicalRegister(Reg);
+  bool hasAliases = isPhysReg &&
+    MCRegAliasIterator(Reg, RegInfo, false).isValid();
+  bool Found = false;
+  SmallVector<unsigned,4> DeadOps;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = getOperand(i);
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+
+    if (MOReg == Reg) {
+      MO.setIsDead();
+      Found = true;
+    } else if (hasAliases && MO.isDead() &&
+               TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      // There exists a super-register that's marked dead.
+      if (RegInfo->isSuperRegister(Reg, MOReg))
+        return true;
+      if (RegInfo->isSubRegister(Reg, MOReg))
+        DeadOps.push_back(i);
+    }
+  }
+
+  // Trim unneeded dead operands.
+  while (!DeadOps.empty()) {
+    unsigned OpIdx = DeadOps.back();
+    if (getOperand(OpIdx).isImplicit())
+      RemoveOperand(OpIdx);
+    else
+      getOperand(OpIdx).setIsDead(false);
+    DeadOps.pop_back();
+  }
+
+  // If not found, this means an alias of one of the operands is dead. Add a
+  // new implicit operand if required.
+  if (Found || !AddIfNotFound)
+    return Found;
+
+  addOperand(MachineOperand::CreateReg(Reg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/,
+                                       false /*IsKill*/,
+                                       true  /*IsDead*/));
+  return true;
+}
+
+void MachineInstr::clearRegisterDeads(unsigned Reg) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg)
+      continue;
+    MO.setIsDead(false);
+  }
+}
+
+void MachineInstr::setRegisterDefReadUndef(unsigned Reg, bool IsUndef) {
+  for (MachineOperand &MO : operands()) {
+    if (!MO.isReg() || !MO.isDef() || MO.getReg() != Reg || MO.getSubReg() == 0)
+      continue;
+    MO.setIsUndef(IsUndef);
+  }
+}
+
+void MachineInstr::addRegisterDefined(unsigned Reg,
+                                      const TargetRegisterInfo *RegInfo) {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+    MachineOperand *MO = findRegisterDefOperand(Reg, false, RegInfo);
+    if (MO)
+      return;
+  } else {
+    for (const MachineOperand &MO : operands()) {
+      if (MO.isReg() && MO.getReg() == Reg && MO.isDef() &&
+          MO.getSubReg() == 0)
+        return;
+    }
+  }
+  addOperand(MachineOperand::CreateReg(Reg,
+                                       true  /*IsDef*/,
+                                       true  /*IsImp*/));
+}
+
+void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
+                                         const TargetRegisterInfo &TRI) {
+  bool HasRegMask = false;
+  for (MachineOperand &MO : operands()) {
+    if (MO.isRegMask()) {
+      HasRegMask = true;
+      continue;
+    }
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    // If there are no uses, including partial uses, the def is dead.
+    if (std::none_of(UsedRegs.begin(), UsedRegs.end(),
+                     [&](unsigned Use) { return TRI.regsOverlap(Use, Reg); }))
+      MO.setIsDead();
+  }
+
+  // This is a call with a register mask operand.
+  // Mask clobbers are always dead, so add defs for the non-dead defines.
+  if (HasRegMask)
+    for (ArrayRef<unsigned>::iterator I = UsedRegs.begin(), E = UsedRegs.end();
+         I != E; ++I)
+      addRegisterDefined(*I, &TRI);
+}
+
+unsigned
+MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
+  // Build up a buffer of hash code components.
+  SmallVector<size_t, 8> HashComponents;
+  HashComponents.reserve(MI->getNumOperands() + 1);
+  HashComponents.push_back(MI->getOpcode());
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isReg() && MO.isDef() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
+  }
+  return hash_combine_range(HashComponents.begin(), HashComponents.end());
+}
+
+void MachineInstr::emitError(StringRef Msg) const {
+  // Find the source location cookie.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = nullptr;
+  for (unsigned i = getNumOperands(); i != 0; --i) {
+    if (getOperand(i-1).isMetadata() &&
+        (LocMD = getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI =
+              mdconst::dyn_extract<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  if (const MachineBasicBlock *MBB = getParent())
+    if (const MachineFunction *MF = MBB->getParent())
+      return MF->getMMI().getModule()->getContext().emitError(LocCookie, Msg);
+  report_fatal_error(Msg);
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineFunction &MF, const DebugLoc &DL,
+                                  const MCInstrDesc &MCID, bool IsIndirect,
+                                  unsigned Reg, unsigned Offset,
+                                  const MDNode *Variable, const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  if (IsIndirect)
+    return BuildMI(MF, DL, MCID)
+        .addReg(Reg, RegState::Debug)
+        .addImm(Offset)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
+  else {
+    assert(Offset == 0 && "A direct address cannot have an offset.");
+    return BuildMI(MF, DL, MCID)
+        .addReg(Reg, RegState::Debug)
+        .addReg(0U, RegState::Debug)
+        .addMetadata(Variable)
+        .addMetadata(Expr);
+  }
+}
+
+MachineInstrBuilder llvm::BuildMI(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  const DebugLoc &DL, const MCInstrDesc &MCID,
+                                  bool IsIndirect, unsigned Reg,
+                                  unsigned Offset, const MDNode *Variable,
+                                  const MDNode *Expr) {
+  assert(isa<DILocalVariable>(Variable) && "not a variable");
+  assert(cast<DIExpression>(Expr)->isValid() && "not an expression");
+  MachineFunction &MF = *BB.getParent();
+  MachineInstr *MI =
+      BuildMI(MF, DL, MCID, IsIndirect, Reg, Offset, Variable, Expr);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MF, MI);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
new file mode 100644
index 000000000000..e4686b3c5c4e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -0,0 +1,344 @@
+//===-- lib/CodeGen/MachineInstrBundle.cpp --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <utility>
+using namespace llvm;
+
+namespace {
+  class UnpackMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    UnpackMachineBundles(std::function<bool(const Function &)> Ftor = nullptr)
+        : MachineFunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+      initializeUnpackMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    std::function<bool(const Function &)> PredicateFtor;
+  };
+} // end anonymous namespace
+
+char UnpackMachineBundles::ID = 0;
+char &llvm::UnpackMachineBundlesID = UnpackMachineBundles::ID;
+INITIALIZE_PASS(UnpackMachineBundles, "unpack-mi-bundles",
+                "Unpack machine instruction bundles", false, false)
+
+bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  if (PredicateFtor && !PredicateFtor(*MF.getFunction()))
+    return false;
+
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+
+    for (MachineBasicBlock::instr_iterator MII = MBB->instr_begin(),
+           MIE = MBB->instr_end(); MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+
+      // Remove BUNDLE instruction and the InsideBundle flags from bundled
+      // instructions.
+      if (MI->isBundle()) {
+        while (++MII != MIE && MII->isBundledWithPred()) {
+          MII->unbundleFromPred();
+          for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
+            MachineOperand &MO = MII->getOperand(i);
+            if (MO.isReg() && MO.isInternalRead())
+              MO.setIsInternalRead(false);
+          }
+        }
+        MI->eraseFromParent();
+
+        Changed = true;
+        continue;
+      }
+
+      ++MII;
+    }
+  }
+
+  return Changed;
+}
+
+FunctionPass *
+llvm::createUnpackMachineBundles(std::function<bool(const Function &)> Ftor) {
+  return new UnpackMachineBundles(std::move(Ftor));
+}
+
+namespace {
+  class FinalizeMachineBundles : public MachineFunctionPass {
+  public:
+    static char ID; // Pass identification
+    FinalizeMachineBundles() : MachineFunctionPass(ID) {
+      initializeFinalizeMachineBundlesPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+  };
+} // end anonymous namespace
+
+char FinalizeMachineBundles::ID = 0;
+char &llvm::FinalizeMachineBundlesID = FinalizeMachineBundles::ID;
+INITIALIZE_PASS(FinalizeMachineBundles, "finalize-mi-bundles",
+                "Finalize machine instruction bundles", false, false)
+
+bool FinalizeMachineBundles::runOnMachineFunction(MachineFunction &MF) {
+  return llvm::finalizeBundles(MF);
+}
+
+
+/// finalizeBundle - Finalize a machine instruction bundle which includes
+/// a sequence of instructions starting from FirstMI to LastMI (exclusive).
+/// This routine adds a BUNDLE instruction to represent the bundle, it adds
+/// IsInternalRead markers to MachineOperands which are defined inside the
+/// bundle, and it copies externally visible defs and uses to the BUNDLE
+/// instruction.
+void llvm::finalizeBundle(MachineBasicBlock &MBB,
+                          MachineBasicBlock::instr_iterator FirstMI,
+                          MachineBasicBlock::instr_iterator LastMI) {
+  assert(FirstMI != LastMI && "Empty bundle?");
+  MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
+
+  MachineFunction &MF = *MBB.getParent();
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  MachineInstrBuilder MIB =
+      BuildMI(MF, FirstMI->getDebugLoc(), TII->get(TargetOpcode::BUNDLE));
+  Bundle.prepend(MIB);
+
+  SmallVector<unsigned, 32> LocalDefs;
+  SmallSet<unsigned, 32> LocalDefSet;
+  SmallSet<unsigned, 8> DeadDefSet;
+  SmallSet<unsigned, 16> KilledDefSet;
+  SmallVector<unsigned, 8> ExternUses;
+  SmallSet<unsigned, 8> ExternUseSet;
+  SmallSet<unsigned, 8> KilledUseSet;
+  SmallSet<unsigned, 8> UndefUseSet;
+  SmallVector<MachineOperand*, 4> Defs;
+  for (; FirstMI != LastMI; ++FirstMI) {
+    for (unsigned i = 0, e = FirstMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = FirstMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      if (MO.isDef()) {
+        Defs.push_back(&MO);
+        continue;
+      }
+
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+      if (LocalDefSet.count(Reg)) {
+        MO.setIsInternalRead();
+        if (MO.isKill())
+          // Internal def is now killed.
+          KilledDefSet.insert(Reg);
+      } else {
+        if (ExternUseSet.insert(Reg).second) {
+          ExternUses.push_back(Reg);
+          if (MO.isUndef())
+            UndefUseSet.insert(Reg);
+        }
+        if (MO.isKill())
+          // External def is now killed.
+          KilledUseSet.insert(Reg);
+      }
+    }
+
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      MachineOperand &MO = *Defs[i];
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+
+      if (LocalDefSet.insert(Reg).second) {
+        LocalDefs.push_back(Reg);
+        if (MO.isDead()) {
+          DeadDefSet.insert(Reg);
+        }
+      } else {
+        // Re-defined inside the bundle, it's no longer killed.
+        KilledDefSet.erase(Reg);
+        if (!MO.isDead())
+          // Previously defined but dead.
+          DeadDefSet.erase(Reg);
+      }
+
+      if (!MO.isDead()) {
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          unsigned SubReg = *SubRegs;
+          if (LocalDefSet.insert(SubReg).second)
+            LocalDefs.push_back(SubReg);
+        }
+      }
+    }
+
+    Defs.clear();
+  }
+
+  SmallSet<unsigned, 32> Added;
+  for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
+    unsigned Reg = LocalDefs[i];
+    if (Added.insert(Reg).second) {
+      // If it's not live beyond end of the bundle, mark it dead.
+      bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
+      MIB.addReg(Reg, getDefRegState(true) | getDeadRegState(isDead) |
+                 getImplRegState(true));
+    }
+  }
+
+  for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
+    unsigned Reg = ExternUses[i];
+    bool isKill = KilledUseSet.count(Reg);
+    bool isUndef = UndefUseSet.count(Reg);
+    MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
+               getImplRegState(true));
+  }
+}
+
+/// finalizeBundle - Same functionality as the previous finalizeBundle except
+/// the last instruction in the bundle is not provided as an input. This is
+/// used in cases where bundles are pre-determined by marking instructions
+/// with 'InsideBundle' marker. It returns the MBB instruction iterator that
+/// points to the end of the bundle.
+MachineBasicBlock::instr_iterator
+llvm::finalizeBundle(MachineBasicBlock &MBB,
+                     MachineBasicBlock::instr_iterator FirstMI) {
+  MachineBasicBlock::instr_iterator E = MBB.instr_end();
+  MachineBasicBlock::instr_iterator LastMI = std::next(FirstMI);
+  while (LastMI != E && LastMI->isInsideBundle())
+    ++LastMI;
+  finalizeBundle(MBB, FirstMI, LastMI);
+  return LastMI;
+}
+
+/// finalizeBundles - Finalize instruction bundles in the specified
+/// MachineFunction. Return true if any bundles are finalized.
+bool llvm::finalizeBundles(MachineFunction &MF) {
+  bool Changed = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock &MBB = *I;
+    MachineBasicBlock::instr_iterator MII = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator MIE = MBB.instr_end();
+    if (MII == MIE)
+      continue;
+    assert(!MII->isInsideBundle() &&
+           "First instr cannot be inside bundle before finalization!");
+
+    for (++MII; MII != MIE; ) {
+      if (!MII->isInsideBundle())
+        ++MII;
+      else {
+        MII = finalizeBundle(MBB, std::prev(MII));
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineOperand iterator
+//===----------------------------------------------------------------------===//
+
+MachineOperandIteratorBase::VirtRegInfo
+MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
+                    SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops) {
+  VirtRegInfo RI = { false, false, false };
+  for(; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+    if (!MO.isReg() || MO.getReg() != Reg)
+      continue;
+
+    // Remember each (MI, OpNo) that refers to Reg.
+    if (Ops)
+      Ops->push_back(std::make_pair(MO.getParent(), getOperandNo()));
+
+    // Both defs and uses can read virtual registers.
+    if (MO.readsReg()) {
+      RI.Reads = true;
+      if (MO.isDef())
+        RI.Tied = true;
+    }
+
+    // Only defs can write.
+    if (MO.isDef())
+      RI.Writes = true;
+    else if (!RI.Tied && MO.getParent()->isRegTiedToDefOperand(getOperandNo()))
+      RI.Tied = true;
+  }
+  return RI;
+}
+
+MachineOperandIteratorBase::PhysRegInfo
+MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
+                                           const TargetRegisterInfo *TRI) {
+  bool AllDefsDead = true;
+  PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
+
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "analyzePhysReg not given a physical register!");
+  for (; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg)) {
+      PRI.Clobbered = true;
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned MOReg = MO.getReg();
+    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+      continue;
+
+    if (!TRI->regsOverlap(MOReg, Reg))
+      continue;
+
+    bool Covered = TRI->isSuperRegisterEq(Reg, MOReg);
+    if (MO.readsReg()) {
+      PRI.Read = true;
+      if (Covered) {
+        PRI.FullyRead = true;
+        if (MO.isKill())
+          PRI.Killed = true;
+      }
+    } else if (MO.isDef()) {
+      PRI.Defined = true;
+      if (Covered)
+        PRI.FullyDefined = true;
+      if (!MO.isDead())
+        AllDefsDead = false;
+    }
+  }
+
+  if (AllDefsDead) {
+    if (PRI.FullyDefined || PRI.Clobbered)
+      PRI.DeadDef = true;
+    else if (PRI.Defined)
+      PRI.PartialDeadDef = true;
+  }
+
+  return PRI;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineLICM.cpp b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
new file mode 100644
index 000000000000..119751b17f56
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineLICM.cpp
@@ -0,0 +1,1388 @@
+//===-- MachineLICM.cpp - Machine Loop Invariant Code Motion Pass ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion on machine instructions. We
+// attempt to remove as much code from the body of a loop as possible.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for the LLVM-IR-level LICM pass. It is only designed to hoist simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-licm"
+
+static cl::opt<bool>
+AvoidSpeculation("avoid-speculation",
+                 cl::desc("MachineLICM should avoid speculation"),
+                 cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+HoistCheapInsts("hoist-cheap-insts",
+                cl::desc("MachineLICM should hoist even cheap instructions"),
+                cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+SinkInstsToAvoidSpills("sink-insts-to-avoid-spills",
+                       cl::desc("MachineLICM should sink instructions into "
+                                "loops to avoid register spills"),
+                       cl::init(false), cl::Hidden);
+
+STATISTIC(NumHoisted,
+          "Number of machine instructions hoisted out of loops");
+STATISTIC(NumLowRP,
+          "Number of instructions hoisted in low reg pressure situation");
+STATISTIC(NumHighLatency,
+          "Number of high latency instructions hoisted");
+STATISTIC(NumCSEed,
+          "Number of hoisted machine instructions CSEed");
+STATISTIC(NumPostRAHoisted,
+          "Number of machine instructions hoisted out of loops post regalloc");
+
+namespace {
+  class MachineLICM : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetLoweringBase *TLI;
+    const TargetRegisterInfo *TRI;
+    const MachineFrameInfo *MFI;
+    MachineRegisterInfo *MRI;
+    TargetSchedModel SchedModel;
+    bool PreRegAlloc;
+
+    // Various analyses that we use...
+    AliasAnalysis        *AA;      // Alias analysis info.
+    MachineLoopInfo      *MLI;     // Current MachineLoopInfo
+    MachineDominatorTree *DT;      // Machine dominator tree for the cur loop
+
+    // State that is updated as we process loops
+    bool         Changed;          // True if a loop is changed.
+    bool         FirstInLoop;      // True if it's the first LICM in the loop.
+    MachineLoop *CurLoop;          // The current loop we are working on.
+    MachineBasicBlock *CurPreheader; // The preheader for CurLoop.
+
+    // Exit blocks for CurLoop.
+    SmallVector<MachineBasicBlock*, 8> ExitBlocks;
+
+    bool isExitBlock(const MachineBasicBlock *MBB) const {
+      return std::find(ExitBlocks.begin(), ExitBlocks.end(), MBB) !=
+        ExitBlocks.end();
+    }
+
+    // Track 'estimated' register pressure.
+    SmallSet<unsigned, 32> RegSeen;
+    SmallVector<unsigned, 8> RegPressure;
+
+    // Register pressure "limit" per register pressure set. If the pressure
+    // is higher than the limit, then it's considered high.
+    SmallVector<unsigned, 8> RegLimit;
+
+    // Register pressure on path leading from loop preheader to current BB.
+    SmallVector<SmallVector<unsigned, 8>, 16> BackTrace;
+
+    // For each opcode, keep a list of potential CSE instructions.
+    DenseMap<unsigned, std::vector<const MachineInstr*> > CSEMap;
+
+    enum {
+      SpeculateFalse   = 0,
+      SpeculateTrue    = 1,
+      SpeculateUnknown = 2
+    };
+
+    // If a MBB does not dominate loop exiting blocks then it may not safe
+    // to hoist loads from this block.
+    // Tri-state: 0 - false, 1 - true, 2 - unknown
+    unsigned SpeculationState;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    MachineLICM() :
+      MachineFunctionPass(ID), PreRegAlloc(true) {
+        initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+      }
+
+    explicit MachineLICM(bool PreRA) :
+      MachineFunctionPass(ID), PreRegAlloc(PreRA) {
+        initializeMachineLICMPass(*PassRegistry::getPassRegistry());
+      }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<MachineLoopInfo>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addPreserved<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    void releaseMemory() override {
+      RegSeen.clear();
+      RegPressure.clear();
+      RegLimit.clear();
+      BackTrace.clear();
+      CSEMap.clear();
+    }
+
+  private:
+    /// Keep track of information about hoisting candidates.
+    struct CandidateInfo {
+      MachineInstr *MI;
+      unsigned      Def;
+      int           FI;
+      CandidateInfo(MachineInstr *mi, unsigned def, int fi)
+        : MI(mi), Def(def), FI(fi) {}
+    };
+
+    void HoistRegionPostRA();
+
+    void HoistPostRA(MachineInstr *MI, unsigned Def);
+
+    void ProcessMI(MachineInstr *MI, BitVector &PhysRegDefs,
+                   BitVector &PhysRegClobbers, SmallSet<int, 32> &StoredFIs,
+                   SmallVectorImpl<CandidateInfo> &Candidates);
+
+    void AddToLiveIns(unsigned Reg);
+
+    bool IsLICMCandidate(MachineInstr &I);
+
+    bool IsLoopInvariantInst(MachineInstr &I);
+
+    bool HasLoopPHIUse(const MachineInstr *MI) const;
+
+    bool HasHighOperandLatency(MachineInstr &MI, unsigned DefIdx,
+                               unsigned Reg) const;
+
+    bool IsCheapInstruction(MachineInstr &MI) const;
+
+    bool CanCauseHighRegPressure(const DenseMap<unsigned, int> &Cost,
+                                 bool Cheap);
+
+    void UpdateBackTraceRegPressure(const MachineInstr *MI);
+
+    bool IsProfitableToHoist(MachineInstr &MI);
+
+    bool IsGuaranteedToExecute(MachineBasicBlock *BB);
+
+    void EnterScope(MachineBasicBlock *MBB);
+
+    void ExitScope(MachineBasicBlock *MBB);
+
+    void ExitScopeIfDone(
+        MachineDomTreeNode *Node,
+        DenseMap<MachineDomTreeNode *, unsigned> &OpenChildren,
+        DenseMap<MachineDomTreeNode *, MachineDomTreeNode *> &ParentMap);
+
+    void HoistOutOfLoop(MachineDomTreeNode *LoopHeaderNode);
+
+    void HoistRegion(MachineDomTreeNode *N, bool IsHeader);
+
+    void SinkIntoLoop();
+
+    void InitRegPressure(MachineBasicBlock *BB);
+
+    DenseMap<unsigned, int> calcRegisterCost(const MachineInstr *MI,
+                                             bool ConsiderSeen,
+                                             bool ConsiderUnseenAsDef);
+
+    void UpdateRegPressure(const MachineInstr *MI,
+                           bool ConsiderUnseenAsDef = false);
+
+    MachineInstr *ExtractHoistableLoad(MachineInstr *MI);
+
+    const MachineInstr *
+    LookForDuplicate(const MachineInstr *MI,
+                     std::vector<const MachineInstr *> &PrevMIs);
+
+    bool EliminateCSE(
+        MachineInstr *MI,
+        DenseMap<unsigned, std::vector<const MachineInstr *>>::iterator &CI);
+
+    bool MayCSE(MachineInstr *MI);
+
+    bool Hoist(MachineInstr *MI, MachineBasicBlock *Preheader);
+
+    void InitCSEMap(MachineBasicBlock *BB);
+
+    MachineBasicBlock *getCurPreheader();
+  };
+} // end anonymous namespace
+
+char MachineLICM::ID = 0;
+char &llvm::MachineLICMID = MachineLICM::ID;
+INITIALIZE_PASS_BEGIN(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineLICM, "machinelicm",
+                "Machine Loop Invariant Code Motion", false, false)
+
+/// Test if the given loop is the outer-most loop that has a unique predecessor.
+static bool LoopIsOuterMostWithPredecessor(MachineLoop *CurLoop) {
+  // Check whether this loop even has a unique predecessor.
+  if (!CurLoop->getLoopPredecessor())
+    return false;
+  // Ok, now check to see if any of its outer loops do.
+  for (MachineLoop *L = CurLoop->getParentLoop(); L; L = L->getParentLoop())
+    if (L->getLoopPredecessor())
+      return false;
+  // None of them did, so this is the outermost with a unique predecessor.
+  return true;
+}
+
+bool MachineLICM::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  Changed = FirstInLoop = false;
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  TII = ST.getInstrInfo();
+  TLI = ST.getTargetLowering();
+  TRI = ST.getRegisterInfo();
+  MFI = MF.getFrameInfo();
+  MRI = &MF.getRegInfo();
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+
+  PreRegAlloc = MRI->isSSA();
+
+  if (PreRegAlloc)
+    DEBUG(dbgs() << "******** Pre-regalloc Machine LICM: ");
+  else
+    DEBUG(dbgs() << "******** Post-regalloc Machine LICM: ");
+  DEBUG(dbgs() << MF.getName() << " ********\n");
+
+  if (PreRegAlloc) {
+    // Estimate register pressure during pre-regalloc pass.
+    unsigned NumRPS = TRI->getNumRegPressureSets();
+    RegPressure.resize(NumRPS);
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+    RegLimit.resize(NumRPS);
+    for (unsigned i = 0, e = NumRPS; i != e; ++i)
+      RegLimit[i] = TRI->getRegPressureSetLimit(MF, i);
+  }
+
+  // Get our Loop information...
+  MLI = &getAnalysis<MachineLoopInfo>();
+  DT  = &getAnalysis<MachineDominatorTree>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  SmallVector<MachineLoop *, 8> Worklist(MLI->begin(), MLI->end());
+  while (!Worklist.empty()) {
+    CurLoop = Worklist.pop_back_val();
+    CurPreheader = nullptr;
+    ExitBlocks.clear();
+
+    // If this is done before regalloc, only visit outer-most preheader-sporting
+    // loops.
+    if (PreRegAlloc && !LoopIsOuterMostWithPredecessor(CurLoop)) {
+      Worklist.append(CurLoop->begin(), CurLoop->end());
+      continue;
+    }
+
+    CurLoop->getExitBlocks(ExitBlocks);
+
+    if (!PreRegAlloc)
+      HoistRegionPostRA();
+    else {
+      // CSEMap is initialized for loop header when the first instruction is
+      // being hoisted.
+      MachineDomTreeNode *N = DT->getNode(CurLoop->getHeader());
+      FirstInLoop = true;
+      HoistOutOfLoop(N);
+      CSEMap.clear();
+
+      if (SinkInstsToAvoidSpills)
+        SinkIntoLoop();
+    }
+  }
+
+  return Changed;
+}
+
+/// Return true if instruction stores to the specified frame.
+static bool InstructionStoresToFI(const MachineInstr *MI, int FI) {
+  // If we lost memory operands, conservatively assume that the instruction
+  // writes to all slots. 
+  if (MI->memoperands_empty())
+    return true;
+  for (const MachineMemOperand *MemOp : MI->memoperands()) {
+    if (!MemOp->isStore() || !MemOp->getPseudoValue())
+      continue;
+    if (const FixedStackPseudoSourceValue *Value =
+        dyn_cast<FixedStackPseudoSourceValue>(MemOp->getPseudoValue())) {
+      if (Value->getFrameIndex() == FI)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Examine the instruction for potentai LICM candidate. Also
+/// gather register def and frame object update information.
+void MachineLICM::ProcessMI(MachineInstr *MI,
+                            BitVector &PhysRegDefs,
+                            BitVector &PhysRegClobbers,
+                            SmallSet<int, 32> &StoredFIs,
+                            SmallVectorImpl<CandidateInfo> &Candidates) {
+  bool RuledOut = false;
+  bool HasNonInvariantUse = false;
+  unsigned Def = 0;
+  for (const MachineOperand &MO : MI->operands()) {
+    if (MO.isFI()) {
+      // Remember if the instruction stores to the frame index.
+      int FI = MO.getIndex();
+      if (!StoredFIs.count(FI) &&
+          MFI->isSpillSlotObjectIndex(FI) &&
+          InstructionStoresToFI(MI, FI))
+        StoredFIs.insert(FI);
+      HasNonInvariantUse = true;
+      continue;
+    }
+
+    // We can't hoist an instruction defining a physreg that is clobbered in
+    // the loop.
+    if (MO.isRegMask()) {
+      PhysRegClobbers.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+           "Not expecting virtual register!");
+
+    if (!MO.isDef()) {
+      if (Reg && (PhysRegDefs.test(Reg) || PhysRegClobbers.test(Reg)))
+        // If it's using a non-loop-invariant register, then it's obviously not
+        // safe to hoist.
+        HasNonInvariantUse = true;
+      continue;
+    }
+
+    if (MO.isImplicit()) {
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRegClobbers.set(*AI);
+      if (!MO.isDead())
+        // Non-dead implicit def? This cannot be hoisted.
+        RuledOut = true;
+      // No need to check if a dead implicit def is also defined by
+      // another instruction.
+      continue;
+    }
+
+    // FIXME: For now, avoid instructions with multiple defs, unless
+    // it's a dead implicit def.
+    if (Def)
+      RuledOut = true;
+    else
+      Def = Reg;
+
+    // If we have already seen another instruction that defines the same
+    // register, then this is not safe.  Two defs is indicated by setting a
+    // PhysRegClobbers bit.
+    for (MCRegAliasIterator AS(Reg, TRI, true); AS.isValid(); ++AS) {
+      if (PhysRegDefs.test(*AS))
+        PhysRegClobbers.set(*AS);
+      PhysRegDefs.set(*AS);
+    }
+    if (PhysRegClobbers.test(Reg))
+      // MI defined register is seen defined by another instruction in
+      // the loop, it cannot be a LICM candidate.
+      RuledOut = true;
+  }
+
+  // Only consider reloads for now and remats which do not have register
+  // operands. FIXME: Consider unfold load folding instructions.
+  if (Def && !RuledOut) {
+    int FI = INT_MIN;
+    if ((!HasNonInvariantUse && IsLICMCandidate(*MI)) ||
+        (TII->isLoadFromStackSlot(*MI, FI) && MFI->isSpillSlotObjectIndex(FI)))
+      Candidates.push_back(CandidateInfo(MI, Def, FI));
+  }
+}
+
+/// Walk the specified region of the CFG and hoist loop invariants out to the
+/// preheader.
+void MachineLICM::HoistRegionPostRA() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  unsigned NumRegs = TRI->getNumRegs();
+  BitVector PhysRegDefs(NumRegs); // Regs defined once in the loop.
+  BitVector PhysRegClobbers(NumRegs); // Regs defined more than once.
+
+  SmallVector<CandidateInfo, 32> Candidates;
+  SmallSet<int, 32> StoredFIs;
+
+  // Walk the entire region, count number of defs for each register, and
+  // collect potential LICM candidates.
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
+  for (MachineBasicBlock *BB : Blocks) {
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isEHPad()) continue;
+
+    // Conservatively treat live-in's as an external def.
+    // FIXME: That means a reload that're reused in successor block(s) will not
+    // be LICM'ed.
+    for (const auto &LI : BB->liveins()) {
+      for (MCRegAliasIterator AI(LI.PhysReg, TRI, true); AI.isValid(); ++AI)
+        PhysRegDefs.set(*AI);
+    }
+
+    SpeculationState = SpeculateUnknown;
+    for (MachineInstr &MI : *BB)
+      ProcessMI(&MI, PhysRegDefs, PhysRegClobbers, StoredFIs, Candidates);
+  }
+
+  // Gather the registers read / clobbered by the terminator.
+  BitVector TermRegs(NumRegs);
+  MachineBasicBlock::iterator TI = Preheader->getFirstTerminator();
+  if (TI != Preheader->end()) {
+    for (const MachineOperand &MO : TI->operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg)
+        continue;
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        TermRegs.set(*AI);
+    }
+  }
+
+  // Now evaluate whether the potential candidates qualify.
+  // 1. Check if the candidate defined register is defined by another
+  //    instruction in the loop.
+  // 2. If the candidate is a load from stack slot (always true for now),
+  //    check if the slot is stored anywhere in the loop.
+  // 3. Make sure candidate def should not clobber
+  //    registers read by the terminator. Similarly its def should not be
+  //    clobbered by the terminator.
+  for (CandidateInfo &Candidate : Candidates) {
+    if (Candidate.FI != INT_MIN &&
+        StoredFIs.count(Candidate.FI))
+      continue;
+
+    unsigned Def = Candidate.Def;
+    if (!PhysRegClobbers.test(Def) && !TermRegs.test(Def)) {
+      bool Safe = true;
+      MachineInstr *MI = Candidate.MI;
+      for (const MachineOperand &MO : MI->operands()) {
+        if (!MO.isReg() || MO.isDef() || !MO.getReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (PhysRegDefs.test(Reg) ||
+            PhysRegClobbers.test(Reg)) {
+          // If it's using a non-loop-invariant register, then it's obviously
+          // not safe to hoist.
+          Safe = false;
+          break;
+        }
+      }
+      if (Safe)
+        HoistPostRA(MI, Candidate.Def);
+    }
+  }
+}
+
+/// Add register 'Reg' to the livein sets of BBs in the current loop, and make
+/// sure it is not killed by any instructions in the loop.
+void MachineLICM::AddToLiveIns(unsigned Reg) {
+  const std::vector<MachineBasicBlock *> &Blocks = CurLoop->getBlocks();
+  for (MachineBasicBlock *BB : Blocks) {
+    if (!BB->isLiveIn(Reg))
+      BB->addLiveIn(Reg);
+    for (MachineInstr &MI : *BB) {
+      for (MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg() || !MO.getReg() || MO.isDef()) continue;
+        if (MO.getReg() == Reg || TRI->isSuperRegister(Reg, MO.getReg()))
+          MO.setIsKill(false);
+      }
+    }
+  }
+}
+
+/// When an instruction is found to only use loop invariant operands that is
+/// safe to hoist, this instruction is called to do the dirty work.
+void MachineLICM::HoistPostRA(MachineInstr *MI, unsigned Def) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG(dbgs() << "Hoisting to BB#" << Preheader->getNumber() << " from BB#"
+               << MI->getParent()->getNumber() << ": " << *MI);
+
+  // Splice the instruction to the preheader.
+  MachineBasicBlock *MBB = MI->getParent();
+  Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
+
+  // Add register to livein list to all the BBs in the current loop since a
+  // loop invariant must be kept live throughout the whole loop. This is
+  // important to ensure later passes do not scavenge the def register.
+  AddToLiveIns(Def);
+
+  ++NumPostRAHoisted;
+  Changed = true;
+}
+
+/// Check if this mbb is guaranteed to execute. If not then a load from this mbb
+/// may not be safe to hoist.
+bool MachineLICM::IsGuaranteedToExecute(MachineBasicBlock *BB) {
+  if (SpeculationState != SpeculateUnknown)
+    return SpeculationState == SpeculateFalse;
+
+  if (BB != CurLoop->getHeader()) {
+    // Check loop exiting blocks.
+    SmallVector<MachineBasicBlock*, 8> CurrentLoopExitingBlocks;
+    CurLoop->getExitingBlocks(CurrentLoopExitingBlocks);
+    for (MachineBasicBlock *CurrentLoopExitingBlock : CurrentLoopExitingBlocks)
+      if (!DT->dominates(BB, CurrentLoopExitingBlock)) {
+        SpeculationState = SpeculateTrue;
+        return false;
+      }
+  }
+
+  SpeculationState = SpeculateFalse;
+  return true;
+}
+
+void MachineLICM::EnterScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Entering BB#" << MBB->getNumber() << '\n');
+
+  // Remember livein register pressure.
+  BackTrace.push_back(RegPressure);
+}
+
+void MachineLICM::ExitScope(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Exiting BB#" << MBB->getNumber() << '\n');
+  BackTrace.pop_back();
+}
+
+/// Destroy scope for the MBB that corresponds to the given dominator tree node
+/// if its a leaf or all of its children are done. Walk up the dominator tree to
+/// destroy ancestors which are now done.
+void MachineLICM::ExitScopeIfDone(MachineDomTreeNode *Node,
+                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
+                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+  if (OpenChildren[Node])
+    return;
+
+  // Pop scope.
+  ExitScope(Node->getBlock());
+
+  // Now traverse upwards to pop ancestors whose offsprings are all done.
+  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+    unsigned Left = --OpenChildren[Parent];
+    if (Left != 0)
+      break;
+    ExitScope(Parent->getBlock());
+    Node = Parent;
+  }
+}
+
+/// Walk the specified loop in the CFG (defined by all blocks dominated by the
+/// specified header block, and that are in the current loop) in depth first
+/// order w.r.t the DominatorTree. This allows us to visit definitions before
+/// uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void MachineLICM::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  SmallVector<MachineDomTreeNode*, 32> Scopes;
+  SmallVector<MachineDomTreeNode*, 8> WorkList;
+  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
+  DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
+
+  // Perform a DFS walk to determine the order of visit.
+  WorkList.push_back(HeaderN);
+  while (!WorkList.empty()) {
+    MachineDomTreeNode *Node = WorkList.pop_back_val();
+    assert(Node && "Null dominator tree node?");
+    MachineBasicBlock *BB = Node->getBlock();
+
+    // If the header of the loop containing this basic block is a landing pad,
+    // then don't try to hoist instructions out of this loop.
+    const MachineLoop *ML = MLI->getLoopFor(BB);
+    if (ML && ML->getHeader()->isEHPad())
+      continue;
+
+    // If this subregion is not in the top level loop at all, exit.
+    if (!CurLoop->contains(BB))
+      continue;
+
+    Scopes.push_back(Node);
+    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
+    unsigned NumChildren = Children.size();
+
+    // Don't hoist things out of a large switch statement.  This often causes
+    // code to be hoisted that wasn't going to be executed, and increases
+    // register pressure in a situation where it's likely to matter.
+    if (BB->succ_size() >= 25)
+      NumChildren = 0;
+
+    OpenChildren[Node] = NumChildren;
+    // Add children in reverse order as then the next popped worklist node is
+    // the first child of this node.  This means we ultimately traverse the
+    // DOM tree in exactly the same order as if we'd recursed.
+    for (int i = (int)NumChildren-1; i >= 0; --i) {
+      MachineDomTreeNode *Child = Children[i];
+      ParentMap[Child] = Node;
+      WorkList.push_back(Child);
+    }
+  }
+
+  if (Scopes.size() == 0)
+    return;
+
+  // Compute registers which are livein into the loop headers.
+  RegSeen.clear();
+  BackTrace.clear();
+  InitRegPressure(Preheader);
+
+  // Now perform LICM.
+  for (MachineDomTreeNode *Node : Scopes) {
+    MachineBasicBlock *MBB = Node->getBlock();
+
+    EnterScope(MBB);
+
+    // Process the block
+    SpeculationState = SpeculateUnknown;
+    for (MachineBasicBlock::iterator
+         MII = MBB->begin(), E = MBB->end(); MII != E; ) {
+      MachineBasicBlock::iterator NextMII = MII; ++NextMII;
+      MachineInstr *MI = &*MII;
+      if (!Hoist(MI, Preheader))
+        UpdateRegPressure(MI);
+      MII = NextMII;
+    }
+
+    // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
+    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+  }
+}
+
+/// Sink instructions into loops if profitable. This especially tries to prevent
+/// register spills caused by register pressure if there is little to no
+/// overhead moving instructions into loops.
+void MachineLICM::SinkIntoLoop() {
+  MachineBasicBlock *Preheader = getCurPreheader();
+  if (!Preheader)
+    return;
+
+  SmallVector<MachineInstr *, 8> Candidates;
+  for (MachineBasicBlock::instr_iterator I = Preheader->instr_begin();
+       I != Preheader->instr_end(); ++I) {
+    // We need to ensure that we can safely move this instruction into the loop.
+    // As such, it must not have side-effects, e.g. such as a call has.  
+    if (IsLoopInvariantInst(*I) && !HasLoopPHIUse(&*I))
+      Candidates.push_back(&*I);
+  }
+
+  for (MachineInstr *I : Candidates) {
+    const MachineOperand &MO = I->getOperand(0);
+    if (!MO.isDef() || !MO.isReg() || !MO.getReg())
+      continue;
+    if (!MRI->hasOneDef(MO.getReg()))
+      continue;
+    bool CanSink = true;
+    MachineBasicBlock *B = nullptr;
+    for (MachineInstr &MI : MRI->use_instructions(MO.getReg())) {
+      // FIXME: Come up with a proper cost model that estimates whether sinking
+      // the instruction (and thus possibly executing it on every loop
+      // iteration) is more expensive than a register.
+      // For now assumes that copies are cheap and thus almost always worth it.
+      if (!MI.isCopy()) {
+        CanSink = false;
+        break;
+      }
+      if (!B) {
+        B = MI.getParent();
+        continue;
+      }
+      B = DT->findNearestCommonDominator(B, MI.getParent());
+      if (!B) {
+        CanSink = false;
+        break;
+      }
+    }
+    if (!CanSink || !B || B == Preheader)
+      continue;
+    B->splice(B->getFirstNonPHI(), Preheader, I);
+  }
+}
+
+static bool isOperandKill(const MachineOperand &MO, MachineRegisterInfo *MRI) {
+  return MO.isKill() || MRI->hasOneNonDBGUse(MO.getReg());
+}
+
+/// Find all virtual register references that are liveout of the preheader to
+/// initialize the starting "register pressure". Note this does not count live
+/// through (livein but not used) registers.
+void MachineLICM::InitRegPressure(MachineBasicBlock *BB) {
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+
+  // If the preheader has only a single predecessor and it ends with a
+  // fallthrough or an unconditional branch, then scan its predecessor for live
+  // defs as well. This happens whenever the preheader is created by splitting
+  // the critical edge from the loop predecessor to the loop header.
+  if (BB->pred_size() == 1) {
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    SmallVector<MachineOperand, 4> Cond;
+    if (!TII->analyzeBranch(*BB, TBB, FBB, Cond, false) && Cond.empty())
+      InitRegPressure(*BB->pred_begin());
+  }
+
+  for (const MachineInstr &MI : *BB)
+    UpdateRegPressure(&MI, /*ConsiderUnseenAsDef=*/true);
+}
+
+/// Update estimate of register pressure after the specified instruction.
+void MachineLICM::UpdateRegPressure(const MachineInstr *MI,
+                                    bool ConsiderUnseenAsDef) {
+  auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
+  for (const auto &RPIdAndCost : Cost) {
+    unsigned Class = RPIdAndCost.first;
+    if (static_cast<int>(RegPressure[Class]) < -RPIdAndCost.second)
+      RegPressure[Class] = 0;
+    else
+      RegPressure[Class] += RPIdAndCost.second;
+  }
+}
+
+/// Calculate the additional register pressure that the registers used in MI
+/// cause.
+///
+/// If 'ConsiderSeen' is true, updates 'RegSeen' and uses the information to
+/// figure out which usages are live-ins.
+/// FIXME: Figure out a way to consider 'RegSeen' from all code paths.
+DenseMap<unsigned, int>
+MachineLICM::calcRegisterCost(const MachineInstr *MI, bool ConsiderSeen,
+                              bool ConsiderUnseenAsDef) {
+  DenseMap<unsigned, int> Cost;
+  if (MI->isImplicitDef())
+    return Cost;
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // FIXME: It seems bad to use RegSeen only for some of these calculations.
+    bool isNew = ConsiderSeen ? RegSeen.insert(Reg).second : false;
+    const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+
+    RegClassWeight W = TRI->getRegClassWeight(RC);
+    int RCCost = 0;
+    if (MO.isDef())
+      RCCost = W.RegWeight;
+    else {
+      bool isKill = isOperandKill(MO, MRI);
+      if (isNew && !isKill && ConsiderUnseenAsDef)
+        // Haven't seen this, it must be a livein.
+        RCCost = W.RegWeight;
+      else if (!isNew && isKill)
+        RCCost = -W.RegWeight;
+    }
+    if (RCCost == 0)
+      continue;
+    const int *PS = TRI->getRegClassPressureSets(RC);
+    for (; *PS != -1; ++PS) {
+      if (Cost.find(*PS) == Cost.end())
+        Cost[*PS] = RCCost;
+      else
+        Cost[*PS] += RCCost;
+    }
+  }
+  return Cost;
+}
+
+/// Return true if this machine instruction loads from global offset table or
+/// constant pool.
+static bool mayLoadFromGOTOrConstantPool(MachineInstr &MI) {
+  assert (MI.mayLoad() && "Expected MI that loads!");
+  
+  // If we lost memory operands, conservatively assume that the instruction
+  // reads from everything.. 
+  if (MI.memoperands_empty())
+    return true;
+
+  for (MachineMemOperand *MemOp : MI.memoperands())
+    if (const PseudoSourceValue *PSV = MemOp->getPseudoValue())
+      if (PSV->isGOT() || PSV->isConstantPool())
+        return true;
+
+  return false;
+}
+
+/// Returns true if the instruction may be a suitable candidate for LICM.
+/// e.g. If the instruction is a call, then it's obviously not safe to hoist it.
+bool MachineLICM::IsLICMCandidate(MachineInstr &I) {
+  // Check if it's safe to move the instruction.
+  bool DontMoveAcrossStore = true;
+  if (!I.isSafeToMove(AA, DontMoveAcrossStore))
+    return false;
+
+  // If it is load then check if it is guaranteed to execute by making sure that
+  // it dominates all exiting blocks. If it doesn't, then there is a path out of
+  // the loop which does not execute this load, so we can't hoist it. Loads
+  // from constant memory are not safe to speculate all the time, for example
+  // indexed load from a jump table.
+  // Stores and side effects are already checked by isSafeToMove.
+  if (I.mayLoad() && !mayLoadFromGOTOrConstantPool(I) &&
+      !IsGuaranteedToExecute(I.getParent()))
+    return false;
+
+  return true;
+}
+
+/// Returns true if the instruction is loop invariant.
+/// I.e., all virtual register operands are defined outside of the loop,
+/// physical registers aren't accessed explicitly, and there are no side
+/// effects that aren't captured by the operands or other flags.
+///
+bool MachineLICM::IsLoopInvariantInst(MachineInstr &I) {
+  if (!IsLICMCandidate(I))
+    return false;
+
+  // The instruction is loop invariant if all of its operands are.
+  for (const MachineOperand &MO : I.operands()) {
+    if (!MO.isReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    // Don't hoist an instruction that uses or defines a physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI->isConstantPhysReg(Reg, *I.getParent()->getParent()))
+          return false;
+        // Otherwise it's safe to move.
+        continue;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return false;
+      } else if (CurLoop->getHeader()->isLiveIn(Reg)) {
+        // If the reg is live into the loop, we can't hoist an instruction
+        // which would clobber it.
+        return false;
+      }
+    }
+
+    if (!MO.isUse())
+      continue;
+
+    assert(MRI->getVRegDef(Reg) &&
+           "Machine instr not mapped for this vreg?!");
+
+    // If the loop contains the definition of an operand, then the instruction
+    // isn't loop invariant.
+    if (CurLoop->contains(MRI->getVRegDef(Reg)))
+      return false;
+  }
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+
+/// Return true if the specified instruction is used by a phi node and hoisting
+/// it could cause a copy to be inserted.
+bool MachineLICM::HasLoopPHIUse(const MachineInstr *MI) const {
+  SmallVector<const MachineInstr*, 8> Work(1, MI);
+  do {
+    MI = Work.pop_back_val();
+    for (const MachineOperand &MO : MI->operands()) {
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
+        // A PHI may cause a copy to be inserted.
+        if (UseMI.isPHI()) {
+          // A PHI inside the loop causes a copy because the live range of Reg is
+          // extended across the PHI.
+          if (CurLoop->contains(&UseMI))
+            return true;
+          // A PHI in an exit block can cause a copy to be inserted if the PHI
+          // has multiple predecessors in the loop with different values.
+          // For now, approximate by rejecting all exit blocks.
+          if (isExitBlock(UseMI.getParent()))
+            return true;
+          continue;
+        }
+        // Look past copies as well.
+        if (UseMI.isCopy() && CurLoop->contains(&UseMI))
+          Work.push_back(&UseMI);
+      }
+    }
+  } while (!Work.empty());
+  return false;
+}
+
+/// Compute operand latency between a def of 'Reg' and an use in the current
+/// loop, return true if the target considered it high.
+bool MachineLICM::HasHighOperandLatency(MachineInstr &MI,
+                                        unsigned DefIdx, unsigned Reg) const {
+  if (MRI->use_nodbg_empty(Reg))
+    return false;
+
+  for (MachineInstr &UseMI : MRI->use_nodbg_instructions(Reg)) {
+    if (UseMI.isCopyLike())
+      continue;
+    if (!CurLoop->contains(UseMI.getParent()))
+      continue;
+    for (unsigned i = 0, e = UseMI.getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = UseMI.getOperand(i);
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (MOReg != Reg)
+        continue;
+
+      if (TII->hasHighOperandLatency(SchedModel, MRI, MI, DefIdx, UseMI, i))
+        return true;
+    }
+
+    // Only look at the first in loop use.
+    break;
+  }
+
+  return false;
+}
+
+/// Return true if the instruction is marked "cheap" or the operand latency
+/// between its def and a use is one or less.
+bool MachineLICM::IsCheapInstruction(MachineInstr &MI) const {
+  if (TII->isAsCheapAsAMove(MI) || MI.isCopyLike())
+    return true;
+
+  bool isCheap = false;
+  unsigned NumDefs = MI.getDesc().getNumDefs();
+  for (unsigned i = 0, e = MI.getNumOperands(); NumDefs && i != e; ++i) {
+    MachineOperand &DefMO = MI.getOperand(i);
+    if (!DefMO.isReg() || !DefMO.isDef())
+      continue;
+    --NumDefs;
+    unsigned Reg = DefMO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    if (!TII->hasLowDefLatency(SchedModel, MI, i))
+      return false;
+    isCheap = true;
+  }
+
+  return isCheap;
+}
+
+/// Visit BBs from header to current BB, check if hoisting an instruction of the
+/// given cost matrix can cause high register pressure.
+bool MachineLICM::CanCauseHighRegPressure(const DenseMap<unsigned, int>& Cost,
+                                          bool CheapInstr) {
+  for (const auto &RPIdAndCost : Cost) {
+    if (RPIdAndCost.second <= 0)
+      continue;
+
+    unsigned Class = RPIdAndCost.first;
+    int Limit = RegLimit[Class];
+
+    // Don't hoist cheap instructions if they would increase register pressure,
+    // even if we're under the limit.
+    if (CheapInstr && !HoistCheapInsts)
+      return true;
+
+    for (const auto &RP : BackTrace)
+      if (static_cast<int>(RP[Class]) + RPIdAndCost.second >= Limit)
+        return true;
+  }
+
+  return false;
+}
+
+/// Traverse the back trace from header to the current block and update their
+/// register pressures to reflect the effect of hoisting MI from the current
+/// block to the preheader.
+void MachineLICM::UpdateBackTraceRegPressure(const MachineInstr *MI) {
+  // First compute the 'cost' of the instruction, i.e. its contribution
+  // to register pressure.
+  auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/false,
+                               /*ConsiderUnseenAsDef=*/false);
+
+  // Update register pressure of blocks from loop header to current block.
+  for (auto &RP : BackTrace)
+    for (const auto &RPIdAndCost : Cost)
+      RP[RPIdAndCost.first] += RPIdAndCost.second;
+}
+
+/// Return true if it is potentially profitable to hoist the given loop
+/// invariant.
+bool MachineLICM::IsProfitableToHoist(MachineInstr &MI) {
+  if (MI.isImplicitDef())
+    return true;
+
+  // Besides removing computation from the loop, hoisting an instruction has
+  // these effects:
+  //
+  // - The value defined by the instruction becomes live across the entire
+  //   loop. This increases register pressure in the loop.
+  //
+  // - If the value is used by a PHI in the loop, a copy will be required for
+  //   lowering the PHI after extending the live range.
+  //
+  // - When hoisting the last use of a value in the loop, that value no longer
+  //   needs to be live in the loop. This lowers register pressure in the loop.
+
+  bool CheapInstr = IsCheapInstruction(MI);
+  bool CreatesCopy = HasLoopPHIUse(&MI);
+
+  // Don't hoist a cheap instruction if it would create a copy in the loop.
+  if (CheapInstr && CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist cheap instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Rematerializable instructions should always be hoisted since the register
+  // allocator can just pull them down again when needed.
+  if (TII->isTriviallyReMaterializable(MI, AA))
+    return true;
+
+  // FIXME: If there are long latency loop-invariant instructions inside the
+  // loop at this point, why didn't the optimizer's LICM hoist them?
+  for (unsigned i = 0, e = MI.getDesc().getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || MO.isImplicit())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (MO.isDef() && HasHighOperandLatency(MI, i, Reg)) {
+      DEBUG(dbgs() << "Hoist High Latency: " << MI);
+      ++NumHighLatency;
+      return true;
+    }
+  }
+
+  // Estimate register pressure to determine whether to LICM the instruction.
+  // In low register pressure situation, we can be more aggressive about
+  // hoisting. Also, favors hoisting long latency instructions even in
+  // moderately high pressure situation.
+  // Cheap instructions will only be hoisted if they don't increase register
+  // pressure at all.
+  auto Cost = calcRegisterCost(&MI, /*ConsiderSeen=*/false,
+                               /*ConsiderUnseenAsDef=*/false);
+
+  // Visit BBs from header to current BB, if hoisting this doesn't cause
+  // high register pressure, then it's safe to proceed.
+  if (!CanCauseHighRegPressure(Cost, CheapInstr)) {
+    DEBUG(dbgs() << "Hoist non-reg-pressure: " << MI);
+    ++NumLowRP;
+    return true;
+  }
+
+  // Don't risk increasing register pressure if it would create copies.
+  if (CreatesCopy) {
+    DEBUG(dbgs() << "Won't hoist instr with loop PHI use: " << MI);
+    return false;
+  }
+
+  // Do not "speculate" in high register pressure situation. If an
+  // instruction is not guaranteed to be executed in the loop, it's best to be
+  // conservative.
+  if (AvoidSpeculation &&
+      (!IsGuaranteedToExecute(MI.getParent()) && !MayCSE(&MI))) {
+    DEBUG(dbgs() << "Won't speculate: " << MI);
+    return false;
+  }
+
+  // High register pressure situation, only hoist if the instruction is going
+  // to be remat'ed.
+  if (!TII->isTriviallyReMaterializable(MI, AA) && !MI.isInvariantLoad(AA)) {
+    DEBUG(dbgs() << "Can't remat / high reg-pressure: " << MI);
+    return false;
+  }
+
+  return true;
+}
+
+/// Unfold a load from the given machineinstr if the load itself could be
+/// hoisted. Return the unfolded and hoistable load, or null if the load
+/// couldn't be unfolded or if it wouldn't be hoistable.
+MachineInstr *MachineLICM::ExtractHoistableLoad(MachineInstr *MI) {
+  // Don't unfold simple loads.
+  if (MI->canFoldAsLoad())
+    return nullptr;
+
+  // If not, we may be able to unfold a load and hoist that.
+  // First test whether the instruction is loading from an amenable
+  // memory location.
+  if (!MI->isInvariantLoad(AA))
+    return nullptr;
+
+  // Next determine the register class for a temporary register.
+  unsigned LoadRegIndex;
+  unsigned NewOpc =
+    TII->getOpcodeAfterMemoryUnfold(MI->getOpcode(),
+                                    /*UnfoldLoad=*/true,
+                                    /*UnfoldStore=*/false,
+                                    &LoadRegIndex);
+  if (NewOpc == 0) return nullptr;
+  const MCInstrDesc &MID = TII->get(NewOpc);
+  MachineFunction &MF = *MI->getParent()->getParent();
+  const TargetRegisterClass *RC = TII->getRegClass(MID, LoadRegIndex, TRI, MF);
+  // Ok, we're unfolding. Create a temporary register and do the unfold.
+  unsigned Reg = MRI->createVirtualRegister(RC);
+
+  SmallVector<MachineInstr *, 2> NewMIs;
+  bool Success = TII->unfoldMemoryOperand(MF, *MI, Reg,
+                                          /*UnfoldLoad=*/true,
+                                          /*UnfoldStore=*/false, NewMIs);
+  (void)Success;
+  assert(Success &&
+         "unfoldMemoryOperand failed when getOpcodeAfterMemoryUnfold "
+         "succeeded!");
+  assert(NewMIs.size() == 2 &&
+         "Unfolded a load into multiple instructions!");
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineBasicBlock::iterator Pos = MI;
+  MBB->insert(Pos, NewMIs[0]);
+  MBB->insert(Pos, NewMIs[1]);
+  // If unfolding produced a load that wasn't loop-invariant or profitable to
+  // hoist, discard the new instructions and bail.
+  if (!IsLoopInvariantInst(*NewMIs[0]) || !IsProfitableToHoist(*NewMIs[0])) {
+    NewMIs[0]->eraseFromParent();
+    NewMIs[1]->eraseFromParent();
+    return nullptr;
+  }
+
+  // Update register pressure for the unfolded instruction.
+  UpdateRegPressure(NewMIs[1]);
+
+  // Otherwise we successfully unfolded a load that we can hoist.
+  MI->eraseFromParent();
+  return NewMIs[0];
+}
+
+/// Initialize the CSE map with instructions that are in the current loop
+/// preheader that may become duplicates of instructions that are hoisted
+/// out of the loop.
+void MachineLICM::InitCSEMap(MachineBasicBlock *BB) {
+  for (MachineInstr &MI : *BB)
+    CSEMap[MI.getOpcode()].push_back(&MI);
+}
+
+/// Find an instruction amount PrevMIs that is a duplicate of MI.
+/// Return this instruction if it's found.
+const MachineInstr*
+MachineLICM::LookForDuplicate(const MachineInstr *MI,
+                              std::vector<const MachineInstr*> &PrevMIs) {
+  for (const MachineInstr *PrevMI : PrevMIs)
+    if (TII->produceSameValue(*MI, *PrevMI, (PreRegAlloc ? MRI : nullptr)))
+      return PrevMI;
+
+  return nullptr;
+}
+
+/// Given a LICM'ed instruction, look for an instruction on the preheader that
+/// computes the same value. If it's found, do a RAU on with the definition of
+/// the existing instruction rather than hoisting the instruction to the
+/// preheader.
+bool MachineLICM::EliminateCSE(MachineInstr *MI,
+          DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator &CI) {
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  if (const MachineInstr *Dup = LookForDuplicate(MI, CI->second)) {
+    DEBUG(dbgs() << "CSEing " << *MI << " with " << *Dup);
+
+    // Replace virtual registers defined by MI by their counterparts defined
+    // by Dup.
+    SmallVector<unsigned, 2> Defs;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = MI->getOperand(i);
+
+      // Physical registers may not differ here.
+      assert((!MO.isReg() || MO.getReg() == 0 ||
+              !TargetRegisterInfo::isPhysicalRegister(MO.getReg()) ||
+              MO.getReg() == Dup->getOperand(i).getReg()) &&
+             "Instructions with different phys regs are not identical!");
+
+      if (MO.isReg() && MO.isDef() &&
+          !TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        Defs.push_back(i);
+    }
+
+    SmallVector<const TargetRegisterClass*, 2> OrigRCs;
+    for (unsigned i = 0, e = Defs.size(); i != e; ++i) {
+      unsigned Idx = Defs[i];
+      unsigned Reg = MI->getOperand(Idx).getReg();
+      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      OrigRCs.push_back(MRI->getRegClass(DupReg));
+
+      if (!MRI->constrainRegClass(DupReg, MRI->getRegClass(Reg))) {
+        // Restore old RCs if more than one defs.
+        for (unsigned j = 0; j != i; ++j)
+          MRI->setRegClass(Dup->getOperand(Defs[j]).getReg(), OrigRCs[j]);
+        return false;
+      }
+    }
+
+    for (unsigned Idx : Defs) {
+      unsigned Reg = MI->getOperand(Idx).getReg();
+      unsigned DupReg = Dup->getOperand(Idx).getReg();
+      MRI->replaceRegWith(Reg, DupReg);
+      MRI->clearKillFlags(DupReg);
+    }
+
+    MI->eraseFromParent();
+    ++NumCSEed;
+    return true;
+  }
+  return false;
+}
+
+/// Return true if the given instruction will be CSE'd if it's hoisted out of
+/// the loop.
+bool MachineLICM::MayCSE(MachineInstr *MI) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  // Do not CSE implicit_def so ProcessImplicitDefs can properly propagate
+  // the undef property onto uses.
+  if (CI == CSEMap.end() || MI->isImplicitDef())
+    return false;
+
+  return LookForDuplicate(MI, CI->second) != nullptr;
+}
+
+/// When an instruction is found to use only loop invariant operands
+/// that are safe to hoist, this instruction is called to do the dirty work.
+/// It returns true if the instruction is hoisted.
+bool MachineLICM::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
+  // First check whether we should hoist this instruction.
+  if (!IsLoopInvariantInst(*MI) || !IsProfitableToHoist(*MI)) {
+    // If not, try unfolding a hoistable load.
+    MI = ExtractHoistableLoad(MI);
+    if (!MI) return false;
+  }
+
+  // Now move the instructions to the predecessor, inserting it before any
+  // terminator instructions.
+  DEBUG({
+      dbgs() << "Hoisting " << *MI;
+      if (MI->getParent()->getBasicBlock())
+        dbgs() << " from BB#" << MI->getParent()->getNumber();
+      if (Preheader->getBasicBlock())
+        dbgs() << " to BB#" << Preheader->getNumber();
+      dbgs() << "\n";
+    });
+
+  // If this is the first instruction being hoisted to the preheader,
+  // initialize the CSE map with potential common expressions.
+  if (FirstInLoop) {
+    InitCSEMap(Preheader);
+    FirstInLoop = false;
+  }
+
+  // Look for opportunity to CSE the hoisted instruction.
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, std::vector<const MachineInstr*> >::iterator
+    CI = CSEMap.find(Opcode);
+  if (!EliminateCSE(MI, CI)) {
+    // Otherwise, splice the instruction to the preheader.
+    Preheader->splice(Preheader->getFirstTerminator(),MI->getParent(),MI);
+
+    // Update register pressure for BBs from header to this block.
+    UpdateBackTraceRegPressure(MI);
+
+    // Clear the kill flags of any register this instruction defines,
+    // since they may need to be live throughout the entire loop
+    // rather than just live for part of it.
+    for (MachineOperand &MO : MI->operands())
+      if (MO.isReg() && MO.isDef() && !MO.isDead())
+        MRI->clearKillFlags(MO.getReg());
+
+    // Add to the CSE map.
+    if (CI != CSEMap.end())
+      CI->second.push_back(MI);
+    else
+      CSEMap[Opcode].push_back(MI);
+  }
+
+  ++NumHoisted;
+  Changed = true;
+
+  return true;
+}
+
+/// Get the preheader for the current loop, splitting a critical edge if needed.
+MachineBasicBlock *MachineLICM::getCurPreheader() {
+  // Determine the block to which to hoist instructions. If we can't find a
+  // suitable loop predecessor, we can't do any hoisting.
+
+  // If we've tried to get a preheader and failed, don't try again.
+  if (CurPreheader == reinterpret_cast<MachineBasicBlock *>(-1))
+    return nullptr;
+
+  if (!CurPreheader) {
+    CurPreheader = CurLoop->getLoopPreheader();
+    if (!CurPreheader) {
+      MachineBasicBlock *Pred = CurLoop->getLoopPredecessor();
+      if (!Pred) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return nullptr;
+      }
+
+      CurPreheader = Pred->SplitCriticalEdge(CurLoop->getHeader(), *this);
+      if (!CurPreheader) {
+        CurPreheader = reinterpret_cast<MachineBasicBlock *>(-1);
+        return nullptr;
+      }
+    }
+  }
+  return CurPreheader;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
new file mode 100644
index 000000000000..376f78fda1c4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineLoopInfo.cpp
@@ -0,0 +1,84 @@
+//===- MachineLoopInfo.cpp - Natural Loop Calculator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the MachineLoopInfo class that is used to identify natural
+// loops and determine the loop depth of various nodes of the CFG.  Note that
+// the loops identified may actually be several natural loops that share the
+// same header node... not just a single natural loop.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/Analysis/LoopInfoImpl.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+// Explicitly instantiate methods in LoopInfoImpl.h for MI-level Loops.
+template class llvm::LoopBase<MachineBasicBlock, MachineLoop>;
+template class llvm::LoopInfoBase<MachineBasicBlock, MachineLoop>;
+
+char MachineLoopInfo::ID = 0;
+INITIALIZE_PASS_BEGIN(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(MachineLoopInfo, "machine-loops",
+                "Machine Natural Loop Construction", true, true)
+
+char &llvm::MachineLoopInfoID = MachineLoopInfo::ID;
+
+bool MachineLoopInfo::runOnMachineFunction(MachineFunction &) {
+  releaseMemory();
+  LI.analyze(getAnalysis<MachineDominatorTree>().getBase());
+  return false;
+}
+
+void MachineLoopInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineBasicBlock *MachineLoop::getTopBlock() {
+  MachineBasicBlock *TopMBB = getHeader();
+  MachineFunction::iterator Begin = TopMBB->getParent()->begin();
+  if (TopMBB->getIterator() != Begin) {
+    MachineBasicBlock *PriorMBB = &*std::prev(TopMBB->getIterator());
+    while (contains(PriorMBB)) {
+      TopMBB = PriorMBB;
+      if (TopMBB->getIterator() == Begin)
+        break;
+      PriorMBB = &*std::prev(TopMBB->getIterator());
+    }
+  }
+  return TopMBB;
+}
+
+MachineBasicBlock *MachineLoop::getBottomBlock() {
+  MachineBasicBlock *BotMBB = getHeader();
+  MachineFunction::iterator End = BotMBB->getParent()->end();
+  if (BotMBB->getIterator() != std::prev(End)) {
+    MachineBasicBlock *NextMBB = &*std::next(BotMBB->getIterator());
+    while (contains(NextMBB)) {
+      BotMBB = NextMBB;
+      if (BotMBB == &*std::next(BotMBB->getIterator()))
+        break;
+      NextMBB = &*std::next(BotMBB->getIterator());
+    }
+  }
+  return BotMBB;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineLoop::dump() const {
+  print(dbgs());
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
new file mode 100644
index 000000000000..244e3fbc4e8f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -0,0 +1,463 @@
+//===-- llvm/CodeGen/MachineModuleInfo.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/ADT/PointerUnion.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
+using namespace llvm;
+using namespace llvm::dwarf;
+
+// Handle the Pass registration stuff necessary to use DataLayout's.
+INITIALIZE_PASS(MachineModuleInfo, "machinemoduleinfo",
+                "Machine Module Information", false, false)
+char MachineModuleInfo::ID = 0;
+
+// Out of line virtual method.
+MachineModuleInfoImpl::~MachineModuleInfoImpl() {}
+
+namespace llvm {
+class MMIAddrLabelMapCallbackPtr final : CallbackVH {
+  MMIAddrLabelMap *Map;
+public:
+  MMIAddrLabelMapCallbackPtr() : Map(nullptr) {}
+  MMIAddrLabelMapCallbackPtr(Value *V) : CallbackVH(V), Map(nullptr) {}
+
+  void setPtr(BasicBlock *BB) {
+    ValueHandleBase::operator=(BB);
+  }
+
+  void setMap(MMIAddrLabelMap *map) { Map = map; }
+
+  void deleted() override;
+  void allUsesReplacedWith(Value *V2) override;
+};
+
+class MMIAddrLabelMap {
+  MCContext &Context;
+  struct AddrLabelSymEntry {
+    /// Symbols - The symbols for the label.
+    TinyPtrVector<MCSymbol *> Symbols;
+
+    Function *Fn;   // The containing function of the BasicBlock.
+    unsigned Index; // The index in BBCallbacks for the BasicBlock.
+  };
+
+  DenseMap<AssertingVH<BasicBlock>, AddrLabelSymEntry> AddrLabelSymbols;
+
+  /// BBCallbacks - Callbacks for the BasicBlock's that we have entries for.  We
+  /// use this so we get notified if a block is deleted or RAUWd.
+  std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
+
+  /// DeletedAddrLabelsNeedingEmission - This is a per-function list of symbols
+  /// whose corresponding BasicBlock got deleted.  These symbols need to be
+  /// emitted at some point in the file, so AsmPrinter emits them after the
+  /// function body.
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >
+    DeletedAddrLabelsNeedingEmission;
+public:
+
+  MMIAddrLabelMap(MCContext &context) : Context(context) {}
+  ~MMIAddrLabelMap() {
+    assert(DeletedAddrLabelsNeedingEmission.empty() &&
+           "Some labels for deleted blocks never got emitted");
+  }
+
+  ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
+
+  void takeDeletedSymbolsForFunction(Function *F,
+                                     std::vector<MCSymbol*> &Result);
+
+  void UpdateForDeletedBlock(BasicBlock *BB);
+  void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
+};
+}
+
+ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
+  assert(BB->hasAddressTaken() &&
+         "Shouldn't get label for block without address taken");
+  AddrLabelSymEntry &Entry = AddrLabelSymbols[BB];
+
+  // If we already had an entry for this block, just return it.
+  if (!Entry.Symbols.empty()) {
+    assert(BB->getParent() == Entry.Fn && "Parent changed");
+    return Entry.Symbols;
+  }
+
+  // Otherwise, this is a new entry, create a new symbol for it and add an
+  // entry to BBCallbacks so we can be notified if the BB is deleted or RAUWd.
+  BBCallbacks.emplace_back(BB);
+  BBCallbacks.back().setMap(this);
+  Entry.Index = BBCallbacks.size() - 1;
+  Entry.Fn = BB->getParent();
+  Entry.Symbols.push_back(Context.createTempSymbol());
+  return Entry.Symbols;
+}
+
+/// takeDeletedSymbolsForFunction - If we have any deleted symbols for F, return
+/// them.
+void MMIAddrLabelMap::
+takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
+  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*> >::iterator I =
+    DeletedAddrLabelsNeedingEmission.find(F);
+
+  // If there are no entries for the function, just return.
+  if (I == DeletedAddrLabelsNeedingEmission.end()) return;
+
+  // Otherwise, take the list.
+  std::swap(Result, I->second);
+  DeletedAddrLabelsNeedingEmission.erase(I);
+}
+
+
+void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
+  // If the block got deleted, there is no need for the symbol.  If the symbol
+  // was already emitted, we can just forget about it, otherwise we need to
+  // queue it up for later emission when the function is output.
+  AddrLabelSymEntry Entry = std::move(AddrLabelSymbols[BB]);
+  AddrLabelSymbols.erase(BB);
+  assert(!Entry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+  BBCallbacks[Entry.Index] = nullptr;  // Clear the callback.
+
+  assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
+         "Block/parent mismatch");
+
+  for (MCSymbol *Sym : Entry.Symbols) {
+    if (Sym->isDefined())
+      return;
+
+    // If the block is not yet defined, we need to emit it at the end of the
+    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
+    // for the containing Function.  Since the block is being deleted, its
+    // parent may already be removed, we have to get the function from 'Entry'.
+    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
+  }
+}
+
+void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
+  // Get the entry for the RAUW'd block and remove it from our map.
+  AddrLabelSymEntry OldEntry = std::move(AddrLabelSymbols[Old]);
+  AddrLabelSymbols.erase(Old);
+  assert(!OldEntry.Symbols.empty() && "Didn't have a symbol, why a callback?");
+
+  AddrLabelSymEntry &NewEntry = AddrLabelSymbols[New];
+
+  // If New is not address taken, just move our symbol over to it.
+  if (NewEntry.Symbols.empty()) {
+    BBCallbacks[OldEntry.Index].setPtr(New);    // Update the callback.
+    NewEntry = std::move(OldEntry);             // Set New's entry.
+    return;
+  }
+
+  BBCallbacks[OldEntry.Index] = nullptr;    // Update the callback.
+
+  // Otherwise, we need to add the old symbols to the new block's set.
+  NewEntry.Symbols.insert(NewEntry.Symbols.end(), OldEntry.Symbols.begin(),
+                          OldEntry.Symbols.end());
+}
+
+
+void MMIAddrLabelMapCallbackPtr::deleted() {
+  Map->UpdateForDeletedBlock(cast<BasicBlock>(getValPtr()));
+}
+
+void MMIAddrLabelMapCallbackPtr::allUsesReplacedWith(Value *V2) {
+  Map->UpdateForRAUWBlock(cast<BasicBlock>(getValPtr()), cast<BasicBlock>(V2));
+}
+
+
+//===----------------------------------------------------------------------===//
+
+MachineModuleInfo::MachineModuleInfo(const MCAsmInfo &MAI,
+                                     const MCRegisterInfo &MRI,
+                                     const MCObjectFileInfo *MOFI)
+  : ImmutablePass(ID), Context(&MAI, &MRI, MOFI, nullptr, false) {
+  initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
+}
+
+MachineModuleInfo::MachineModuleInfo()
+  : ImmutablePass(ID), Context(nullptr, nullptr, nullptr) {
+  llvm_unreachable("This MachineModuleInfo constructor should never be called, "
+                   "MMI should always be explicitly constructed by "
+                   "LLVMTargetMachine");
+}
+
+MachineModuleInfo::~MachineModuleInfo() {
+}
+
+bool MachineModuleInfo::doInitialization(Module &M) {
+
+  ObjFileMMI = nullptr;
+  CurCallSite = 0;
+  CallsEHReturn = false;
+  CallsUnwindInit = false;
+  HasEHFunclets = false;
+  DbgInfoAvailable = UsesVAFloatArgument = UsesMorestackAddr = false;
+  PersonalityTypeCache = EHPersonality::Unknown;
+  AddrLabelSymbols = nullptr;
+  TheModule = nullptr;
+
+  return false;
+}
+
+bool MachineModuleInfo::doFinalization(Module &M) {
+
+  Personalities.clear();
+
+  delete AddrLabelSymbols;
+  AddrLabelSymbols = nullptr;
+
+  Context.reset();
+
+  delete ObjFileMMI;
+  ObjFileMMI = nullptr;
+
+  return false;
+}
+
+/// EndFunction - Discard function meta information.
+///
+void MachineModuleInfo::EndFunction() {
+  // Clean up frame info.
+  FrameInstructions.clear();
+
+  // Clean up exception info.
+  LandingPads.clear();
+  PersonalityTypeCache = EHPersonality::Unknown;
+  CallSiteMap.clear();
+  TypeInfos.clear();
+  FilterIds.clear();
+  FilterEnds.clear();
+  CallsEHReturn = false;
+  CallsUnwindInit = false;
+  HasEHFunclets = false;
+  VariableDbgInfos.clear();
+}
+
+//===- Address of Block Management ----------------------------------------===//
+
+/// getAddrLabelSymbolToEmit - Return the symbol to be used for the specified
+/// basic block when its address is taken.  If other blocks were RAUW'd to
+/// this one, we may have to emit them as well, return the whole set.
+ArrayRef<MCSymbol *>
+MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
+  // Lazily create AddrLabelSymbols.
+  if (!AddrLabelSymbols)
+    AddrLabelSymbols = new MMIAddrLabelMap(Context);
+ return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
+}
+
+
+/// takeDeletedSymbolsForFunction - If the specified function has had any
+/// references to address-taken blocks generated, but the block got deleted,
+/// return the symbol now so we can emit it.  This prevents emitting a
+/// reference to a symbol that has no definition.
+void MachineModuleInfo::
+takeDeletedSymbolsForFunction(const Function *F,
+                              std::vector<MCSymbol*> &Result) {
+  // If no blocks have had their addresses taken, we're done.
+  if (!AddrLabelSymbols) return;
+  return AddrLabelSymbols->
+     takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
+}
+
+//===- EH -----------------------------------------------------------------===//
+
+/// getOrCreateLandingPadInfo - Find or create an LandingPadInfo for the
+/// specified MachineBasicBlock.
+LandingPadInfo &MachineModuleInfo::getOrCreateLandingPadInfo
+    (MachineBasicBlock *LandingPad) {
+  unsigned N = LandingPads.size();
+  for (unsigned i = 0; i < N; ++i) {
+    LandingPadInfo &LP = LandingPads[i];
+    if (LP.LandingPadBlock == LandingPad)
+      return LP;
+  }
+
+  LandingPads.push_back(LandingPadInfo(LandingPad));
+  return LandingPads[N];
+}
+
+/// addInvoke - Provide the begin and end labels of an invoke style call and
+/// associate it with a try landing pad block.
+void MachineModuleInfo::addInvoke(MachineBasicBlock *LandingPad,
+                                  MCSymbol *BeginLabel, MCSymbol *EndLabel) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.BeginLabels.push_back(BeginLabel);
+  LP.EndLabels.push_back(EndLabel);
+}
+
+/// addLandingPad - Provide the label of a try LandingPad block.
+///
+MCSymbol *MachineModuleInfo::addLandingPad(MachineBasicBlock *LandingPad) {
+  MCSymbol *LandingPadLabel = Context.createTempSymbol();
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.LandingPadLabel = LandingPadLabel;
+  return LandingPadLabel;
+}
+
+void MachineModuleInfo::addPersonality(const Function *Personality) {
+  for (unsigned i = 0; i < Personalities.size(); ++i)
+    if (Personalities[i] == Personality)
+      return;
+  Personalities.push_back(Personality);
+}
+
+/// addCatchTypeInfo - Provide the catch typeinfo for a landing pad.
+///
+void MachineModuleInfo::
+addCatchTypeInfo(MachineBasicBlock *LandingPad,
+                 ArrayRef<const GlobalValue *> TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  for (unsigned N = TyInfo.size(); N; --N)
+    LP.TypeIds.push_back(getTypeIDFor(TyInfo[N - 1]));
+}
+
+/// addFilterTypeInfo - Provide the filter typeinfo for a landing pad.
+///
+void MachineModuleInfo::
+addFilterTypeInfo(MachineBasicBlock *LandingPad,
+                  ArrayRef<const GlobalValue *> TyInfo) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  std::vector<unsigned> IdsInFilter(TyInfo.size());
+  for (unsigned I = 0, E = TyInfo.size(); I != E; ++I)
+    IdsInFilter[I] = getTypeIDFor(TyInfo[I]);
+  LP.TypeIds.push_back(getFilterIDFor(IdsInFilter));
+}
+
+/// addCleanup - Add a cleanup action for a landing pad.
+///
+void MachineModuleInfo::addCleanup(MachineBasicBlock *LandingPad) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  LP.TypeIds.push_back(0);
+}
+
+void MachineModuleInfo::addSEHCatchHandler(MachineBasicBlock *LandingPad,
+                                           const Function *Filter,
+                                           const BlockAddress *RecoverBA) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  SEHHandler Handler;
+  Handler.FilterOrFinally = Filter;
+  Handler.RecoverBA = RecoverBA;
+  LP.SEHHandlers.push_back(Handler);
+}
+
+void MachineModuleInfo::addSEHCleanupHandler(MachineBasicBlock *LandingPad,
+                                             const Function *Cleanup) {
+  LandingPadInfo &LP = getOrCreateLandingPadInfo(LandingPad);
+  SEHHandler Handler;
+  Handler.FilterOrFinally = Cleanup;
+  Handler.RecoverBA = nullptr;
+  LP.SEHHandlers.push_back(Handler);
+}
+
+/// TidyLandingPads - Remap landing pad labels and remove any deleted landing
+/// pads.
+void MachineModuleInfo::TidyLandingPads(DenseMap<MCSymbol*, uintptr_t> *LPMap) {
+  for (unsigned i = 0; i != LandingPads.size(); ) {
+    LandingPadInfo &LandingPad = LandingPads[i];
+    if (LandingPad.LandingPadLabel &&
+        !LandingPad.LandingPadLabel->isDefined() &&
+        (!LPMap || (*LPMap)[LandingPad.LandingPadLabel] == 0))
+      LandingPad.LandingPadLabel = nullptr;
+
+    // Special case: we *should* emit LPs with null LP MBB. This indicates
+    // "nounwind" case.
+    if (!LandingPad.LandingPadLabel && LandingPad.LandingPadBlock) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    for (unsigned j = 0, e = LandingPads[i].BeginLabels.size(); j != e; ++j) {
+      MCSymbol *BeginLabel = LandingPad.BeginLabels[j];
+      MCSymbol *EndLabel = LandingPad.EndLabels[j];
+      if ((BeginLabel->isDefined() ||
+           (LPMap && (*LPMap)[BeginLabel] != 0)) &&
+          (EndLabel->isDefined() ||
+           (LPMap && (*LPMap)[EndLabel] != 0))) continue;
+
+      LandingPad.BeginLabels.erase(LandingPad.BeginLabels.begin() + j);
+      LandingPad.EndLabels.erase(LandingPad.EndLabels.begin() + j);
+      --j;
+      --e;
+    }
+
+    // Remove landing pads with no try-ranges.
+    if (LandingPads[i].BeginLabels.empty()) {
+      LandingPads.erase(LandingPads.begin() + i);
+      continue;
+    }
+
+    // If there is no landing pad, ensure that the list of typeids is empty.
+    // If the only typeid is a cleanup, this is the same as having no typeids.
+    if (!LandingPad.LandingPadBlock ||
+        (LandingPad.TypeIds.size() == 1 && !LandingPad.TypeIds[0]))
+      LandingPad.TypeIds.clear();
+    ++i;
+  }
+}
+
+/// setCallSiteLandingPad - Map the landing pad's EH symbol to the call site
+/// indexes.
+void MachineModuleInfo::setCallSiteLandingPad(MCSymbol *Sym,
+                                              ArrayRef<unsigned> Sites) {
+  LPadToCallSiteMap[Sym].append(Sites.begin(), Sites.end());
+}
+
+/// getTypeIDFor - Return the type id for the specified typeinfo.  This is
+/// function wide.
+unsigned MachineModuleInfo::getTypeIDFor(const GlobalValue *TI) {
+  for (unsigned i = 0, N = TypeInfos.size(); i != N; ++i)
+    if (TypeInfos[i] == TI) return i + 1;
+
+  TypeInfos.push_back(TI);
+  return TypeInfos.size();
+}
+
+/// getFilterIDFor - Return the filter id for the specified typeinfos.  This is
+/// function wide.
+int MachineModuleInfo::getFilterIDFor(std::vector<unsigned> &TyIds) {
+  // If the new filter coincides with the tail of an existing filter, then
+  // re-use the existing filter.  Folding filters more than this requires
+  // re-ordering filters and/or their elements - probably not worth it.
+  for (std::vector<unsigned>::iterator I = FilterEnds.begin(),
+       E = FilterEnds.end(); I != E; ++I) {
+    unsigned i = *I, j = TyIds.size();
+
+    while (i && j)
+      if (FilterIds[--i] != TyIds[--j])
+        goto try_next;
+
+    if (!j)
+      // The new filter coincides with range [i, end) of the existing filter.
+      return -(1 + i);
+
+try_next:;
+  }
+
+  // Add the new filter.
+  int FilterID = -(1 + FilterIds.size());
+  FilterIds.reserve(FilterIds.size() + TyIds.size() + 1);
+  FilterIds.insert(FilterIds.end(), TyIds.begin(), TyIds.end());
+  FilterEnds.push_back(FilterIds.size());
+  FilterIds.push_back(0); // terminator
+  return FilterID;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
new file mode 100644
index 000000000000..22d519e5d88f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -0,0 +1,44 @@
+//===-- llvm/CodeGen/MachineModuleInfoImpls.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements object-file format specific implementations of
+// MachineModuleInfoImpl.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/MC/MCSymbol.h"
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// MachineModuleInfoMachO
+//===----------------------------------------------------------------------===//
+
+// Out of line virtual method.
+void MachineModuleInfoMachO::anchor() {}
+void MachineModuleInfoELF::anchor() {}
+
+static int SortSymbolPair(const void *LHS, const void *RHS) {
+  typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
+  const MCSymbol *LHSS = ((const PairTy *)LHS)->first;
+  const MCSymbol *RHSS = ((const PairTy *)RHS)->first;
+  return LHSS->getName().compare(RHSS->getName());
+}
+
+MachineModuleInfoImpl::SymbolListTy MachineModuleInfoImpl::getSortedStubs(
+    DenseMap<MCSymbol *, MachineModuleInfoImpl::StubValueTy> &Map) {
+  MachineModuleInfoImpl::SymbolListTy List(Map.begin(), Map.end());
+
+  if (!List.empty())
+    qsort(&List[0], List.size(), sizeof(List[0]), SortSymbolPair);
+
+  Map.clear();
+  return List;
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp b/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp
new file mode 100644
index 000000000000..3ee3e40b27e2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachinePassRegistry.cpp
@@ -0,0 +1,55 @@
+//===-- CodeGen/MachineInstr.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the machine function pass registry for register allocators
+// and instruction schedulers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePassRegistry.h"
+
+using namespace llvm;
+
+void MachinePassRegistryListener::anchor() { }
+
+/// setDefault - Set the default constructor by name.
+void MachinePassRegistry::setDefault(StringRef Name) {
+  MachinePassCtor Ctor = nullptr;
+  for(MachinePassRegistryNode *R = getList(); R; R = R->getNext()) {
+    if (R->getName() == Name) {
+      Ctor = R->getCtor();
+      break;
+    }
+  }
+  assert(Ctor && "Unregistered pass name");
+  setDefault(Ctor);
+}
+
+/// Add - Adds a function pass to the registration list.
+///
+void MachinePassRegistry::Add(MachinePassRegistryNode *Node) {
+  Node->setNext(List);
+  List = Node;
+  if (Listener) Listener->NotifyAdd(Node->getName(),
+                                    Node->getCtor(),
+                                    Node->getDescription());
+}
+
+
+/// Remove - Removes a function pass from the registration list.
+///
+void MachinePassRegistry::Remove(MachinePassRegistryNode *Node) {
+  for (MachinePassRegistryNode **I = &List; *I; I = (*I)->getNextAddress()) {
+    if (*I == Node) {
+      if (Listener) Listener->NotifyRemove(Node->getName());
+      *I = (*I)->getNext();
+      break;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
new file mode 100644
index 000000000000..c3f6e9249e7d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachinePostDominators.cpp
@@ -0,0 +1,55 @@
+//===- MachinePostDominators.cpp -Machine Post Dominator Calculation ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// post dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+using namespace llvm;
+
+char MachinePostDominatorTree::ID = 0;
+
+//declare initializeMachinePostDominatorTreePass
+INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
+                "MachinePostDominator Tree Construction", true, true)
+
+MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+  initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+                                                       // postdominator
+}
+
+FunctionPass *
+MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+  return new MachinePostDominatorTree();
+}
+
+bool
+MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+void
+MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void
+MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
+  DT->print(OS);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
new file mode 100644
index 000000000000..fc32183c7f63
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineRegionInfo.cpp
@@ -0,0 +1,140 @@
+
+#include "llvm/CodeGen/MachineRegionInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/RegionInfoImpl.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+#define DEBUG_TYPE "region"
+
+using namespace llvm;
+
+STATISTIC(numMachineRegions,       "The # of machine regions");
+STATISTIC(numMachineSimpleRegions, "The # of simple machine regions");
+
+namespace llvm {
+template class RegionBase<RegionTraits<MachineFunction>>;
+template class RegionNodeBase<RegionTraits<MachineFunction>>;
+template class RegionInfoBase<RegionTraits<MachineFunction>>;
+}
+
+//===----------------------------------------------------------------------===//
+// MachineRegion implementation
+//
+
+MachineRegion::MachineRegion(MachineBasicBlock *Entry, MachineBasicBlock *Exit,
+                             MachineRegionInfo* RI,
+                             MachineDominatorTree *DT, MachineRegion *Parent) :
+  RegionBase<RegionTraits<MachineFunction>>(Entry, Exit, RI, DT, Parent) {
+
+}
+
+MachineRegion::~MachineRegion() { }
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfo implementation
+//
+
+MachineRegionInfo::MachineRegionInfo() :
+  RegionInfoBase<RegionTraits<MachineFunction>>() {
+
+}
+
+MachineRegionInfo::~MachineRegionInfo() {
+
+}
+
+void MachineRegionInfo::updateStatistics(MachineRegion *R) {
+  ++numMachineRegions;
+
+  // TODO: Slow. Should only be enabled if -stats is used.
+  if (R->isSimple())
+    ++numMachineSimpleRegions;
+}
+
+void MachineRegionInfo::recalculate(MachineFunction &F,
+                                    MachineDominatorTree *DT_,
+                                    MachinePostDominatorTree *PDT_,
+                                    MachineDominanceFrontier *DF_) {
+  DT = DT_;
+  PDT = PDT_;
+  DF = DF_;
+
+  MachineBasicBlock *Entry = GraphTraits<MachineFunction*>::getEntryNode(&F);
+
+  TopLevelRegion = new MachineRegion(Entry, nullptr, this, DT, nullptr);
+  updateStatistics(TopLevelRegion);
+  calculate(F);
+}
+
+//===----------------------------------------------------------------------===//
+// MachineRegionInfoPass implementation
+//
+
+MachineRegionInfoPass::MachineRegionInfoPass() : MachineFunctionPass(ID) {
+  initializeMachineRegionInfoPassPass(*PassRegistry::getPassRegistry());
+}
+
+MachineRegionInfoPass::~MachineRegionInfoPass() {
+
+}
+
+bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
+  releaseMemory();
+
+  auto DT = &getAnalysis<MachineDominatorTree>();
+  auto PDT = &getAnalysis<MachinePostDominatorTree>();
+  auto DF = &getAnalysis<MachineDominanceFrontier>();
+
+  RI.recalculate(F, DT, PDT, DF);
+  return false;
+}
+
+void MachineRegionInfoPass::releaseMemory() {
+  RI.releaseMemory();
+}
+
+void MachineRegionInfoPass::verifyAnalysis() const {
+  // Only do verification when user wants to, otherwise this expensive check
+  // will be invoked by PMDataManager::verifyPreservedAnalysis when
+  // a regionpass (marked PreservedAll) finish.
+  if (MachineRegionInfo::VerifyRegionInfo)
+    RI.verifyAnalysis();
+}
+
+void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+  AU.addRequired<DominanceFrontierWrapperPass>();
+}
+
+void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
+  RI.print(OS);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
+  RI.dump();
+}
+#endif
+
+char MachineRegionInfoPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions",
+                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(MachineRegionInfoPass, "regions",
+                "Detect single entry single exit regions", true, true)
+
+// Create methods available outside of this file, to use them
+// "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
+// the link time optimization.
+
+namespace llvm {
+  FunctionPass *createMachineRegionInfoPass() {
+    return new MachineRegionInfoPass();
+  }
+}
+
diff --git a/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
new file mode 100644
index 000000000000..613598dbe215
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -0,0 +1,529 @@
+//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the MachineRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+// Pin the vtable to this file.
+void MachineRegisterInfo::Delegate::anchor() {}
+
+MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
+    : MF(MF), TheDelegate(nullptr), TracksSubRegLiveness(false) {
+  unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
+  VRegInfo.reserve(256);
+  RegAllocHints.reserve(256);
+  UsedPhysRegMask.resize(NumRegs);
+  PhysRegUseDefLists.reset(new MachineOperand*[NumRegs]());
+}
+
+/// setRegClass - Set the register class of the specified virtual register.
+///
+void
+MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
+  assert(RC && RC->isAllocatable() && "Invalid RC for virtual register");
+  VRegInfo[Reg].first = RC;
+}
+
+void MachineRegisterInfo::setRegBank(unsigned Reg,
+                                     const RegisterBank &RegBank) {
+  VRegInfo[Reg].first = &RegBank;
+}
+
+const TargetRegisterClass *
+MachineRegisterInfo::constrainRegClass(unsigned Reg,
+                                       const TargetRegisterClass *RC,
+                                       unsigned MinNumRegs) {
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  if (OldRC == RC)
+    return RC;
+  const TargetRegisterClass *NewRC =
+    getTargetRegisterInfo()->getCommonSubClass(OldRC, RC);
+  if (!NewRC || NewRC == OldRC)
+    return NewRC;
+  if (NewRC->getNumRegs() < MinNumRegs)
+    return nullptr;
+  setRegClass(Reg, NewRC);
+  return NewRC;
+}
+
+bool
+MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterClass *OldRC = getRegClass(Reg);
+  const TargetRegisterClass *NewRC =
+      getTargetRegisterInfo()->getLargestLegalSuperClass(OldRC, *MF);
+
+  // Stop early if there is no room to grow.
+  if (NewRC == OldRC)
+    return false;
+
+  // Accumulate constraints from all uses.
+  for (MachineOperand &MO : reg_nodbg_operands(Reg)) {
+    // Apply the effect of the given operand to NewRC.
+    MachineInstr *MI = MO.getParent();
+    unsigned OpNo = &MO - &MI->getOperand(0);
+    NewRC = MI->getRegClassConstraintEffect(OpNo, NewRC, TII,
+                                            getTargetRegisterInfo());
+    if (!NewRC || NewRC == OldRC)
+      return false;
+  }
+  setRegClass(Reg, NewRC);
+  return true;
+}
+
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register class.
+///
+unsigned
+MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
+  assert(RegClass && "Cannot create register without RegClass!");
+  assert(RegClass->isAllocatable() &&
+         "Virtual register RegClass must be allocatable.");
+
+  // New virtual register number.
+  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  VRegInfo.grow(Reg);
+  VRegInfo[Reg].first = RegClass;
+  RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+  return Reg;
+}
+
+unsigned
+MachineRegisterInfo::getSize(unsigned VReg) const {
+  VRegToSizeMap::const_iterator SizeIt = getVRegToSize().find(VReg);
+  return SizeIt != getVRegToSize().end() ? SizeIt->second : 0;
+}
+
+void MachineRegisterInfo::setSize(unsigned VReg, unsigned Size) {
+  getVRegToSize()[VReg] = Size;
+}
+
+unsigned
+MachineRegisterInfo::createGenericVirtualRegister(unsigned Size) {
+  assert(Size && "Cannot create empty virtual register");
+
+  // New virtual register number.
+  unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
+  VRegInfo.grow(Reg);
+  // FIXME: Should we use a dummy register class?
+  VRegInfo[Reg].first = static_cast<TargetRegisterClass *>(nullptr);
+  getVRegToSize()[Reg] = Size;
+  RegAllocHints.grow(Reg);
+  if (TheDelegate)
+    TheDelegate->MRI_NoteNewVirtualRegister(Reg);
+  return Reg;
+}
+
+/// clearVirtRegs - Remove all virtual registers (after physreg assignment).
+void MachineRegisterInfo::clearVirtRegs() {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (!VRegInfo[Reg].second)
+      continue;
+    verifyUseList(Reg);
+    llvm_unreachable("Remaining virtual register operands");
+  }
+#endif
+  VRegInfo.clear();
+  for (auto &I : LiveIns)
+    I.second = 0;
+}
+
+void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
+#ifndef NDEBUG
+  bool Valid = true;
+  for (MachineOperand &M : reg_operands(Reg)) {
+    MachineOperand *MO = &M;
+    MachineInstr *MI = MO->getParent();
+    if (!MI) {
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
+             << " has no parent instruction.\n";
+      Valid = false;
+      continue;
+    }
+    MachineOperand *MO0 = &MI->getOperand(0);
+    unsigned NumOps = MI->getNumOperands();
+    if (!(MO >= MO0 && MO < MO0+NumOps)) {
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use list MachineOperand " << MO
+             << " doesn't belong to parent MI: " << *MI;
+      Valid = false;
+    }
+    if (!MO->isReg()) {
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " MachineOperand " << MO << ": " << *MO
+             << " is not a register\n";
+      Valid = false;
+    }
+    if (MO->getReg() != Reg) {
+      errs() << PrintReg(Reg, getTargetRegisterInfo())
+             << " use-list MachineOperand " << MO << ": "
+             << *MO << " is the wrong register\n";
+      Valid = false;
+    }
+  }
+  assert(Valid && "Invalid use list");
+#endif
+}
+
+void MachineRegisterInfo::verifyUseLists() const {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i)
+    verifyUseList(TargetRegisterInfo::index2VirtReg(i));
+  for (unsigned i = 1, e = getTargetRegisterInfo()->getNumRegs(); i != e; ++i)
+    verifyUseList(i);
+#endif
+}
+
+/// Add MO to the linked list of operands for its register.
+void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
+  assert(!MO->isOnRegUseList() && "Already on list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+
+  // Head points to the first list element.
+  // Next is NULL on the last list element.
+  // Prev pointers are circular, so Head->Prev == Last.
+
+  // Head is NULL for an empty list.
+  if (!Head) {
+    MO->Contents.Reg.Prev = MO;
+    MO->Contents.Reg.Next = nullptr;
+    HeadRef = MO;
+    return;
+  }
+  assert(MO->getReg() == Head->getReg() && "Different regs on the same list!");
+
+  // Insert MO between Last and Head in the circular Prev chain.
+  MachineOperand *Last = Head->Contents.Reg.Prev;
+  assert(Last && "Inconsistent use list");
+  assert(MO->getReg() == Last->getReg() && "Different regs on the same list!");
+  Head->Contents.Reg.Prev = MO;
+  MO->Contents.Reg.Prev = Last;
+
+  // Def operands always precede uses. This allows def_iterator to stop early.
+  // Insert def operands at the front, and use operands at the back.
+  if (MO->isDef()) {
+    // Insert def at the front.
+    MO->Contents.Reg.Next = Head;
+    HeadRef = MO;
+  } else {
+    // Insert use at the end.
+    MO->Contents.Reg.Next = nullptr;
+    Last->Contents.Reg.Next = MO;
+  }
+}
+
+/// Remove MO from its use-def list.
+void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
+  assert(MO->isOnRegUseList() && "Operand not on use list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+  assert(Head && "List already empty");
+
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *Next = MO->Contents.Reg.Next;
+  MachineOperand *Prev = MO->Contents.Reg.Prev;
+
+  // Prev links are circular, next link is NULL instead of looping back to Head.
+  if (MO == Head)
+    HeadRef = Next;
+  else
+    Prev->Contents.Reg.Next = Next;
+
+  (Next ? Next : Head)->Contents.Reg.Prev = Prev;
+
+  MO->Contents.Reg.Prev = nullptr;
+  MO->Contents.Reg.Next = nullptr;
+}
+
+/// Move NumOps operands from Src to Dst, updating use-def lists as needed.
+///
+/// The Dst range is assumed to be uninitialized memory. (Or it may contain
+/// operands that won't be destroyed, which is OK because the MO destructor is
+/// trivial anyway).
+///
+/// The Src and Dst ranges may overlap.
+void MachineRegisterInfo::moveOperands(MachineOperand *Dst,
+                                       MachineOperand *Src,
+                                       unsigned NumOps) {
+  assert(Src != Dst && NumOps && "Noop moveOperands");
+
+  // Copy backwards if Dst is within the Src range.
+  int Stride = 1;
+  if (Dst >= Src && Dst < Src + NumOps) {
+    Stride = -1;
+    Dst += NumOps - 1;
+    Src += NumOps - 1;
+  }
+
+  // Copy one operand at a time.
+  do {
+    new (Dst) MachineOperand(*Src);
+
+    // Dst takes Src's place in the use-def chain.
+    if (Src->isReg()) {
+      MachineOperand *&Head = getRegUseDefListHead(Src->getReg());
+      MachineOperand *Prev = Src->Contents.Reg.Prev;
+      MachineOperand *Next = Src->Contents.Reg.Next;
+      assert(Head && "List empty, but operand is chained");
+      assert(Prev && "Operand was not on use-def list");
+
+      // Prev links are circular, next link is NULL instead of looping back to
+      // Head.
+      if (Src == Head)
+        Head = Dst;
+      else
+        Prev->Contents.Reg.Next = Dst;
+
+      // Update Prev pointer. This also works when Src was pointing to itself
+      // in a 1-element list. In that case Head == Dst.
+      (Next ? Next : Head)->Contents.Reg.Prev = Dst;
+    }
+
+    Dst += Stride;
+    Src += Stride;
+  } while (--NumOps);
+}
+
+/// replaceRegWith - Replace all instances of FromReg with ToReg in the
+/// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
+/// except that it also changes any definitions of the register as well.
+/// If ToReg is a physical register we apply the sub register to obtain the
+/// final/proper physical register.
+void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
+  assert(FromReg != ToReg && "Cannot replace a reg with itself");
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  
+  // TODO: This could be more efficient by bulk changing the operands.
+  for (reg_iterator I = reg_begin(FromReg), E = reg_end(); I != E; ) {
+    MachineOperand &O = *I;
+    ++I;
+    if (TargetRegisterInfo::isPhysicalRegister(ToReg)) {
+      O.substPhysReg(ToReg, *TRI);
+    } else {
+      O.setReg(ToReg);
+    }
+  }
+}
+
+/// getVRegDef - Return the machine instr that defines the specified virtual
+/// register or null if none is found.  This assumes that the code is in SSA
+/// form, so there should only be one definition.
+MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
+  // Since we are in SSA form, we can use the first definition.
+  def_instr_iterator I = def_instr_begin(Reg);
+  assert((I.atEnd() || std::next(I) == def_instr_end()) &&
+         "getVRegDef assumes a single definition or no definition");
+  return !I.atEnd() ? &*I : nullptr;
+}
+
+/// getUniqueVRegDef - Return the unique machine instr that defines the
+/// specified virtual register or null if none is found.  If there are
+/// multiple definitions or no definition, return null.
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
+  if (def_empty(Reg)) return nullptr;
+  def_instr_iterator I = def_instr_begin(Reg);
+  if (std::next(I) != def_instr_end())
+    return nullptr;
+  return &*I;
+}
+
+bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
+  use_nodbg_iterator UI = use_nodbg_begin(RegNo);
+  if (UI == use_nodbg_end())
+    return false;
+  return ++UI == use_nodbg_end();
+}
+
+/// clearKillFlags - Iterate over all the uses of the given register and
+/// clear the kill flag from the MachineOperand. This function is used by
+/// optimization passes which extend register lifetimes and need only
+/// preserve conservative kill flag information.
+void MachineRegisterInfo::clearKillFlags(unsigned Reg) const {
+  for (MachineOperand &MO : use_operands(Reg))
+    MO.setIsKill(false);
+}
+
+bool MachineRegisterInfo::isLiveIn(unsigned Reg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == Reg || I->second == Reg)
+      return true;
+  return false;
+}
+
+/// getLiveInPhysReg - If VReg is a live-in virtual register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInPhysReg(unsigned VReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->second == VReg)
+      return I->first;
+  return 0;
+}
+
+/// getLiveInVirtReg - If PReg is a live-in physical register, return the
+/// corresponding live-in physical register.
+unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
+  for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
+    if (I->first == PReg)
+      return I->second;
+  return 0;
+}
+
+/// EmitLiveInCopies - Emit copies to initialize livein virtual registers
+/// into the given entry block.
+void
+MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
+                                      const TargetRegisterInfo &TRI,
+                                      const TargetInstrInfo &TII) {
+  // Emit the copies into the top of the block.
+  for (unsigned i = 0, e = LiveIns.size(); i != e; ++i)
+    if (LiveIns[i].second) {
+      if (use_empty(LiveIns[i].second)) {
+        // The livein has no uses. Drop it.
+        //
+        // It would be preferable to have isel avoid creating live-in
+        // records for unused arguments in the first place, but it's
+        // complicated by the debug info code for arguments.
+        LiveIns.erase(LiveIns.begin() + i);
+        --i; --e;
+      } else {
+        // Emit a copy.
+        BuildMI(*EntryMBB, EntryMBB->begin(), DebugLoc(),
+                TII.get(TargetOpcode::COPY), LiveIns[i].second)
+          .addReg(LiveIns[i].first);
+
+        // Add the register to the entry block live-in set.
+        EntryMBB->addLiveIn(LiveIns[i].first);
+      }
+    } else {
+      // Add the register to the entry block live-in set.
+      EntryMBB->addLiveIn(LiveIns[i].first);
+    }
+}
+
+LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
+  // Lane masks are only defined for vregs.
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const TargetRegisterClass &TRC = *getRegClass(Reg);
+  return TRC.getLaneMask();
+}
+
+#ifndef NDEBUG
+void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+  for (MachineInstr &I : use_instructions(Reg))
+    I.dump();
+}
+#endif
+
+void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
+  ReservedRegs = getTargetRegisterInfo()->getReservedRegs(MF);
+  assert(ReservedRegs.size() == getTargetRegisterInfo()->getNumRegs() &&
+         "Invalid ReservedRegs vector from target");
+}
+
+bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg,
+                                            const MachineFunction &MF) const {
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
+
+  // Check if any overlapping register is modified, or allocatable so it may be
+  // used later.
+  for (MCRegAliasIterator AI(PhysReg, getTargetRegisterInfo(), true);
+       AI.isValid(); ++AI)
+    if (!def_empty(*AI) || isAllocatable(*AI))
+      return false;
+  return true;
+}
+
+/// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
+/// specified register as undefined which causes the DBG_VALUE to be
+/// deleted during LiveDebugVariables analysis.
+void MachineRegisterInfo::markUsesInDebugValueAsUndef(unsigned Reg) const {
+  // Mark any DBG_VALUE that uses Reg as undef (but don't delete it.)
+  MachineRegisterInfo::use_instr_iterator nextI;
+  for (use_instr_iterator I = use_instr_begin(Reg), E = use_instr_end();
+       I != E; I = nextI) {
+    nextI = std::next(I);  // I is invalidated by the setReg
+    MachineInstr *UseMI = &*I;
+    if (UseMI->isDebugValue())
+      UseMI->getOperand(0).setReg(0U);
+  }
+}
+
+static const Function *getCalledFunction(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isGlobal())
+      continue;
+    const Function *Func = dyn_cast<Function>(MO.getGlobal());
+    if (Func != nullptr)
+      return Func;
+  }
+  return nullptr;
+}
+
+static bool isNoReturnDef(const MachineOperand &MO) {
+  // Anything which is not a noreturn function is a real def.
+  const MachineInstr &MI = *MO.getParent();
+  if (!MI.isCall())
+    return false;
+  const MachineBasicBlock &MBB = *MI.getParent();
+  if (!MBB.succ_empty())
+    return false;
+  const MachineFunction &MF = *MBB.getParent();
+  // We need to keep correct unwind information even if the function will
+  // not return, since the runtime may need it.
+  if (MF.getFunction()->hasFnAttribute(Attribute::UWTable))
+    return false;
+  const Function *Called = getCalledFunction(MI);
+  return !(Called == nullptr || !Called->hasFnAttribute(Attribute::NoReturn) ||
+           !Called->hasFnAttribute(Attribute::NoUnwind));
+}
+
+bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg,
+                                            bool SkipNoReturnDef) const {
+  if (UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AI(PhysReg, TRI, true); AI.isValid(); ++AI) {
+    for (const MachineOperand &MO : make_range(def_begin(*AI), def_end())) {
+      if (!SkipNoReturnDef && isNoReturnDef(MO))
+        continue;
+      return true;
+    }
+  }
+  return false;
+}
+
+bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
+  if (UsedPhysRegMask.test(PhysReg))
+    return true;
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  for (MCRegAliasIterator AliasReg(PhysReg, TRI, true); AliasReg.isValid();
+       ++AliasReg) {
+    if (!reg_nodbg_empty(*AliasReg))
+      return true;
+  }
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
new file mode 100644
index 000000000000..47ad60c5dd56
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -0,0 +1,355 @@
+//===- MachineSSAUpdater.cpp - Unstructured SSA Update Tool ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the MachineSSAUpdater class. It's based on SSAUpdater
+// class in lib/Transforms/Utils.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineSSAUpdater.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/AlignOf.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-ssaupdater"
+
+typedef DenseMap<MachineBasicBlock*, unsigned> AvailableValsTy;
+static AvailableValsTy &getAvailableVals(void *AV) {
+  return *static_cast<AvailableValsTy*>(AV);
+}
+
+MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
+                                     SmallVectorImpl<MachineInstr*> *NewPHI)
+  : AV(nullptr), InsertedPHIs(NewPHI) {
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+MachineSSAUpdater::~MachineSSAUpdater() {
+  delete static_cast<AvailableValsTy*>(AV);
+}
+
+/// Initialize - Reset this object to get ready for a new set of SSA
+/// updates.  ProtoValue is the value used to name PHI nodes.
+void MachineSSAUpdater::Initialize(unsigned V) {
+  if (!AV)
+    AV = new AvailableValsTy();
+  else
+    getAvailableVals(AV).clear();
+
+  VR = V;
+  VRC = MRI->getRegClass(VR);
+}
+
+/// HasValueForBlock - Return true if the MachineSSAUpdater already has a value for
+/// the specified block.
+bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
+  return getAvailableVals(AV).count(BB);
+}
+
+/// AddAvailableValue - Indicate that a rewritten value is available in the
+/// specified block with the specified value.
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+  getAvailableVals(AV)[BB] = V;
+}
+
+/// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
+/// live at the end of the specified block.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+  return GetValueAtEndOfBlockInternal(BB);
+}
+
+static
+unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
+        SmallVectorImpl<std::pair<MachineBasicBlock*, unsigned> > &PredValues) {
+  if (BB->empty())
+    return 0;
+
+  MachineBasicBlock::iterator I = BB->begin();
+  if (!I->isPHI())
+    return 0;
+
+  AvailableValsTy AVals;
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    AVals[PredValues[i].first] = PredValues[i].second;
+  while (I != BB->end() && I->isPHI()) {
+    bool Same = true;
+    for (unsigned i = 1, e = I->getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = I->getOperand(i).getReg();
+      MachineBasicBlock *SrcBB = I->getOperand(i+1).getMBB();
+      if (AVals[SrcBB] != SrcReg) {
+        Same = false;
+        break;
+      }
+    }
+    if (Same)
+      return I->getOperand(0).getReg();
+    ++I;
+  }
+  return 0;
+}
+
+/// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
+/// a value of the given register class at the start of the specified basic
+/// block. It returns the virtual register defined by the instruction.
+static
+MachineInstrBuilder InsertNewDef(unsigned Opcode,
+                           MachineBasicBlock *BB, MachineBasicBlock::iterator I,
+                           const TargetRegisterClass *RC,
+                           MachineRegisterInfo *MRI,
+                           const TargetInstrInfo *TII) {
+  unsigned NewVR = MRI->createVirtualRegister(RC);
+  return BuildMI(*BB, I, DebugLoc(), TII->get(Opcode), NewVR);
+}
+
+/// GetValueInMiddleOfBlock - Construct SSA form, materializing a value that
+/// is live in the middle of the specified block.
+///
+/// GetValueInMiddleOfBlock is the same as GetValueAtEndOfBlock except in one
+/// important case: if there is a definition of the rewritten value after the
+/// 'use' in BB.  Consider code like this:
+///
+///      X1 = ...
+///   SomeBB:
+///      use(X)
+///      X2 = ...
+///      br Cond, SomeBB, OutBB
+///
+/// In this case, there are two values (X1 and X2) added to the AvailableVals
+/// set by the client of the rewriter, and those values are both live out of
+/// their respective blocks.  However, the use of X happens in the *middle* of
+/// a block.  Because of this, we need to insert a new PHI node in SomeBB to
+/// merge the appropriate values, and this value isn't live out of the block.
+///
+unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+  // If there is no definition of the renamed variable in this block, just use
+  // GetValueAtEndOfBlock to do our work.
+  if (!HasValueForBlock(BB))
+    return GetValueAtEndOfBlockInternal(BB);
+
+  // If there are no predecessors, just return undef.
+  if (BB->pred_empty()) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        VRC, MRI, TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  // Otherwise, we have the hard case.  Get the live-in values for each
+  // predecessor.
+  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
+  unsigned SingularValue = 0;
+
+  bool isFirstPred = true;
+  for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+         E = BB->pred_end(); PI != E; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    PredValues.push_back(std::make_pair(PredBB, PredVal));
+
+    // Compute SingularValue.
+    if (isFirstPred) {
+      SingularValue = PredVal;
+      isFirstPred = false;
+    } else if (PredVal != SingularValue)
+      SingularValue = 0;
+  }
+
+  // Otherwise, if all the merged values are the same, just use it.
+  if (SingularValue != 0)
+    return SingularValue;
+
+  // If an identical PHI is already in BB, just reuse it.
+  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  if (DupPHI)
+    return DupPHI;
+
+  // Otherwise, we do need a PHI: insert one now.
+  MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+  MachineInstrBuilder InsertedPHI = InsertNewDef(TargetOpcode::PHI, BB,
+                                                 Loc, VRC, MRI, TII);
+
+  // Fill in all the predecessors of the PHI.
+  for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
+    InsertedPHI.addReg(PredValues[i].second).addMBB(PredValues[i].first);
+
+  // See if the PHI node can be merged to a single value.  This can happen in
+  // loop cases when we get a PHI of itself and one other value.
+  if (unsigned ConstVal = InsertedPHI->isConstantValuePHI()) {
+    InsertedPHI->eraseFromParent();
+    return ConstVal;
+  }
+
+  // If the client wants to know about all new instructions, tell it.
+  if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
+
+  DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
+  return InsertedPHI->getOperand(0).getReg();
+}
+
+static
+MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
+                                         MachineOperand *U) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+    if (&MI->getOperand(i) == U)
+      return MI->getOperand(i+1).getMBB();
+  }
+
+  llvm_unreachable("MachineOperand::getParent() failure?");
+}
+
+/// RewriteUse - Rewrite a use of the symbolic value.  This handles PHI nodes,
+/// which use their value in the corresponding predecessor.
+void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
+  MachineInstr *UseMI = U.getParent();
+  unsigned NewVR = 0;
+  if (UseMI->isPHI()) {
+    MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
+    NewVR = GetValueAtEndOfBlockInternal(SourceBB);
+  } else {
+    NewVR = GetValueInMiddleOfBlock(UseMI->getParent());
+  }
+
+  U.setReg(NewVR);
+}
+
+/// SSAUpdaterTraits<MachineSSAUpdater> - Traits for the SSAUpdaterImpl
+/// template, specialized for MachineSSAUpdater.
+namespace llvm {
+template<>
+class SSAUpdaterTraits<MachineSSAUpdater> {
+public:
+  typedef MachineBasicBlock BlkT;
+  typedef unsigned ValT;
+  typedef MachineInstr PhiT;
+
+  typedef MachineBasicBlock::succ_iterator BlkSucc_iterator;
+  static BlkSucc_iterator BlkSucc_begin(BlkT *BB) { return BB->succ_begin(); }
+  static BlkSucc_iterator BlkSucc_end(BlkT *BB) { return BB->succ_end(); }
+
+  /// Iterator for PHI operands.
+  class PHI_iterator {
+  private:
+    MachineInstr *PHI;
+    unsigned idx;
+ 
+  public:
+    explicit PHI_iterator(MachineInstr *P) // begin iterator
+      : PHI(P), idx(1) {}
+    PHI_iterator(MachineInstr *P, bool) // end iterator
+      : PHI(P), idx(PHI->getNumOperands()) {}
+
+    PHI_iterator &operator++() { idx += 2; return *this; } 
+    bool operator==(const PHI_iterator& x) const { return idx == x.idx; }
+    bool operator!=(const PHI_iterator& x) const { return !operator==(x); }
+    unsigned getIncomingValue() { return PHI->getOperand(idx).getReg(); }
+    MachineBasicBlock *getIncomingBlock() {
+      return PHI->getOperand(idx+1).getMBB();
+    }
+  };
+  static inline PHI_iterator PHI_begin(PhiT *PHI) { return PHI_iterator(PHI); }
+  static inline PHI_iterator PHI_end(PhiT *PHI) {
+    return PHI_iterator(PHI, true);
+  }
+
+  /// FindPredecessorBlocks - Put the predecessors of BB into the Preds
+  /// vector.
+  static void FindPredecessorBlocks(MachineBasicBlock *BB,
+                                    SmallVectorImpl<MachineBasicBlock*> *Preds){
+    for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
+           E = BB->pred_end(); PI != E; ++PI)
+      Preds->push_back(*PI);
+  }
+
+  /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned GetUndefVal(MachineBasicBlock *BB,
+                              MachineSSAUpdater *Updater) {
+    // Insert an implicit_def to represent an undef value.
+    MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
+                                        BB, BB->getFirstTerminator(),
+                                        Updater->VRC, Updater->MRI,
+                                        Updater->TII);
+    return NewDef->getOperand(0).getReg();
+  }
+
+  /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
+  /// Add it into the specified block and return the register.
+  static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
+                                 MachineSSAUpdater *Updater) {
+    MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
+    MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
+                                     Updater->VRC, Updater->MRI,
+                                     Updater->TII);
+    return PHI->getOperand(0).getReg();
+  }
+
+  /// AddPHIOperand - Add the specified value as an operand of the PHI for
+  /// the specified predecessor block.
+  static void AddPHIOperand(MachineInstr *PHI, unsigned Val,
+                            MachineBasicBlock *Pred) {
+    MachineInstrBuilder(*Pred->getParent(), PHI).addReg(Val).addMBB(Pred);
+  }
+
+  /// InstrIsPHI - Check if an instruction is a PHI.
+  ///
+  static MachineInstr *InstrIsPHI(MachineInstr *I) {
+    if (I && I->isPHI())
+      return I;
+    return nullptr;
+  }
+
+  /// ValueIsPHI - Check if the instruction that defines the specified register
+  /// is a PHI instruction.
+  static MachineInstr *ValueIsPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    return InstrIsPHI(Updater->MRI->getVRegDef(Val));
+  }
+
+  /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
+  /// operands, i.e., it was just added.
+  static MachineInstr *ValueIsNewPHI(unsigned Val, MachineSSAUpdater *Updater) {
+    MachineInstr *PHI = ValueIsPHI(Val, Updater);
+    if (PHI && PHI->getNumOperands() <= 1)
+      return PHI;
+    return nullptr;
+  }
+
+  /// GetPHIValue - For the specified PHI instruction, return the register
+  /// that it defines.
+  static unsigned GetPHIValue(MachineInstr *PHI) {
+    return PHI->getOperand(0).getReg();
+  }
+};
+
+} // End llvm namespace
+
+/// GetValueAtEndOfBlockInternal - Check to see if AvailableVals has an entry
+/// for the specified BB and if so, return it.  If not, construct SSA form by
+/// first calculating the required placement of PHIs and then inserting new
+/// PHIs where needed.
+unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+  AvailableValsTy &AvailableVals = getAvailableVals(AV);
+  if (unsigned V = AvailableVals[BB])
+    return V;
+
+  SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
+  return Impl.GetValue(BB);
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineScheduler.cpp b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
new file mode 100644
index 000000000000..d921e2977cc7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -0,0 +1,3553 @@
+//===- MachineScheduler.cpp - Machine Instruction Scheduler ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// MachineScheduler schedules machine instructions after phi elimination. It
+// preserves LiveIntervals so it can be invoked before register allocation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace llvm {
+cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
+                           cl::desc("Force top-down list scheduling"));
+cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
+                            cl::desc("Force bottom-up list scheduling"));
+cl::opt<bool>
+DumpCriticalPathLength("misched-dcpl", cl::Hidden,
+                       cl::desc("Print critical path length to stdout"));
+}
+
+#ifndef NDEBUG
+static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
+  cl::desc("Pop up a window to show MISched dags after they are processed"));
+
+/// In some situations a few uninteresting nodes depend on nearly all other
+/// nodes in the graph, provide a cutoff to hide them.
+static cl::opt<unsigned> ViewMISchedCutoff("view-misched-cutoff", cl::Hidden,
+  cl::desc("Hide nodes with more predecessor/successor than cutoff"));
+
+static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
+  cl::desc("Stop scheduling after N instructions"), cl::init(~0U));
+
+static cl::opt<std::string> SchedOnlyFunc("misched-only-func", cl::Hidden,
+  cl::desc("Only schedule this function"));
+static cl::opt<unsigned> SchedOnlyBlock("misched-only-block", cl::Hidden,
+  cl::desc("Only schedule this MBB#"));
+#else
+static bool ViewMISchedDAGs = false;
+#endif // NDEBUG
+
+/// Avoid quadratic complexity in unusually large basic blocks by limiting the
+/// size of the ready lists.
+static cl::opt<unsigned> ReadyListLimit("misched-limit", cl::Hidden,
+  cl::desc("Limit ready list to N instructions"), cl::init(256));
+
+static cl::opt<bool> EnableRegPressure("misched-regpressure", cl::Hidden,
+  cl::desc("Enable register pressure scheduling."), cl::init(true));
+
+static cl::opt<bool> EnableCyclicPath("misched-cyclicpath", cl::Hidden,
+  cl::desc("Enable cyclic critical path analysis."), cl::init(true));
+
+static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
+                                        cl::desc("Enable memop clustering."),
+                                        cl::init(true));
+
+// Experimental heuristics
+static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
+  cl::desc("Verify machine instrs before and after machine scheduling"));
+
+// DAG subtrees must have at least this many nodes.
+static const unsigned MinSubtreeSize = 8;
+
+// Pin the vtables to this file.
+void MachineSchedStrategy::anchor() {}
+void ScheduleDAGMutation::anchor() {}
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Scheduling Pass and Registry
+//===----------------------------------------------------------------------===//
+
+MachineSchedContext::MachineSchedContext():
+    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) {
+  RegClassInfo = new RegisterClassInfo();
+}
+
+MachineSchedContext::~MachineSchedContext() {
+  delete RegClassInfo;
+}
+
+namespace {
+/// Base class for a machine scheduler class that can run at any point.
+class MachineSchedulerBase : public MachineSchedContext,
+                             public MachineFunctionPass {
+public:
+  MachineSchedulerBase(char &ID): MachineFunctionPass(ID) {}
+
+  void print(raw_ostream &O, const Module* = nullptr) const override;
+
+protected:
+  void scheduleRegions(ScheduleDAGInstrs &Scheduler, bool FixKillFlags);
+};
+
+/// MachineScheduler runs after coalescing and before register allocation.
+class MachineScheduler : public MachineSchedulerBase {
+public:
+  MachineScheduler();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+
+  static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createMachineScheduler();
+};
+
+/// PostMachineScheduler runs after shortly before code emission.
+class PostMachineScheduler : public MachineSchedulerBase {
+public:
+  PostMachineScheduler();
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+
+  static char ID; // Class identification, replacement for typeinfo
+
+protected:
+  ScheduleDAGInstrs *createPostMachineScheduler();
+};
+} // namespace
+
+char MachineScheduler::ID = 0;
+
+char &llvm::MachineSchedulerID = MachineScheduler::ID;
+
+INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
+                      "Machine Instruction Scheduler", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
+                    "Machine Instruction Scheduler", false, false)
+
+MachineScheduler::MachineScheduler()
+: MachineSchedulerBase(ID) {
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void MachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<TargetPassConfig>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+char PostMachineScheduler::ID = 0;
+
+char &llvm::PostMachineSchedulerID = PostMachineScheduler::ID;
+
+INITIALIZE_PASS(PostMachineScheduler, "postmisched",
+                "PostRA Machine Instruction Scheduler", false, false)
+
+PostMachineScheduler::PostMachineScheduler()
+: MachineSchedulerBase(ID) {
+  initializePostMachineSchedulerPass(*PassRegistry::getPassRegistry());
+}
+
+void PostMachineScheduler::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<TargetPassConfig>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachinePassRegistry MachineSchedRegistry::Registry;
+
+/// A dummy default scheduler factory indicates whether the scheduler
+/// is overridden on the command line.
+static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
+  return nullptr;
+}
+
+/// MachineSchedOpt allows command line selection of the scheduler.
+static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
+               RegisterPassParser<MachineSchedRegistry> >
+MachineSchedOpt("misched",
+                cl::init(&useDefaultMachineSched), cl::Hidden,
+                cl::desc("Machine instruction scheduler to use"));
+
+static MachineSchedRegistry
+DefaultSchedRegistry("default", "Use the target's default scheduler choice.",
+                     useDefaultMachineSched);
+
+static cl::opt<bool> EnableMachineSched(
+    "enable-misched",
+    cl::desc("Enable the machine instruction scheduling pass."), cl::init(true),
+    cl::Hidden);
+
+static cl::opt<bool> EnablePostRAMachineSched(
+    "enable-post-misched",
+    cl::desc("Enable the post-ra machine instruction scheduling pass."),
+    cl::init(true), cl::Hidden);
+
+/// Forward declare the standard machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C);
+static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C);
+
+/// Decrement this iterator until reaching the top or a non-debug instr.
+static MachineBasicBlock::const_iterator
+priorNonDebug(MachineBasicBlock::const_iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  assert(I != Beg && "reached the top of the region, cannot decrement");
+  while (--I != Beg) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+priorNonDebug(MachineBasicBlock::iterator I,
+              MachineBasicBlock::const_iterator Beg) {
+  return const_cast<MachineInstr*>(
+    &*priorNonDebug(MachineBasicBlock::const_iterator(I), Beg));
+}
+
+/// If this iterator is a debug value, increment until reaching the End or a
+/// non-debug instruction.
+static MachineBasicBlock::const_iterator
+nextIfDebug(MachineBasicBlock::const_iterator I,
+            MachineBasicBlock::const_iterator End) {
+  for(; I != End; ++I) {
+    if (!I->isDebugValue())
+      break;
+  }
+  return I;
+}
+
+/// Non-const version.
+static MachineBasicBlock::iterator
+nextIfDebug(MachineBasicBlock::iterator I,
+            MachineBasicBlock::const_iterator End) {
+  // Cast the return value to nonconst MachineInstr, then cast to an
+  // instr_iterator, which does not check for null, finally return a
+  // bundle_iterator.
+  return MachineBasicBlock::instr_iterator(
+    const_cast<MachineInstr*>(
+      &*nextIfDebug(MachineBasicBlock::const_iterator(I), End)));
+}
+
+/// Instantiate a ScheduleDAGInstrs that will be owned by the caller.
+ScheduleDAGInstrs *MachineScheduler::createMachineScheduler() {
+  // Select the scheduler, or set the default.
+  MachineSchedRegistry::ScheduleDAGCtor Ctor = MachineSchedOpt;
+  if (Ctor != useDefaultMachineSched)
+    return Ctor(this);
+
+  // Get the default scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSchedLive(this);
+}
+
+/// Instantiate a ScheduleDAGInstrs for PostRA scheduling that will be owned by
+/// the caller. We don't have a command line option to override the postRA
+/// scheduler. The Target must configure it.
+ScheduleDAGInstrs *PostMachineScheduler::createPostMachineScheduler() {
+  // Get the postRA scheduler set by the target for this function.
+  ScheduleDAGInstrs *Scheduler = PassConfig->createPostMachineScheduler(this);
+  if (Scheduler)
+    return Scheduler;
+
+  // Default to GenericScheduler.
+  return createGenericSchedPostRA(this);
+}
+
+/// Top-level MachineScheduler pass driver.
+///
+/// Visit blocks in function order. Divide each block into scheduling regions
+/// and visit them bottom-up. Visiting regions bottom-up is not required, but is
+/// consistent with the DAG builder, which traverses the interior of the
+/// scheduling regions bottom-up.
+///
+/// This design avoids exposing scheduling boundaries to the DAG builder,
+/// simplifying the DAG builder's support for "special" target instructions.
+/// At the same time the design allows target schedulers to operate across
+/// scheduling boundaries, for example to bundle the boudary instructions
+/// without reordering them. This creates complexity, because the target
+/// scheduler must update the RegionBegin and RegionEnd positions cached by
+/// ScheduleDAGInstrs whenever adding or removing instructions. A much simpler
+/// design would be to split blocks at scheduling boundaries, but LLVM has a
+/// general bias against block splitting purely for implementation simplicity.
+bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
+  if (EnableMachineSched.getNumOccurrences()) {
+    if (!EnableMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enableMachineScheduler())
+    return false;
+
+  DEBUG(dbgs() << "Before MISched:\n"; mf.print(dbgs()));
+
+  // Initialize the context of the pass.
+  MF = &mf;
+  MLI = &getAnalysis<MachineLoopInfo>();
+  MDT = &getAnalysis<MachineDominatorTree>();
+  PassConfig = &getAnalysis<TargetPassConfig>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  LIS = &getAnalysis<LiveIntervals>();
+
+  if (VerifyScheduling) {
+    DEBUG(LIS->dump());
+    MF->verify(this, "Before machine scheduling.");
+  }
+  RegClassInfo->runOnMachineFunction(*MF);
+
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  std::unique_ptr<ScheduleDAGInstrs> Scheduler(createMachineScheduler());
+  scheduleRegions(*Scheduler, false);
+
+  DEBUG(LIS->dump());
+  if (VerifyScheduling)
+    MF->verify(this, "After machine scheduling.");
+  return true;
+}
+
+bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
+  if (skipFunction(*mf.getFunction()))
+    return false;
+
+  if (EnablePostRAMachineSched.getNumOccurrences()) {
+    if (!EnablePostRAMachineSched)
+      return false;
+  } else if (!mf.getSubtarget().enablePostRAScheduler()) {
+    DEBUG(dbgs() << "Subtarget disables post-MI-sched.\n");
+    return false;
+  }
+  DEBUG(dbgs() << "Before post-MI-sched:\n"; mf.print(dbgs()));
+
+  // Initialize the context of the pass.
+  MF = &mf;
+  PassConfig = &getAnalysis<TargetPassConfig>();
+
+  if (VerifyScheduling)
+    MF->verify(this, "Before post machine scheduling.");
+
+  // Instantiate the selected scheduler for this target, function, and
+  // optimization level.
+  std::unique_ptr<ScheduleDAGInstrs> Scheduler(createPostMachineScheduler());
+  scheduleRegions(*Scheduler, true);
+
+  if (VerifyScheduling)
+    MF->verify(this, "After post machine scheduling.");
+  return true;
+}
+
+/// Return true of the given instruction should not be included in a scheduling
+/// region.
+///
+/// MachineScheduler does not currently support scheduling across calls. To
+/// handle calls, the DAG builder needs to be modified to create register
+/// anti/output dependencies on the registers clobbered by the call's regmask
+/// operand. In PreRA scheduling, the stack pointer adjustment already prevents
+/// scheduling across calls. In PostRA scheduling, we need the isCall to enforce
+/// the boundary, but there would be no benefit to postRA scheduling across
+/// calls this late anyway.
+static bool isSchedBoundary(MachineBasicBlock::iterator MI,
+                            MachineBasicBlock *MBB,
+                            MachineFunction *MF,
+                            const TargetInstrInfo *TII) {
+  return MI->isCall() || TII->isSchedulingBoundary(*MI, MBB, *MF);
+}
+
+/// Main driver for both MachineScheduler and PostMachineScheduler.
+void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
+                                           bool FixKillFlags) {
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // Visit all machine basic blocks.
+  //
+  // TODO: Visit blocks in global postorder or postorder within the bottom-up
+  // loop tree. Then we can optionally compute global RegPressure.
+  for (MachineFunction::iterator MBB = MF->begin(), MBBEnd = MF->end();
+       MBB != MBBEnd; ++MBB) {
+
+    Scheduler.startBlock(&*MBB);
+
+#ifndef NDEBUG
+    if (SchedOnlyFunc.getNumOccurrences() && SchedOnlyFunc != MF->getName())
+      continue;
+    if (SchedOnlyBlock.getNumOccurrences()
+        && (int)SchedOnlyBlock != MBB->getNumber())
+      continue;
+#endif
+
+    // Break the block into scheduling regions [I, RegionEnd), and schedule each
+    // region as soon as it is discovered. RegionEnd points the scheduling
+    // boundary at the bottom of the region. The DAG does not include RegionEnd,
+    // but the region does (i.e. the next RegionEnd is above the previous
+    // RegionBegin). If the current block has no terminator then RegionEnd ==
+    // MBB->end() for the bottom region.
+    //
+    // The Scheduler may insert instructions during either schedule() or
+    // exitRegion(), even for empty regions. So the local iterators 'I' and
+    // 'RegionEnd' are invalid across these calls.
+    //
+    // MBB::size() uses instr_iterator to count. Here we need a bundle to count
+    // as a single instruction.
+    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
+        RegionEnd != MBB->begin(); RegionEnd = Scheduler.begin()) {
+
+      // Avoid decrementing RegionEnd for blocks with no terminator.
+      if (RegionEnd != MBB->end() ||
+          isSchedBoundary(&*std::prev(RegionEnd), &*MBB, MF, TII)) {
+        --RegionEnd;
+      }
+
+      // The next region starts above the previous region. Look backward in the
+      // instruction stream until we find the nearest boundary.
+      unsigned NumRegionInstrs = 0;
+      MachineBasicBlock::iterator I = RegionEnd;
+      for (;I != MBB->begin(); --I) {
+        if (isSchedBoundary(&*std::prev(I), &*MBB, MF, TII))
+          break;
+        if (!I->isDebugValue())
+          ++NumRegionInstrs;
+      }
+      // Notify the scheduler of the region, even if we may skip scheduling
+      // it. Perhaps it still needs to be bundled.
+      Scheduler.enterRegion(&*MBB, I, RegionEnd, NumRegionInstrs);
+
+      // Skip empty scheduling regions (0 or 1 schedulable instructions).
+      if (I == RegionEnd || I == std::prev(RegionEnd)) {
+        // Close the current region. Bundle the terminator if needed.
+        // This invalidates 'RegionEnd' and 'I'.
+        Scheduler.exitRegion();
+        continue;
+      }
+      DEBUG(dbgs() << "********** MI Scheduling **********\n");
+      DEBUG(dbgs() << MF->getName()
+            << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+            << "\n  From: " << *I << "    To: ";
+            if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+            else dbgs() << "End";
+            dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+      if (DumpCriticalPathLength) {
+        errs() << MF->getName();
+        errs() << ":BB# " << MBB->getNumber();
+        errs() << " " << MBB->getName() << " \n";
+      }
+
+      // Schedule a region: possibly reorder instructions.
+      // This invalidates 'RegionEnd' and 'I'.
+      Scheduler.schedule();
+
+      // Close the current region.
+      Scheduler.exitRegion();
+
+      // Scheduling has invalidated the current iterator 'I'. Ask the
+      // scheduler for the top of it's scheduled region.
+      RegionEnd = Scheduler.begin();
+    }
+    Scheduler.finishBlock();
+    // FIXME: Ideally, no further passes should rely on kill flags. However,
+    // thumb2 size reduction is currently an exception, so the PostMIScheduler
+    // needs to do this.
+    if (FixKillFlags)
+        Scheduler.fixupKills(&*MBB);
+  }
+  Scheduler.finalizeSchedule();
+}
+
+void MachineSchedulerBase::print(raw_ostream &O, const Module* m) const {
+  // unimplemented
+}
+
+LLVM_DUMP_METHOD
+void ReadyQueue::dump() {
+  dbgs() << "Queue " << Name << ": ";
+  for (unsigned i = 0, e = Queue.size(); i < e; ++i)
+    dbgs() << Queue[i]->NodeNum << " ";
+  dbgs() << "\n";
+}
+
+//===----------------------------------------------------------------------===//
+// ScheduleDAGMI - Basic machine instruction scheduling. This is
+// independent of PreRA/PostRA scheduling and involves no extra book-keeping for
+// virtual registers.
+// ===----------------------------------------------------------------------===/
+
+// Provide a vtable anchor.
+ScheduleDAGMI::~ScheduleDAGMI() {
+}
+
+bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
+  return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
+}
+
+bool ScheduleDAGMI::addEdge(SUnit *SuccSU, const SDep &PredDep) {
+  if (SuccSU != &ExitSU) {
+    // Do not use WillCreateCycle, it assumes SD scheduling.
+    // If Pred is reachable from Succ, then the edge creates a cycle.
+    if (Topo.IsReachable(PredDep.getSUnit(), SuccSU))
+      return false;
+    Topo.AddPred(SuccSU, PredDep.getSUnit());
+  }
+  SuccSU->addPred(PredDep, /*Required=*/!PredDep.isArtificial());
+  // Return true regardless of whether a new edge needed to be inserted.
+  return true;
+}
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. When
+/// NumPredsLeft reaches zero, release the successor node.
+///
+/// FIXME: Adjust SuccSU height based on MinLatency.
+void ScheduleDAGMI::releaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    if (SuccEdge->isCluster())
+      NextClusterSucc = SuccSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  // SU->TopReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (SuccSU->TopReadyCycle < SU->TopReadyCycle + SuccEdge->getLatency())
+    SuccSU->TopReadyCycle = SU->TopReadyCycle + SuccEdge->getLatency();
+
+  --SuccSU->NumPredsLeft;
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    SchedImpl->releaseTopNode(SuccSU);
+}
+
+/// releaseSuccessors - Call releaseSucc on each of SU's successors.
+void ScheduleDAGMI::releaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    releaseSucc(SU, &*I);
+  }
+}
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. When
+/// NumSuccsLeft reaches zero, release the predecessor node.
+///
+/// FIXME: Adjust PredSU height based on MinLatency.
+void ScheduleDAGMI::releasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+  if (PredEdge->isWeak()) {
+    --PredSU->WeakSuccsLeft;
+    if (PredEdge->isCluster())
+      NextClusterPred = PredSU;
+    return;
+  }
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  // SU->BotReadyCycle was set to CurrCycle when it was scheduled. However,
+  // CurrCycle may have advanced since then.
+  if (PredSU->BotReadyCycle < SU->BotReadyCycle + PredEdge->getLatency())
+    PredSU->BotReadyCycle = SU->BotReadyCycle + PredEdge->getLatency();
+
+  --PredSU->NumSuccsLeft;
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU)
+    SchedImpl->releaseBottomNode(PredSU);
+}
+
+/// releasePredecessors - Call releasePred on each of SU's predecessors.
+void ScheduleDAGMI::releasePredecessors(SUnit *SU) {
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    releasePred(SU, &*I);
+  }
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMI::enterRegion(MachineBasicBlock *bb,
+                                     MachineBasicBlock::iterator begin,
+                                     MachineBasicBlock::iterator end,
+                                     unsigned regioninstrs)
+{
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
+
+  SchedImpl->initPolicy(begin, end, regioninstrs);
+}
+
+/// This is normally called from the main scheduler loop but may also be invoked
+/// by the scheduling strategy to perform additional code motion.
+void ScheduleDAGMI::moveInstruction(
+  MachineInstr *MI, MachineBasicBlock::iterator InsertPos) {
+  // Advance RegionBegin if the first instruction moves down.
+  if (&*RegionBegin == MI)
+    ++RegionBegin;
+
+  // Update the instruction stream.
+  BB->splice(InsertPos, BB, MI);
+
+  // Update LiveIntervals
+  if (LIS)
+    LIS->handleMove(*MI, /*UpdateFlags=*/true);
+
+  // Recede RegionBegin if an instruction moves above the first.
+  if (RegionBegin == InsertPos)
+    RegionBegin = MI;
+}
+
+bool ScheduleDAGMI::checkSchedLimit() {
+#ifndef NDEBUG
+  if (NumInstrsScheduled == MISchedCutoff && MISchedCutoff != ~0U) {
+    CurrentTop = CurrentBottom;
+    return false;
+  }
+  ++NumInstrsScheduled;
+#endif
+  return true;
+}
+
+/// Per-region scheduling driver, called back from
+/// MachineScheduler::runOnMachineFunction. This is a simplified driver that
+/// does not consider liveness or register pressure. It is useful for PostRA
+/// scheduling and potentially other custom schedulers.
+void ScheduleDAGMI::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMI::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
+
+  // Build the DAG.
+  buildSchedGraph(AA);
+
+  Topo.InitDAGTopologicalSorting();
+
+  postprocessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMI::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    MachineInstr *MI = SU->getInstr();
+    if (IsTopNode) {
+      assert(SU->isTopReady() && "node still has unscheduled dependencies");
+      if (&*CurrentTop == MI)
+        CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+      else
+        moveInstruction(MI, CurrentTop);
+    } else {
+      assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+      MachineBasicBlock::iterator priorII =
+        priorNonDebug(CurrentBottom, CurrentTop);
+      if (&*priorII == MI)
+        CurrentBottom = priorII;
+      else {
+        if (&*CurrentTop == MI)
+          CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        moveInstruction(MI, CurrentBottom);
+        CurrentBottom = MI;
+      }
+    }
+    // Notify the scheduling strategy before updating the DAG.
+    // This sets the scheduled node's ReadyCycle to CurrCycle. When updateQueues
+    // runs, it can then use the accurate ReadyCycle time to determine whether
+    // newly released nodes can move to the readyQ.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void ScheduleDAGMI::postprocessDAG() {
+  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
+    Mutations[i]->apply(this);
+  }
+}
+
+void ScheduleDAGMI::
+findRootsAndBiasEdges(SmallVectorImpl<SUnit*> &TopRoots,
+                      SmallVectorImpl<SUnit*> &BotRoots) {
+  for (std::vector<SUnit>::iterator
+         I = SUnits.begin(), E = SUnits.end(); I != E; ++I) {
+    SUnit *SU = &(*I);
+    assert(!SU->isBoundaryNode() && "Boundary node should not be in SUnits");
+
+    // Order predecessors so DFSResult follows the critical path.
+    SU->biasCriticalPath();
+
+    // A SUnit is ready to top schedule if it has no predecessors.
+    if (!I->NumPredsLeft)
+      TopRoots.push_back(SU);
+    // A SUnit is ready to bottom schedule if it has no successors.
+    if (!I->NumSuccsLeft)
+      BotRoots.push_back(SU);
+  }
+  ExitSU.biasCriticalPath();
+}
+
+/// Identify DAG roots and setup scheduler queues.
+void ScheduleDAGMI::initQueues(ArrayRef<SUnit*> TopRoots,
+                               ArrayRef<SUnit*> BotRoots) {
+  NextClusterSucc = nullptr;
+  NextClusterPred = nullptr;
+
+  // Release all DAG roots for scheduling, not including EntrySU/ExitSU.
+  //
+  // Nodes with unreleased weak edges can still be roots.
+  // Release top roots in forward order.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = TopRoots.begin(), E = TopRoots.end(); I != E; ++I) {
+    SchedImpl->releaseTopNode(*I);
+  }
+  // Release bottom roots in reverse order so the higher priority nodes appear
+  // first. This is more natural and slightly more efficient.
+  for (SmallVectorImpl<SUnit*>::const_reverse_iterator
+         I = BotRoots.rbegin(), E = BotRoots.rend(); I != E; ++I) {
+    SchedImpl->releaseBottomNode(*I);
+  }
+
+  releaseSuccessors(&EntrySU);
+  releasePredecessors(&ExitSU);
+
+  SchedImpl->registerRoots();
+
+  // Advance past initial DebugValues.
+  CurrentTop = nextIfDebug(RegionBegin, RegionEnd);
+  CurrentBottom = RegionEnd;
+}
+
+/// Update scheduler queues after scheduling an instruction.
+void ScheduleDAGMI::updateQueues(SUnit *SU, bool IsTopNode) {
+  // Release dependent instructions for scheduling.
+  if (IsTopNode)
+    releaseSuccessors(SU);
+  else
+    releasePredecessors(SU);
+
+  SU->isScheduled = true;
+}
+
+/// Reinsert any remaining debug_values, just like the PostRA scheduler.
+void ScheduleDAGMI::placeDebugValues() {
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue) {
+    BB->splice(RegionBegin, BB, FirstDbgValue);
+    RegionBegin = FirstDbgValue;
+  }
+
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrevMI = P.second;
+    if (&*RegionBegin == DbgValue)
+      ++RegionBegin;
+    BB->splice(++OrigPrevMI, BB, DbgValue);
+    if (OrigPrevMI == std::prev(RegionEnd))
+      RegionEnd = DbgValue;
+  }
+  DbgValues.clear();
+  FirstDbgValue = nullptr;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGMI::dumpSchedule() const {
+  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
+    if (SUnit *SU = getSUnit(&(*MI)))
+      SU->dump(this);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// ScheduleDAGMILive - Base class for MachineInstr scheduling with LiveIntervals
+// preservation.
+//===----------------------------------------------------------------------===//
+
+ScheduleDAGMILive::~ScheduleDAGMILive() {
+  delete DFSResult;
+}
+
+/// enterRegion - Called back from MachineScheduler::runOnMachineFunction after
+/// crossing a scheduling boundary. [begin, end) includes all instructions in
+/// the region, including the boundary itself and single-instruction regions
+/// that don't get scheduled.
+void ScheduleDAGMILive::enterRegion(MachineBasicBlock *bb,
+                                MachineBasicBlock::iterator begin,
+                                MachineBasicBlock::iterator end,
+                                unsigned regioninstrs)
+{
+  // ScheduleDAGMI initializes SchedImpl's per-region policy.
+  ScheduleDAGMI::enterRegion(bb, begin, end, regioninstrs);
+
+  // For convenience remember the end of the liveness region.
+  LiveRegionEnd = (RegionEnd == bb->end()) ? RegionEnd : std::next(RegionEnd);
+
+  SUPressureDiffs.clear();
+
+  ShouldTrackPressure = SchedImpl->shouldTrackPressure();
+  ShouldTrackLaneMasks = SchedImpl->shouldTrackLaneMasks();
+
+  assert((!ShouldTrackLaneMasks || ShouldTrackPressure) &&
+         "ShouldTrackLaneMasks requires ShouldTrackPressure");
+}
+
+// Setup the register pressure trackers for the top scheduled top and bottom
+// scheduled regions.
+void ScheduleDAGMILive::initRegPressure() {
+  TopRPTracker.init(&MF, RegClassInfo, LIS, BB, RegionBegin,
+                    ShouldTrackLaneMasks, false);
+  BotRPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                    ShouldTrackLaneMasks, false);
+
+  // Close the RPTracker to finalize live ins.
+  RPTracker.closeRegion();
+
+  DEBUG(RPTracker.dump());
+
+  // Initialize the live ins and live outs.
+  TopRPTracker.addLiveRegs(RPTracker.getPressure().LiveInRegs);
+  BotRPTracker.addLiveRegs(RPTracker.getPressure().LiveOutRegs);
+
+  // Close one end of the tracker so we can call
+  // getMaxUpward/DownwardPressureDelta before advancing across any
+  // instructions. This converts currently live regs into live ins/outs.
+  TopRPTracker.closeTop();
+  BotRPTracker.closeBottom();
+
+  BotRPTracker.initLiveThru(RPTracker);
+  if (!BotRPTracker.getLiveThru().empty()) {
+    TopRPTracker.initLiveThru(BotRPTracker.getLiveThru());
+    DEBUG(dbgs() << "Live Thru: ";
+          dumpRegSetPressure(BotRPTracker.getLiveThru(), TRI));
+  };
+
+  // For each live out vreg reduce the pressure change associated with other
+  // uses of the same vreg below the live-out reaching def.
+  updatePressureDiffs(RPTracker.getPressure().LiveOutRegs);
+
+  // Account for liveness generated by the region boundary.
+  if (LiveRegionEnd != RegionEnd) {
+    SmallVector<RegisterMaskPair, 8> LiveUses;
+    BotRPTracker.recede(&LiveUses);
+    updatePressureDiffs(LiveUses);
+  }
+
+  DEBUG(
+    dbgs() << "Top Pressure:\n";
+    dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+    dbgs() << "Bottom Pressure:\n";
+    dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+  );
+
+  assert(BotRPTracker.getPos() == RegionEnd && "Can't find the region bottom");
+
+  // Cache the list of excess pressure sets in this region. This will also track
+  // the max pressure in the scheduled code for these sets.
+  RegionCriticalPSets.clear();
+  const std::vector<unsigned> &RegionPressure =
+    RPTracker.getPressure().MaxSetPressure;
+  for (unsigned i = 0, e = RegionPressure.size(); i < e; ++i) {
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(i);
+    if (RegionPressure[i] > Limit) {
+      DEBUG(dbgs() << TRI->getRegPressureSetName(i)
+            << " Limit " << Limit
+            << " Actual " << RegionPressure[i] << "\n");
+      RegionCriticalPSets.push_back(PressureChange(i));
+    }
+  }
+  DEBUG(dbgs() << "Excess PSets: ";
+        for (unsigned i = 0, e = RegionCriticalPSets.size(); i != e; ++i)
+          dbgs() << TRI->getRegPressureSetName(
+            RegionCriticalPSets[i].getPSet()) << " ";
+        dbgs() << "\n");
+}
+
+void ScheduleDAGMILive::
+updateScheduledPressure(const SUnit *SU,
+                        const std::vector<unsigned> &NewMaxPressure) {
+  const PressureDiff &PDiff = getPressureDiff(SU);
+  unsigned CritIdx = 0, CritEnd = RegionCriticalPSets.size();
+  for (PressureDiff::const_iterator I = PDiff.begin(), E = PDiff.end();
+       I != E; ++I) {
+    if (!I->isValid())
+      break;
+    unsigned ID = I->getPSet();
+    while (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() < ID)
+      ++CritIdx;
+    if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
+      if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
+          && NewMaxPressure[ID] <= INT16_MAX)
+        RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
+    }
+    unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
+    if (NewMaxPressure[ID] >= Limit - 2) {
+      DEBUG(dbgs() << "  " << TRI->getRegPressureSetName(ID) << ": "
+            << NewMaxPressure[ID]
+            << ((NewMaxPressure[ID] > Limit) ? " > " : " <= ") << Limit
+            << "(+ " << BotRPTracker.getLiveThru()[ID] << " livethru)\n");
+    }
+  }
+}
+
+/// Update the PressureDiff array for liveness after scheduling this
+/// instruction.
+void ScheduleDAGMILive::updatePressureDiffs(
+    ArrayRef<RegisterMaskPair> LiveUses) {
+  for (const RegisterMaskPair &P : LiveUses) {
+    unsigned Reg = P.RegUnit;
+    /// FIXME: Currently assuming single-use physregs.
+    if (!TRI->isVirtualRegister(Reg))
+      continue;
+
+    if (ShouldTrackLaneMasks) {
+      // If the register has just become live then other uses won't change
+      // this fact anymore => decrement pressure.
+      // If the register has just become dead then other uses make it come
+      // back to life => increment pressure.
+      bool Decrement = P.LaneMask != 0;
+
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit &SU = *V2SU.SU;
+        if (SU.isScheduled || &SU == &ExitSU)
+          continue;
+
+        PressureDiff &PDiff = getPressureDiff(&SU);
+        PDiff.addPressureChange(Reg, Decrement, &MRI);
+        DEBUG(
+          dbgs() << "  UpdateRegP: SU(" << SU.NodeNum << ") "
+                 << PrintReg(Reg, TRI) << ':' << PrintLaneMask(P.LaneMask)
+                 << ' ' << *SU.getInstr();
+          dbgs() << "              to ";
+          PDiff.dump(*TRI);
+        );
+      }
+    } else {
+      assert(P.LaneMask != 0);
+      DEBUG(dbgs() << "  LiveReg: " << PrintVRegOrUnit(Reg, TRI) << "\n");
+      // This may be called before CurrentBottom has been initialized. However,
+      // BotRPTracker must have a valid position. We want the value live into the
+      // instruction or live out of the block, so ask for the previous
+      // instruction's live-out.
+      const LiveInterval &LI = LIS->getInterval(Reg);
+      VNInfo *VNI;
+      MachineBasicBlock::const_iterator I =
+        nextIfDebug(BotRPTracker.getPos(), BB->end());
+      if (I == BB->end())
+        VNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+      else {
+        LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*I));
+        VNI = LRQ.valueIn();
+      }
+      // RegisterPressureTracker guarantees that readsReg is true for LiveUses.
+      assert(VNI && "No live value at use.");
+      for (const VReg2SUnit &V2SU
+           : make_range(VRegUses.find(Reg), VRegUses.end())) {
+        SUnit *SU = V2SU.SU;
+        // If this use comes before the reaching def, it cannot be a last use,
+        // so decrease its pressure change.
+        if (!SU->isScheduled && SU != &ExitSU) {
+          LiveQueryResult LRQ =
+              LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
+          if (LRQ.valueIn() == VNI) {
+            PressureDiff &PDiff = getPressureDiff(SU);
+            PDiff.addPressureChange(Reg, true, &MRI);
+            DEBUG(
+              dbgs() << "  UpdateRegP: SU(" << SU->NodeNum << ") "
+                     << *SU->getInstr();
+              dbgs() << "              to ";
+              PDiff.dump(*TRI);
+            );
+          }
+        }
+      }
+    }
+  }
+}
+
+/// schedule - Called back from MachineScheduler::runOnMachineFunction
+/// after setting up the current scheduling region. [RegionBegin, RegionEnd)
+/// only includes instructions that have DAG nodes, not scheduling boundaries.
+///
+/// This is a skeletal driver, with all the functionality pushed into helpers,
+/// so that it can be easily extended by experimental schedulers. Generally,
+/// implementing MachineSchedStrategy should be sufficient to implement a new
+/// scheduling algorithm. However, if a scheduler further subclasses
+/// ScheduleDAGMILive then it will want to override this virtual method in order
+/// to update any specialized state.
+void ScheduleDAGMILive::schedule() {
+  DEBUG(dbgs() << "ScheduleDAGMILive::schedule starting\n");
+  DEBUG(SchedImpl->dumpPolicy());
+  buildDAGWithRegPressure();
+
+  Topo.InitDAGTopologicalSorting();
+
+  postprocessDAG();
+
+  SmallVector<SUnit*, 8> TopRoots, BotRoots;
+  findRootsAndBiasEdges(TopRoots, BotRoots);
+
+  // Initialize the strategy before modifying the DAG.
+  // This may initialize a DFSResult to be used for queue priority.
+  SchedImpl->initialize(this);
+
+  DEBUG(
+    for (const SUnit &SU : SUnits) {
+      SU.dumpAll(this);
+      if (ShouldTrackPressure) {
+        dbgs() << "  Pressure Diff      : ";
+        getPressureDiff(&SU).dump(*TRI);
+      }
+      dbgs() << '\n';
+    }
+  );
+  if (ViewMISchedDAGs) viewGraph();
+
+  // Initialize ready queues now that the DAG and priority data are finalized.
+  initQueues(TopRoots, BotRoots);
+
+  bool IsTopNode = false;
+  while (true) {
+    DEBUG(dbgs() << "** ScheduleDAGMILive::schedule picking next node\n");
+    SUnit *SU = SchedImpl->pickNode(IsTopNode);
+    if (!SU) break;
+
+    assert(!SU->isScheduled && "Node already scheduled");
+    if (!checkSchedLimit())
+      break;
+
+    scheduleMI(SU, IsTopNode);
+
+    if (DFSResult) {
+      unsigned SubtreeID = DFSResult->getSubtreeID(SU);
+      if (!ScheduledTrees.test(SubtreeID)) {
+        ScheduledTrees.set(SubtreeID);
+        DFSResult->scheduleTree(SubtreeID);
+        SchedImpl->scheduleTree(SubtreeID);
+      }
+    }
+
+    // Notify the scheduling strategy after updating the DAG.
+    SchedImpl->schedNode(SU, IsTopNode);
+
+    updateQueues(SU, IsTopNode);
+  }
+  assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
+
+  placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = begin()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+/// Build the DAG and setup three register pressure trackers.
+void ScheduleDAGMILive::buildDAGWithRegPressure() {
+  if (!ShouldTrackPressure) {
+    RPTracker.reset();
+    RegionCriticalPSets.clear();
+    buildSchedGraph(AA);
+    return;
+  }
+
+  // Initialize the register pressure tracker used by buildSchedGraph.
+  RPTracker.init(&MF, RegClassInfo, LIS, BB, LiveRegionEnd,
+                 ShouldTrackLaneMasks, /*TrackUntiedDefs=*/true);
+
+  // Account for liveness generate by the region boundary.
+  if (LiveRegionEnd != RegionEnd)
+    RPTracker.recede();
+
+  // Build the DAG, and compute current register pressure.
+  buildSchedGraph(AA, &RPTracker, &SUPressureDiffs, LIS, ShouldTrackLaneMasks);
+
+  // Initialize top/bottom trackers after computing region pressure.
+  initRegPressure();
+}
+
+void ScheduleDAGMILive::computeDFSResult() {
+  if (!DFSResult)
+    DFSResult = new SchedDFSResult(/*BottomU*/true, MinSubtreeSize);
+  DFSResult->clear();
+  ScheduledTrees.clear();
+  DFSResult->resize(SUnits.size());
+  DFSResult->compute(SUnits);
+  ScheduledTrees.resize(DFSResult->getNumSubtrees());
+}
+
+/// Compute the max cyclic critical path through the DAG. The scheduling DAG
+/// only provides the critical path for single block loops. To handle loops that
+/// span blocks, we could use the vreg path latencies provided by
+/// MachineTraceMetrics instead. However, MachineTraceMetrics is not currently
+/// available for use in the scheduler.
+///
+/// The cyclic path estimation identifies a def-use pair that crosses the back
+/// edge and considers the depth and height of the nodes. For example, consider
+/// the following instruction sequence where each instruction has unit latency
+/// and defines an epomymous virtual register:
+///
+/// a->b(a,c)->c(b)->d(c)->exit
+///
+/// The cyclic critical path is a two cycles: b->c->b
+/// The acyclic critical path is four cycles: a->b->c->d->exit
+/// LiveOutHeight = height(c) = len(c->d->exit) = 2
+/// LiveOutDepth = depth(c) + 1 = len(a->b->c) + 1 = 3
+/// LiveInHeight = height(b) + 1 = len(b->c->d->exit) + 1 = 4
+/// LiveInDepth = depth(b) = len(a->b) = 1
+///
+/// LiveOutDepth - LiveInDepth = 3 - 1 = 2
+/// LiveInHeight - LiveOutHeight = 4 - 2 = 2
+/// CyclicCriticalPath = min(2, 2) = 2
+///
+/// This could be relevant to PostRA scheduling, but is currently implemented
+/// assuming LiveIntervals.
+unsigned ScheduleDAGMILive::computeCyclicCriticalPath() {
+  // This only applies to single block loop.
+  if (!BB->isSuccessor(BB))
+    return 0;
+
+  unsigned MaxCyclicLatency = 0;
+  // Visit each live out vreg def to find def/use pairs that cross iterations.
+  for (const RegisterMaskPair &P : RPTracker.getPressure().LiveOutRegs) {
+    unsigned Reg = P.RegUnit;
+    if (!TRI->isVirtualRegister(Reg))
+        continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    const VNInfo *DefVNI = LI.getVNInfoBefore(LIS->getMBBEndIdx(BB));
+    if (!DefVNI)
+      continue;
+
+    MachineInstr *DefMI = LIS->getInstructionFromIndex(DefVNI->def);
+    const SUnit *DefSU = getSUnit(DefMI);
+    if (!DefSU)
+      continue;
+
+    unsigned LiveOutHeight = DefSU->getHeight();
+    unsigned LiveOutDepth = DefSU->getDepth() + DefSU->Latency;
+    // Visit all local users of the vreg def.
+    for (const VReg2SUnit &V2SU
+         : make_range(VRegUses.find(Reg), VRegUses.end())) {
+      SUnit *SU = V2SU.SU;
+      if (SU == &ExitSU)
+        continue;
+
+      // Only consider uses of the phi.
+      LiveQueryResult LRQ = LI.Query(LIS->getInstructionIndex(*SU->getInstr()));
+      if (!LRQ.valueIn()->isPHIDef())
+        continue;
+
+      // Assume that a path spanning two iterations is a cycle, which could
+      // overestimate in strange cases. This allows cyclic latency to be
+      // estimated as the minimum slack of the vreg's depth or height.
+      unsigned CyclicLatency = 0;
+      if (LiveOutDepth > SU->getDepth())
+        CyclicLatency = LiveOutDepth - SU->getDepth();
+
+      unsigned LiveInHeight = SU->getHeight() + DefSU->Latency;
+      if (LiveInHeight > LiveOutHeight) {
+        if (LiveInHeight - LiveOutHeight < CyclicLatency)
+          CyclicLatency = LiveInHeight - LiveOutHeight;
+      } else
+        CyclicLatency = 0;
+
+      DEBUG(dbgs() << "Cyclic Path: SU(" << DefSU->NodeNum << ") -> SU("
+            << SU->NodeNum << ") = " << CyclicLatency << "c\n");
+      if (CyclicLatency > MaxCyclicLatency)
+        MaxCyclicLatency = CyclicLatency;
+    }
+  }
+  DEBUG(dbgs() << "Cyclic Critical Path: " << MaxCyclicLatency << "c\n");
+  return MaxCyclicLatency;
+}
+
+/// Release ExitSU predecessors and setup scheduler queues. Re-position
+/// the Top RP tracker in case the region beginning has changed.
+void ScheduleDAGMILive::initQueues(ArrayRef<SUnit*> TopRoots,
+                                   ArrayRef<SUnit*> BotRoots) {
+  ScheduleDAGMI::initQueues(TopRoots, BotRoots);
+  if (ShouldTrackPressure) {
+    assert(TopRPTracker.getPos() == RegionBegin && "bad initial Top tracker");
+    TopRPTracker.setPos(CurrentTop);
+  }
+}
+
+/// Move an instruction and update register pressure.
+void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
+  // Move the instruction to its new location in the instruction stream.
+  MachineInstr *MI = SU->getInstr();
+
+  if (IsTopNode) {
+    assert(SU->isTopReady() && "node still has unscheduled dependencies");
+    if (&*CurrentTop == MI)
+      CurrentTop = nextIfDebug(++CurrentTop, CurrentBottom);
+    else {
+      moveInstruction(MI, CurrentTop);
+      TopRPTracker.setPos(MI);
+    }
+
+    if (ShouldTrackPressure) {
+      // Update top scheduled pressure.
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      TopRPTracker.advance(RegOpers);
+      assert(TopRPTracker.getPos() == CurrentTop && "out of sync");
+      DEBUG(
+        dbgs() << "Top Pressure:\n";
+        dumpRegSetPressure(TopRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
+      updateScheduledPressure(SU, TopRPTracker.getPressure().MaxSetPressure);
+    }
+  } else {
+    assert(SU->isBottomReady() && "node still has unscheduled dependencies");
+    MachineBasicBlock::iterator priorII =
+      priorNonDebug(CurrentBottom, CurrentTop);
+    if (&*priorII == MI)
+      CurrentBottom = priorII;
+    else {
+      if (&*CurrentTop == MI) {
+        CurrentTop = nextIfDebug(++CurrentTop, priorII);
+        TopRPTracker.setPos(CurrentTop);
+      }
+      moveInstruction(MI, CurrentBottom);
+      CurrentBottom = MI;
+    }
+    if (ShouldTrackPressure) {
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+      if (ShouldTrackLaneMasks) {
+        // Adjust liveness and add missing dead+read-undef flags.
+        SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+      } else {
+        // Adjust for missing dead-def flags.
+        RegOpers.detectDeadDefs(*MI, *LIS);
+      }
+
+      BotRPTracker.recedeSkipDebugValues();
+      SmallVector<RegisterMaskPair, 8> LiveUses;
+      BotRPTracker.recede(RegOpers, &LiveUses);
+      assert(BotRPTracker.getPos() == CurrentBottom && "out of sync");
+      DEBUG(
+        dbgs() << "Bottom Pressure:\n";
+        dumpRegSetPressure(BotRPTracker.getRegSetPressureAtPos(), TRI);
+      );
+
+      updateScheduledPressure(SU, BotRPTracker.getPressure().MaxSetPressure);
+      updatePressureDiffs(LiveUses);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// BaseMemOpClusterMutation - DAG post-processing to cluster loads or stores.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between neighboring
+/// loads or between neighboring stores.
+class BaseMemOpClusterMutation : public ScheduleDAGMutation {
+  struct MemOpInfo {
+    SUnit *SU;
+    unsigned BaseReg;
+    int64_t Offset;
+    MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
+        : SU(su), BaseReg(reg), Offset(ofs) {}
+
+    bool operator<(const MemOpInfo&RHS) const {
+      return std::tie(BaseReg, Offset) < std::tie(RHS.BaseReg, RHS.Offset);
+    }
+  };
+
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  bool IsLoad;
+
+public:
+  BaseMemOpClusterMutation(const TargetInstrInfo *tii,
+                           const TargetRegisterInfo *tri, bool IsLoad)
+      : TII(tii), TRI(tri), IsLoad(IsLoad) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+protected:
+  void clusterNeighboringMemOps(ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG);
+};
+
+class StoreClusterMutation : public BaseMemOpClusterMutation {
+public:
+  StoreClusterMutation(const TargetInstrInfo *tii,
+                       const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, false) {}
+};
+
+class LoadClusterMutation : public BaseMemOpClusterMutation {
+public:
+  LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
+      : BaseMemOpClusterMutation(tii, tri, true) {}
+};
+} // anonymous
+
+void BaseMemOpClusterMutation::clusterNeighboringMemOps(
+    ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
+  SmallVector<MemOpInfo, 32> MemOpRecords;
+  for (unsigned Idx = 0, End = MemOps.size(); Idx != End; ++Idx) {
+    SUnit *SU = MemOps[Idx];
+    unsigned BaseReg;
+    int64_t Offset;
+    if (TII->getMemOpBaseRegImmOfs(*SU->getInstr(), BaseReg, Offset, TRI))
+      MemOpRecords.push_back(MemOpInfo(SU, BaseReg, Offset));
+  }
+  if (MemOpRecords.size() < 2)
+    return;
+
+  std::sort(MemOpRecords.begin(), MemOpRecords.end());
+  unsigned ClusterLength = 1;
+  for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
+    if (MemOpRecords[Idx].BaseReg != MemOpRecords[Idx+1].BaseReg) {
+      ClusterLength = 1;
+      continue;
+    }
+
+    SUnit *SUa = MemOpRecords[Idx].SU;
+    SUnit *SUb = MemOpRecords[Idx+1].SU;
+    if (TII->shouldClusterMemOps(*SUa->getInstr(), *SUb->getInstr(),
+                                 ClusterLength) &&
+        DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
+      DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+            << SUb->NodeNum << ")\n");
+      // Copy successor edges from SUa to SUb. Interleaving computation
+      // dependent on SUa can prevent load combining due to register reuse.
+      // Predecessor edges do not need to be copied from SUb to SUa since nearby
+      // loads should have effectively the same inputs.
+      for (SUnit::const_succ_iterator
+             SI = SUa->Succs.begin(), SE = SUa->Succs.end(); SI != SE; ++SI) {
+        if (SI->getSUnit() == SUb)
+          continue;
+        DEBUG(dbgs() << "  Copy Succ SU(" << SI->getSUnit()->NodeNum << ")\n");
+        DAG->addEdge(SI->getSUnit(), SDep(SUb, SDep::Artificial));
+      }
+      ++ClusterLength;
+    } else
+      ClusterLength = 1;
+  }
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges for loads.
+void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
+
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // Map DAG NodeNum to store chain ID.
+  DenseMap<unsigned, unsigned> StoreChainIDs;
+  // Map each store chain to a set of dependent MemOps.
+  SmallVector<SmallVector<SUnit*,4>, 32> StoreChainDependents;
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    SUnit *SU = &DAG->SUnits[Idx];
+    if ((IsLoad && !SU->getInstr()->mayLoad()) ||
+        (!IsLoad && !SU->getInstr()->mayStore()))
+      continue;
+
+    unsigned ChainPredID = DAG->SUnits.size();
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->isCtrl()) {
+        ChainPredID = PI->getSUnit()->NodeNum;
+        break;
+      }
+    }
+    // Check if this chain-like pred has been seen
+    // before. ChainPredID==MaxNodeID at the top of the schedule.
+    unsigned NumChains = StoreChainDependents.size();
+    std::pair<DenseMap<unsigned, unsigned>::iterator, bool> Result =
+      StoreChainIDs.insert(std::make_pair(ChainPredID, NumChains));
+    if (Result.second)
+      StoreChainDependents.resize(NumChains + 1);
+    StoreChainDependents[Result.first->second].push_back(SU);
+  }
+
+  // Iterate over the store chains.
+  for (unsigned Idx = 0, End = StoreChainDependents.size(); Idx != End; ++Idx)
+    clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
+}
+
+//===----------------------------------------------------------------------===//
+// MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class MacroFusion : public ScheduleDAGMutation {
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+public:
+  MacroFusion(const TargetInstrInfo &TII, const TargetRegisterInfo &TRI)
+    : TII(TII), TRI(TRI) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+} // anonymous
+
+/// Returns true if \p MI reads a register written by \p Other.
+static bool HasDataDep(const TargetRegisterInfo &TRI, const MachineInstr &MI,
+                       const MachineInstr &Other) {
+  for (const MachineOperand &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (Other.modifiesRegister(Reg, &TRI))
+      return true;
+  }
+  return false;
+}
+
+/// \brief Callback from DAG postProcessing to create cluster edges to encourage
+/// fused operations.
+void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For now, assume targets can only fuse with the branch.
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
+  if (!Branch)
+    return;
+
+  for (SUnit &SU : DAG->SUnits) {
+    // SUnits with successors can't be schedule in front of the ExitSU.
+    if (!SU.Succs.empty())
+      continue;
+    // We only care if the node writes to a register that the branch reads.
+    MachineInstr *Pred = SU.getInstr();
+    if (!HasDataDep(TRI, *Branch, *Pred))
+      continue;
+
+    if (!TII.shouldScheduleAdjacent(*Pred, *Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
+    break;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// CopyConstrain - DAG post-processing to encourage copy elimination.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Post-process the DAG to create weak edges from all uses of a copy to
+/// the one use that defines the copy's source vreg, most likely an induction
+/// variable increment.
+class CopyConstrain : public ScheduleDAGMutation {
+  // Transient state.
+  SlotIndex RegionBeginIdx;
+  // RegionEndIdx is the slot index of the last non-debug instruction in the
+  // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
+  SlotIndex RegionEndIdx;
+public:
+  CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+
+protected:
+  void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
+};
+} // anonymous
+
+/// constrainLocalCopy handles two possibilities:
+/// 1) Local src:
+/// I0:     = dst
+/// I1: src = ...
+/// I2:     = dst
+/// I3: dst = src (copy)
+/// (create pred->succ edges I0->I1, I2->I1)
+///
+/// 2) Local copy:
+/// I0: dst = src (copy)
+/// I1:     = dst
+/// I2: src = ...
+/// I3:     = dst
+/// (create pred->succ edges I1->I2, I3->I2)
+///
+/// Although the MachineScheduler is currently constrained to single blocks,
+/// this algorithm should handle extended blocks. An EBB is a set of
+/// contiguously numbered blocks such that the previous block in the EBB is
+/// always the single predecessor.
+void CopyConstrain::constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG) {
+  LiveIntervals *LIS = DAG->getLIS();
+  MachineInstr *Copy = CopySU->getInstr();
+
+  // Check for pure vreg copies.
+  const MachineOperand &SrcOp = Copy->getOperand(1);
+  unsigned SrcReg = SrcOp.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) || !SrcOp.readsReg())
+    return;
+
+  const MachineOperand &DstOp = Copy->getOperand(0);
+  unsigned DstReg = DstOp.getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg) || DstOp.isDead())
+    return;
+
+  // Check if either the dest or source is local. If it's live across a back
+  // edge, it's not local. Note that if both vregs are live across the back
+  // edge, we cannot successfully contrain the copy without cyclic scheduling.
+  // If both the copy's source and dest are local live intervals, then we
+  // should treat the dest as the global for the purpose of adding
+  // constraints. This adds edges from source's other uses to the copy.
+  unsigned LocalReg = SrcReg;
+  unsigned GlobalReg = DstReg;
+  LiveInterval *LocalLI = &LIS->getInterval(LocalReg);
+  if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx)) {
+    LocalReg = DstReg;
+    GlobalReg = SrcReg;
+    LocalLI = &LIS->getInterval(LocalReg);
+    if (!LocalLI->isLocal(RegionBeginIdx, RegionEndIdx))
+      return;
+  }
+  LiveInterval *GlobalLI = &LIS->getInterval(GlobalReg);
+
+  // Find the global segment after the start of the local LI.
+  LiveInterval::iterator GlobalSegment = GlobalLI->find(LocalLI->beginIndex());
+  // If GlobalLI does not overlap LocalLI->start, then a copy directly feeds a
+  // local live range. We could create edges from other global uses to the local
+  // start, but the coalescer should have already eliminated these cases, so
+  // don't bother dealing with it.
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // If GlobalSegment is killed at the LocalLI->start, the call to find()
+  // returned the next global segment. But if GlobalSegment overlaps with
+  // LocalLI->start, then advance to the next segement. If a hole in GlobalLI
+  // exists in LocalLI's vicinity, GlobalSegment will be the end of the hole.
+  if (GlobalSegment->contains(LocalLI->beginIndex()))
+    ++GlobalSegment;
+
+  if (GlobalSegment == GlobalLI->end())
+    return;
+
+  // Check if GlobalLI contains a hole in the vicinity of LocalLI.
+  if (GlobalSegment != GlobalLI->begin()) {
+    // Two address defs have no hole.
+    if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->end,
+                               GlobalSegment->start)) {
+      return;
+    }
+    // If the prior global segment may be defined by the same two-address
+    // instruction that also defines LocalLI, then can't make a hole here.
+    if (SlotIndex::isSameInstr(std::prev(GlobalSegment)->start,
+                               LocalLI->beginIndex())) {
+      return;
+    }
+    // If GlobalLI has a prior segment, it must be live into the EBB. Otherwise
+    // it would be a disconnected component in the live range.
+    assert(std::prev(GlobalSegment)->start < LocalLI->beginIndex() &&
+           "Disconnected LRG within the scheduling region.");
+  }
+  MachineInstr *GlobalDef = LIS->getInstructionFromIndex(GlobalSegment->start);
+  if (!GlobalDef)
+    return;
+
+  SUnit *GlobalSU = DAG->getSUnit(GlobalDef);
+  if (!GlobalSU)
+    return;
+
+  // GlobalDef is the bottom of the GlobalLI hole. Open the hole by
+  // constraining the uses of the last local def to precede GlobalDef.
+  SmallVector<SUnit*,8> LocalUses;
+  const VNInfo *LastLocalVN = LocalLI->getVNInfoBefore(LocalLI->endIndex());
+  MachineInstr *LastLocalDef = LIS->getInstructionFromIndex(LastLocalVN->def);
+  SUnit *LastLocalSU = DAG->getSUnit(LastLocalDef);
+  for (SUnit::const_succ_iterator
+         I = LastLocalSU->Succs.begin(), E = LastLocalSU->Succs.end();
+       I != E; ++I) {
+    if (I->getKind() != SDep::Data || I->getReg() != LocalReg)
+      continue;
+    if (I->getSUnit() == GlobalSU)
+      continue;
+    if (!DAG->canAddEdge(GlobalSU, I->getSUnit()))
+      return;
+    LocalUses.push_back(I->getSUnit());
+  }
+  // Open the top of the GlobalLI hole by constraining any earlier global uses
+  // to precede the start of LocalLI.
+  SmallVector<SUnit*,8> GlobalUses;
+  MachineInstr *FirstLocalDef =
+    LIS->getInstructionFromIndex(LocalLI->beginIndex());
+  SUnit *FirstLocalSU = DAG->getSUnit(FirstLocalDef);
+  for (SUnit::const_pred_iterator
+         I = GlobalSU->Preds.begin(), E = GlobalSU->Preds.end(); I != E; ++I) {
+    if (I->getKind() != SDep::Anti || I->getReg() != GlobalReg)
+      continue;
+    if (I->getSUnit() == FirstLocalSU)
+      continue;
+    if (!DAG->canAddEdge(FirstLocalSU, I->getSUnit()))
+      return;
+    GlobalUses.push_back(I->getSUnit());
+  }
+  DEBUG(dbgs() << "Constraining copy SU(" << CopySU->NodeNum << ")\n");
+  // Add the weak edges.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = LocalUses.begin(), E = LocalUses.end(); I != E; ++I) {
+    DEBUG(dbgs() << "  Local use SU(" << (*I)->NodeNum << ") -> SU("
+          << GlobalSU->NodeNum << ")\n");
+    DAG->addEdge(GlobalSU, SDep(*I, SDep::Weak));
+  }
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = GlobalUses.begin(), E = GlobalUses.end(); I != E; ++I) {
+    DEBUG(dbgs() << "  Global use SU(" << (*I)->NodeNum << ") -> SU("
+          << FirstLocalSU->NodeNum << ")\n");
+    DAG->addEdge(FirstLocalSU, SDep(*I, SDep::Weak));
+  }
+}
+
+/// \brief Callback from DAG postProcessing to create weak edges to encourage
+/// copy elimination.
+void CopyConstrain::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+  assert(DAG->hasVRegLiveness() && "Expect VRegs with LiveIntervals");
+
+  MachineBasicBlock::iterator FirstPos = nextIfDebug(DAG->begin(), DAG->end());
+  if (FirstPos == DAG->end())
+    return;
+  RegionBeginIdx = DAG->getLIS()->getInstructionIndex(*FirstPos);
+  RegionEndIdx = DAG->getLIS()->getInstructionIndex(
+      *priorNonDebug(DAG->end(), DAG->begin()));
+
+  for (unsigned Idx = 0, End = DAG->SUnits.size(); Idx != End; ++Idx) {
+    SUnit *SU = &DAG->SUnits[Idx];
+    if (!SU->getInstr()->isCopy())
+      continue;
+
+    constrainLocalCopy(SU, static_cast<ScheduleDAGMILive*>(DAG));
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// MachineSchedStrategy helpers used by GenericScheduler, GenericPostScheduler
+// and possibly other custom schedulers.
+//===----------------------------------------------------------------------===//
+
+static const unsigned InvalidCycle = ~0U;
+
+SchedBoundary::~SchedBoundary() { delete HazardRec; }
+
+void SchedBoundary::reset() {
+  // A new HazardRec is created for each DAG and owned by SchedBoundary.
+  // Destroying and reconstructing it is very expensive though. So keep
+  // invalid, placeholder HazardRecs.
+  if (HazardRec && HazardRec->isEnabled()) {
+    delete HazardRec;
+    HazardRec = nullptr;
+  }
+  Available.clear();
+  Pending.clear();
+  CheckPending = false;
+  NextSUs.clear();
+  CurrCycle = 0;
+  CurrMOps = 0;
+  MinReadyCycle = UINT_MAX;
+  ExpectedLatency = 0;
+  DependentLatency = 0;
+  RetiredMOps = 0;
+  MaxExecutedResCount = 0;
+  ZoneCritResIdx = 0;
+  IsResourceLimited = false;
+  ReservedCycles.clear();
+#ifndef NDEBUG
+  // Track the maximum number of stall cycles that could arise either from the
+  // latency of a DAG edge or the number of cycles that a processor resource is
+  // reserved (SchedBoundary::ReservedCycles).
+  MaxObservedStall = 0;
+#endif
+  // Reserve a zero-count for invalid CritResIdx.
+  ExecutedResCounts.resize(1);
+  assert(!ExecutedResCounts[0] && "nonzero count for bad resource");
+}
+
+void SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (std::vector<SUnit>::iterator
+         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+    RemIssueCount += SchedModel->getNumMicroOps(I->getInstr(), SC)
+      * SchedModel->getMicroOpFactor();
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      RemainingCounts[PIdx] += (Factor * PI->Cycles);
+    }
+  }
+}
+
+void SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel()) {
+    ExecutedResCounts.resize(SchedModel->getNumProcResourceKinds());
+    ReservedCycles.resize(SchedModel->getNumProcResourceKinds(), InvalidCycle);
+  }
+}
+
+/// Compute the stall cycles based on this SUnit's ready time. Heuristics treat
+/// these "soft stalls" differently than the hard stall cycles based on CPU
+/// resources and computed by checkHazard(). A fully in-order model
+/// (MicroOpBufferSize==0) will not make use of this since instructions are not
+/// available for scheduling until they are ready. However, a weaker in-order
+/// model may use this for heuristics. For example, if a processor has in-order
+/// behavior when reading certain resources, this may come into play.
+unsigned SchedBoundary::getLatencyStallCycles(SUnit *SU) {
+  if (!SU->isUnbuffered)
+    return 0;
+
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  if (ReadyCycle > CurrCycle)
+    return ReadyCycle - CurrCycle;
+  return 0;
+}
+
+/// Compute the next cycle at which the given processor resource can be
+/// scheduled.
+unsigned SchedBoundary::
+getNextResourceCycle(unsigned PIdx, unsigned Cycles) {
+  unsigned NextUnreserved = ReservedCycles[PIdx];
+  // If this resource has never been used, always return cycle zero.
+  if (NextUnreserved == InvalidCycle)
+    return 0;
+  // For bottom-up scheduling add the cycles needed for the current operation.
+  if (!isTop())
+    NextUnreserved += Cycles;
+  return NextUnreserved;
+}
+
+/// Does this SU have a hazard within the current instruction group.
+///
+/// The scheduler supports two modes of hazard recognition. The first is the
+/// ScheduleHazardRecognizer API. It is a fully general hazard recognizer that
+/// supports highly complicated in-order reservation tables
+/// (ScoreboardHazardRecognizer) and arbitraty target-specific logic.
+///
+/// The second is a streamlined mechanism that checks for hazards based on
+/// simple counters that the scheduler itself maintains. It explicitly checks
+/// for instruction dispatch limitations, including the number of micro-ops that
+/// can dispatch per cycle.
+///
+/// TODO: Also check whether the SU must start a new group.
+bool SchedBoundary::checkHazard(SUnit *SU) {
+  if (HazardRec->isEnabled()
+      && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
+    return true;
+  }
+  unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
+  if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
+    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
+    return true;
+  }
+  if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned NRCycle = getNextResourceCycle(PI->ProcResourceIdx, PI->Cycles);
+      if (NRCycle > CurrCycle) {
+#ifndef NDEBUG
+        MaxObservedStall = std::max(PI->Cycles, MaxObservedStall);
+#endif
+        DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") "
+              << SchedModel->getResourceName(PI->ProcResourceIdx)
+              << "=" << NRCycle << "c\n");
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+// Find the unscheduled node in ReadySUs with the highest latency.
+unsigned SchedBoundary::
+findMaxLatency(ArrayRef<SUnit*> ReadySUs) {
+  SUnit *LateSU = nullptr;
+  unsigned RemLatency = 0;
+  for (ArrayRef<SUnit*>::iterator I = ReadySUs.begin(), E = ReadySUs.end();
+       I != E; ++I) {
+    unsigned L = getUnscheduledLatency(*I);
+    if (L > RemLatency) {
+      RemLatency = L;
+      LateSU = *I;
+    }
+  }
+  if (LateSU) {
+    DEBUG(dbgs() << Available.getName() << " RemLatency SU("
+          << LateSU->NodeNum << ") " << RemLatency << "c\n");
+  }
+  return RemLatency;
+}
+
+// Count resources in this zone and the remaining unscheduled
+// instruction. Return the max count, scaled. Set OtherCritIdx to the critical
+// resource index, or zero if the zone is issue limited.
+unsigned SchedBoundary::
+getOtherResourceCount(unsigned &OtherCritIdx) {
+  OtherCritIdx = 0;
+  if (!SchedModel->hasInstrSchedModel())
+    return 0;
+
+  unsigned OtherCritCount = Rem->RemIssueCount
+    + (RetiredMOps * SchedModel->getMicroOpFactor());
+  DEBUG(dbgs() << "  " << Available.getName() << " + Remain MOps: "
+        << OtherCritCount / SchedModel->getMicroOpFactor() << '\n');
+  for (unsigned PIdx = 1, PEnd = SchedModel->getNumProcResourceKinds();
+       PIdx != PEnd; ++PIdx) {
+    unsigned OtherCount = getResourceCount(PIdx) + Rem->RemainingCounts[PIdx];
+    if (OtherCount > OtherCritCount) {
+      OtherCritCount = OtherCount;
+      OtherCritIdx = PIdx;
+    }
+  }
+  if (OtherCritIdx) {
+    DEBUG(dbgs() << "  " << Available.getName() << " + Remain CritRes: "
+          << OtherCritCount / SchedModel->getResourceFactor(OtherCritIdx)
+          << " " << SchedModel->getResourceName(OtherCritIdx) << "\n");
+  }
+  return OtherCritCount;
+}
+
+void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
+  assert(SU->getInstr() && "Scheduled SUnit must have instr");
+
+#ifndef NDEBUG
+  // ReadyCycle was been bumped up to the CurrCycle when this node was
+  // scheduled, but CurrCycle may have been eagerly advanced immediately after
+  // scheduling, so may now be greater than ReadyCycle.
+  if (ReadyCycle > CurrCycle)
+    MaxObservedStall = std::max(ReadyCycle - CurrCycle, MaxObservedStall);
+#endif
+
+  if (ReadyCycle < MinReadyCycle)
+    MinReadyCycle = ReadyCycle;
+
+  // Check for interlocks first. For the purpose of other heuristics, an
+  // instruction that cannot issue appears as if it's not in the ReadyQueue.
+  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+  if ((!IsBuffered && ReadyCycle > CurrCycle) || checkHazard(SU) ||
+      Available.size() >= ReadyListLimit)
+    Pending.push(SU);
+  else
+    Available.push(SU);
+
+  // Record this node as an immediate dependent of the scheduled node.
+  NextSUs.insert(SU);
+}
+
+void SchedBoundary::releaseTopNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  releaseNode(SU, SU->TopReadyCycle);
+}
+
+void SchedBoundary::releaseBottomNode(SUnit *SU) {
+  if (SU->isScheduled)
+    return;
+
+  releaseNode(SU, SU->BotReadyCycle);
+}
+
+/// Move the boundary of scheduled code by one cycle.
+void SchedBoundary::bumpCycle(unsigned NextCycle) {
+  if (SchedModel->getMicroOpBufferSize() == 0) {
+    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+    if (MinReadyCycle > NextCycle)
+      NextCycle = MinReadyCycle;
+  }
+  // Update the current micro-ops, which will issue in the next cycle.
+  unsigned DecMOps = SchedModel->getIssueWidth() * (NextCycle - CurrCycle);
+  CurrMOps = (CurrMOps <= DecMOps) ? 0 : CurrMOps - DecMOps;
+
+  // Decrement DependentLatency based on the next cycle.
+  if ((NextCycle - CurrCycle) > DependentLatency)
+    DependentLatency = 0;
+  else
+    DependentLatency -= (NextCycle - CurrCycle);
+
+  if (!HazardRec->isEnabled()) {
+    // Bypass HazardRec virtual calls.
+    CurrCycle = NextCycle;
+  } else {
+    // Bypass getHazardType calls in case of long latency.
+    for (; CurrCycle != NextCycle; ++CurrCycle) {
+      if (isTop())
+        HazardRec->AdvanceCycle();
+      else
+        HazardRec->RecedeCycle();
+    }
+  }
+  CheckPending = true;
+  unsigned LFactor = SchedModel->getLatencyFactor();
+  IsResourceLimited =
+    (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+    > (int)LFactor;
+
+  DEBUG(dbgs() << "Cycle: " << CurrCycle << ' ' << Available.getName() << '\n');
+}
+
+void SchedBoundary::incExecutedResources(unsigned PIdx, unsigned Count) {
+  ExecutedResCounts[PIdx] += Count;
+  if (ExecutedResCounts[PIdx] > MaxExecutedResCount)
+    MaxExecutedResCount = ExecutedResCounts[PIdx];
+}
+
+/// Add the given processor resource to this scheduled zone.
+///
+/// \param Cycles indicates the number of consecutive (non-pipelined) cycles
+/// during which this resource is consumed.
+///
+/// \return the next cycle at which the instruction may execute without
+/// oversubscribing resources.
+unsigned SchedBoundary::
+countResource(unsigned PIdx, unsigned Cycles, unsigned NextCycle) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  unsigned Count = Factor * Cycles;
+  DEBUG(dbgs() << "  " << SchedModel->getResourceName(PIdx)
+        << " +" << Cycles << "x" << Factor << "u\n");
+
+  // Update Executed resources counts.
+  incExecutedResources(PIdx, Count);
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Check if this resource exceeds the current critical resource. If so, it
+  // becomes the critical resource.
+  if (ZoneCritResIdx != PIdx && (getResourceCount(PIdx) > getCriticalCount())) {
+    ZoneCritResIdx = PIdx;
+    DEBUG(dbgs() << "  *** Critical resource "
+          << SchedModel->getResourceName(PIdx) << ": "
+          << getResourceCount(PIdx) / SchedModel->getLatencyFactor() << "c\n");
+  }
+  // For reserved resources, record the highest cycle using the resource.
+  unsigned NextAvailable = getNextResourceCycle(PIdx, Cycles);
+  if (NextAvailable > CurrCycle) {
+    DEBUG(dbgs() << "  Resource conflict: "
+          << SchedModel->getProcResource(PIdx)->Name << " reserved until @"
+          << NextAvailable << "\n");
+  }
+  return NextAvailable;
+}
+
+/// Move the boundary of scheduled code by one SUnit.
+void SchedBoundary::bumpNode(SUnit *SU) {
+  // Update the reservation table.
+  if (HazardRec->isEnabled()) {
+    if (!isTop() && SU->isCall) {
+      // Calls are scheduled with their preceding instructions. For bottom-up
+      // scheduling, clear the pipeline state before emitting.
+      HazardRec->Reset();
+    }
+    HazardRec->EmitInstruction(SU);
+  }
+  // checkHazard should prevent scheduling multiple instructions per cycle that
+  // exceed the issue width.
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  unsigned IncMOps = SchedModel->getNumMicroOps(SU->getInstr());
+  assert(
+      (CurrMOps == 0 || (CurrMOps + IncMOps) <= SchedModel->getIssueWidth()) &&
+      "Cannot schedule this instruction's MicroOps in the current cycle.");
+
+  unsigned ReadyCycle = (isTop() ? SU->TopReadyCycle : SU->BotReadyCycle);
+  DEBUG(dbgs() << "  Ready @" << ReadyCycle << "c\n");
+
+  unsigned NextCycle = CurrCycle;
+  switch (SchedModel->getMicroOpBufferSize()) {
+  case 0:
+    assert(ReadyCycle <= CurrCycle && "Broken PendingQueue");
+    break;
+  case 1:
+    if (ReadyCycle > NextCycle) {
+      NextCycle = ReadyCycle;
+      DEBUG(dbgs() << "  *** Stall until: " << ReadyCycle << "\n");
+    }
+    break;
+  default:
+    // We don't currently model the OOO reorder buffer, so consider all
+    // scheduled MOps to be "retired". We do loosely model in-order resource
+    // latency. If this instruction uses an in-order resource, account for any
+    // likely stall cycles.
+    if (SU->isUnbuffered && ReadyCycle > NextCycle)
+      NextCycle = ReadyCycle;
+    break;
+  }
+  RetiredMOps += IncMOps;
+
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    unsigned DecRemIssue = IncMOps * SchedModel->getMicroOpFactor();
+    assert(Rem->RemIssueCount >= DecRemIssue && "MOps double counted");
+    Rem->RemIssueCount -= DecRemIssue;
+    if (ZoneCritResIdx) {
+      // Scale scheduled micro-ops for comparing with the critical resource.
+      unsigned ScaledMOps =
+        RetiredMOps * SchedModel->getMicroOpFactor();
+
+      // If scaled micro-ops are now more than the previous critical resource by
+      // a full cycle, then micro-ops issue becomes critical.
+      if ((int)(ScaledMOps - getResourceCount(ZoneCritResIdx))
+          >= (int)SchedModel->getLatencyFactor()) {
+        ZoneCritResIdx = 0;
+        DEBUG(dbgs() << "  *** Critical resource NumMicroOps: "
+              << ScaledMOps / SchedModel->getLatencyFactor() << "c\n");
+      }
+    }
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned RCycle =
+        countResource(PI->ProcResourceIdx, PI->Cycles, NextCycle);
+      if (RCycle > NextCycle)
+        NextCycle = RCycle;
+    }
+    if (SU->hasReservedResource) {
+      // For reserved resources, record the highest cycle using the resource.
+      // For top-down scheduling, this is the cycle in which we schedule this
+      // instruction plus the number of cycles the operations reserves the
+      // resource. For bottom-up is it simply the instruction's cycle.
+      for (TargetSchedModel::ProcResIter
+             PI = SchedModel->getWriteProcResBegin(SC),
+             PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+        unsigned PIdx = PI->ProcResourceIdx;
+        if (SchedModel->getProcResource(PIdx)->BufferSize == 0) {
+          if (isTop()) {
+            ReservedCycles[PIdx] =
+              std::max(getNextResourceCycle(PIdx, 0), NextCycle + PI->Cycles);
+          }
+          else
+            ReservedCycles[PIdx] = NextCycle;
+        }
+      }
+    }
+  }
+  // Update ExpectedLatency and DependentLatency.
+  unsigned &TopLatency = isTop() ? ExpectedLatency : DependentLatency;
+  unsigned &BotLatency = isTop() ? DependentLatency : ExpectedLatency;
+  if (SU->getDepth() > TopLatency) {
+    TopLatency = SU->getDepth();
+    DEBUG(dbgs() << "  " << Available.getName()
+          << " TopLatency SU(" << SU->NodeNum << ") " << TopLatency << "c\n");
+  }
+  if (SU->getHeight() > BotLatency) {
+    BotLatency = SU->getHeight();
+    DEBUG(dbgs() << "  " << Available.getName()
+          << " BotLatency SU(" << SU->NodeNum << ") " << BotLatency << "c\n");
+  }
+  // If we stall for any reason, bump the cycle.
+  if (NextCycle > CurrCycle) {
+    bumpCycle(NextCycle);
+  } else {
+    // After updating ZoneCritResIdx and ExpectedLatency, check if we're
+    // resource limited. If a stall occurred, bumpCycle does this.
+    unsigned LFactor = SchedModel->getLatencyFactor();
+    IsResourceLimited =
+      (int)(getCriticalCount() - (getScheduledLatency() * LFactor))
+      > (int)LFactor;
+  }
+  // Update CurrMOps after calling bumpCycle to handle stalls, since bumpCycle
+  // resets CurrMOps. Loop to handle instructions with more MOps than issue in
+  // one cycle.  Since we commonly reach the max MOps here, opportunistically
+  // bump the cycle to avoid uselessly checking everything in the readyQ.
+  CurrMOps += IncMOps;
+  while (CurrMOps >= SchedModel->getIssueWidth()) {
+    DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
+          << " at cycle " << CurrCycle << '\n');
+    bumpCycle(++NextCycle);
+  }
+  DEBUG(dumpScheduledState());
+}
+
+/// Release pending ready nodes in to the available queue. This makes them
+/// visible to heuristics.
+void SchedBoundary::releasePending() {
+  // If the available queue is empty, it is safe to reset MinReadyCycle.
+  if (Available.empty())
+    MinReadyCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  bool IsBuffered = SchedModel->getMicroOpBufferSize() != 0;
+  for (unsigned i = 0, e = Pending.size(); i != e; ++i) {
+    SUnit *SU = *(Pending.begin()+i);
+    unsigned ReadyCycle = isTop() ? SU->TopReadyCycle : SU->BotReadyCycle;
+
+    if (ReadyCycle < MinReadyCycle)
+      MinReadyCycle = ReadyCycle;
+
+    if (!IsBuffered && ReadyCycle > CurrCycle)
+      continue;
+
+    if (checkHazard(SU))
+      continue;
+
+    if (Available.size() >= ReadyListLimit)
+      break;
+
+    Available.push(SU);
+    Pending.remove(Pending.begin()+i);
+    --i; --e;
+  }
+  CheckPending = false;
+}
+
+/// Remove SU from the ready set for this boundary.
+void SchedBoundary::removeReady(SUnit *SU) {
+  if (Available.isInQueue(SU))
+    Available.remove(Available.find(SU));
+  else {
+    assert(Pending.isInQueue(SU) && "bad ready count");
+    Pending.remove(Pending.find(SU));
+  }
+}
+
+/// If this queue only has one ready candidate, return it. As a side effect,
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
+SUnit *SchedBoundary::pickOnlyChoice() {
+  if (CheckPending)
+    releasePending();
+
+  if (CurrMOps > 0) {
+    // Defer any ready instrs that now have a hazard.
+    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+      if (checkHazard(*I)) {
+        Pending.push(*I);
+        I = Available.remove(I);
+        continue;
+      }
+      ++I;
+    }
+  }
+  for (unsigned i = 0; Available.empty(); ++i) {
+//  FIXME: Re-enable assert once PR20057 is resolved.
+//    assert(i <= (HazardRec->getMaxLookAhead() + MaxObservedStall) &&
+//           "permanent hazard");
+    (void)i;
+    bumpCycle(CurrCycle + 1);
+    releasePending();
+  }
+
+  DEBUG(Pending.dump());
+  DEBUG(Available.dump());
+
+  if (Available.size() == 1)
+    return *Available.begin();
+  return nullptr;
+}
+
+#ifndef NDEBUG
+// This is useful information to dump after bumpNode.
+// Note that the Queue contents are more useful before pickNodeFromQueue.
+void SchedBoundary::dumpScheduledState() {
+  unsigned ResFactor;
+  unsigned ResCount;
+  if (ZoneCritResIdx) {
+    ResFactor = SchedModel->getResourceFactor(ZoneCritResIdx);
+    ResCount = getResourceCount(ZoneCritResIdx);
+  } else {
+    ResFactor = SchedModel->getMicroOpFactor();
+    ResCount = RetiredMOps * SchedModel->getMicroOpFactor();
+  }
+  unsigned LFactor = SchedModel->getLatencyFactor();
+  dbgs() << Available.getName() << " @" << CurrCycle << "c\n"
+         << "  Retired: " << RetiredMOps;
+  dbgs() << "\n  Executed: " << getExecutedCount() / LFactor << "c";
+  dbgs() << "\n  Critical: " << ResCount / LFactor << "c, "
+         << ResCount / ResFactor << " "
+         << SchedModel->getResourceName(ZoneCritResIdx)
+         << "\n  ExpectedLatency: " << ExpectedLatency << "c\n"
+         << (IsResourceLimited ? "  - Resource" : "  - Latency")
+         << " limited.\n";
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+// GenericScheduler - Generic implementation of MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+void GenericSchedulerBase::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Set the CandPolicy given a scheduling zone given the current resources and
+/// latencies inside and outside the zone.
+void GenericSchedulerBase::setPolicy(CandPolicy &Policy, bool IsPostRA,
+                                     SchedBoundary &CurrZone,
+                                     SchedBoundary *OtherZone) {
+  // Apply preemptive heuristics based on the total latency and resources
+  // inside and outside this zone. Potential stalls should be considered before
+  // following this policy.
+
+  // Compute remaining latency. We need this both to determine whether the
+  // overall schedule has become latency-limited and whether the instructions
+  // outside this zone are resource or latency limited.
+  //
+  // The "dependent" latency is updated incrementally during scheduling as the
+  // max height/depth of scheduled nodes minus the cycles since it was
+  // scheduled:
+  //   DLat = max (N.depth - (CurrCycle - N.ReadyCycle) for N in Zone
+  //
+  // The "independent" latency is the max ready queue depth:
+  //   ILat = max N.depth for N in Available|Pending
+  //
+  // RemainingLatency is the greater of independent and dependent latency.
+  unsigned RemLatency = CurrZone.getDependentLatency();
+  RemLatency = std::max(RemLatency,
+                        CurrZone.findMaxLatency(CurrZone.Available.elements()));
+  RemLatency = std::max(RemLatency,
+                        CurrZone.findMaxLatency(CurrZone.Pending.elements()));
+
+  // Compute the critical resource outside the zone.
+  unsigned OtherCritIdx = 0;
+  unsigned OtherCount =
+    OtherZone ? OtherZone->getOtherResourceCount(OtherCritIdx) : 0;
+
+  bool OtherResLimited = false;
+  if (SchedModel->hasInstrSchedModel()) {
+    unsigned LFactor = SchedModel->getLatencyFactor();
+    OtherResLimited = (int)(OtherCount - (RemLatency * LFactor)) > (int)LFactor;
+  }
+  // Schedule aggressively for latency in PostRA mode. We don't check for
+  // acyclic latency during PostRA, and highly out-of-order processors will
+  // skip PostRA scheduling.
+  if (!OtherResLimited) {
+    if (IsPostRA || (RemLatency + CurrZone.getCurrCycle() > Rem.CriticalPath)) {
+      Policy.ReduceLatency |= true;
+      DEBUG(dbgs() << "  " << CurrZone.Available.getName()
+            << " RemainingLatency " << RemLatency << " + "
+            << CurrZone.getCurrCycle() << "c > CritPath "
+            << Rem.CriticalPath << "\n");
+    }
+  }
+  // If the same resource is limiting inside and outside the zone, do nothing.
+  if (CurrZone.getZoneCritResIdx() == OtherCritIdx)
+    return;
+
+  DEBUG(
+    if (CurrZone.isResourceLimited()) {
+      dbgs() << "  " << CurrZone.Available.getName() << " ResourceLimited: "
+             << SchedModel->getResourceName(CurrZone.getZoneCritResIdx())
+             << "\n";
+    }
+    if (OtherResLimited)
+      dbgs() << "  RemainingLimit: "
+             << SchedModel->getResourceName(OtherCritIdx) << "\n";
+    if (!CurrZone.isResourceLimited() && !OtherResLimited)
+      dbgs() << "  Latency limited both directions.\n");
+
+  if (CurrZone.isResourceLimited() && !Policy.ReduceResIdx)
+    Policy.ReduceResIdx = CurrZone.getZoneCritResIdx();
+
+  if (OtherResLimited)
+    Policy.DemandResIdx = OtherCritIdx;
+}
+
+#ifndef NDEBUG
+const char *GenericSchedulerBase::getReasonStr(
+  GenericSchedulerBase::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case Only1:          return "ONLY1     ";
+  case PhysRegCopy:    return "PREG-COPY ";
+  case RegExcess:      return "REG-EXCESS";
+  case RegCritical:    return "REG-CRIT  ";
+  case Stall:          return "STALL     ";
+  case Cluster:        return "CLUSTER   ";
+  case Weak:           return "WEAK      ";
+  case RegMax:         return "REG-MAX   ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+  llvm_unreachable("Unknown reason!");
+}
+
+void GenericSchedulerBase::traceCandidate(const SchedCandidate &Cand) {
+  PressureChange P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case RegExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case RegCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case RegMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << "  Cand SU(" << Cand.SU->NodeNum << ") " << getReasonStr(Cand.Reason);
+  if (P.isValid())
+    dbgs() << " " << TRI->getRegPressureSetName(P.getPSet())
+           << ":" << P.getUnitInc() << " ";
+  else
+    dbgs() << "      ";
+  if (ResIdx)
+    dbgs() << " " << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "         ";
+  if (Latency)
+    dbgs() << " " << Latency << " cycles ";
+  else
+    dbgs() << "          ";
+  dbgs() << '\n';
+}
+#endif
+
+/// Return true if this heuristic determines order.
+static bool tryLess(int TryVal, int CandVal,
+                    GenericSchedulerBase::SchedCandidate &TryCand,
+                    GenericSchedulerBase::SchedCandidate &Cand,
+                    GenericSchedulerBase::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+static bool tryGreater(int TryVal, int CandVal,
+                       GenericSchedulerBase::SchedCandidate &TryCand,
+                       GenericSchedulerBase::SchedCandidate &Cand,
+                       GenericSchedulerBase::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+static bool tryLatency(GenericSchedulerBase::SchedCandidate &TryCand,
+                       GenericSchedulerBase::SchedCandidate &Cand,
+                       SchedBoundary &Zone) {
+  if (Zone.isTop()) {
+    if (Cand.SU->getDepth() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                  TryCand, Cand, GenericSchedulerBase::TopDepthReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                   TryCand, Cand, GenericSchedulerBase::TopPathReduce))
+      return true;
+  } else {
+    if (Cand.SU->getHeight() > Zone.getScheduledLatency()) {
+      if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                  TryCand, Cand, GenericSchedulerBase::BotHeightReduce))
+        return true;
+    }
+    if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                   TryCand, Cand, GenericSchedulerBase::BotPathReduce))
+      return true;
+  }
+  return false;
+}
+
+static void tracePick(GenericSchedulerBase::CandReason Reason, bool IsTop) {
+  DEBUG(dbgs() << "Pick " << (IsTop ? "Top " : "Bot ")
+        << GenericSchedulerBase::getReasonStr(Reason) << '\n');
+}
+
+static void tracePick(const GenericSchedulerBase::SchedCandidate &Cand) {
+  tracePick(Cand.Reason, Cand.AtTop);
+}
+
+void GenericScheduler::initialize(ScheduleDAGMI *dag) {
+  assert(dag->hasVRegLiveness() &&
+         "(PreRA)GenericScheduler needs vreg liveness");
+  DAG = static_cast<ScheduleDAGMILive*>(dag);
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
+  // are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+  if (!Bot.HazardRec) {
+    Bot.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+  TopCand.SU = nullptr;
+  BotCand.SU = nullptr;
+}
+
+/// Initialize the per-region scheduling policy.
+void GenericScheduler::initPolicy(MachineBasicBlock::iterator Begin,
+                                  MachineBasicBlock::iterator End,
+                                  unsigned NumRegionInstrs) {
+  const MachineFunction &MF = *Begin->getParent()->getParent();
+  const TargetLowering *TLI = MF.getSubtarget().getTargetLowering();
+
+  // Avoid setting up the register pressure tracker for small regions to save
+  // compile time. As a rough heuristic, only track pressure when the number of
+  // schedulable instructions exceeds half the integer register file.
+  RegionPolicy.ShouldTrackPressure = true;
+  for (unsigned VT = MVT::i32; VT > (unsigned)MVT::i1; --VT) {
+    MVT::SimpleValueType LegalIntVT = (MVT::SimpleValueType)VT;
+    if (TLI->isTypeLegal(LegalIntVT)) {
+      unsigned NIntRegs = Context->RegClassInfo->getNumAllocatableRegs(
+        TLI->getRegClassFor(LegalIntVT));
+      RegionPolicy.ShouldTrackPressure = NumRegionInstrs > (NIntRegs / 2);
+    }
+  }
+
+  // For generic targets, we default to bottom-up, because it's simpler and more
+  // compile-time optimizations have been implemented in that direction.
+  RegionPolicy.OnlyBottomUp = true;
+
+  // Allow the subtarget to override default policy.
+  MF.getSubtarget().overrideSchedPolicy(RegionPolicy, NumRegionInstrs);
+
+  // After subtarget overrides, apply command line options.
+  if (!EnableRegPressure)
+    RegionPolicy.ShouldTrackPressure = false;
+
+  // Check -misched-topdown/bottomup can force or unforce scheduling direction.
+  // e.g. -misched-bottomup=false allows scheduling in both directions.
+  assert((!ForceTopDown || !ForceBottomUp) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  if (ForceBottomUp.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyBottomUp = ForceBottomUp;
+    if (RegionPolicy.OnlyBottomUp)
+      RegionPolicy.OnlyTopDown = false;
+  }
+  if (ForceTopDown.getNumOccurrences() > 0) {
+    RegionPolicy.OnlyTopDown = ForceTopDown;
+    if (RegionPolicy.OnlyTopDown)
+      RegionPolicy.OnlyBottomUp = false;
+  }
+}
+
+void GenericScheduler::dumpPolicy() {
+  dbgs() << "GenericScheduler RegionPolicy: "
+         << " ShouldTrackPressure=" << RegionPolicy.ShouldTrackPressure
+         << " OnlyTopDown=" << RegionPolicy.OnlyTopDown
+         << " OnlyBottomUp=" << RegionPolicy.OnlyBottomUp
+         << "\n";
+}
+
+/// Set IsAcyclicLatencyLimited if the acyclic path is longer than the cyclic
+/// critical path by more cycles than it takes to drain the instruction buffer.
+/// We estimate an upper bounds on in-flight instructions as:
+///
+/// CyclesPerIteration = max( CyclicPath, Loop-Resource-Height )
+/// InFlightIterations = AcyclicPath / CyclesPerIteration
+/// InFlightResources = InFlightIterations * LoopResources
+///
+/// TODO: Check execution resources in addition to IssueCount.
+void GenericScheduler::checkAcyclicLatency() {
+  if (Rem.CyclicCritPath == 0 || Rem.CyclicCritPath >= Rem.CriticalPath)
+    return;
+
+  // Scaled number of cycles per loop iteration.
+  unsigned IterCount =
+    std::max(Rem.CyclicCritPath * SchedModel->getLatencyFactor(),
+             Rem.RemIssueCount);
+  // Scaled acyclic critical path.
+  unsigned AcyclicCount = Rem.CriticalPath * SchedModel->getLatencyFactor();
+  // InFlightCount = (AcyclicPath / IterCycles) * InstrPerLoop
+  unsigned InFlightCount =
+    (AcyclicCount * Rem.RemIssueCount + IterCount-1) / IterCount;
+  unsigned BufferLimit =
+    SchedModel->getMicroOpBufferSize() * SchedModel->getMicroOpFactor();
+
+  Rem.IsAcyclicLatencyLimited = InFlightCount > BufferLimit;
+
+  DEBUG(dbgs() << "IssueCycles="
+        << Rem.RemIssueCount / SchedModel->getLatencyFactor() << "c "
+        << "IterCycles=" << IterCount / SchedModel->getLatencyFactor()
+        << "c NumIters=" << (AcyclicCount + IterCount-1) / IterCount
+        << " InFlight=" << InFlightCount / SchedModel->getMicroOpFactor()
+        << "m BufferLim=" << SchedModel->getMicroOpBufferSize() << "m\n";
+        if (Rem.IsAcyclicLatencyLimited)
+          dbgs() << "  ACYCLIC LATENCY LIMIT\n");
+}
+
+void GenericScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (std::vector<SUnit*>::const_iterator
+         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(GS-RR ): " << Rem.CriticalPath << " \n";
+  }
+
+  if (EnableCyclicPath) {
+    Rem.CyclicCritPath = DAG->computeCyclicCriticalPath();
+    checkAcyclicLatency();
+  }
+}
+
+static bool tryPressure(const PressureChange &TryP,
+                        const PressureChange &CandP,
+                        GenericSchedulerBase::SchedCandidate &TryCand,
+                        GenericSchedulerBase::SchedCandidate &Cand,
+                        GenericSchedulerBase::CandReason Reason,
+                        const TargetRegisterInfo *TRI,
+                        const MachineFunction &MF) {
+  // If one candidate decreases and the other increases, go with it.
+  // Invalid candidates have UnitInc==0.
+  if (tryGreater(TryP.getUnitInc() < 0, CandP.getUnitInc() < 0, TryCand, Cand,
+                 Reason)) {
+    return true;
+  }
+  // Do not compare the magnitude of pressure changes between top and bottom
+  // boundary.
+  if (Cand.AtTop != TryCand.AtTop)
+    return false;
+
+  // If both candidates affect the same set in the same boundary, go with the
+  // smallest increase.
+  unsigned TryPSet = TryP.getPSetOrMax();
+  unsigned CandPSet = CandP.getPSetOrMax();
+  if (TryPSet == CandPSet) {
+    return tryLess(TryP.getUnitInc(), CandP.getUnitInc(), TryCand, Cand,
+                   Reason);
+  }
+
+  int TryRank = TryP.isValid() ? TRI->getRegPressureSetScore(MF, TryPSet) :
+                                 std::numeric_limits<int>::max();
+
+  int CandRank = CandP.isValid() ? TRI->getRegPressureSetScore(MF, CandPSet) :
+                                   std::numeric_limits<int>::max();
+
+  // If the candidates are decreasing pressure, reverse priority.
+  if (TryP.getUnitInc() < 0)
+    std::swap(TryRank, CandRank);
+  return tryGreater(TryRank, CandRank, TryCand, Cand, Reason);
+}
+
+static unsigned getWeakLeft(const SUnit *SU, bool isTop) {
+  return (isTop) ? SU->WeakPredsLeft : SU->WeakSuccsLeft;
+}
+
+/// Minimize physical register live ranges. Regalloc wants them adjacent to
+/// their physreg def/use.
+///
+/// FIXME: This is an unnecessary check on the critical path. Most are root/leaf
+/// copies which can be prescheduled. The rest (e.g. x86 MUL) could be bundled
+/// with the operation that produces or consumes the physreg. We'll do this when
+/// regalloc has support for parallel copies.
+static int biasPhysRegCopy(const SUnit *SU, bool isTop) {
+  const MachineInstr *MI = SU->getInstr();
+  if (!MI->isCopy())
+    return 0;
+
+  unsigned ScheduledOper = isTop ? 1 : 0;
+  unsigned UnscheduledOper = isTop ? 0 : 1;
+  // If we have already scheduled the physreg produce/consumer, immediately
+  // schedule the copy.
+  if (TargetRegisterInfo::isPhysicalRegister(
+        MI->getOperand(ScheduledOper).getReg()))
+    return 1;
+  // If the physreg is at the boundary, defer it. Otherwise schedule it
+  // immediately to free the dependent. We can hoist the copy later.
+  bool AtBoundary = isTop ? !SU->NumSuccsLeft : !SU->NumPredsLeft;
+  if (TargetRegisterInfo::isPhysicalRegister(
+        MI->getOperand(UnscheduledOper).getReg()))
+    return AtBoundary ? -1 : 1;
+  return 0;
+}
+
+void GenericScheduler::initCandidate(SchedCandidate &Cand, SUnit *SU,
+                                     bool AtTop,
+                                     const RegPressureTracker &RPTracker,
+                                     RegPressureTracker &TempTracker) {
+  Cand.SU = SU;
+  Cand.AtTop = AtTop;
+  if (DAG->isTrackingPressure()) {
+    if (AtTop) {
+      TempTracker.getMaxDownwardPressureDelta(
+        Cand.SU->getInstr(),
+        Cand.RPDelta,
+        DAG->getRegionCriticalPSets(),
+        DAG->getRegPressure().MaxSetPressure);
+    } else {
+      if (VerifyScheduling) {
+        TempTracker.getMaxUpwardPressureDelta(
+          Cand.SU->getInstr(),
+          &DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      } else {
+        RPTracker.getUpwardPressureDelta(
+          Cand.SU->getInstr(),
+          DAG->getPressureDiff(Cand.SU),
+          Cand.RPDelta,
+          DAG->getRegionCriticalPSets(),
+          DAG->getRegPressure().MaxSetPressure);
+      }
+    }
+  }
+  DEBUG(if (Cand.RPDelta.Excess.isValid())
+          dbgs() << "  Try  SU(" << Cand.SU->NodeNum << ") "
+                 << TRI->getRegPressureSetName(Cand.RPDelta.Excess.getPSet())
+                 << ":" << Cand.RPDelta.Excess.getUnitInc() << "\n");
+}
+
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending, or nullptr
+//              if Cand is from a different zone than TryCand.
+void GenericScheduler::tryCandidate(SchedCandidate &Cand,
+                                    SchedCandidate &TryCand,
+                                    SchedBoundary *Zone) {
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  if (tryGreater(biasPhysRegCopy(TryCand.SU, TryCand.AtTop),
+                 biasPhysRegCopy(Cand.SU, Cand.AtTop),
+                 TryCand, Cand, PhysRegCopy))
+    return;
+
+  // Avoid exceeding the target's limit.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.Excess,
+                                               Cand.RPDelta.Excess,
+                                               TryCand, Cand, RegExcess, TRI,
+                                               DAG->MF))
+    return;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CriticalMax,
+                                               Cand.RPDelta.CriticalMax,
+                                               TryCand, Cand, RegCritical, TRI,
+                                               DAG->MF))
+    return;
+
+  // We only compare a subset of features when comparing nodes between
+  // Top and Bottom boundary. Some properties are simply incomparable, in many
+  // other instances we should only override the other boundary if something
+  // is a clear good pick on one boundary. Skip heuristics that are more
+  // "tie-breaking" in nature.
+  bool SameBoundary = Zone != nullptr;
+  if (SameBoundary) {
+    // For loops that are acyclic path limited, aggressively schedule for
+    // latency.  This can result in very long dependence chains scheduled in
+    // sequence, so once every cycle (when CurrMOps == 0), switch to normal
+    // heuristics.
+    if (Rem.IsAcyclicLatencyLimited && !Zone->getCurrMOps() &&
+        tryLatency(TryCand, Cand, *Zone))
+      return;
+
+    // Prioritize instructions that read unbuffered resources by stall cycles.
+    if (tryLess(Zone->getLatencyStallCycles(TryCand.SU),
+                Zone->getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+      return;
+  }
+
+  // Keep clustered nodes together to encourage downstream peephole
+  // optimizations which may reduce resource requirements.
+  //
+  // This is a best effort to set things up for a post-RA pass. Optimizations
+  // like generating loads of multiple registers should ideally be done within
+  // the scheduler pass by combining the loads during DAG postprocessing.
+  const SUnit *CandNextClusterSU =
+    Cand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  const SUnit *TryCandNextClusterSU =
+    TryCand.AtTop ? DAG->getNextClusterSucc() : DAG->getNextClusterPred();
+  if (tryGreater(TryCand.SU == TryCandNextClusterSU,
+                 Cand.SU == CandNextClusterSU,
+                 TryCand, Cand, Cluster))
+    return;
+
+  if (SameBoundary) {
+    // Weak edges are for clustering and other constraints.
+    if (tryLess(getWeakLeft(TryCand.SU, TryCand.AtTop),
+                getWeakLeft(Cand.SU, Cand.AtTop),
+                TryCand, Cand, Weak))
+      return;
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (DAG->isTrackingPressure() && tryPressure(TryCand.RPDelta.CurrentMax,
+                                               Cand.RPDelta.CurrentMax,
+                                               TryCand, Cand, RegMax, TRI,
+                                               DAG->MF))
+    return;
+
+  if (SameBoundary) {
+    // Avoid critical resource consumption and balance the schedule.
+    TryCand.initResourceDelta(DAG, SchedModel);
+    if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+                TryCand, Cand, ResourceReduce))
+      return;
+    if (tryGreater(TryCand.ResDelta.DemandedResources,
+                   Cand.ResDelta.DemandedResources,
+                   TryCand, Cand, ResourceDemand))
+      return;
+
+    // Avoid serializing long latency dependence chains.
+    // For acyclic path limited loops, latency was already checked above.
+    if (!RegionPolicy.DisableLatencyHeuristic && TryCand.Policy.ReduceLatency &&
+        !Rem.IsAcyclicLatencyLimited && tryLatency(TryCand, Cand, *Zone))
+      return;
+
+    // Prefer immediate defs/users of the last scheduled instruction. This is a
+    // local pressure avoidance strategy that also makes the machine code
+    // readable.
+    if (tryGreater(Zone->isNextSU(TryCand.SU), Zone->isNextSU(Cand.SU),
+                   TryCand, Cand, NextDefUse))
+      return;
+
+    // Fall through to original instruction order.
+    if ((Zone->isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+        || (!Zone->isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+      TryCand.Reason = NodeOrder;
+    }
+  }
+}
+
+/// Pick the best candidate from the queue.
+///
+/// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
+/// DAG building. To adjust for the current scheduling location we need to
+/// maintain the number of vreg uses remaining to be top-scheduled.
+void GenericScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                         const CandPolicy &ZonePolicy,
+                                         const RegPressureTracker &RPTracker,
+                                         SchedCandidate &Cand) {
+  // getMaxPressureDelta temporarily modifies the tracker.
+  RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
+
+  ReadyQueue &Q = Zone.Available;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+
+    SchedCandidate TryCand(ZonePolicy);
+    initCandidate(TryCand, *I, Zone.isTop(), RPTracker, TempTracker);
+    // Pass SchedBoundary only when comparing nodes from the same boundary.
+    SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
+    tryCandidate(Cand, TryCand, ZoneArg);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+/// Pick the best candidate node from either the top or bottom queue.
+SUnit *GenericScheduler::pickNodeBidirectional(bool &IsTopNode) {
+  // Schedule as far as possible in the direction of no choice. This is most
+  // efficient, but also provides the best heuristics for CriticalPSets.
+  if (SUnit *SU = Bot.pickOnlyChoice()) {
+    IsTopNode = false;
+    tracePick(Only1, false);
+    return SU;
+  }
+  if (SUnit *SU = Top.pickOnlyChoice()) {
+    IsTopNode = true;
+    tracePick(Only1, true);
+    return SU;
+  }
+  // Set the bottom-up policy based on the state of the current bottom zone and
+  // the instructions outside the zone, including the top zone.
+  CandPolicy BotPolicy;
+  setPolicy(BotPolicy, /*IsPostRA=*/false, Bot, &Top);
+  // Set the top-down policy based on the state of the current top zone and
+  // the instructions outside the zone, including the bottom zone.
+  CandPolicy TopPolicy;
+  setPolicy(TopPolicy, /*IsPostRA=*/false, Top, &Bot);
+
+  // See if BotCand is still valid (because we previously scheduled from Top).
+  DEBUG(dbgs() << "Picking from Bot:\n");
+  if (!BotCand.isValid() || BotCand.SU->isScheduled ||
+      BotCand.Policy != BotPolicy) {
+    BotCand.reset(CandPolicy());
+    pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), BotCand);
+    assert(BotCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(BotCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Bot, BotPolicy, DAG->getBotRPTracker(), TCand);
+      assert(TCand.SU == BotCand.SU &&
+             "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Check if the top Q has a better candidate.
+  DEBUG(dbgs() << "Picking from Top:\n");
+  if (!TopCand.isValid() || TopCand.SU->isScheduled ||
+      TopCand.Policy != TopPolicy) {
+    TopCand.reset(CandPolicy());
+    pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TopCand);
+    assert(TopCand.Reason != NoCand && "failed to find the first candidate");
+  } else {
+    DEBUG(traceCandidate(TopCand));
+#ifndef NDEBUG
+    if (VerifyScheduling) {
+      SchedCandidate TCand;
+      TCand.reset(CandPolicy());
+      pickNodeFromQueue(Top, TopPolicy, DAG->getTopRPTracker(), TCand);
+      assert(TCand.SU == TopCand.SU &&
+           "Last pick result should correspond to re-picking right now");
+    }
+#endif
+  }
+
+  // Pick best from BotCand and TopCand.
+  assert(BotCand.isValid());
+  assert(TopCand.isValid());
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  tryCandidate(Cand, TopCand, nullptr);
+  if (TopCand.Reason != NoCand) {
+    Cand.setBest(TopCand);
+    DEBUG(traceCandidate(Cand));
+  }
+
+  IsTopNode = Cand.AtTop;
+  tracePick(Cand);
+  return Cand.SU;
+}
+
+/// Pick the best node to balance the schedule. Implements MachineSchedStrategy.
+SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() &&
+           Bot.Available.empty() && Bot.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    if (RegionPolicy.OnlyTopDown) {
+      SU = Top.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        TopCand.reset(NoPolicy);
+        pickNodeFromQueue(Top, NoPolicy, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(TopCand);
+        SU = TopCand.SU;
+      }
+      IsTopNode = true;
+    } else if (RegionPolicy.OnlyBottomUp) {
+      SU = Bot.pickOnlyChoice();
+      if (!SU) {
+        CandPolicy NoPolicy;
+        BotCand.reset(NoPolicy);
+        pickNodeFromQueue(Bot, NoPolicy, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find a candidate");
+        tracePick(BotCand);
+        SU = BotCand.SU;
+      }
+      IsTopNode = false;
+    } else {
+      SU = pickNodeBidirectional(IsTopNode);
+    }
+  } while (SU->isScheduled);
+
+  if (SU->isTopReady())
+    Top.removeReady(SU);
+  if (SU->isBottomReady())
+    Bot.removeReady(SU);
+
+  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  return SU;
+}
+
+void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
+
+  MachineBasicBlock::iterator InsertPos = SU->getInstr();
+  if (!isTop)
+    ++InsertPos;
+  SmallVectorImpl<SDep> &Deps = isTop ? SU->Preds : SU->Succs;
+
+  // Find already scheduled copies with a single physreg dependence and move
+  // them just above the scheduled instruction.
+  for (SmallVectorImpl<SDep>::iterator I = Deps.begin(), E = Deps.end();
+       I != E; ++I) {
+    if (I->getKind() != SDep::Data || !TRI->isPhysicalRegister(I->getReg()))
+      continue;
+    SUnit *DepSU = I->getSUnit();
+    if (isTop ? DepSU->Succs.size() > 1 : DepSU->Preds.size() > 1)
+      continue;
+    MachineInstr *Copy = DepSU->getInstr();
+    if (!Copy->isCopy())
+      continue;
+    DEBUG(dbgs() << "  Rescheduling physreg copy ";
+          I->getSUnit()->dump(DAG));
+    DAG->moveInstruction(Copy, InsertPos);
+  }
+}
+
+/// Update the scheduler's state after scheduling a node. This is the same node
+/// that was just returned by pickNode(). However, ScheduleDAGMILive needs to
+/// update it's state based on the current cycle before MachineSchedStrategy
+/// does.
+///
+/// FIXME: Eventually, we may bundle physreg copies rather than rescheduling
+/// them here. See comments in biasPhysRegCopy.
+void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  if (IsTopNode) {
+    SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+    Top.bumpNode(SU);
+    if (SU->hasPhysRegUses)
+      reschedulePhysRegCopies(SU, true);
+  } else {
+    SU->BotReadyCycle = std::max(SU->BotReadyCycle, Bot.getCurrCycle());
+    Bot.bumpNode(SU);
+    if (SU->hasPhysRegDefs)
+      reschedulePhysRegCopies(SU, false);
+  }
+}
+
+/// Create the standard converging machine scheduler. This will be used as the
+/// default scheduler if the target does not set a default.
+static ScheduleDAGInstrs *createGenericSchedLive(MachineSchedContext *C) {
+  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C));
+  // Register DAG post-processors.
+  //
+  // FIXME: extend the mutation API to allow earlier mutations to instantiate
+  // data and pass it to later mutations. Have a single mutation that gathers
+  // the interesting nodes in one pass.
+  DAG->addMutation(make_unique<CopyConstrain>(DAG->TII, DAG->TRI));
+  if (EnableMemOpCluster) {
+    if (DAG->TII->enableClusterLoads())
+      DAG->addMutation(make_unique<LoadClusterMutation>(DAG->TII, DAG->TRI));
+    if (DAG->TII->enableClusterStores())
+      DAG->addMutation(make_unique<StoreClusterMutation>(DAG->TII, DAG->TRI));
+  }
+  if (EnableMacroFusion)
+    DAG->addMutation(make_unique<MacroFusion>(*DAG->TII, *DAG->TRI));
+  return DAG;
+}
+
+static MachineSchedRegistry
+GenericSchedRegistry("converge", "Standard converging scheduler.",
+                     createGenericSchedLive);
+
+//===----------------------------------------------------------------------===//
+// PostGenericScheduler - Generic PostRA implementation of MachineSchedStrategy.
+//===----------------------------------------------------------------------===//
+
+void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
+  DAG = Dag;
+  SchedModel = DAG->getSchedModel();
+  TRI = DAG->TRI;
+
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  BotRoots.clear();
+
+  // Initialize the HazardRecognizers. If itineraries don't exist, are empty,
+  // or are disabled, then these HazardRecs will be disabled.
+  const InstrItineraryData *Itin = SchedModel->getInstrItineraries();
+  if (!Top.HazardRec) {
+    Top.HazardRec =
+        DAG->MF.getSubtarget().getInstrInfo()->CreateTargetMIHazardRecognizer(
+            Itin, DAG);
+  }
+}
+
+
+void PostGenericScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (SmallVectorImpl<SUnit*>::const_iterator
+         I = BotRoots.begin(), E = BotRoots.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path: (PGS-RR) " << Rem.CriticalPath << '\n');
+  if (DumpCriticalPathLength) {
+    errs() << "Critical Path(PGS-RR ): " << Rem.CriticalPath << " \n";
+  }
+}
+
+/// Apply a set of heursitics to a new candidate for PostRA scheduling.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+void PostGenericScheduler::tryCandidate(SchedCandidate &Cand,
+                                        SchedCandidate &TryCand) {
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+
+  // Prioritize instructions that read unbuffered resources by stall cycles.
+  if (tryLess(Top.getLatencyStallCycles(TryCand.SU),
+              Top.getLatencyStallCycles(Cand.SU), TryCand, Cand, Stall))
+    return;
+
+  // Avoid critical resource consumption and balance the schedule.
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency && tryLatency(TryCand, Cand, Top)) {
+    return;
+  }
+
+  // Fall through to original instruction order.
+  if (TryCand.SU->NodeNum < Cand.SU->NodeNum)
+    TryCand.Reason = NodeOrder;
+}
+
+void PostGenericScheduler::pickNodeFromQueue(SchedCandidate &Cand) {
+  ReadyQueue &Q = Top.Available;
+  for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = *I;
+    TryCand.AtTop = true;
+    TryCand.initResourceDelta(DAG, SchedModel);
+    tryCandidate(Cand, TryCand);
+    if (TryCand.Reason != NoCand) {
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand));
+    }
+  }
+}
+
+/// Pick the next node to schedule.
+SUnit *PostGenericScheduler::pickNode(bool &IsTopNode) {
+  if (DAG->top() == DAG->bottom()) {
+    assert(Top.Available.empty() && Top.Pending.empty() && "ReadyQ garbage");
+    return nullptr;
+  }
+  SUnit *SU;
+  do {
+    SU = Top.pickOnlyChoice();
+    if (SU) {
+      tracePick(Only1, true);
+    } else {
+      CandPolicy NoPolicy;
+      SchedCandidate TopCand(NoPolicy);
+      // Set the top-down policy based on the state of the current top zone and
+      // the instructions outside the zone, including the bottom zone.
+      setPolicy(TopCand.Policy, /*IsPostRA=*/true, Top, nullptr);
+      pickNodeFromQueue(TopCand);
+      assert(TopCand.Reason != NoCand && "failed to find a candidate");
+      tracePick(TopCand);
+      SU = TopCand.SU;
+    }
+  } while (SU->isScheduled);
+
+  IsTopNode = true;
+  Top.removeReady(SU);
+
+  DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
+  return SU;
+}
+
+/// Called after ScheduleDAGMI has scheduled an instruction and updated
+/// scheduled/remaining flags in the DAG nodes.
+void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
+  SU->TopReadyCycle = std::max(SU->TopReadyCycle, Top.getCurrCycle());
+  Top.bumpNode(SU);
+}
+
+/// Create a generic scheduler with no vreg liveness or DAG mutation passes.
+static ScheduleDAGInstrs *createGenericSchedPostRA(MachineSchedContext *C) {
+  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C), /*IsPostRA=*/true);
+}
+
+//===----------------------------------------------------------------------===//
+// ILP Scheduler. Currently for experimental analysis of heuristics.
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// \brief Order nodes by the ILP metric.
+struct ILPOrder {
+  const SchedDFSResult *DFSResult;
+  const BitVector *ScheduledTrees;
+  bool MaximizeILP;
+
+  ILPOrder(bool MaxILP)
+    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {}
+
+  /// \brief Apply a less-than relation on node priority.
+  ///
+  /// (Return true if A comes after B in the Q.)
+  bool operator()(const SUnit *A, const SUnit *B) const {
+    unsigned SchedTreeA = DFSResult->getSubtreeID(A);
+    unsigned SchedTreeB = DFSResult->getSubtreeID(B);
+    if (SchedTreeA != SchedTreeB) {
+      // Unscheduled trees have lower priority.
+      if (ScheduledTrees->test(SchedTreeA) != ScheduledTrees->test(SchedTreeB))
+        return ScheduledTrees->test(SchedTreeB);
+
+      // Trees with shallower connections have have lower priority.
+      if (DFSResult->getSubtreeLevel(SchedTreeA)
+          != DFSResult->getSubtreeLevel(SchedTreeB)) {
+        return DFSResult->getSubtreeLevel(SchedTreeA)
+          < DFSResult->getSubtreeLevel(SchedTreeB);
+      }
+    }
+    if (MaximizeILP)
+      return DFSResult->getILP(A) < DFSResult->getILP(B);
+    else
+      return DFSResult->getILP(A) > DFSResult->getILP(B);
+  }
+};
+
+/// \brief Schedule based on the ILP metric.
+class ILPScheduler : public MachineSchedStrategy {
+  ScheduleDAGMILive *DAG;
+  ILPOrder Cmp;
+
+  std::vector<SUnit*> ReadyQ;
+public:
+  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {}
+
+  void initialize(ScheduleDAGMI *dag) override {
+    assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
+    DAG = static_cast<ScheduleDAGMILive*>(dag);
+    DAG->computeDFSResult();
+    Cmp.DFSResult = DAG->getDFSResult();
+    Cmp.ScheduledTrees = &DAG->getScheduledTrees();
+    ReadyQ.clear();
+  }
+
+  void registerRoots() override {
+    // Restore the heap in ReadyQ with the updated DFS results.
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  /// Callback to select the highest priority node from the ready Q.
+  SUnit *pickNode(bool &IsTopNode) override {
+    if (ReadyQ.empty()) return nullptr;
+    std::pop_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+    SUnit *SU = ReadyQ.back();
+    ReadyQ.pop_back();
+    IsTopNode = false;
+    DEBUG(dbgs() << "Pick node " << "SU(" << SU->NodeNum << ") "
+          << " ILP: " << DAG->getDFSResult()->getILP(SU)
+          << " Tree: " << DAG->getDFSResult()->getSubtreeID(SU) << " @"
+          << DAG->getDFSResult()->getSubtreeLevel(
+            DAG->getDFSResult()->getSubtreeID(SU)) << '\n'
+          << "Scheduling " << *SU->getInstr());
+    return SU;
+  }
+
+  /// \brief Scheduler callback to notify that a new subtree is scheduled.
+  void scheduleTree(unsigned SubtreeID) override {
+    std::make_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+
+  /// Callback after a node is scheduled. Mark a newly scheduled tree, notify
+  /// DFSResults, and resort the priority Q.
+  void schedNode(SUnit *SU, bool IsTopNode) override {
+    assert(!IsTopNode && "SchedDFSResult needs bottom-up");
+  }
+
+  void releaseTopNode(SUnit *) override { /*only called for top roots*/ }
+
+  void releaseBottomNode(SUnit *SU) override {
+    ReadyQ.push_back(SU);
+    std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
+  }
+};
+} // namespace
+
+static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true));
+}
+static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
+  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false));
+}
+static MachineSchedRegistry ILPMaxRegistry(
+  "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
+static MachineSchedRegistry ILPMinRegistry(
+  "ilpmin", "Schedule bottom-up for min ILP", createILPMinScheduler);
+
+//===----------------------------------------------------------------------===//
+// Machine Instruction Shuffler for Correctness Testing
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace {
+/// Apply a less-than relation on the node order, which corresponds to the
+/// instruction order prior to scheduling. IsReverse implements greater-than.
+template<bool IsReverse>
+struct SUnitOrder {
+  bool operator()(SUnit *A, SUnit *B) const {
+    if (IsReverse)
+      return A->NodeNum > B->NodeNum;
+    else
+      return A->NodeNum < B->NodeNum;
+  }
+};
+
+/// Reorder instructions as much as possible.
+class InstructionShuffler : public MachineSchedStrategy {
+  bool IsAlternating;
+  bool IsTopDown;
+
+  // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
+  // gives nodes with a higher number higher priority causing the latest
+  // instructions to be scheduled first.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+    TopQ;
+  // When scheduling bottom-up, use greater-than as the queue priority.
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+    BottomQ;
+public:
+  InstructionShuffler(bool alternate, bool topdown)
+    : IsAlternating(alternate), IsTopDown(topdown) {}
+
+  void initialize(ScheduleDAGMI*) override {
+    TopQ.clear();
+    BottomQ.clear();
+  }
+
+  /// Implement MachineSchedStrategy interface.
+  /// -----------------------------------------
+
+  SUnit *pickNode(bool &IsTopNode) override {
+    SUnit *SU;
+    if (IsTopDown) {
+      do {
+        if (TopQ.empty()) return nullptr;
+        SU = TopQ.top();
+        TopQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = true;
+    } else {
+      do {
+        if (BottomQ.empty()) return nullptr;
+        SU = BottomQ.top();
+        BottomQ.pop();
+      } while (SU->isScheduled);
+      IsTopNode = false;
+    }
+    if (IsAlternating)
+      IsTopDown = !IsTopDown;
+    return SU;
+  }
+
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+
+  void releaseTopNode(SUnit *SU) override {
+    TopQ.push(SU);
+  }
+  void releaseBottomNode(SUnit *SU) override {
+    BottomQ.push(SU);
+  }
+};
+} // namespace
+
+static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
+  bool Alternate = !ForceTopDown && !ForceBottomUp;
+  bool TopDown = !ForceBottomUp;
+  assert((TopDown || !ForceTopDown) &&
+         "-misched-topdown incompatible with -misched-bottomup");
+  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown));
+}
+static MachineSchedRegistry ShufflerRegistry(
+  "shuffle", "Shuffle machine instructions alternating directions",
+  createInstructionShuffler);
+#endif // !NDEBUG
+
+//===----------------------------------------------------------------------===//
+// GraphWriter support for ScheduleDAGMILive.
+//===----------------------------------------------------------------------===//
+
+#ifndef NDEBUG
+namespace llvm {
+
+template<> struct GraphTraits<
+  ScheduleDAGMI*> : public GraphTraits<ScheduleDAG*> {};
+
+template<>
+struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+  static std::string getGraphName(const ScheduleDAG *G) {
+    return G->MF.getName();
+  }
+
+  static bool renderGraphFromBottomUp() {
+    return true;
+  }
+
+  static bool isNodeHidden(const SUnit *Node) {
+    if (ViewMISchedCutoff == 0)
+      return false;
+    return (Node->Preds.size() > ViewMISchedCutoff
+         || Node->Succs.size() > ViewMISchedCutoff);
+  }
+
+  /// If you want to override the dot attributes printed for a particular
+  /// edge, override this method.
+  static std::string getEdgeAttributes(const SUnit *Node,
+                                       SUnitIterator EI,
+                                       const ScheduleDAG *Graph) {
+    if (EI.isArtificialDep())
+      return "color=cyan,style=dashed";
+    if (EI.isCtrlDep())
+      return "color=blue,style=dashed";
+    return "";
+  }
+
+  static std::string getNodeLabel(const SUnit *SU, const ScheduleDAG *G) {
+    std::string Str;
+    raw_string_ostream SS(Str);
+    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
+    SS << "SU:" << SU->NodeNum;
+    if (DFS)
+      SS << " I:" << DFS->getNumInstrs(SU);
+    return SS.str();
+  }
+  static std::string getNodeDescription(const SUnit *SU, const ScheduleDAG *G) {
+    return G->getGraphNodeLabel(SU);
+  }
+
+  static std::string getNodeAttributes(const SUnit *N, const ScheduleDAG *G) {
+    std::string Str("shape=Mrecord");
+    const ScheduleDAGMI *DAG = static_cast<const ScheduleDAGMI*>(G);
+    const SchedDFSResult *DFS = DAG->hasVRegLiveness() ?
+      static_cast<const ScheduleDAGMILive*>(G)->getDFSResult() : nullptr;
+    if (DFS) {
+      Str += ",style=filled,fillcolor=\"#";
+      Str += DOT::getColorString(DFS->getSubtreeID(N));
+      Str += '"';
+    }
+    return Str;
+  }
+};
+} // namespace llvm
+#endif // NDEBUG
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAGMI::viewGraph(const Twine &Name, const Twine &Title) {
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAGMI::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAGMI::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineSink.cpp b/contrib/llvm/lib/CodeGen/MachineSink.cpp
new file mode 100644
index 000000000000..571a5c1d8005
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineSink.cpp
@@ -0,0 +1,857 @@
+//===-- MachineSink.cpp - Sinking for machine instructions ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass moves instructions into successor blocks when possible, so that
+// they aren't executed on paths where their results aren't needed.
+//
+// This pass is not intended to be a replacement or a complete alternative
+// for an LLVM-IR-level sinking pass. It is only designed to sink simple
+// constructs that are not exposed before lowering and instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-sink"
+
+static cl::opt<bool>
+SplitEdges("machine-sink-split",
+           cl::desc("Split critical edges during machine sinking"),
+           cl::init(true), cl::Hidden);
+
+static cl::opt<bool>
+UseBlockFreqInfo("machine-sink-bfi",
+           cl::desc("Use block frequency info to find successors to sink"),
+           cl::init(true), cl::Hidden);
+
+
+STATISTIC(NumSunk,      "Number of machine instructions sunk");
+STATISTIC(NumSplit,     "Number of critical edges split");
+STATISTIC(NumCoalesces, "Number of copies coalesced");
+
+namespace {
+  class MachineSinking : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo  *MRI;     // Machine register information
+    MachineDominatorTree *DT;      // Machine dominator tree
+    MachinePostDominatorTree *PDT; // Machine post dominator tree
+    MachineLoopInfo *LI;
+    const MachineBlockFrequencyInfo *MBFI;
+    AliasAnalysis *AA;
+
+    // Remember which edges have been considered for breaking.
+    SmallSet<std::pair<MachineBasicBlock*,MachineBasicBlock*>, 8>
+    CEBCandidates;
+    // Remember which edges we are about to split.
+    // This is different from CEBCandidates since those edges
+    // will be split.
+    SetVector<std::pair<MachineBasicBlock*,MachineBasicBlock*> > ToSplit;
+
+    SparseBitVector<> RegsToClearKillFlags;
+
+    typedef std::map<MachineBasicBlock *, SmallVector<MachineBasicBlock *, 4>>
+        AllSuccsCache;
+
+  public:
+    static char ID; // Pass identification
+    MachineSinking() : MachineFunctionPass(ID) {
+      initializeMachineSinkingPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addRequired<MachinePostDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addPreserved<MachinePostDominatorTree>();
+      AU.addPreserved<MachineLoopInfo>();
+      if (UseBlockFreqInfo)
+        AU.addRequired<MachineBlockFrequencyInfo>();
+    }
+
+    void releaseMemory() override {
+      CEBCandidates.clear();
+    }
+
+  private:
+    bool ProcessBlock(MachineBasicBlock &MBB);
+    bool isWorthBreakingCriticalEdge(MachineInstr &MI,
+                                     MachineBasicBlock *From,
+                                     MachineBasicBlock *To);
+    /// \brief Postpone the splitting of the given critical
+    /// edge (\p From, \p To).
+    ///
+    /// We do not split the edges on the fly. Indeed, this invalidates
+    /// the dominance information and thus triggers a lot of updates
+    /// of that information underneath.
+    /// Instead, we postpone all the splits after each iteration of
+    /// the main loop. That way, the information is at least valid
+    /// for the lifetime of an iteration.
+    ///
+    /// \return True if the edge is marked as toSplit, false otherwise.
+    /// False can be returned if, for instance, this is not profitable.
+    bool PostponeSplitCriticalEdge(MachineInstr &MI,
+                                   MachineBasicBlock *From,
+                                   MachineBasicBlock *To,
+                                   bool BreakPHIEdge);
+    bool SinkInstruction(MachineInstr &MI, bool &SawStore,
+                         AllSuccsCache &AllSuccessors);
+    bool AllUsesDominatedByBlock(unsigned Reg, MachineBasicBlock *MBB,
+                                 MachineBasicBlock *DefMBB,
+                                 bool &BreakPHIEdge, bool &LocalUse) const;
+    MachineBasicBlock *FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
+               bool &BreakPHIEdge, AllSuccsCache &AllSuccessors);
+    bool isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
+                              MachineBasicBlock *MBB,
+                              MachineBasicBlock *SuccToSinkTo,
+                              AllSuccsCache &AllSuccessors);
+
+    bool PerformTrivialForwardCoalescing(MachineInstr &MI,
+                                         MachineBasicBlock *MBB);
+
+    SmallVector<MachineBasicBlock *, 4> &
+    GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
+                           AllSuccsCache &AllSuccessors) const;
+  };
+} // end anonymous namespace
+
+char MachineSinking::ID = 0;
+char &llvm::MachineSinkingID = MachineSinking::ID;
+INITIALIZE_PASS_BEGIN(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(MachineSinking, "machine-sink",
+                "Machine code sinking", false, false)
+
+bool MachineSinking::PerformTrivialForwardCoalescing(MachineInstr &MI,
+                                                     MachineBasicBlock *MBB) {
+  if (!MI.isCopy())
+    return false;
+
+  unsigned SrcReg = MI.getOperand(1).getReg();
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg) ||
+      !TargetRegisterInfo::isVirtualRegister(DstReg) ||
+      !MRI->hasOneNonDBGUse(SrcReg))
+    return false;
+
+  const TargetRegisterClass *SRC = MRI->getRegClass(SrcReg);
+  const TargetRegisterClass *DRC = MRI->getRegClass(DstReg);
+  if (SRC != DRC)
+    return false;
+
+  MachineInstr *DefMI = MRI->getVRegDef(SrcReg);
+  if (DefMI->isCopyLike())
+    return false;
+  DEBUG(dbgs() << "Coalescing: " << *DefMI);
+  DEBUG(dbgs() << "*** to: " << MI);
+  MRI->replaceRegWith(DstReg, SrcReg);
+  MI.eraseFromParent();
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  MRI->clearKillFlags(SrcReg);
+
+  ++NumCoalesces;
+  return true;
+}
+
+/// AllUsesDominatedByBlock - Return true if all uses of the specified register
+/// occur in blocks dominated by the specified block. If any use is in the
+/// definition block, then return false since it is never legal to move def
+/// after uses.
+bool
+MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
+                                        MachineBasicBlock *MBB,
+                                        MachineBasicBlock *DefMBB,
+                                        bool &BreakPHIEdge,
+                                        bool &LocalUse) const {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Only makes sense for vregs");
+
+  // Ignore debug uses because debug info doesn't affect the code.
+  if (MRI->use_nodbg_empty(Reg))
+    return true;
+
+  // BreakPHIEdge is true if all the uses are in the successor MBB being sunken
+  // into and they are all PHI nodes. In this case, machine-sink must break
+  // the critical edge first. e.g.
+  //
+  // BB#1: derived from LLVM BB %bb4.preheader
+  //   Predecessors according to CFG: BB#0
+  //     ...
+  //     %reg16385<def> = DEC64_32r %reg16437, %EFLAGS<imp-def,dead>
+  //     ...
+  //     JE_4 <BB#37>, %EFLAGS<imp-use>
+  //   Successors according to CFG: BB#37 BB#2
+  //
+  // BB#2: derived from LLVM BB %bb.nph
+  //   Predecessors according to CFG: BB#0 BB#1
+  //     %reg16386<def> = PHI %reg16434, <BB#0>, %reg16385, <BB#1>
+  BreakPHIEdge = true;
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    MachineInstr *UseInst = MO.getParent();
+    unsigned OpNo = &MO - &UseInst->getOperand(0);
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (!(UseBlock == MBB && UseInst->isPHI() &&
+          UseInst->getOperand(OpNo+1).getMBB() == DefMBB)) {
+      BreakPHIEdge = false;
+      break;
+    }
+  }
+  if (BreakPHIEdge)
+    return true;
+
+  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+    // Determine the block of the use.
+    MachineInstr *UseInst = MO.getParent();
+    unsigned OpNo = &MO - &UseInst->getOperand(0);
+    MachineBasicBlock *UseBlock = UseInst->getParent();
+    if (UseInst->isPHI()) {
+      // PHI nodes use the operand in the predecessor block, not the block with
+      // the PHI.
+      UseBlock = UseInst->getOperand(OpNo+1).getMBB();
+    } else if (UseBlock == DefMBB) {
+      LocalUse = true;
+      return false;
+    }
+
+    // Check that it dominates.
+    if (!DT->dominates(MBB, UseBlock))
+      return false;
+  }
+
+  return true;
+}
+
+bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  DEBUG(dbgs() << "******** Machine Sinking ********\n");
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
+  LI = &getAnalysis<MachineLoopInfo>();
+  MBFI = UseBlockFreqInfo ? &getAnalysis<MachineBlockFrequencyInfo>() : nullptr;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  bool EverMadeChange = false;
+
+  while (1) {
+    bool MadeChange = false;
+
+    // Process all basic blocks.
+    CEBCandidates.clear();
+    ToSplit.clear();
+    for (auto &MBB: MF)
+      MadeChange |= ProcessBlock(MBB);
+
+    // If we have anything we marked as toSplit, split it now.
+    for (auto &Pair : ToSplit) {
+      auto NewSucc = Pair.first->SplitCriticalEdge(Pair.second, *this);
+      if (NewSucc != nullptr) {
+        DEBUG(dbgs() << " *** Splitting critical edge:"
+              " BB#" << Pair.first->getNumber()
+              << " -- BB#" << NewSucc->getNumber()
+              << " -- BB#" << Pair.second->getNumber() << '\n');
+        MadeChange = true;
+        ++NumSplit;
+      } else
+        DEBUG(dbgs() << " *** Not legal to break critical edge\n");
+    }
+    // If this iteration over the code changed anything, keep iterating.
+    if (!MadeChange) break;
+    EverMadeChange = true;
+  }
+
+  // Now clear any kill flags for recorded registers.
+  for (auto I : RegsToClearKillFlags)
+    MRI->clearKillFlags(I);
+  RegsToClearKillFlags.clear();
+
+  return EverMadeChange;
+}
+
+bool MachineSinking::ProcessBlock(MachineBasicBlock &MBB) {
+  // Can't sink anything out of a block that has less than two successors.
+  if (MBB.succ_size() <= 1 || MBB.empty()) return false;
+
+  // Don't bother sinking code out of unreachable blocks. In addition to being
+  // unprofitable, it can also lead to infinite looping, because in an
+  // unreachable loop there may be nowhere to stop.
+  if (!DT->isReachableFromEntry(&MBB)) return false;
+
+  bool MadeChange = false;
+
+  // Cache all successors, sorted by frequency info and loop depth.
+  AllSuccsCache AllSuccessors;
+
+  // Walk the basic block bottom-up.  Remember if we saw a store.
+  MachineBasicBlock::iterator I = MBB.end();
+  --I;
+  bool ProcessedBegin, SawStore = false;
+  do {
+    MachineInstr &MI = *I;  // The instruction to sink.
+
+    // Predecrement I (if it's not begin) so that it isn't invalidated by
+    // sinking.
+    ProcessedBegin = I == MBB.begin();
+    if (!ProcessedBegin)
+      --I;
+
+    if (MI.isDebugValue())
+      continue;
+
+    bool Joined = PerformTrivialForwardCoalescing(MI, &MBB);
+    if (Joined) {
+      MadeChange = true;
+      continue;
+    }
+
+    if (SinkInstruction(MI, SawStore, AllSuccessors)) {
+      ++NumSunk;
+      MadeChange = true;
+    }
+
+    // If we just processed the first instruction in the block, we're done.
+  } while (!ProcessedBegin);
+
+  return MadeChange;
+}
+
+bool MachineSinking::isWorthBreakingCriticalEdge(MachineInstr &MI,
+                                                 MachineBasicBlock *From,
+                                                 MachineBasicBlock *To) {
+  // FIXME: Need much better heuristics.
+
+  // If the pass has already considered breaking this edge (during this pass
+  // through the function), then let's go ahead and break it. This means
+  // sinking multiple "cheap" instructions into the same block.
+  if (!CEBCandidates.insert(std::make_pair(From, To)).second)
+    return true;
+
+  if (!MI.isCopy() && !TII->isAsCheapAsAMove(MI))
+    return true;
+
+  // MI is cheap, we probably don't want to break the critical edge for it.
+  // However, if this would allow some definitions of its source operands
+  // to be sunk then it's probably worth it.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // We don't move live definitions of physical registers,
+    // so sinking their uses won't enable any opportunities.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    // If this instruction is the only user of a virtual register,
+    // check if breaking the edge will enable sinking
+    // both this instruction and the defining instruction.
+    if (MRI->hasOneNonDBGUse(Reg)) {
+      // If the definition resides in same MBB,
+      // claim it's likely we can sink these together.
+      // If definition resides elsewhere, we aren't
+      // blocking it from being sunk so don't break the edge.
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (DefMI->getParent() == MI.getParent())
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool MachineSinking::PostponeSplitCriticalEdge(MachineInstr &MI,
+                                               MachineBasicBlock *FromBB,
+                                               MachineBasicBlock *ToBB,
+                                               bool BreakPHIEdge) {
+  if (!isWorthBreakingCriticalEdge(MI, FromBB, ToBB))
+    return false;
+
+  // Avoid breaking back edge. From == To means backedge for single BB loop.
+  if (!SplitEdges || FromBB == ToBB)
+    return false;
+
+  // Check for backedges of more "complex" loops.
+  if (LI->getLoopFor(FromBB) == LI->getLoopFor(ToBB) &&
+      LI->isLoopHeader(ToBB))
+    return false;
+
+  // It's not always legal to break critical edges and sink the computation
+  // to the edge.
+  //
+  // BB#1:
+  // v1024
+  // Beq BB#3
+  // <fallthrough>
+  // BB#2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // BB#3:
+  // ...
+  //       = v1024
+  //
+  // If BB#1 -> BB#3 edge is broken and computation of v1024 is inserted:
+  //
+  // BB#1:
+  // ...
+  // Bne BB#2
+  // BB#4:
+  // v1024 =
+  // B BB#3
+  // BB#2:
+  // ... no uses of v1024
+  // <fallthrough>
+  // BB#3:
+  // ...
+  //       = v1024
+  //
+  // This is incorrect since v1024 is not computed along the BB#1->BB#2->BB#3
+  // flow. We need to ensure the new basic block where the computation is
+  // sunk to dominates all the uses.
+  // It's only legal to break critical edge and sink the computation to the
+  // new block if all the predecessors of "To", except for "From", are
+  // not dominated by "From". Given SSA property, this means these
+  // predecessors are dominated by "To".
+  //
+  // There is no need to do this check if all the uses are PHI nodes. PHI
+  // sources are only defined on the specific predecessor edges.
+  if (!BreakPHIEdge) {
+    for (MachineBasicBlock::pred_iterator PI = ToBB->pred_begin(),
+           E = ToBB->pred_end(); PI != E; ++PI) {
+      if (*PI == FromBB)
+        continue;
+      if (!DT->dominates(ToBB, *PI))
+        return false;
+    }
+  }
+
+  ToSplit.insert(std::make_pair(FromBB, ToBB));
+  
+  return true;
+}
+
+/// collectDebgValues - Scan instructions following MI and collect any
+/// matching DBG_VALUEs.
+static void collectDebugValues(MachineInstr &MI,
+                               SmallVectorImpl<MachineInstr *> &DbgValues) {
+  DbgValues.clear();
+  if (!MI.getOperand(0).isReg())
+    return;
+
+  MachineBasicBlock::iterator DI = MI; ++DI;
+  for (MachineBasicBlock::iterator DE = MI.getParent()->end();
+       DI != DE; ++DI) {
+    if (!DI->isDebugValue())
+      return;
+    if (DI->getOperand(0).isReg() &&
+        DI->getOperand(0).getReg() == MI.getOperand(0).getReg())
+      DbgValues.push_back(&*DI);
+  }
+}
+
+/// isProfitableToSinkTo - Return true if it is profitable to sink MI.
+bool MachineSinking::isProfitableToSinkTo(unsigned Reg, MachineInstr &MI,
+                                          MachineBasicBlock *MBB,
+                                          MachineBasicBlock *SuccToSinkTo,
+                                          AllSuccsCache &AllSuccessors) {
+  assert (SuccToSinkTo && "Invalid SinkTo Candidate BB");
+
+  if (MBB == SuccToSinkTo)
+    return false;
+
+  // It is profitable if SuccToSinkTo does not post dominate current block.
+  if (!PDT->dominates(SuccToSinkTo, MBB))
+    return true;
+
+  // It is profitable to sink an instruction from a deeper loop to a shallower
+  // loop, even if the latter post-dominates the former (PR21115).
+  if (LI->getLoopDepth(MBB) > LI->getLoopDepth(SuccToSinkTo))
+    return true;
+
+  // Check if only use in post dominated block is PHI instruction.
+  bool NonPHIUse = false;
+  for (MachineInstr &UseInst : MRI->use_nodbg_instructions(Reg)) {
+    MachineBasicBlock *UseBlock = UseInst.getParent();
+    if (UseBlock == SuccToSinkTo && !UseInst.isPHI())
+      NonPHIUse = true;
+  }
+  if (!NonPHIUse)
+    return true;
+
+  // If SuccToSinkTo post dominates then also it may be profitable if MI
+  // can further profitably sinked into another block in next round.
+  bool BreakPHIEdge = false;
+  // FIXME - If finding successor is compile time expensive then cache results.
+  if (MachineBasicBlock *MBB2 =
+          FindSuccToSinkTo(MI, SuccToSinkTo, BreakPHIEdge, AllSuccessors))
+    return isProfitableToSinkTo(Reg, MI, SuccToSinkTo, MBB2, AllSuccessors);
+
+  // If SuccToSinkTo is final destination and it is a post dominator of current
+  // block then it is not profitable to sink MI into SuccToSinkTo block.
+  return false;
+}
+
+/// Get the sorted sequence of successors for this MachineBasicBlock, possibly
+/// computing it if it was not already cached.
+SmallVector<MachineBasicBlock *, 4> &
+MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
+                                       AllSuccsCache &AllSuccessors) const {
+
+  // Do we have the sorted successors in cache ?
+  auto Succs = AllSuccessors.find(MBB);
+  if (Succs != AllSuccessors.end())
+    return Succs->second;
+
+  SmallVector<MachineBasicBlock *, 4> AllSuccs(MBB->succ_begin(),
+                                               MBB->succ_end());
+
+  // Handle cases where sinking can happen but where the sink point isn't a
+  // successor. For example:
+  //
+  //   x = computation
+  //   if () {} else {}
+  //   use x
+  //
+  const std::vector<MachineDomTreeNode *> &Children =
+    DT->getNode(MBB)->getChildren();
+  for (const auto &DTChild : Children)
+    // DomTree children of MBB that have MBB as immediate dominator are added.
+    if (DTChild->getIDom()->getBlock() == MI.getParent() &&
+        // Skip MBBs already added to the AllSuccs vector above.
+        !MBB->isSuccessor(DTChild->getBlock()))
+      AllSuccs.push_back(DTChild->getBlock());
+
+  // Sort Successors according to their loop depth or block frequency info.
+  std::stable_sort(
+      AllSuccs.begin(), AllSuccs.end(),
+      [this](const MachineBasicBlock *L, const MachineBasicBlock *R) {
+        uint64_t LHSFreq = MBFI ? MBFI->getBlockFreq(L).getFrequency() : 0;
+        uint64_t RHSFreq = MBFI ? MBFI->getBlockFreq(R).getFrequency() : 0;
+        bool HasBlockFreq = LHSFreq != 0 && RHSFreq != 0;
+        return HasBlockFreq ? LHSFreq < RHSFreq
+                            : LI->getLoopDepth(L) < LI->getLoopDepth(R);
+      });
+
+  auto it = AllSuccessors.insert(std::make_pair(MBB, AllSuccs));
+
+  return it.first->second;
+}
+
+/// FindSuccToSinkTo - Find a successor to sink this instruction to.
+MachineBasicBlock *
+MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
+                                 bool &BreakPHIEdge,
+                                 AllSuccsCache &AllSuccessors) {
+  assert (MBB && "Invalid MachineBasicBlock!");
+
+  // Loop over all the operands of the specified instruction.  If there is
+  // anything we can't handle, bail out.
+
+  // SuccToSinkTo - This is the successor to sink this instruction to, once we
+  // decide.
+  MachineBasicBlock *SuccToSinkTo = nullptr;
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;  // Ignore non-register operands.
+
+    unsigned Reg = MO.getReg();
+    if (Reg == 0) continue;
+
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+          return nullptr;
+      } else if (!MO.isDead()) {
+        // A def that isn't dead. We can't move it.
+        return nullptr;
+      }
+    } else {
+      // Virtual register uses are always safe to sink.
+      if (MO.isUse()) continue;
+
+      // If it's not safe to move defs of the register class, then abort.
+      if (!TII->isSafeToMoveRegClassDefs(MRI->getRegClass(Reg)))
+        return nullptr;
+
+      // Virtual register defs can only be sunk if all their uses are in blocks
+      // dominated by one of the successors.
+      if (SuccToSinkTo) {
+        // If a previous operand picked a block to sink to, then this operand
+        // must be sinkable to the same block.
+        bool LocalUse = false;
+        if (!AllUsesDominatedByBlock(Reg, SuccToSinkTo, MBB,
+                                     BreakPHIEdge, LocalUse))
+          return nullptr;
+
+        continue;
+      }
+
+      // Otherwise, we should look at all the successors and decide which one
+      // we should sink to. If we have reliable block frequency information
+      // (frequency != 0) available, give successors with smaller frequencies
+      // higher priority, otherwise prioritize smaller loop depths.
+      for (MachineBasicBlock *SuccBlock :
+           GetAllSortedSuccessors(MI, MBB, AllSuccessors)) {
+        bool LocalUse = false;
+        if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
+                                    BreakPHIEdge, LocalUse)) {
+          SuccToSinkTo = SuccBlock;
+          break;
+        }
+        if (LocalUse)
+          // Def is used locally, it's never safe to move this def.
+          return nullptr;
+      }
+
+      // If we couldn't find a block to sink to, ignore this instruction.
+      if (!SuccToSinkTo)
+        return nullptr;
+      if (!isProfitableToSinkTo(Reg, MI, MBB, SuccToSinkTo, AllSuccessors))
+        return nullptr;
+    }
+  }
+
+  // It is not possible to sink an instruction into its own block.  This can
+  // happen with loops.
+  if (MBB == SuccToSinkTo)
+    return nullptr;
+
+  // It's not safe to sink instructions to EH landing pad. Control flow into
+  // landing pad is implicitly defined.
+  if (SuccToSinkTo && SuccToSinkTo->isEHPad())
+    return nullptr;
+
+  return SuccToSinkTo;
+}
+
+/// \brief Return true if MI is likely to be usable as a memory operation by the
+/// implicit null check optimization.
+///
+/// This is a "best effort" heuristic, and should not be relied upon for
+/// correctness.  This returning true does not guarantee that the implicit null
+/// check optimization is legal over MI, and this returning false does not
+/// guarantee MI cannot possibly be used to do a null check.
+static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
+                                             const TargetInstrInfo *TII,
+                                             const TargetRegisterInfo *TRI) {
+  typedef TargetInstrInfo::MachineBranchPredicate MachineBranchPredicate;
+
+  auto *MBB = MI.getParent();
+  if (MBB->pred_size() != 1)
+    return false;
+
+  auto *PredMBB = *MBB->pred_begin();
+  auto *PredBB = PredMBB->getBasicBlock();
+
+  // Frontends that don't use implicit null checks have no reason to emit
+  // branches with make.implicit metadata, and this function should always
+  // return false for them.
+  if (!PredBB ||
+      !PredBB->getTerminator()->getMetadata(LLVMContext::MD_make_implicit))
+    return false;
+
+  unsigned BaseReg;
+  int64_t Offset;
+  if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI))
+    return false;
+
+  if (!(MI.mayLoad() && !MI.isPredicable()))
+    return false;
+
+  MachineBranchPredicate MBP;
+  if (TII->analyzeBranchPredicate(*PredMBB, MBP, false))
+    return false;
+
+  return MBP.LHS.isReg() && MBP.RHS.isImm() && MBP.RHS.getImm() == 0 &&
+         (MBP.Predicate == MachineBranchPredicate::PRED_NE ||
+          MBP.Predicate == MachineBranchPredicate::PRED_EQ) &&
+         MBP.LHS.getReg() == BaseReg;
+}
+
+/// SinkInstruction - Determine whether it is safe to sink the specified machine
+/// instruction out of its current block into a successor.
+bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
+                                     AllSuccsCache &AllSuccessors) {
+  // Don't sink instructions that the target prefers not to sink.
+  if (!TII->shouldSink(MI))
+    return false;
+
+  // Check if it's safe to move the instruction.
+  if (!MI.isSafeToMove(AA, SawStore))
+    return false;
+
+  // Convergent operations may not be made control-dependent on additional
+  // values.
+  if (MI.isConvergent())
+    return false;
+
+  // Don't break implicit null checks.  This is a performance heuristic, and not
+  // required for correctness.
+  if (SinkingPreventsImplicitNullCheck(MI, TII, TRI))
+    return false;
+
+  // FIXME: This should include support for sinking instructions within the
+  // block they are currently in to shorten the live ranges.  We often get
+  // instructions sunk into the top of a large block, but it would be better to
+  // also sink them down before their first use in the block.  This xform has to
+  // be careful not to *increase* register pressure though, e.g. sinking
+  // "x = y + z" down if it kills y and z would increase the live ranges of y
+  // and z and only shrink the live range of x.
+
+  bool BreakPHIEdge = false;
+  MachineBasicBlock *ParentBlock = MI.getParent();
+  MachineBasicBlock *SuccToSinkTo =
+      FindSuccToSinkTo(MI, ParentBlock, BreakPHIEdge, AllSuccessors);
+
+  // If there are no outputs, it must have side-effects.
+  if (!SuccToSinkTo)
+    return false;
+
+
+  // If the instruction to move defines a dead physical register which is live
+  // when leaving the basic block, don't move it because it could turn into a
+  // "zombie" define of that preg. E.g., EFLAGS. (<rdar://problem/8030636>)
+  for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+    const MachineOperand &MO = MI.getOperand(I);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0 || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    if (SuccToSinkTo->isLiveIn(Reg))
+      return false;
+  }
+
+  DEBUG(dbgs() << "Sink instr " << MI << "\tinto block " << *SuccToSinkTo);
+
+  // If the block has multiple predecessors, this is a critical edge.
+  // Decide if we can sink along it or need to break the edge.
+  if (SuccToSinkTo->pred_size() > 1) {
+    // We cannot sink a load across a critical edge - there may be stores in
+    // other code paths.
+    bool TryBreak = false;
+    bool store = true;
+    if (!MI.isSafeToMove(AA, store)) {
+      DEBUG(dbgs() << " *** NOTE: Won't sink load along critical edge.\n");
+      TryBreak = true;
+    }
+
+    // We don't want to sink across a critical edge if we don't dominate the
+    // successor. We could be introducing calculations to new code paths.
+    if (!TryBreak && !DT->dominates(ParentBlock, SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Critical edge found\n");
+      TryBreak = true;
+    }
+
+    // Don't sink instructions into a loop.
+    if (!TryBreak && LI->isLoopHeader(SuccToSinkTo)) {
+      DEBUG(dbgs() << " *** NOTE: Loop header found\n");
+      TryBreak = true;
+    }
+
+    // Otherwise we are OK with sinking along a critical edge.
+    if (!TryBreak)
+      DEBUG(dbgs() << "Sinking along critical edge.\n");
+    else {
+      // Mark this edge as to be split.
+      // If the edge can actually be split, the next iteration of the main loop
+      // will sink MI in the newly created block.
+      bool Status =
+        PostponeSplitCriticalEdge(MI, ParentBlock, SuccToSinkTo, BreakPHIEdge);
+      if (!Status)
+        DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+              "break critical edge\n");
+      // The instruction will not be sunk this time.
+      return false;
+    }
+  }
+
+  if (BreakPHIEdge) {
+    // BreakPHIEdge is true if all the uses are in the successor MBB being
+    // sunken into and they are all PHI nodes. In this case, machine-sink must
+    // break the critical edge first.
+    bool Status = PostponeSplitCriticalEdge(MI, ParentBlock,
+                                            SuccToSinkTo, BreakPHIEdge);
+    if (!Status)
+      DEBUG(dbgs() << " *** PUNTING: Not legal or profitable to "
+            "break critical edge\n");
+    // The instruction will not be sunk this time.
+    return false;
+  }
+
+  // Determine where to insert into. Skip phi nodes.
+  MachineBasicBlock::iterator InsertPos = SuccToSinkTo->begin();
+  while (InsertPos != SuccToSinkTo->end() && InsertPos->isPHI())
+    ++InsertPos;
+
+  // collect matching debug values.
+  SmallVector<MachineInstr *, 2> DbgValuesToSink;
+  collectDebugValues(MI, DbgValuesToSink);
+
+  // Move the instruction.
+  SuccToSinkTo->splice(InsertPos, ParentBlock, MI,
+                       ++MachineBasicBlock::iterator(MI));
+
+  // Move debug values.
+  for (SmallVectorImpl<MachineInstr *>::iterator DBI = DbgValuesToSink.begin(),
+         DBE = DbgValuesToSink.end(); DBI != DBE; ++DBI) {
+    MachineInstr *DbgMI = *DBI;
+    SuccToSinkTo->splice(InsertPos, ParentBlock,  DbgMI,
+                         ++MachineBasicBlock::iterator(DbgMI));
+  }
+
+  // Conservatively, clear any kill flags, since it's possible that they are no
+  // longer correct.
+  // Note that we have to clear the kill flags for any register this instruction
+  // uses as we may sink over another instruction which currently kills the
+  // used registers.
+  for (MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse())
+      RegsToClearKillFlags.set(MO.getReg()); // Remember to clear kill flags.
+  }
+
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
new file mode 100644
index 000000000000..86332c8a93a5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -0,0 +1,1328 @@
+//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "machine-trace-metrics"
+
+char MachineTraceMetrics::ID = 0;
+char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+
+MachineTraceMetrics::MachineTraceMetrics()
+  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr),
+    MRI(nullptr), Loops(nullptr) {
+  std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
+}
+
+void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetSubtargetInfo &ST = MF->getSubtarget();
+  TII = ST.getInstrInfo();
+  TRI = ST.getRegisterInfo();
+  MRI = &MF->getRegInfo();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+  BlockInfo.resize(MF->getNumBlockIDs());
+  ProcResourceCycles.resize(MF->getNumBlockIDs() *
+                            SchedModel.getNumProcResourceKinds());
+  return false;
+}
+
+void MachineTraceMetrics::releaseMemory() {
+  MF = nullptr;
+  BlockInfo.clear();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i) {
+    delete Ensembles[i];
+    Ensembles[i] = nullptr;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                          Fixed block information
+//===----------------------------------------------------------------------===//
+//
+// The number of instructions in a basic block and the CPU resources used by
+// those instructions don't depend on any given trace strategy.
+
+/// Compute the resource usage in basic block MBB.
+const MachineTraceMetrics::FixedBlockInfo*
+MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
+  assert(MBB && "No basic block");
+  FixedBlockInfo *FBI = &BlockInfo[MBB->getNumber()];
+  if (FBI->hasResources())
+    return FBI;
+
+  // Compute resource usage in the block.
+  FBI->HasCalls = false;
+  unsigned InstrCount = 0;
+
+  // Add up per-processor resource cycles as well.
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  SmallVector<unsigned, 32> PRCycles(PRKinds);
+
+  for (const auto &MI : *MBB) {
+    if (MI.isTransient())
+      continue;
+    ++InstrCount;
+    if (MI.isCall())
+      FBI->HasCalls = true;
+
+    // Count processor resources used.
+    if (!SchedModel.hasInstrSchedModel())
+      continue;
+    const MCSchedClassDesc *SC = SchedModel.resolveSchedClass(&MI);
+    if (!SC->isValid())
+      continue;
+
+    for (TargetSchedModel::ProcResIter
+         PI = SchedModel.getWriteProcResBegin(SC),
+         PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+      assert(PI->ProcResourceIdx < PRKinds && "Bad processor resource kind");
+      PRCycles[PI->ProcResourceIdx] += PI->Cycles;
+    }
+  }
+  FBI->InstrCount = InstrCount;
+
+  // Scale the resource cycles so they are comparable.
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceCycles[PROffset + K] =
+      PRCycles[K] * SchedModel.getResourceFactor(K);
+
+  return FBI;
+}
+
+ArrayRef<unsigned>
+MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
+  assert(BlockInfo[MBBNum].hasResources() &&
+         "getResources() must be called before getProcResourceCycles()");
+  unsigned PRKinds = SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceCycles.size());
+  return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Ensemble utility functions
+//===----------------------------------------------------------------------===//
+
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
+  BlockInfo.resize(MTM.BlockInfo.size());
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  ProcResourceDepths.resize(MTM.BlockInfo.size() * PRKinds);
+  ProcResourceHeights.resize(MTM.BlockInfo.size() * PRKinds);
+}
+
+// Virtual destructor serves as an anchor.
+MachineTraceMetrics::Ensemble::~Ensemble() {}
+
+const MachineLoop*
+MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
+  return MTM.Loops->getLoopFor(MBB);
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace above MBB.
+void MachineTraceMetrics::Ensemble::
+computeDepthResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources from trace above. The top block is simple.
+  if (!TBI->Pred) {
+    TBI->InstrDepth = 0;
+    TBI->Head = MBB->getNumber();
+    std::fill(ProcResourceDepths.begin() + PROffset,
+              ProcResourceDepths.begin() + PROffset + PRKinds, 0);
+    return;
+  }
+
+  // Compute from the block above. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned PredNum = TBI->Pred->getNumber();
+  TraceBlockInfo *PredTBI = &BlockInfo[PredNum];
+  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");
+  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);
+  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;
+  TBI->Head = PredTBI->Head;
+
+  // Compute per-resource depths.
+  ArrayRef<unsigned> PredPRDepths = getProcResourceDepths(PredNum);
+  ArrayRef<unsigned> PredPRCycles = MTM.getProcResourceCycles(PredNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceDepths[PROffset + K] = PredPRDepths[K] + PredPRCycles[K];
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace below MBB.
+void MachineTraceMetrics::Ensemble::
+computeHeightResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  unsigned PROffset = MBB->getNumber() * PRKinds;
+
+  // Compute resources for the current block.
+  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;
+  ArrayRef<unsigned> PRCycles = MTM.getProcResourceCycles(MBB->getNumber());
+
+  // The trace tail is done.
+  if (!TBI->Succ) {
+    TBI->Tail = MBB->getNumber();
+    std::copy(PRCycles.begin(), PRCycles.end(),
+              ProcResourceHeights.begin() + PROffset);
+    return;
+  }
+
+  // Compute from the block below. A post-order traversal ensures the
+  // predecessor is always computed first.
+  unsigned SuccNum = TBI->Succ->getNumber();
+  TraceBlockInfo *SuccTBI = &BlockInfo[SuccNum];
+  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");
+  TBI->InstrHeight += SuccTBI->InstrHeight;
+  TBI->Tail = SuccTBI->Tail;
+
+  // Compute per-resource heights.
+  ArrayRef<unsigned> SuccPRHeights = getProcResourceHeights(SuccNum);
+  for (unsigned K = 0; K != PRKinds; ++K)
+    ProcResourceHeights[PROffset + K] = SuccPRHeights[K] + PRCycles[K];
+}
+
+// Check if depth resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getDepthResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidDepth() ? TBI : nullptr;
+}
+
+// Check if height resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getHeightResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidHeight() ? TBI : nullptr;
+}
+
+/// Get an array of processor resource depths for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by all blocks preceding MBB in its trace. It does not include instructions
+/// in MBB.
+///
+/// Compare TraceBlockInfo::InstrDepth.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceDepths(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceDepths.size());
+  return makeArrayRef(ProcResourceDepths.data() + MBBNum * PRKinds, PRKinds);
+}
+
+/// Get an array of processor resource heights for MBB. Indexed by processor
+/// resource kind, this array contains the scaled processor resources consumed
+/// by this block and all blocks following it in its trace.
+///
+/// Compare TraceBlockInfo::InstrHeight.
+ArrayRef<unsigned>
+MachineTraceMetrics::Ensemble::
+getProcResourceHeights(unsigned MBBNum) const {
+  unsigned PRKinds = MTM.SchedModel.getNumProcResourceKinds();
+  assert((MBBNum+1) * PRKinds <= ProcResourceHeights.size());
+  return makeArrayRef(ProcResourceHeights.data() + MBBNum * PRKinds, PRKinds);
+}
+
+//===----------------------------------------------------------------------===//
+//                         Trace Selection Strategies
+//===----------------------------------------------------------------------===//
+//
+// A trace selection strategy is implemented as a sub-class of Ensemble. The
+// trace through a block B is computed by two DFS traversals of the CFG
+// starting from B. One upwards, and one downwards. During the upwards DFS,
+// pickTracePred() is called on the post-ordered blocks. During the downwards
+// DFS, pickTraceSucc() is called in a post-order.
+//
+
+// We never allow traces that leave loops, but we do allow traces to enter
+// nested loops. We also never allow traces to contain back-edges.
+//
+// This means that a loop header can never appear above the center block of a
+// trace, except as the trace head. Below the center block, loop exiting edges
+// are banned.
+//
+// Return true if an edge from the From loop to the To loop is leaving a loop.
+// Either of To and From can be null.
+static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
+  return From && !From->contains(To);
+}
+
+// MinInstrCountEnsemble - Pick the trace that executes the least number of
+// instructions.
+namespace {
+class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const override { return "MinInstr"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override;
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) override;
+
+public:
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
+};
+}
+
+// Select the preferred predecessor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return nullptr;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  // Don't leave loops, and never follow back-edges.
+  if (CurLoop && MBB == CurLoop->getHeader())
+    return nullptr;
+  unsigned CurCount = MTM.getResources(MBB)->InstrCount;
+  const MachineBasicBlock *Best = nullptr;
+  unsigned BestDepth = 0;
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    const MachineTraceMetrics::TraceBlockInfo *PredTBI =
+      getDepthResources(Pred);
+    // Ignore cycles that aren't natural loops.
+    if (!PredTBI)
+      continue;
+    // Pick the predecessor that would give this block the smallest InstrDepth.
+    unsigned Depth = PredTBI->InstrDepth + CurCount;
+    if (!Best || Depth < BestDepth) {
+      Best = Pred;
+      BestDepth = Depth;
+    }
+  }
+  return Best;
+}
+
+// Select the preferred successor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return nullptr;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  const MachineBasicBlock *Best = nullptr;
+  unsigned BestHeight = 0;
+  for (const MachineBasicBlock *Succ : MBB->successors()) {
+    // Don't consider back-edges.
+    if (CurLoop && Succ == CurLoop->getHeader())
+      continue;
+    // Don't consider successors exiting CurLoop.
+    if (isExitingLoop(CurLoop, getLoopFor(Succ)))
+      continue;
+    const MachineTraceMetrics::TraceBlockInfo *SuccTBI =
+      getHeightResources(Succ);
+    // Ignore cycles that aren't natural loops.
+    if (!SuccTBI)
+      continue;
+    // Pick the successor that would give this block the smallest InstrHeight.
+    unsigned Height = SuccTBI->InstrHeight;
+    if (!Best || Height < BestHeight) {
+      Best = Succ;
+      BestHeight = Height;
+    }
+  }
+  return Best;
+}
+
+// Get an Ensemble sub-class for the requested trace strategy.
+MachineTraceMetrics::Ensemble *
+MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) {
+  assert(strategy < TS_NumStrategies && "Invalid trace strategy enum");
+  Ensemble *&E = Ensembles[strategy];
+  if (E)
+    return E;
+
+  // Allocate new Ensemble on demand.
+  switch (strategy) {
+  case TS_MinInstrCount: return (E = new MinInstrCountEnsemble(this));
+  default: llvm_unreachable("Invalid trace strategy enum");
+  }
+}
+
+void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Invalidate traces through BB#" << MBB->getNumber() << '\n');
+  BlockInfo[MBB->getNumber()].invalidate();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->invalidate(MBB);
+}
+
+void MachineTraceMetrics::verifyAnalysis() const {
+  if (!MF)
+    return;
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->verify();
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                               Trace building
+//===----------------------------------------------------------------------===//
+//
+// Traces are built by two CFG traversals. To avoid recomputing too much, use a
+// set abstraction that confines the search to the current loop, and doesn't
+// revisit blocks.
+
+namespace {
+struct LoopBounds {
+  MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
+  SmallPtrSet<const MachineBasicBlock*, 8> Visited;
+  const MachineLoopInfo *Loops;
+  bool Downward;
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops)
+    : Blocks(blocks), Loops(loops), Downward(false) {}
+};
+}
+
+// Specialize po_iterator_storage in order to prune the post-order traversal so
+// it is limited to the current loop and doesn't traverse the loop back edges.
+namespace llvm {
+template<>
+class po_iterator_storage<LoopBounds, true> {
+  LoopBounds &LB;
+public:
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+  void finishPostorder(const MachineBasicBlock*) {}
+
+  bool insertEdge(const MachineBasicBlock *From, const MachineBasicBlock *To) {
+    // Skip already visited To blocks.
+    MachineTraceMetrics::TraceBlockInfo &TBI = LB.Blocks[To->getNumber()];
+    if (LB.Downward ? TBI.hasValidHeight() : TBI.hasValidDepth())
+      return false;
+    // From is null once when To is the trace center block.
+    if (From) {
+      if (const MachineLoop *FromLoop = LB.Loops->getLoopFor(From)) {
+        // Don't follow backedges, don't leave FromLoop when going upwards.
+        if ((LB.Downward ? To : From) == FromLoop->getHeader())
+          return false;
+        // Don't leave FromLoop.
+        if (isExitingLoop(FromLoop, LB.Loops->getLoopFor(To)))
+          return false;
+      }
+    }
+    // To is a new block. Mark the block as visited in case the CFG has cycles
+    // that MachineLoopInfo didn't recognize as a natural loop.
+    return LB.Visited.insert(To).second;
+  }
+};
+}
+
+/// Compute the trace through MBB.
+void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Computing " << getName() << " trace through BB#"
+               << MBB->getNumber() << '\n');
+  // Set up loop bounds for the backwards post-order traversal.
+  LoopBounds Bounds(BlockInfo, MTM.Loops);
+
+  // Run an upwards post-order search for the trace start.
+  Bounds.Downward = false;
+  Bounds.Visited.clear();
+  for (auto I : inverse_post_order_ext(MBB, Bounds)) {
+    DEBUG(dbgs() << "  pred for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the predecessors have been visited, pick the preferred one.
+    TBI.Pred = pickTracePred(I);
+    DEBUG({
+      if (TBI.Pred)
+        dbgs() << "BB#" << TBI.Pred->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leading to I is now known, compute the depth resources.
+    computeDepthResources(I);
+  }
+
+  // Run a downwards post-order search for the trace end.
+  Bounds.Downward = true;
+  Bounds.Visited.clear();
+  for (auto I : post_order_ext(MBB, Bounds)) {
+    DEBUG(dbgs() << "  succ for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the successors have been visited, pick the preferred one.
+    TBI.Succ = pickTraceSucc(I);
+    DEBUG({
+      if (TBI.Succ)
+        dbgs() << "BB#" << TBI.Succ->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leaving I is now known, compute the height resources.
+    computeHeightResources(I);
+  }
+}
+
+/// Invalidate traces through BadMBB.
+void
+MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
+  SmallVector<const MachineBasicBlock*, 16> WorkList;
+  TraceBlockInfo &BadTBI = BlockInfo[BadMBB->getNumber()];
+
+  // Invalidate height resources of blocks above MBB.
+  if (BadTBI.hasValidHeight()) {
+    BadTBI.invalidateHeight();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " height.\n");
+      // Find any MBB predecessors that have MBB as their preferred successor.
+      // They are the only ones that need to be invalidated.
+      for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+        TraceBlockInfo &TBI = BlockInfo[Pred->getNumber()];
+        if (!TBI.hasValidHeight())
+          continue;
+        if (TBI.Succ == MBB) {
+          TBI.invalidateHeight();
+          WorkList.push_back(Pred);
+          continue;
+        }
+        // Verify that TBI.Succ is actually a *I successor.
+        assert((!TBI.Succ || Pred->isSuccessor(TBI.Succ)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Invalidate depth resources of blocks below MBB.
+  if (BadTBI.hasValidDepth()) {
+    BadTBI.invalidateDepth();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " depth.\n");
+      // Find any MBB successors that have MBB as their preferred predecessor.
+      // They are the only ones that need to be invalidated.
+      for (const MachineBasicBlock *Succ : MBB->successors()) {
+        TraceBlockInfo &TBI = BlockInfo[Succ->getNumber()];
+        if (!TBI.hasValidDepth())
+          continue;
+        if (TBI.Pred == MBB) {
+          TBI.invalidateDepth();
+          WorkList.push_back(Succ);
+          continue;
+        }
+        // Verify that TBI.Pred is actually a *I predecessor.
+        assert((!TBI.Pred || Succ->isPredecessor(TBI.Pred)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Clear any per-instruction data. We only have to do this for BadMBB itself
+  // because the instructions in that block may change. Other blocks may be
+  // invalidated, but their instructions will stay the same, so there is no
+  // need to erase the Cycle entries. They will be overwritten when we
+  // recompute.
+  for (const auto &I : *BadMBB)
+    Cycles.erase(&I);
+}
+
+void MachineTraceMetrics::Ensemble::verify() const {
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
+         "Outdated BlockInfo size");
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+    const TraceBlockInfo &TBI = BlockInfo[Num];
+    if (TBI.hasValidDepth() && TBI.Pred) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isPredecessor(TBI.Pred) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Pred->getNumber()].hasValidDepth() &&
+             "Trace is broken, depth should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      assert(!(Loop && MBB == Loop->getHeader()) && "Trace contains backedge");
+    }
+    if (TBI.hasValidHeight() && TBI.Succ) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isSuccessor(TBI.Succ) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Succ->getNumber()].hasValidHeight() &&
+             "Trace is broken, height should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      const MachineLoop *SuccLoop = getLoopFor(TBI.Succ);
+      assert(!(Loop && Loop == SuccLoop && TBI.Succ == Loop->getHeader()) &&
+             "Trace contains backedge");
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                             Data Dependencies
+//===----------------------------------------------------------------------===//
+//
+// Compute the depth and height of each instruction based on data dependencies
+// and instruction latencies. These cycle numbers assume that the CPU can issue
+// an infinite number of instructions per cycle as long as their dependencies
+// are ready.
+
+// A data dependency is represented as a defining MI and operand numbers on the
+// defining and using MI.
+namespace {
+struct DataDep {
+  const MachineInstr *DefMI;
+  unsigned DefOp;
+  unsigned UseOp;
+
+  DataDep(const MachineInstr *DefMI, unsigned DefOp, unsigned UseOp)
+    : DefMI(DefMI), DefOp(DefOp), UseOp(UseOp) {}
+
+  /// Create a DataDep from an SSA form virtual register.
+  DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
+    : UseOp(UseOp) {
+    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
+    assert(!DefI.atEnd() && "Register has no defs");
+    DefMI = DefI->getParent();
+    DefOp = DefI.getOperandNo();
+    assert((++DefI).atEnd() && "Register has multiple defs");
+  }
+};
+}
+
+// Get the input data dependencies that must be ready before UseMI can issue.
+// Return true if UseMI has any physreg operands.
+static bool getDataDeps(const MachineInstr &UseMI,
+                        SmallVectorImpl<DataDep> &Deps,
+                        const MachineRegisterInfo *MRI) {
+  // Debug values should not be included in any calculations.
+  if (UseMI.isDebugValue())
+    return false;
+  
+  bool HasPhysRegs = false;
+  for (MachineInstr::const_mop_iterator I = UseMI.operands_begin(),
+       E = UseMI.operands_end(); I != E; ++I) {
+    const MachineOperand &MO = *I;
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      HasPhysRegs = true;
+      continue;
+    }
+    // Collect virtual register reads.
+    if (MO.readsReg())
+      Deps.push_back(DataDep(MRI, Reg, UseMI.getOperandNo(I)));
+  }
+  return HasPhysRegs;
+}
+
+// Get the input data dependencies of a PHI instruction, using Pred as the
+// preferred predecessor.
+// This will add at most one dependency to Deps.
+static void getPHIDeps(const MachineInstr &UseMI,
+                       SmallVectorImpl<DataDep> &Deps,
+                       const MachineBasicBlock *Pred,
+                       const MachineRegisterInfo *MRI) {
+  // No predecessor at the beginning of a trace. Ignore dependencies.
+  if (!Pred)
+    return;
+  assert(UseMI.isPHI() && UseMI.getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI.getNumOperands(); i += 2) {
+    if (UseMI.getOperand(i + 1).getMBB() == Pred) {
+      unsigned Reg = UseMI.getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
+      return;
+    }
+  }
+}
+
+// Keep track of physreg data dependencies by recording each live register unit.
+// Associate each regunit with an instruction operand. Depending on the
+// direction instructions are scanned, it could be the operand that defined the
+// regunit, or the highest operand to read the regunit.
+namespace {
+struct LiveRegUnit {
+  unsigned RegUnit;
+  unsigned Cycle;
+  const MachineInstr *MI;
+  unsigned Op;
+
+  unsigned getSparseSetIndex() const { return RegUnit; }
+
+  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {}
+};
+}
+
+// Identify physreg dependencies for UseMI, and update the live regunit
+// tracking set when scanning instructions downwards.
+static void updatePhysDepsDownwards(const MachineInstr *UseMI,
+                                    SmallVectorImpl<DataDep> &Deps,
+                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> Kills;
+  SmallVector<unsigned, 8> LiveDefOps;
+
+  for (MachineInstr::const_mop_iterator MI = UseMI->operands_begin(),
+       ME = UseMI->operands_end(); MI != ME; ++MI) {
+    const MachineOperand &MO = *MI;
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    // Track live defs and kills for updating RegUnits.
+    if (MO.isDef()) {
+      if (MO.isDead())
+        Kills.push_back(Reg);
+      else
+        LiveDefOps.push_back(UseMI->getOperandNo(MI));
+    } else if (MO.isKill())
+      Kills.push_back(Reg);
+    // Identify dependencies.
+    if (!MO.readsReg())
+      continue;
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      Deps.push_back(DataDep(I->MI, I->Op, UseMI->getOperandNo(MI)));
+      break;
+    }
+  }
+
+  // Update RegUnits to reflect live registers after UseMI.
+  // First kills.
+  for (unsigned Kill : Kills)
+    for (MCRegUnitIterator Units(Kill, TRI); Units.isValid(); ++Units)
+      RegUnits.erase(*Units);
+
+  // Second, live defs.
+  for (unsigned DefOp : LiveDefOps) {
+    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
+         Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      LRU.MI = UseMI;
+      LRU.Op = DefOp;
+    }
+  }
+}
+
+/// The length of the critical path through a trace is the maximum of two path
+/// lengths:
+///
+/// 1. The maximum height+depth over all instructions in the trace center block.
+///
+/// 2. The longest cross-block dependency chain. For small blocks, it is
+///    possible that the critical path through the trace doesn't include any
+///    instructions in the block.
+///
+/// This function computes the second number from the live-in list of the
+/// center block.
+unsigned MachineTraceMetrics::Ensemble::
+computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
+  assert(TBI.HasValidInstrDepths && "Missing depth info");
+  assert(TBI.HasValidInstrHeights && "Missing height info");
+  unsigned MaxLen = 0;
+  for (const LiveInReg &LIR : TBI.LiveIns) {
+    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+      continue;
+    const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+    // Ignore dependencies outside the current trace.
+    const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
+    if (!DefTBI.isUsefulDominator(TBI))
+      continue;
+    unsigned Len = LIR.Height + Cycles[DefMI].Depth;
+    MaxLen = std::max(MaxLen, Len);
+  }
+  return MaxLen;
+}
+
+/// Compute instruction depths for all instructions above or in MBB in its
+/// trace. This assumes that the trace through MBB has already been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrDepths(const MachineBasicBlock *MBB) {
+  // The top of the trace may already be computed, and HasValidInstrDepths
+  // implies Head->HasValidInstrDepths, so we only need to start from the first
+  // block in the trace that needs to be recomputed.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidDepth() && "Incomplete trace");
+    if (TBI.HasValidInstrDepths)
+      break;
+    Stack.push_back(MBB);
+    MBB = TBI.Pred;
+  } while (MBB);
+
+  // FIXME: If MBB is non-null at this point, it is the last pre-computed block
+  // in the trace. We should track any live-out physregs that were defined in
+  // the trace. This is quite rare in SSA form, typically created by CSE
+  // hoisting a compare.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // Go through trace blocks in top-down order, stopping after the center block.
+  SmallVector<DataDep, 8> Deps;
+  while (!Stack.empty()) {
+    MBB = Stack.pop_back_val();
+    DEBUG(dbgs() << "\nDepths for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrDepths = true;
+    TBI.CriticalPath = 0;
+
+    // Print out resource depths here as well.
+    DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrDepth);
+      ArrayRef<unsigned> PRDepths = getProcResourceDepths(MBB->getNumber());
+      for (unsigned K = 0; K != PRDepths.size(); ++K)
+        if (PRDepths[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRDepths[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRDepths[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Also compute the critical path length through MBB when possible.
+    if (TBI.HasValidInstrHeights)
+      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
+
+    for (const auto &UseMI : *MBB) {
+      // Collect all data dependencies.
+      Deps.clear();
+      if (UseMI.isPHI())
+        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(UseMI, Deps, MTM.MRI))
+        updatePhysDepsDownwards(&UseMI, Deps, RegUnits, MTM.TRI);
+
+      // Filter and process dependencies, computing the earliest issue cycle.
+      unsigned Cycle = 0;
+      for (const DataDep &Dep : Deps) {
+        const TraceBlockInfo&DepTBI =
+          BlockInfo[Dep.DefMI->getParent()->getNumber()];
+        // Ignore dependencies from outside the current trace.
+        if (!DepTBI.isUsefulDominator(TBI))
+          continue;
+        assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
+        unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
+        // Add latency if DefMI is a real instruction. Transients get latency 0.
+        if (!Dep.DefMI->isTransient())
+          DepCycle += MTM.SchedModel
+            .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp);
+        Cycle = std::max(Cycle, DepCycle);
+      }
+      // Remember the instruction depth.
+      InstrCycles &MICycles = Cycles[&UseMI];
+      MICycles.Depth = Cycle;
+
+      if (!TBI.HasValidInstrHeights) {
+        DEBUG(dbgs() << Cycle << '\t' << UseMI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << UseMI);
+    }
+  }
+}
+
+// Identify physreg dependencies for MI when scanning instructions upwards.
+// Return the issue height of MI after considering any live regunits.
+// Height is the issue height computed from virtual register dependencies alone.
+static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
+                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      const TargetSchedModel &SchedModel,
+                                      const TargetInstrInfo *TII,
+                                      const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> ReadOps;
+
+  for (MachineInstr::const_mop_iterator MOI = MI.operands_begin(),
+                                        MOE = MI.operands_end();
+       MOI != MOE; ++MOI) {
+    const MachineOperand &MO = *MOI;
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO.readsReg())
+      ReadOps.push_back(MI.getOperandNo(MOI));
+    if (!MO.isDef())
+      continue;
+    // This is a def of Reg. Remove corresponding entries from RegUnits, and
+    // update MI Height to consider the physreg dependencies.
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      unsigned DepHeight = I->Cycle;
+      if (!MI.isTransient()) {
+        // We may not know the UseMI of this dependency, if it came from the
+        // live-in list. SchedModel can handle a NULL UseMI.
+        DepHeight += SchedModel.computeOperandLatency(&MI, MI.getOperandNo(MOI),
+                                                      I->MI, I->Op);
+      }
+      Height = std::max(Height, DepHeight);
+      // This regunit is dead above MI.
+      RegUnits.erase(I);
+    }
+  }
+
+  // Now we know the height of MI. Update any regunits read.
+  for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
+    unsigned Reg = MI.getOperand(ReadOps[i]).getReg();
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      // Set the height to the highest reader of the unit.
+      if (LRU.Cycle <= Height && LRU.MI != &MI) {
+        LRU.Cycle = Height;
+        LRU.MI = &MI;
+        LRU.Op = ReadOps[i];
+      }
+    }
+  }
+
+  return Height;
+}
+
+
+typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
+
+// Push the height of DefMI upwards if required to match UseMI.
+// Return true if this is the first time DefMI was seen.
+static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI,
+                          unsigned UseHeight, MIHeightMap &Heights,
+                          const TargetSchedModel &SchedModel,
+                          const TargetInstrInfo *TII) {
+  // Adjust height by Dep.DefMI latency.
+  if (!Dep.DefMI->isTransient())
+    UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI,
+                                                  Dep.UseOp);
+
+  // Update Heights[DefMI] to be the maximum height seen.
+  MIHeightMap::iterator I;
+  bool New;
+  std::tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
+    return true;
+
+  // DefMI has been pushed before. Give it the max height.
+  if (I->second < UseHeight)
+    I->second = UseHeight;
+  return false;
+}
+
+/// Assuming that the virtual register defined by DefMI:DefOp was used by
+/// Trace.back(), add it to the live-in lists of all the blocks in Trace. Stop
+/// when reaching the block that contains DefMI.
+void MachineTraceMetrics::Ensemble::
+addLiveIns(const MachineInstr *DefMI, unsigned DefOp,
+           ArrayRef<const MachineBasicBlock*> Trace) {
+  assert(!Trace.empty() && "Trace should contain at least one block");
+  unsigned Reg = DefMI->getOperand(DefOp).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const MachineBasicBlock *DefMBB = DefMI->getParent();
+
+  // Reg is live-in to all blocks in Trace that follow DefMBB.
+  for (unsigned i = Trace.size(); i; --i) {
+    const MachineBasicBlock *MBB = Trace[i-1];
+    if (MBB == DefMBB)
+      return;
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    // Just add the register. The height will be updated later.
+    TBI.LiveIns.push_back(Reg);
+  }
+}
+
+/// Compute instruction heights in the trace through MBB. This updates MBB and
+/// the blocks below it in the trace. It is assumed that the trace has already
+/// been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrHeights(const MachineBasicBlock *MBB) {
+  // The bottom of the trace may already be computed.
+  // Find the blocks that need updating.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidHeight() && "Incomplete trace");
+    if (TBI.HasValidInstrHeights)
+      break;
+    Stack.push_back(MBB);
+    TBI.LiveIns.clear();
+    MBB = TBI.Succ;
+  } while (MBB);
+
+  // As we move upwards in the trace, keep track of instructions that are
+  // required by deeper trace instructions. Map MI -> height required so far.
+  MIHeightMap Heights;
+
+  // For physregs, the def isn't known when we see the use.
+  // Instead, keep track of the highest use of each regunit.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // If the bottom of the trace was already precomputed, initialize heights
+  // from its live-in list.
+  // MBB is the highest precomputed block in the trace.
+  if (MBB) {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    for (LiveInReg &LI : TBI.LiveIns) {
+      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+        // For virtual registers, the def latency is included.
+        unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
+        if (Height < LI.Height)
+          Height = LI.Height;
+      } else {
+        // For register units, the def latency is not included because we don't
+        // know the def yet.
+        RegUnits[LI.Reg].Cycle = LI.Height;
+      }
+    }
+  }
+
+  // Go through the trace blocks in bottom-up order.
+  SmallVector<DataDep, 8> Deps;
+  for (;!Stack.empty(); Stack.pop_back()) {
+    MBB = Stack.back();
+    DEBUG(dbgs() << "Heights for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrHeights = true;
+    TBI.CriticalPath = 0;
+
+    DEBUG({
+      dbgs() << format("%7u Instructions\n", TBI.InstrHeight);
+      ArrayRef<unsigned> PRHeights = getProcResourceHeights(MBB->getNumber());
+      for (unsigned K = 0; K != PRHeights.size(); ++K)
+        if (PRHeights[K]) {
+          unsigned Factor = MTM.SchedModel.getResourceFactor(K);
+          dbgs() << format("%6uc @ ", MTM.getCycles(PRHeights[K]))
+                 << MTM.SchedModel.getProcResource(K)->Name << " ("
+                 << PRHeights[K]/Factor << " ops x" << Factor << ")\n";
+        }
+    });
+
+    // Get dependencies from PHIs in the trace successor.
+    const MachineBasicBlock *Succ = TBI.Succ;
+    // If MBB is the last block in the trace, and it has a back-edge to the
+    // loop header, get loop-carried dependencies from PHIs in the header. For
+    // that purpose, pretend that all the loop header PHIs have height 0.
+    if (!Succ)
+      if (const MachineLoop *Loop = getLoopFor(MBB))
+        if (MBB->isSuccessor(Loop->getHeader()))
+          Succ = Loop->getHeader();
+
+    if (Succ) {
+      for (const auto &PHI : *Succ) {
+        if (!PHI.isPHI())
+          break;
+        Deps.clear();
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        if (!Deps.empty()) {
+          // Loop header PHI heights are all 0.
+          unsigned Height = TBI.Succ ? Cycles.lookup(&PHI).Height : 0;
+          DEBUG(dbgs() << "pred\t" << Height << '\t' << PHI);
+          if (pushDepHeight(Deps.front(), PHI, Height, Heights, MTM.SchedModel,
+                            MTM.TII))
+            addLiveIns(Deps.front().DefMI, Deps.front().DefOp, Stack);
+        }
+      }
+    }
+
+    // Go through the block backwards.
+    for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin();
+         BI != BB;) {
+      const MachineInstr &MI = *--BI;
+
+      // Find the MI height as determined by virtual register uses in the
+      // trace below.
+      unsigned Cycle = 0;
+      MIHeightMap::iterator HeightI = Heights.find(&MI);
+      if (HeightI != Heights.end()) {
+        Cycle = HeightI->second;
+        // We won't be seeing any more MI uses.
+        Heights.erase(HeightI);
+      }
+
+      // Don't process PHI deps. They depend on the specific predecessor, and
+      // we'll get them when visiting the predecessor.
+      Deps.clear();
+      bool HasPhysRegs = !MI.isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+
+      // There may also be regunit dependencies to include in the height.
+      if (HasPhysRegs)
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits, MTM.SchedModel,
+                                      MTM.TII, MTM.TRI);
+
+      // Update the required height of any virtual registers read by MI.
+      for (const DataDep &Dep : Deps)
+        if (pushDepHeight(Dep, MI, Cycle, Heights, MTM.SchedModel, MTM.TII))
+          addLiveIns(Dep.DefMI, Dep.DefOp, Stack);
+
+      InstrCycles &MICycles = Cycles[&MI];
+      MICycles.Height = Cycle;
+      if (!TBI.HasValidInstrDepths) {
+        DEBUG(dbgs() << Cycle << '\t' << MI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << MI);
+    }
+
+    // Update virtual live-in heights. They were added by addLiveIns() with a 0
+    // height because the final height isn't known until now.
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() <<  " Live-ins:");
+    for (LiveInReg &LIR : TBI.LiveIns) {
+      const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+      LIR.Height = Heights.lookup(DefMI);
+      DEBUG(dbgs() << ' ' << PrintReg(LIR.Reg) << '@' << LIR.Height);
+    }
+
+    // Transfer the live regunits to the live-in list.
+    for (SparseSet<LiveRegUnit>::const_iterator
+         RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) {
+      TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle));
+      DEBUG(dbgs() << ' ' << PrintRegUnit(RI->RegUnit, MTM.TRI)
+                   << '@' << RI->Cycle);
+    }
+    DEBUG(dbgs() << '\n');
+
+    if (!TBI.HasValidInstrDepths)
+      continue;
+    // Add live-ins to the critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath,
+                                computeCrossBlockCriticalPath(TBI));
+    DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+  }
+}
+
+MachineTraceMetrics::Trace
+MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
+  TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+
+  if (!TBI.hasValidDepth() || !TBI.hasValidHeight())
+    computeTrace(MBB);
+  if (!TBI.HasValidInstrDepths)
+    computeInstrDepths(MBB);
+  if (!TBI.HasValidInstrHeights)
+    computeInstrHeights(MBB);
+  
+  return Trace(*this, TBI);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr &MI) const {
+  assert(getBlockNum() == unsigned(MI.getParent()->getNumber()) &&
+         "MI must be in the trace center block");
+  InstrCycles Cyc = getInstrCycles(MI);
+  return getCriticalPath() - (Cyc.Depth + Cyc.Height);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const {
+  const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
+  SmallVector<DataDep, 1> Deps;
+  getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
+  assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
+  DataDep &Dep = Deps.front();
+  unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth;
+  // Add latency if DefMI is a real instruction. Transients get latency 0.
+  if (!Dep.DefMI->isTransient())
+    DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp,
+                                                        &PHI, Dep.UseOp);
+  return DepCycle;
+}
+
+/// When bottom is set include instructions in current block in estimate.
+unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
+  // Find the limiting processor resource.
+  // Numbers have been pre-scaled to be comparable.
+  unsigned PRMax = 0;
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  if (Bottom) {
+    ArrayRef<unsigned> PRCycles = TE.MTM.getProcResourceCycles(getBlockNum());
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
+      PRMax = std::max(PRMax, PRDepths[K] + PRCycles[K]);
+  } else {
+    for (unsigned K = 0; K != PRDepths.size(); ++K)
+      PRMax = std::max(PRMax, PRDepths[K]);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  /// All instructions before current block
+  unsigned Instrs = TBI.InstrDepth;
+  // plus instructions in current block
+  if (Bottom)
+    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+unsigned MachineTraceMetrics::Trace::getResourceLength(
+    ArrayRef<const MachineBasicBlock *> Extrablocks,
+    ArrayRef<const MCSchedClassDesc *> ExtraInstrs,
+    ArrayRef<const MCSchedClassDesc *> RemoveInstrs) const {
+  // Add up resources above and below the center block.
+  ArrayRef<unsigned> PRDepths = TE.getProcResourceDepths(getBlockNum());
+  ArrayRef<unsigned> PRHeights = TE.getProcResourceHeights(getBlockNum());
+  unsigned PRMax = 0;
+
+  // Capture computing cycles from extra instructions
+  auto extraCycles = [this](ArrayRef<const MCSchedClassDesc *> Instrs,
+                            unsigned ResourceIdx)
+                         ->unsigned {
+    unsigned Cycles = 0;
+    for (const MCSchedClassDesc *SC : Instrs) {
+      if (!SC->isValid())
+        continue;
+      for (TargetSchedModel::ProcResIter
+               PI = TE.MTM.SchedModel.getWriteProcResBegin(SC),
+               PE = TE.MTM.SchedModel.getWriteProcResEnd(SC);
+           PI != PE; ++PI) {
+        if (PI->ProcResourceIdx != ResourceIdx)
+          continue;
+        Cycles +=
+            (PI->Cycles * TE.MTM.SchedModel.getResourceFactor(ResourceIdx));
+      }
+    }
+    return Cycles;
+  };
+
+  for (unsigned K = 0; K != PRDepths.size(); ++K) {
+    unsigned PRCycles = PRDepths[K] + PRHeights[K];
+    for (const MachineBasicBlock *MBB : Extrablocks)
+      PRCycles += TE.MTM.getProcResourceCycles(MBB->getNumber())[K];
+    PRCycles += extraCycles(ExtraInstrs, K);
+    PRCycles -= extraCycles(RemoveInstrs, K);
+    PRMax = std::max(PRMax, PRCycles);
+  }
+  // Convert to cycle count.
+  PRMax = TE.MTM.getCycles(PRMax);
+
+  // Instrs: #instructions in current trace outside current block.
+  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  // Add instruction count from the extra blocks.
+  for (const MachineBasicBlock *MBB : Extrablocks)
+    Instrs += TE.MTM.getResources(MBB)->InstrCount;
+  Instrs += ExtraInstrs.size();
+  Instrs -= RemoveInstrs.size();
+  if (unsigned IW = TE.MTM.SchedModel.getIssueWidth())
+    Instrs /= IW;
+  // Assume issue width 1 without a schedule model.
+  return std::max(Instrs, PRMax);
+}
+
+bool MachineTraceMetrics::Trace::isDepInTrace(const MachineInstr &DefMI,
+                                              const MachineInstr &UseMI) const {
+  if (DefMI.getParent() == UseMI.getParent())
+    return true;
+
+  const TraceBlockInfo &DepTBI = TE.BlockInfo[DefMI.getParent()->getNumber()];
+  const TraceBlockInfo &TBI = TE.BlockInfo[UseMI.getParent()->getNumber()];
+
+  return DepTBI.isUsefulDominator(TBI);
+}
+
+void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
+  OS << getName() << " ensemble:\n";
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  BB#" << i << '\t';
+    BlockInfo[i].print(OS);
+    OS << '\n';
+  }
+}
+
+void MachineTraceMetrics::TraceBlockInfo::print(raw_ostream &OS) const {
+  if (hasValidDepth()) {
+    OS << "depth=" << InstrDepth;
+    if (Pred)
+      OS << " pred=BB#" << Pred->getNumber();
+    else
+      OS << " pred=null";
+    OS << " head=BB#" << Head;
+    if (HasValidInstrDepths)
+      OS << " +instrs";
+  } else
+    OS << "depth invalid";
+  OS << ", ";
+  if (hasValidHeight()) {
+    OS << "height=" << InstrHeight;
+    if (Succ)
+      OS << " succ=BB#" << Succ->getNumber();
+    else
+      OS << " succ=null";
+    OS << " tail=BB#" << Tail;
+    if (HasValidInstrHeights)
+      OS << " +instrs";
+  } else
+    OS << "height invalid";
+  if (HasValidInstrDepths && HasValidInstrHeights)
+    OS << ", crit=" << CriticalPath;
+}
+
+void MachineTraceMetrics::Trace::print(raw_ostream &OS) const {
+  unsigned MBBNum = &TBI - &TE.BlockInfo[0];
+
+  OS << TE.getName() << " trace BB#" << TBI.Head << " --> BB#" << MBBNum
+     << " --> BB#" << TBI.Tail << ':';
+  if (TBI.hasValidHeight() && TBI.hasValidDepth())
+    OS << ' ' << getInstrCount() << " instrs.";
+  if (TBI.HasValidInstrDepths && TBI.HasValidInstrHeights)
+    OS << ' ' << TBI.CriticalPath << " cycles.";
+
+  const MachineTraceMetrics::TraceBlockInfo *Block = &TBI;
+  OS << "\nBB#" << MBBNum;
+  while (Block->hasValidDepth() && Block->Pred) {
+    unsigned Num = Block->Pred->getNumber();
+    OS << " <- BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+
+  Block = &TBI;
+  OS << "\n    ";
+  while (Block->hasValidHeight() && Block->Succ) {
+    unsigned Num = Block->Succ->getNumber();
+    OS << " -> BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+  OS << '\n';
+}
diff --git a/contrib/llvm/lib/CodeGen/MachineVerifier.cpp b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
new file mode 100644
index 000000000000..a70adb050455
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -0,0 +1,2049 @@
+//===-- MachineVerifier.cpp - Machine Code Verifier -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Pass to verify generated machine code. The following is checked:
+//
+// Operand counts: All explicit operands must be present.
+//
+// Register classes: All physical and virtual register operands must be
+// compatible with the register class required by the instruction descriptor.
+//
+// Register live intervals: Registers must be defined only once, and must be
+// defined before use.
+//
+// The machine code verifier is enabled from LLVMTargetMachine.cpp with the
+// command-line option -verify-machineinstrs, or by defining the environment
+// variable LLVM_VERIFY_MACHINEINSTRS to the name of a file that will receive
+// the verifier errors.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+namespace {
+  struct MachineVerifier {
+
+    MachineVerifier(Pass *pass, const char *b) :
+      PASS(pass),
+      Banner(b)
+      {}
+
+    unsigned verify(MachineFunction &MF);
+
+    Pass *const PASS;
+    const char *Banner;
+    const MachineFunction *MF;
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const MachineRegisterInfo *MRI;
+
+    unsigned foundErrors;
+
+    typedef SmallVector<unsigned, 16> RegVector;
+    typedef SmallVector<const uint32_t*, 4> RegMaskVector;
+    typedef DenseSet<unsigned> RegSet;
+    typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+    typedef SmallPtrSet<const MachineBasicBlock*, 8> BlockSet;
+
+    const MachineInstr *FirstTerminator;
+    BlockSet FunctionBlocks;
+
+    BitVector regsReserved;
+    RegSet regsLive;
+    RegVector regsDefined, regsDead, regsKilled;
+    RegMaskVector regMasks;
+    RegSet regsLiveInButUnused;
+
+    SlotIndex lastIndex;
+
+    // Add Reg and any sub-registers to RV
+    void addRegWithSubRegs(RegVector &RV, unsigned Reg) {
+      RV.push_back(Reg);
+      if (TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          RV.push_back(*SubRegs);
+    }
+
+    struct BBInfo {
+      // Is this MBB reachable from the MF entry point?
+      bool reachable;
+
+      // Vregs that must be live in because they are used without being
+      // defined. Map value is the user.
+      RegMap vregsLiveIn;
+
+      // Regs killed in MBB. They may be defined again, and will then be in both
+      // regsKilled and regsLiveOut.
+      RegSet regsKilled;
+
+      // Regs defined in MBB and live out. Note that vregs passing through may
+      // be live out without being mentioned here.
+      RegSet regsLiveOut;
+
+      // Vregs that pass through MBB untouched. This set is disjoint from
+      // regsKilled and regsLiveOut.
+      RegSet vregsPassed;
+
+      // Vregs that must pass through MBB because they are needed by a successor
+      // block. This set is disjoint from regsLiveOut.
+      RegSet vregsRequired;
+
+      // Set versions of block's predecessor and successor lists.
+      BlockSet Preds, Succs;
+
+      BBInfo() : reachable(false) {}
+
+      // Add register to vregsPassed if it belongs there. Return true if
+      // anything changed.
+      bool addPassed(unsigned Reg) {
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          return false;
+        if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
+          return false;
+        return vregsPassed.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addPassed(const RegSet &RS) {
+        bool changed = false;
+        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
+          if (addPassed(*I))
+            changed = true;
+        return changed;
+      }
+
+      // Add register to vregsRequired if it belongs there. Return true if
+      // anything changed.
+      bool addRequired(unsigned Reg) {
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          return false;
+        if (regsLiveOut.count(Reg))
+          return false;
+        return vregsRequired.insert(Reg).second;
+      }
+
+      // Same for a full set.
+      bool addRequired(const RegSet &RS) {
+        bool changed = false;
+        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
+          if (addRequired(*I))
+            changed = true;
+        return changed;
+      }
+
+      // Same for a full map.
+      bool addRequired(const RegMap &RM) {
+        bool changed = false;
+        for (RegMap::const_iterator I = RM.begin(), E = RM.end(); I != E; ++I)
+          if (addRequired(I->first))
+            changed = true;
+        return changed;
+      }
+
+      // Live-out registers are either in regsLiveOut or vregsPassed.
+      bool isLiveOut(unsigned Reg) const {
+        return regsLiveOut.count(Reg) || vregsPassed.count(Reg);
+      }
+    };
+
+    // Extra register info per MBB.
+    DenseMap<const MachineBasicBlock*, BBInfo> MBBInfoMap;
+
+    bool isReserved(unsigned Reg) {
+      return Reg < regsReserved.size() && regsReserved.test(Reg);
+    }
+
+    bool isAllocatable(unsigned Reg) {
+      return Reg < TRI->getNumRegs() && MRI->isAllocatable(Reg);
+    }
+
+    // Analysis information if available
+    LiveVariables *LiveVars;
+    LiveIntervals *LiveInts;
+    LiveStacks *LiveStks;
+    SlotIndexes *Indexes;
+
+    void visitMachineFunctionBefore();
+    void visitMachineBasicBlockBefore(const MachineBasicBlock *MBB);
+    void visitMachineBundleBefore(const MachineInstr *MI);
+    void visitMachineInstrBefore(const MachineInstr *MI);
+    void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
+    void visitMachineInstrAfter(const MachineInstr *MI);
+    void visitMachineBundleAfter(const MachineInstr *MI);
+    void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
+    void visitMachineFunctionAfter();
+
+    template <typename T> void report(const char *msg, ilist_iterator<T> I) {
+      report(msg, &*I);
+    }
+    void report(const char *msg, const MachineFunction *MF);
+    void report(const char *msg, const MachineBasicBlock *MBB);
+    void report(const char *msg, const MachineInstr *MI);
+    void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+
+    void report_context(const LiveInterval &LI) const;
+    void report_context(const LiveRange &LR, unsigned Reg,
+                        LaneBitmask LaneMask) const;
+    void report_context(const LiveRange::Segment &S) const;
+    void report_context(const VNInfo &VNI) const;
+    void report_context(SlotIndex Pos) const;
+    void report_context_liverange(const LiveRange &LR) const;
+    void report_context_lanemask(LaneBitmask LaneMask) const;
+    void report_context_vreg(unsigned VReg) const;
+    void report_context_vreg_regunit(unsigned VRegOrRegUnit) const;
+
+    void verifyInlineAsm(const MachineInstr *MI);
+
+    void checkLiveness(const MachineOperand *MO, unsigned MONum);
+    void checkLivenessAtUse(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex UseIdx, const LiveRange &LR, unsigned Reg,
+                            LaneBitmask LaneMask = 0);
+    void checkLivenessAtDef(const MachineOperand *MO, unsigned MONum,
+                            SlotIndex DefIdx, const LiveRange &LR, unsigned Reg,
+                            LaneBitmask LaneMask = 0);
+
+    void markReachable(const MachineBasicBlock *MBB);
+    void calcRegsPassed();
+    void checkPHIOps(const MachineBasicBlock *MBB);
+
+    void calcRegsRequired();
+    void verifyLiveVariables();
+    void verifyLiveIntervals();
+    void verifyLiveInterval(const LiveInterval&);
+    void verifyLiveRangeValue(const LiveRange&, const VNInfo*, unsigned,
+                              unsigned);
+    void verifyLiveRangeSegment(const LiveRange&,
+                                const LiveRange::const_iterator I, unsigned,
+                                unsigned);
+    void verifyLiveRange(const LiveRange&, unsigned, LaneBitmask LaneMask = 0);
+
+    void verifyStackFrame();
+
+    void verifySlotIndexes() const;
+    void verifyProperties(const MachineFunction &MF);
+  };
+
+  struct MachineVerifierPass : public MachineFunctionPass {
+    static char ID; // Pass ID, replacement for typeid
+    const std::string Banner;
+
+    MachineVerifierPass(const std::string &banner = nullptr)
+      : MachineFunctionPass(ID), Banner(banner) {
+        initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
+      }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesAll();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override {
+      unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF);
+      if (FoundErrors)
+        report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
+      return false;
+    }
+  };
+
+}
+
+char MachineVerifierPass::ID = 0;
+INITIALIZE_PASS(MachineVerifierPass, "machineverifier",
+                "Verify generated machine code", false, false)
+
+FunctionPass *llvm::createMachineVerifierPass(const std::string &Banner) {
+  return new MachineVerifierPass(Banner);
+}
+
+bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors)
+    const {
+  MachineFunction &MF = const_cast<MachineFunction&>(*this);
+  unsigned FoundErrors = MachineVerifier(p, Banner).verify(MF);
+  if (AbortOnErrors && FoundErrors)
+    report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
+  return FoundErrors == 0;
+}
+
+void MachineVerifier::verifySlotIndexes() const {
+  if (Indexes == nullptr)
+    return;
+
+  // Ensure the IdxMBB list is sorted by slot indexes.
+  SlotIndex Last;
+  for (SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin(),
+       E = Indexes->MBBIndexEnd(); I != E; ++I) {
+    assert(!Last.isValid() || I->first > Last);
+    Last = I->first;
+  }
+}
+
+void MachineVerifier::verifyProperties(const MachineFunction &MF) {
+  // If a pass has introduced virtual registers without clearing the
+  // AllVRegsAllocated property (or set it without allocating the vregs)
+  // then report an error.
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::AllVRegsAllocated) &&
+      MRI->getNumVirtRegs()) {
+    report(
+        "Function has AllVRegsAllocated property but there are VReg operands",
+        &MF);
+  }
+}
+
+unsigned MachineVerifier::verify(MachineFunction &MF) {
+  foundErrors = 0;
+
+  this->MF = &MF;
+  TM = &MF.getTarget();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  LiveVars = nullptr;
+  LiveInts = nullptr;
+  LiveStks = nullptr;
+  Indexes = nullptr;
+  if (PASS) {
+    LiveInts = PASS->getAnalysisIfAvailable<LiveIntervals>();
+    // We don't want to verify LiveVariables if LiveIntervals is available.
+    if (!LiveInts)
+      LiveVars = PASS->getAnalysisIfAvailable<LiveVariables>();
+    LiveStks = PASS->getAnalysisIfAvailable<LiveStacks>();
+    Indexes = PASS->getAnalysisIfAvailable<SlotIndexes>();
+  }
+
+  verifySlotIndexes();
+
+  verifyProperties(MF);
+
+  visitMachineFunctionBefore();
+  for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
+       MFI!=MFE; ++MFI) {
+    visitMachineBasicBlockBefore(&*MFI);
+    // Keep track of the current bundle header.
+    const MachineInstr *CurBundle = nullptr;
+    // Do we expect the next instruction to be part of the same bundle?
+    bool InBundle = false;
+
+    for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
+           MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
+      if (MBBI->getParent() != &*MFI) {
+        report("Bad instruction parent pointer", MFI);
+        errs() << "Instruction: " << *MBBI;
+        continue;
+      }
+
+      // Check for consistent bundle flags.
+      if (InBundle && !MBBI->isBundledWithPred())
+        report("Missing BundledPred flag, "
+               "BundledSucc was set on predecessor",
+               &*MBBI);
+      if (!InBundle && MBBI->isBundledWithPred())
+        report("BundledPred flag is set, "
+               "but BundledSucc not set on predecessor",
+               &*MBBI);
+
+      // Is this a bundle header?
+      if (!MBBI->isInsideBundle()) {
+        if (CurBundle)
+          visitMachineBundleAfter(CurBundle);
+        CurBundle = &*MBBI;
+        visitMachineBundleBefore(CurBundle);
+      } else if (!CurBundle)
+        report("No bundle header", MBBI);
+      visitMachineInstrBefore(&*MBBI);
+      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
+        const MachineInstr &MI = *MBBI;
+        const MachineOperand &Op = MI.getOperand(I);
+        if (Op.getParent() != &MI) {
+          // Make sure to use correct addOperand / RemoveOperand / ChangeTo
+          // functions when replacing operands of a MachineInstr.
+          report("Instruction has operand with wrong parent set", &MI);
+        }
+
+        visitMachineOperand(&Op, I);
+      }
+
+      visitMachineInstrAfter(&*MBBI);
+
+      // Was this the last bundled instruction?
+      InBundle = MBBI->isBundledWithSucc();
+    }
+    if (CurBundle)
+      visitMachineBundleAfter(CurBundle);
+    if (InBundle)
+      report("BundledSucc flag set on last instruction in block", &MFI->back());
+    visitMachineBasicBlockAfter(&*MFI);
+  }
+  visitMachineFunctionAfter();
+
+  // Clean up.
+  regsLive.clear();
+  regsDefined.clear();
+  regsDead.clear();
+  regsKilled.clear();
+  regMasks.clear();
+  regsLiveInButUnused.clear();
+  MBBInfoMap.clear();
+
+  return foundErrors;
+}
+
+void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
+  assert(MF);
+  errs() << '\n';
+  if (!foundErrors++) {
+    if (Banner)
+      errs() << "# " << Banner << '\n';
+    if (LiveInts != nullptr)
+      LiveInts->print(errs());
+    else
+      MF->print(errs(), Indexes);
+  }
+  errs() << "*** Bad machine code: " << msg << " ***\n"
+      << "- function:    " << MF->getName() << "\n";
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
+  assert(MBB);
+  report(msg, MBB->getParent());
+  errs() << "- basic block: BB#" << MBB->getNumber()
+      << ' ' << MBB->getName()
+      << " (" << (const void*)MBB << ')';
+  if (Indexes)
+    errs() << " [" << Indexes->getMBBStartIdx(MBB)
+        << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
+  errs() << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
+  assert(MI);
+  report(msg, MI->getParent());
+  errs() << "- instruction: ";
+  if (Indexes && Indexes->hasIndex(*MI))
+    errs() << Indexes->getInstructionIndex(*MI) << '\t';
+  MI->print(errs(), /*SkipOpers=*/true);
+  errs() << '\n';
+}
+
+void MachineVerifier::report(const char *msg,
+                             const MachineOperand *MO, unsigned MONum) {
+  assert(MO);
+  report(msg, MO->getParent());
+  errs() << "- operand " << MONum << ":   ";
+  MO->print(errs(), TRI);
+  errs() << "\n";
+}
+
+void MachineVerifier::report_context(SlotIndex Pos) const {
+  errs() << "- at:          " << Pos << '\n';
+}
+
+void MachineVerifier::report_context(const LiveInterval &LI) const {
+  errs() << "- interval:    " << LI << '\n';
+}
+
+void MachineVerifier::report_context(const LiveRange &LR, unsigned Reg,
+                                     LaneBitmask LaneMask) const {
+  report_context_liverange(LR);
+  errs() << "- register:    " << PrintReg(Reg, TRI) << '\n';
+  if (LaneMask != 0)
+    report_context_lanemask(LaneMask);
+}
+
+void MachineVerifier::report_context(const LiveRange::Segment &S) const {
+  errs() << "- segment:     " << S << '\n';
+}
+
+void MachineVerifier::report_context(const VNInfo &VNI) const {
+  errs() << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
+}
+
+void MachineVerifier::report_context_liverange(const LiveRange &LR) const {
+  errs() << "- liverange:   " << LR << '\n';
+}
+
+void MachineVerifier::report_context_vreg(unsigned VReg) const {
+  errs() << "- v. register: " << PrintReg(VReg, TRI) << '\n';
+}
+
+void MachineVerifier::report_context_vreg_regunit(unsigned VRegOrUnit) const {
+  if (TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+    report_context_vreg(VRegOrUnit);
+  } else {
+    errs() << "- regunit:     " << PrintRegUnit(VRegOrUnit, TRI) << '\n';
+  }
+}
+
+void MachineVerifier::report_context_lanemask(LaneBitmask LaneMask) const {
+  errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
+}
+
+void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
+  BBInfo &MInfo = MBBInfoMap[MBB];
+  if (!MInfo.reachable) {
+    MInfo.reachable = true;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI)
+      markReachable(*SuI);
+  }
+}
+
+void MachineVerifier::visitMachineFunctionBefore() {
+  lastIndex = SlotIndex();
+  regsReserved = MRI->getReservedRegs();
+
+  // A sub-register of a reserved register is also reserved
+  for (int Reg = regsReserved.find_first(); Reg>=0;
+       Reg = regsReserved.find_next(Reg)) {
+    for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+      // FIXME: This should probably be:
+      // assert(regsReserved.test(*SubRegs) && "Non-reserved sub-register");
+      regsReserved.set(*SubRegs);
+    }
+  }
+
+  markReachable(&MF->front());
+
+  // Build a set of the basic blocks in the function.
+  FunctionBlocks.clear();
+  for (const auto &MBB : *MF) {
+    FunctionBlocks.insert(&MBB);
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+
+    MInfo.Preds.insert(MBB.pred_begin(), MBB.pred_end());
+    if (MInfo.Preds.size() != MBB.pred_size())
+      report("MBB has duplicate entries in its predecessor list.", &MBB);
+
+    MInfo.Succs.insert(MBB.succ_begin(), MBB.succ_end());
+    if (MInfo.Succs.size() != MBB.succ_size())
+      report("MBB has duplicate entries in its successor list.", &MBB);
+  }
+
+  // Check that the register use lists are sane.
+  MRI->verifyUseLists();
+
+  verifyStackFrame();
+}
+
+// Does iterator point to a and b as the first two elements?
+static bool matchPair(MachineBasicBlock::const_succ_iterator i,
+                      const MachineBasicBlock *a, const MachineBasicBlock *b) {
+  if (*i == a)
+    return *++i == b;
+  if (*i == b)
+    return *++i == a;
+  return false;
+}
+
+void
+MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
+  FirstTerminator = nullptr;
+
+  if (MRI->isSSA()) {
+    // If this block has allocatable physical registers live-in, check that
+    // it is an entry block or landing pad.
+    for (const auto &LI : MBB->liveins()) {
+      if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
+          MBB->getIterator() != MBB->getParent()->begin()) {
+        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
+      }
+    }
+  }
+
+  // Count the number of landing pad successors.
+  SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs;
+  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+       E = MBB->succ_end(); I != E; ++I) {
+    if ((*I)->isEHPad())
+      LandingPadSuccs.insert(*I);
+    if (!FunctionBlocks.count(*I))
+      report("MBB has successor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Preds.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      errs() << "MBB is not in the predecessor list of the successor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  // Check the predecessor list.
+  for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+       E = MBB->pred_end(); I != E; ++I) {
+    if (!FunctionBlocks.count(*I))
+      report("MBB has predecessor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Succs.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      errs() << "MBB is not in the successor list of the predecessor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
+  const BasicBlock *BB = MBB->getBasicBlock();
+  const Function *Fn = MF->getFunction();
+  if (LandingPadSuccs.size() > 1 &&
+      !(AsmInfo &&
+        AsmInfo->getExceptionHandlingType() == ExceptionHandling::SjLj &&
+        BB && isa<SwitchInst>(BB->getTerminator())) &&
+      !isFuncletEHPersonality(classifyEHPersonality(Fn->getPersonalityFn())))
+    report("MBB has more than one landing pad successor", MBB);
+
+  // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 4> Cond;
+  if (!TII->analyzeBranch(*const_cast<MachineBasicBlock *>(MBB), TBB, FBB,
+                          Cond)) {
+    // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
+    // check whether its answers match up with reality.
+    if (!TBB && !FBB) {
+      // Block falls through to its successor.
+      MachineFunction::const_iterator MBBI = MBB->getIterator();
+      ++MBBI;
+      if (MBBI == MF->end()) {
+        // It's possible that the block legitimately ends with a noreturn
+        // call or an unreachable, in which case it won't actually fall
+        // out the bottom of the function.
+      } else if (MBB->succ_size() == LandingPadSuccs.size()) {
+        // It's possible that the block legitimately ends with a noreturn
+        // call or an unreachable, in which case it won't actuall fall
+        // out of the block.
+      } else if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
+        report("MBB exits via unconditional fall-through but doesn't have "
+               "exactly one CFG successor!", MBB);
+      } else if (!MBB->isSuccessor(&*MBBI)) {
+        report("MBB exits via unconditional fall-through but its successor "
+               "differs from its CFG successor!", MBB);
+      }
+      if (!MBB->empty() && MBB->back().isBarrier() &&
+          !TII->isPredicated(MBB->back())) {
+        report("MBB exits via unconditional fall-through but ends with a "
+               "barrier instruction!", MBB);
+      }
+      if (!Cond.empty()) {
+        report("MBB exits via unconditional fall-through but has a condition!",
+               MBB);
+      }
+    } else if (TBB && !FBB && Cond.empty()) {
+      // Block unconditionally branches somewhere.
+      // If the block has exactly one successor, that happens to be a
+      // landingpad, accept it as valid control flow.
+      if (MBB->succ_size() != 1+LandingPadSuccs.size() &&
+          (MBB->succ_size() != 1 || LandingPadSuccs.size() != 1 ||
+           *MBB->succ_begin() != *LandingPadSuccs.begin())) {
+        report("MBB exits via unconditional branch but doesn't have "
+               "exactly one CFG successor!", MBB);
+      } else if (!MBB->isSuccessor(TBB)) {
+        report("MBB exits via unconditional branch but the CFG "
+               "successor doesn't match the actual successor!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via unconditional branch but doesn't contain "
+               "any instructions!", MBB);
+      } else if (!MBB->back().isBarrier()) {
+        report("MBB exits via unconditional branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via unconditional branch but the branch isn't a "
+               "terminator instruction!", MBB);
+      }
+    } else if (TBB && !FBB && !Cond.empty()) {
+      // Block conditionally branches somewhere, otherwise falls through.
+      MachineFunction::const_iterator MBBI = MBB->getIterator();
+      ++MBBI;
+      if (MBBI == MF->end()) {
+        report("MBB conditionally falls through out of function!", MBB);
+      } else if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (&*MBBI != TBB)
+          report("MBB exits via conditional branch/fall-through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/fall-through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
+        report("MBB exits via conditional branch/fall-through but doesn't have "
+               "exactly two CFG successors!", MBB);
+      } else if (!matchPair(MBB->succ_begin(), TBB, &*MBBI)) {
+        report("MBB exits via conditional branch/fall-through but the CFG "
+               "successors don't match the actual successors!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/fall-through but doesn't "
+               "contain any instructions!", MBB);
+      } else if (MBB->back().isBarrier()) {
+        report("MBB exits via conditional branch/fall-through but ends with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via conditional branch/fall-through but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+    } else if (TBB && FBB) {
+      // Block conditionally branches somewhere, otherwise branches
+      // somewhere else.
+      if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (FBB != TBB)
+          report("MBB exits via conditional branch/branch through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/branch through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
+        report("MBB exits via conditional branch/branch but doesn't have "
+               "exactly two CFG successors!", MBB);
+      } else if (!matchPair(MBB->succ_begin(), TBB, FBB)) {
+        report("MBB exits via conditional branch/branch but the CFG "
+               "successors don't match the actual successors!", MBB);
+      }
+      if (MBB->empty()) {
+        report("MBB exits via conditional branch/branch but doesn't "
+               "contain any instructions!", MBB);
+      } else if (!MBB->back().isBarrier()) {
+        report("MBB exits via conditional branch/branch but doesn't end with a "
+               "barrier instruction!", MBB);
+      } else if (!MBB->back().isTerminator()) {
+        report("MBB exits via conditional branch/branch but the branch "
+               "isn't a terminator instruction!", MBB);
+      }
+      if (Cond.empty()) {
+        report("MBB exits via conditinal branch/branch but there's no "
+               "condition!", MBB);
+      }
+    } else {
+      report("AnalyzeBranch returned invalid data!", MBB);
+    }
+  }
+
+  regsLive.clear();
+  for (const auto &LI : MBB->liveins()) {
+    if (!TargetRegisterInfo::isPhysicalRegister(LI.PhysReg)) {
+      report("MBB live-in list contains non-physical register", MBB);
+      continue;
+    }
+    for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
+  }
+  regsLiveInButUnused = regsLive;
+
+  const MachineFrameInfo *MFI = MF->getFrameInfo();
+  assert(MFI && "Function has no frame info");
+  BitVector PR = MFI->getPristineRegs(*MF);
+  for (int I = PR.find_first(); I>0; I = PR.find_next(I)) {
+    for (MCSubRegIterator SubRegs(I, TRI, /*IncludeSelf=*/true);
+         SubRegs.isValid(); ++SubRegs)
+      regsLive.insert(*SubRegs);
+  }
+
+  regsKilled.clear();
+  regsDefined.clear();
+
+  if (Indexes)
+    lastIndex = Indexes->getMBBStartIdx(MBB);
+}
+
+// This function gets called for all bundle headers, including normal
+// stand-alone unbundled instructions.
+void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
+  if (Indexes && Indexes->hasIndex(*MI)) {
+    SlotIndex idx = Indexes->getInstructionIndex(*MI);
+    if (!(idx > lastIndex)) {
+      report("Instruction index out of order", MI);
+      errs() << "Last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = idx;
+  }
+
+  // Ensure non-terminators don't follow terminators.
+  // Ignore predicated terminators formed by if conversion.
+  // FIXME: If conversion shouldn't need to violate this rule.
+  if (MI->isTerminator() && !TII->isPredicated(*MI)) {
+    if (!FirstTerminator)
+      FirstTerminator = MI;
+  } else if (FirstTerminator) {
+    report("Non-terminator instruction after the first terminator", MI);
+    errs() << "First terminator was:\t" << *FirstTerminator;
+  }
+}
+
+// The operands on an INLINEASM instruction must follow a template.
+// Verify that the flag operands make sense.
+void MachineVerifier::verifyInlineAsm(const MachineInstr *MI) {
+  // The first two operands on INLINEASM are the asm string and global flags.
+  if (MI->getNumOperands() < 2) {
+    report("Too few operands on inline asm", MI);
+    return;
+  }
+  if (!MI->getOperand(0).isSymbol())
+    report("Asm string must be an external symbol", MI);
+  if (!MI->getOperand(1).isImm())
+    report("Asm flags must be an immediate", MI);
+  // Allowed flags are Extra_HasSideEffects = 1, Extra_IsAlignStack = 2,
+  // Extra_AsmDialect = 4, Extra_MayLoad = 8, and Extra_MayStore = 16,
+  // and Extra_IsConvergent = 32.
+  if (!isUInt<6>(MI->getOperand(1).getImm()))
+    report("Unknown asm flags", &MI->getOperand(1), 1);
+
+  static_assert(InlineAsm::MIOp_FirstOperand == 2, "Asm format changed");
+
+  unsigned OpNo = InlineAsm::MIOp_FirstOperand;
+  unsigned NumOps;
+  for (unsigned e = MI->getNumOperands(); OpNo < e; OpNo += NumOps) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    // There may be implicit ops after the fixed operands.
+    if (!MO.isImm())
+      break;
+    NumOps = 1 + InlineAsm::getNumOperandRegisters(MO.getImm());
+  }
+
+  if (OpNo > MI->getNumOperands())
+    report("Missing operands in last group", MI);
+
+  // An optional MDNode follows the groups.
+  if (OpNo < MI->getNumOperands() && MI->getOperand(OpNo).isMetadata())
+    ++OpNo;
+
+  // All trailing operands must be implicit registers.
+  for (unsigned e = MI->getNumOperands(); OpNo < e; ++OpNo) {
+    const MachineOperand &MO = MI->getOperand(OpNo);
+    if (!MO.isReg() || !MO.isImplicit())
+      report("Expected implicit register after groups", &MO, OpNo);
+  }
+}
+
+void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MI->getNumOperands() < MCID.getNumOperands()) {
+    report("Too few operands", MI);
+    errs() << MCID.getNumOperands() << " operands expected, but "
+        << MI->getNumOperands() << " given.\n";
+  }
+
+  // Check the tied operands.
+  if (MI->isInlineAsm())
+    verifyInlineAsm(MI);
+
+  // Check the MachineMemOperands for basic consistency.
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+       E = MI->memoperands_end(); I != E; ++I) {
+    if ((*I)->isLoad() && !MI->mayLoad())
+      report("Missing mayLoad flag", MI);
+    if ((*I)->isStore() && !MI->mayStore())
+      report("Missing mayStore flag", MI);
+  }
+
+  // Debug values must not have a slot index.
+  // Other instructions must have one, unless they are inside a bundle.
+  if (LiveInts) {
+    bool mapped = !LiveInts->isNotInMIMap(*MI);
+    if (MI->isDebugValue()) {
+      if (mapped)
+        report("Debug instruction has a slot index", MI);
+    } else if (MI->isInsideBundle()) {
+      if (mapped)
+        report("Instruction inside bundle has a slot index", MI);
+    } else {
+      if (!mapped)
+        report("Missing slot index", MI);
+    }
+  }
+
+  StringRef ErrorInfo;
+  if (!TII->verifyInstruction(*MI, ErrorInfo))
+    report(ErrorInfo.data(), MI);
+}
+
+void
+MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const MCInstrDesc &MCID = MI->getDesc();
+  unsigned NumDefs = MCID.getNumDefs();
+  if (MCID.getOpcode() == TargetOpcode::PATCHPOINT)
+    NumDefs = (MONum == 0 && MO->isReg()) ? NumDefs : 0;
+
+  // The first MCID.NumDefs operands must be explicit register defines
+  if (MONum < NumDefs) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
+    if (!MO->isReg())
+      report("Explicit definition must be a register", MO, MONum);
+    else if (!MO->isDef() && !MCOI.isOptionalDef())
+      report("Explicit definition marked as use", MO, MONum);
+    else if (MO->isImplicit())
+      report("Explicit definition marked as implicit", MO, MONum);
+  } else if (MONum < MCID.getNumOperands()) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
+    // Don't check if it's the last operand in a variadic instruction. See,
+    // e.g., LDM_RET in the arm back end.
+    if (MO->isReg() &&
+        !(MI->isVariadic() && MONum == MCID.getNumOperands()-1)) {
+      if (MO->isDef() && !MCOI.isOptionalDef())
+        report("Explicit operand marked as def", MO, MONum);
+      if (MO->isImplicit())
+        report("Explicit operand marked as implicit", MO, MONum);
+    }
+
+    int TiedTo = MCID.getOperandConstraint(MONum, MCOI::TIED_TO);
+    if (TiedTo != -1) {
+      if (!MO->isReg())
+        report("Tied use must be a register", MO, MONum);
+      else if (!MO->isTied())
+        report("Operand should be tied", MO, MONum);
+      else if (unsigned(TiedTo) != MI->findTiedOperandIdx(MONum))
+        report("Tied def doesn't match MCInstrDesc", MO, MONum);
+    } else if (MO->isReg() && MO->isTied())
+      report("Explicit operand should not be tied", MO, MONum);
+  } else {
+    // ARM adds %reg0 operands to indicate predicates. We'll allow that.
+    if (MO->isReg() && !MO->isImplicit() && !MI->isVariadic() && MO->getReg())
+      report("Extra explicit operand on non-variadic instruction", MO, MONum);
+  }
+
+  switch (MO->getType()) {
+  case MachineOperand::MO_Register: {
+    const unsigned Reg = MO->getReg();
+    if (!Reg)
+      return;
+    if (MRI->tracksLiveness() && !MI->isDebugValue())
+      checkLiveness(MO, MONum);
+
+    // Verify the consistency of tied operands.
+    if (MO->isTied()) {
+      unsigned OtherIdx = MI->findTiedOperandIdx(MONum);
+      const MachineOperand &OtherMO = MI->getOperand(OtherIdx);
+      if (!OtherMO.isReg())
+        report("Must be tied to a register", MO, MONum);
+      if (!OtherMO.isTied())
+        report("Missing tie flags on tied operand", MO, MONum);
+      if (MI->findTiedOperandIdx(OtherIdx) != MONum)
+        report("Inconsistent tie links", MO, MONum);
+      if (MONum < MCID.getNumDefs()) {
+        if (OtherIdx < MCID.getNumOperands()) {
+          if (-1 == MCID.getOperandConstraint(OtherIdx, MCOI::TIED_TO))
+            report("Explicit def tied to explicit use without tie constraint",
+                   MO, MONum);
+        } else {
+          if (!OtherMO.isImplicit())
+            report("Explicit def should be tied to implicit use", MO, MONum);
+        }
+      }
+    }
+
+    // Verify two-address constraints after leaving SSA form.
+    unsigned DefIdx;
+    if (!MRI->isSSA() && MO->isUse() &&
+        MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+        Reg != MI->getOperand(DefIdx).getReg())
+      report("Two-address instruction operands must be identical", MO, MONum);
+
+    // Check register classes.
+    if (MONum < MCID.getNumOperands() && !MO->isImplicit()) {
+      unsigned SubIdx = MO->getSubReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (SubIdx) {
+          report("Illegal subregister index for physical register", MO, MONum);
+          return;
+        }
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (!DRC->contains(Reg)) {
+            report("Illegal physical register for instruction", MO, MONum);
+            errs() << TRI->getName(Reg) << " is not a "
+                << TRI->getRegClassName(DRC) << " register.\n";
+          }
+        }
+      } else {
+        // Virtual register.
+        const TargetRegisterClass *RC = MRI->getRegClassOrNull(Reg);
+        if (!RC) {
+          // This is a generic virtual register.
+          // It must have a size and it must not have a SubIdx.
+          unsigned Size = MRI->getSize(Reg);
+          if (!Size) {
+            report("Generic virtual register must have a size", MO, MONum);
+            return;
+          }
+          // Make sure the register fits into its register bank if any.
+          const RegisterBank *RegBank = MRI->getRegBankOrNull(Reg);
+          if (RegBank && RegBank->getSize() < Size) {
+            report("Register bank is too small for virtual register", MO,
+                   MONum);
+            errs() << "Register bank " << RegBank->getName() << " too small("
+                   << RegBank->getSize() << ") to fit " << Size << "-bits\n";
+            return;
+          }
+          if (SubIdx)  {
+            report("Generic virtual register does not subregister index", MO, MONum);
+            return;
+          }
+          break;
+        }
+        if (SubIdx) {
+          const TargetRegisterClass *SRC =
+            TRI->getSubClassWithSubReg(RC, SubIdx);
+          if (!SRC) {
+            report("Invalid subregister index for virtual register", MO, MONum);
+            errs() << "Register class " << TRI->getRegClassName(RC)
+                << " does not support subreg index " << SubIdx << "\n";
+            return;
+          }
+          if (RC != SRC) {
+            report("Invalid register class for subregister index", MO, MONum);
+            errs() << "Register class " << TRI->getRegClassName(RC)
+                << " does not fully support subreg index " << SubIdx << "\n";
+            return;
+          }
+        }
+        if (const TargetRegisterClass *DRC =
+              TII->getRegClass(MCID, MONum, TRI, *MF)) {
+          if (SubIdx) {
+            const TargetRegisterClass *SuperRC =
+                TRI->getLargestLegalSuperClass(RC, *MF);
+            if (!SuperRC) {
+              report("No largest legal super class exists.", MO, MONum);
+              return;
+            }
+            DRC = TRI->getMatchingSuperRegClass(SuperRC, DRC, SubIdx);
+            if (!DRC) {
+              report("No matching super-reg register class.", MO, MONum);
+              return;
+            }
+          }
+          if (!RC->hasSuperClassEq(DRC)) {
+            report("Illegal virtual register for instruction", MO, MONum);
+            errs() << "Expected a " << TRI->getRegClassName(DRC)
+                << " register, but got a " << TRI->getRegClassName(RC)
+                << " register\n";
+          }
+        }
+      }
+    }
+    break;
+  }
+
+  case MachineOperand::MO_RegisterMask:
+    regMasks.push_back(MO->getRegMask());
+    break;
+
+  case MachineOperand::MO_MachineBasicBlock:
+    if (MI->isPHI() && !MO->getMBB()->isSuccessor(MI->getParent()))
+      report("PHI operand is not in the CFG", MO, MONum);
+    break;
+
+  case MachineOperand::MO_FrameIndex:
+    if (LiveStks && LiveStks->hasInterval(MO->getIndex()) &&
+        LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      int FI = MO->getIndex();
+      LiveInterval &LI = LiveStks->getInterval(FI);
+      SlotIndex Idx = LiveInts->getInstructionIndex(*MI);
+
+      bool stores = MI->mayStore();
+      bool loads = MI->mayLoad();
+      // For a memory-to-memory move, we need to check if the frame
+      // index is used for storing or loading, by inspecting the
+      // memory operands.
+      if (stores && loads) {
+        for (auto *MMO : MI->memoperands()) {
+          const PseudoSourceValue *PSV = MMO->getPseudoValue();
+          if (PSV == nullptr) continue;
+          const FixedStackPseudoSourceValue *Value =
+            dyn_cast<FixedStackPseudoSourceValue>(PSV);
+          if (Value == nullptr) continue;
+          if (Value->getFrameIndex() != FI) continue;
+
+          if (MMO->isStore())
+            loads = false;
+          else
+            stores = false;
+          break;
+        }
+        if (loads == stores)
+          report("Missing fixed stack memoperand.", MI);
+      }
+      if (loads && !LI.liveAt(Idx.getRegSlot(true))) {
+        report("Instruction loads from dead spill slot", MO, MONum);
+        errs() << "Live stack: " << LI << '\n';
+      }
+      if (stores && !LI.liveAt(Idx.getRegSlot())) {
+        report("Instruction stores to dead spill slot", MO, MONum);
+        errs() << "Live stack: " << LI << '\n';
+      }
+    }
+    break;
+
+  default:
+    break;
+  }
+}
+
+void MachineVerifier::checkLivenessAtUse(const MachineOperand *MO,
+    unsigned MONum, SlotIndex UseIdx, const LiveRange &LR, unsigned VRegOrUnit,
+    LaneBitmask LaneMask) {
+  LiveQueryResult LRQ = LR.Query(UseIdx);
+  // Check if we have a segment at the use, note however that we only need one
+  // live subregister range, the others may be dead.
+  if (!LRQ.valueIn() && LaneMask == 0) {
+    report("No live segment at use", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    report_context(UseIdx);
+  }
+  if (MO->isKill() && !LRQ.isKill()) {
+    report("Live range continues after kill flag", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask != 0)
+      report_context_lanemask(LaneMask);
+    report_context(UseIdx);
+  }
+}
+
+void MachineVerifier::checkLivenessAtDef(const MachineOperand *MO,
+    unsigned MONum, SlotIndex DefIdx, const LiveRange &LR, unsigned VRegOrUnit,
+    LaneBitmask LaneMask) {
+  if (const VNInfo *VNI = LR.getVNInfoAt(DefIdx)) {
+    assert(VNI && "NULL valno is not allowed");
+    if (VNI->def != DefIdx) {
+      report("Inconsistent valno->def", MO, MONum);
+      report_context_liverange(LR);
+      report_context_vreg_regunit(VRegOrUnit);
+      if (LaneMask != 0)
+        report_context_lanemask(LaneMask);
+      report_context(*VNI);
+      report_context(DefIdx);
+    }
+  } else {
+    report("No live segment at def", MO, MONum);
+    report_context_liverange(LR);
+    report_context_vreg_regunit(VRegOrUnit);
+    if (LaneMask != 0)
+      report_context_lanemask(LaneMask);
+    report_context(DefIdx);
+  }
+  // Check that, if the dead def flag is present, LiveInts agree.
+  if (MO->isDead()) {
+    LiveQueryResult LRQ = LR.Query(DefIdx);
+    if (!LRQ.isDeadDef()) {
+      // In case of physregs we can have a non-dead definition on another
+      // operand.
+      bool otherDef = false;
+      if (!TargetRegisterInfo::isVirtualRegister(VRegOrUnit)) {
+        const MachineInstr &MI = *MO->getParent();
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isReg() || !MO.isDef() || MO.isDead())
+            continue;
+          unsigned Reg = MO.getReg();
+          for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+            if (*Units == VRegOrUnit) {
+              otherDef = true;
+              break;
+            }
+          }
+        }
+      }
+
+      if (!otherDef) {
+        report("Live range continues after dead def flag", MO, MONum);
+        report_context_liverange(LR);
+        report_context_vreg_regunit(VRegOrUnit);
+        if (LaneMask != 0)
+          report_context_lanemask(LaneMask);
+      }
+    }
+  }
+}
+
+void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
+  const MachineInstr *MI = MO->getParent();
+  const unsigned Reg = MO->getReg();
+
+  // Both use and def operands can read a register.
+  if (MO->readsReg()) {
+    regsLiveInButUnused.erase(Reg);
+
+    if (MO->isKill())
+      addRegWithSubRegs(regsKilled, Reg);
+
+    // Check that LiveVars knows this kill.
+    if (LiveVars && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        MO->isKill()) {
+      LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+      if (std::find(VI.Kills.begin(), VI.Kills.end(), MI) == VI.Kills.end())
+        report("Kill missing from LiveVariables", MO, MONum);
+    }
+
+    // Check LiveInts liveness and kill.
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(*MI);
+      // Check the cached regunit intervals.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+          if (const LiveRange *LR = LiveInts->getCachedRegUnit(*Units))
+            checkLivenessAtUse(MO, MONum, UseIdx, *LR, *Units);
+        }
+      }
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          // This is a virtual register interval.
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          checkLivenessAtUse(MO, MONum, UseIdx, LI, Reg);
+
+          if (LI.hasSubRanges() && !MO->isDef()) {
+            unsigned SubRegIdx = MO->getSubReg();
+            LaneBitmask MOMask = SubRegIdx != 0
+                               ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                               : MRI->getMaxLaneMaskForVReg(Reg);
+            LaneBitmask LiveInMask = 0;
+            for (const LiveInterval::SubRange &SR : LI.subranges()) {
+              if ((MOMask & SR.LaneMask) == 0)
+                continue;
+              checkLivenessAtUse(MO, MONum, UseIdx, SR, Reg, SR.LaneMask);
+              LiveQueryResult LRQ = SR.Query(UseIdx);
+              if (LRQ.valueIn())
+                LiveInMask |= SR.LaneMask;
+            }
+            // At least parts of the register has to be live at the use.
+            if ((LiveInMask & MOMask) == 0) {
+              report("No live subrange at use", MO, MONum);
+              report_context(LI);
+              report_context(UseIdx);
+            }
+          }
+        } else {
+          report("Virtual register has no live interval", MO, MONum);
+        }
+      }
+    }
+
+    // Use of a dead register.
+    if (!regsLive.count(Reg)) {
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        // Reserved registers may be used even when 'dead'.
+        bool Bad = !isReserved(Reg);
+        // We are fine if just any subregister has a defined value.
+        if (Bad) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid();
+               ++SubRegs) {
+            if (regsLive.count(*SubRegs)) {
+              Bad = false;
+              break;
+            }
+          }
+        }
+        // If there is an additional implicit-use of a super register we stop
+        // here. By definition we are fine if the super register is not
+        // (completely) dead, if the complete super register is dead we will
+        // get a report for its operand.
+        if (Bad) {
+          for (const MachineOperand &MOP : MI->uses()) {
+            if (!MOP.isReg())
+              continue;
+            if (!MOP.isImplicit())
+              continue;
+            for (MCSubRegIterator SubRegs(MOP.getReg(), TRI); SubRegs.isValid();
+                 ++SubRegs) {
+              if (*SubRegs == Reg) {
+                Bad = false;
+                break;
+              }
+            }
+          }
+        }
+        if (Bad)
+          report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
+      } else {
+        BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+        // We don't know which virtual registers are live in, so only complain
+        // if vreg was killed in this MBB. Otherwise keep track of vregs that
+        // must be live in. PHI instructions are handled separately.
+        if (MInfo.regsKilled.count(Reg))
+          report("Using a killed virtual register", MO, MONum);
+        else if (!MI->isPHI())
+          MInfo.vregsLiveIn.insert(std::make_pair(Reg, MI));
+      }
+    }
+  }
+
+  if (MO->isDef()) {
+    // Register defined.
+    // TODO: verify that earlyclobber ops are not used.
+    if (MO->isDead())
+      addRegWithSubRegs(regsDead, Reg);
+    else
+      addRegWithSubRegs(regsDefined, Reg);
+
+    // Verify SSA form.
+    if (MRI->isSSA() && TargetRegisterInfo::isVirtualRegister(Reg) &&
+        std::next(MRI->def_begin(Reg)) != MRI->def_end())
+      report("Multiple virtual register defs in SSA form", MO, MONum);
+
+    // Check LiveInts for a live segment, but only for virtual registers.
+    if (LiveInts && !LiveInts->isNotInMIMap(*MI)) {
+      SlotIndex DefIdx = LiveInts->getInstructionIndex(*MI);
+      DefIdx = DefIdx.getRegSlot(MO->isEarlyClobber());
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          checkLivenessAtDef(MO, MONum, DefIdx, LI, Reg);
+
+          if (LI.hasSubRanges()) {
+            unsigned SubRegIdx = MO->getSubReg();
+            LaneBitmask MOMask = SubRegIdx != 0
+              ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+              : MRI->getMaxLaneMaskForVReg(Reg);
+            for (const LiveInterval::SubRange &SR : LI.subranges()) {
+              if ((SR.LaneMask & MOMask) == 0)
+                continue;
+              checkLivenessAtDef(MO, MONum, DefIdx, SR, Reg, SR.LaneMask);
+            }
+          }
+        } else {
+          report("Virtual register has no Live interval", MO, MONum);
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {
+}
+
+// This function gets called after visiting all instructions in a bundle. The
+// argument points to the bundle header.
+// Normal stand-alone instructions are also considered 'bundles', and this
+// function is called for all of them.
+void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
+  BBInfo &MInfo = MBBInfoMap[MI->getParent()];
+  set_union(MInfo.regsKilled, regsKilled);
+  set_subtract(regsLive, regsKilled); regsKilled.clear();
+  // Kill any masked registers.
+  while (!regMasks.empty()) {
+    const uint32_t *Mask = regMasks.pop_back_val();
+    for (RegSet::iterator I = regsLive.begin(), E = regsLive.end(); I != E; ++I)
+      if (TargetRegisterInfo::isPhysicalRegister(*I) &&
+          MachineOperand::clobbersPhysReg(Mask, *I))
+        regsDead.push_back(*I);
+  }
+  set_subtract(regsLive, regsDead);   regsDead.clear();
+  set_union(regsLive, regsDefined);   regsDefined.clear();
+}
+
+void
+MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
+  MBBInfoMap[MBB].regsLiveOut = regsLive;
+  regsLive.clear();
+
+  if (Indexes) {
+    SlotIndex stop = Indexes->getMBBEndIdx(MBB);
+    if (!(stop > lastIndex)) {
+      report("Block ends before last instruction index", MBB);
+      errs() << "Block ends at " << stop
+          << " last instruction was at " << lastIndex << '\n';
+    }
+    lastIndex = stop;
+  }
+}
+
+// Calculate the largest possible vregsPassed sets. These are the registers that
+// can pass through an MBB live, but may not be live every time. It is assumed
+// that all vregsPassed sets are empty before the call.
+void MachineVerifier::calcRegsPassed() {
+  // First push live-out regs to successors' vregsPassed. Remember the MBBs that
+  // have any vregsPassed.
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    if (!MInfo.reachable)
+      continue;
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB.succ_begin(),
+           SuE = MBB.succ_end(); SuI != SuE; ++SuI) {
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.regsLiveOut))
+        todo.insert(*SuI);
+    }
+  }
+
+  // Iteratively push vregsPassed to successors. This will converge to the same
+  // final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
+           SuE = MBB->succ_end(); SuI != SuE; ++SuI) {
+      if (*SuI == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[*SuI];
+      if (SInfo.addPassed(MInfo.vregsPassed))
+        todo.insert(*SuI);
+    }
+  }
+}
+
+// Calculate the set of virtual registers that must be passed through each basic
+// block in order to satisfy the requirements of successor blocks. This is very
+// similar to calcRegsPassed, only backwards.
+void MachineVerifier::calcRegsRequired() {
+  // First push live-in regs to predecessors' vregsRequired.
+  SmallPtrSet<const MachineBasicBlock*, 8> todo;
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB.pred_begin(),
+           PrE = MBB.pred_end(); PrI != PrE; ++PrI) {
+      BBInfo &PInfo = MBBInfoMap[*PrI];
+      if (PInfo.addRequired(MInfo.vregsLiveIn))
+        todo.insert(*PrI);
+    }
+  }
+
+  // Iteratively push vregsRequired to predecessors. This will converge to the
+  // same final state regardless of DenseSet iteration order.
+  while (!todo.empty()) {
+    const MachineBasicBlock *MBB = *todo.begin();
+    todo.erase(MBB);
+    BBInfo &MInfo = MBBInfoMap[MBB];
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+      if (*PrI == MBB)
+        continue;
+      BBInfo &SInfo = MBBInfoMap[*PrI];
+      if (SInfo.addRequired(MInfo.vregsRequired))
+        todo.insert(*PrI);
+    }
+  }
+}
+
+// Check PHI instructions at the beginning of MBB. It is assumed that
+// calcRegsPassed has been run so BBInfo::isLiveOut is valid.
+void MachineVerifier::checkPHIOps(const MachineBasicBlock *MBB) {
+  SmallPtrSet<const MachineBasicBlock*, 8> seen;
+  for (const auto &BBI : *MBB) {
+    if (!BBI.isPHI())
+      break;
+    seen.clear();
+
+    for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI.getOperand(i).getReg();
+      const MachineBasicBlock *Pre = BBI.getOperand(i + 1).getMBB();
+      if (!Pre->isSuccessor(MBB))
+        continue;
+      seen.insert(Pre);
+      BBInfo &PrInfo = MBBInfoMap[Pre];
+      if (PrInfo.reachable && !PrInfo.isLiveOut(Reg))
+        report("PHI operand is not live-out from predecessor",
+               &BBI.getOperand(i), i);
+    }
+
+    // Did we see all predecessors?
+    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
+           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
+      if (!seen.count(*PrI)) {
+        report("Missing PHI operand", &BBI);
+        errs() << "BB#" << (*PrI)->getNumber()
+            << " is a predecessor according to the CFG.\n";
+      }
+    }
+  }
+}
+
+void MachineVerifier::visitMachineFunctionAfter() {
+  calcRegsPassed();
+
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+
+    // Skip unreachable MBBs.
+    if (!MInfo.reachable)
+      continue;
+
+    checkPHIOps(&MBB);
+  }
+
+  // Now check liveness info if available
+  calcRegsRequired();
+
+  // Check for killed virtual registers that should be live out.
+  for (const auto &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I)
+      if (MInfo.regsKilled.count(*I)) {
+        report("Virtual register killed in block, but needed live out.", &MBB);
+        errs() << "Virtual register " << PrintReg(*I)
+            << " is used after the block.\n";
+      }
+  }
+
+  if (!MF->empty()) {
+    BBInfo &MInfo = MBBInfoMap[&MF->front()];
+    for (RegSet::iterator
+         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
+         ++I) {
+      report("Virtual register defs don't dominate all uses.", MF);
+      report_context_vreg(*I);
+    }
+  }
+
+  if (LiveVars)
+    verifyLiveVariables();
+  if (LiveInts)
+    verifyLiveIntervals();
+}
+
+void MachineVerifier::verifyLiveVariables() {
+  assert(LiveVars && "Don't call verifyLiveVariables without LiveVars");
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    LiveVariables::VarInfo &VI = LiveVars->getVarInfo(Reg);
+    for (const auto &MBB : *MF) {
+      BBInfo &MInfo = MBBInfoMap[&MBB];
+
+      // Our vregsRequired should be identical to LiveVariables' AliveBlocks
+      if (MInfo.vregsRequired.count(Reg)) {
+        if (!VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block missing from AliveBlocks", &MBB);
+          errs() << "Virtual register " << PrintReg(Reg)
+              << " must be live through the block.\n";
+        }
+      } else {
+        if (VI.AliveBlocks.test(MBB.getNumber())) {
+          report("LiveVariables: Block should not be in AliveBlocks", &MBB);
+          errs() << "Virtual register " << PrintReg(Reg)
+              << " is not needed live through the block.\n";
+        }
+      }
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveIntervals() {
+  assert(LiveInts && "Don't call verifyLiveIntervals without LiveInts");
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+
+    // Spilling and splitting may leave unused registers around. Skip them.
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+
+    if (!LiveInts->hasInterval(Reg)) {
+      report("Missing live interval for virtual register", MF);
+      errs() << PrintReg(Reg, TRI) << " still has defs or uses\n";
+      continue;
+    }
+
+    const LiveInterval &LI = LiveInts->getInterval(Reg);
+    assert(Reg == LI.reg && "Invalid reg to interval mapping");
+    verifyLiveInterval(LI);
+  }
+
+  // Verify all the cached regunit intervals.
+  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+    if (const LiveRange *LR = LiveInts->getCachedRegUnit(i))
+      verifyLiveRange(*LR, i);
+}
+
+void MachineVerifier::verifyLiveRangeValue(const LiveRange &LR,
+                                           const VNInfo *VNI, unsigned Reg,
+                                           LaneBitmask LaneMask) {
+  if (VNI->isUnused())
+    return;
+
+  const VNInfo *DefVNI = LR.getVNInfoAt(VNI->def);
+
+  if (!DefVNI) {
+    report("Value not live at VNInfo def and not marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (DefVNI != VNI) {
+    report("Live segment at def has different VNInfo", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
+  if (!MBB) {
+    report("Invalid VNInfo definition index", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (VNI->isPHIDef()) {
+    if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
+      report("PHIDef VNInfo is not defined at MBB start", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+    return;
+  }
+
+  // Non-PHI def.
+  const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
+  if (!MI) {
+    report("No instruction at VNInfo def index", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(*VNI);
+    return;
+  }
+
+  if (Reg != 0) {
+    bool hasDef = false;
+    bool isEarlyClobber = false;
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || !MOI->isDef())
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (MOI->getReg() != Reg)
+          continue;
+      } else {
+        if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+            !TRI->hasRegUnit(MOI->getReg(), Reg))
+          continue;
+      }
+      if (LaneMask != 0 &&
+          (TRI->getSubRegIndexLaneMask(MOI->getSubReg()) & LaneMask) == 0)
+        continue;
+      hasDef = true;
+      if (MOI->isEarlyClobber())
+        isEarlyClobber = true;
+    }
+
+    if (!hasDef) {
+      report("Defining instruction does not modify register", MI);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+
+    // Early clobber defs begin at USE slots, but other defs must begin at
+    // DEF slots.
+    if (isEarlyClobber) {
+      if (!VNI->def.isEarlyClobber()) {
+        report("Early clobber def must be at an early-clobber slot", MBB);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
+      }
+    } else if (!VNI->def.isRegister()) {
+      report("Non-PHI, non-early clobber def must be at a register slot", MBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(*VNI);
+    }
+  }
+}
+
+void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
+                                             const LiveRange::const_iterator I,
+                                             unsigned Reg, LaneBitmask LaneMask)
+{
+  const LiveRange::Segment &S = *I;
+  const VNInfo *VNI = S.valno;
+  assert(VNI && "Live segment has no valno");
+
+  if (VNI->id >= LR.getNumValNums() || VNI != LR.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live segment", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    report_context(*VNI);
+  }
+
+  if (VNI->isUnused()) {
+    report("Live segment valno is marked unused", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+  }
+
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(S.start);
+  if (!MBB) {
+    report("Bad start of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    return;
+  }
+  SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
+  if (S.start != MBBStartIdx && S.start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+  }
+
+  const MachineBasicBlock *EndMBB =
+    LiveInts->getMBBFromIndex(S.end.getPrevSlot());
+  if (!EndMBB) {
+    report("Bad end of live segment, no basic block", MF);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    return;
+  }
+
+  // No more checks for live-out segments.
+  if (S.end == LiveInts->getMBBEndIdx(EndMBB))
+    return;
+
+  // RegUnit intervals are allowed dead phis.
+  if (!TargetRegisterInfo::isVirtualRegister(Reg) && VNI->isPHIDef() &&
+      S.start == VNI->def && S.end == VNI->def.getDeadSlot())
+    return;
+
+  // The live segment is ending inside EndMBB
+  const MachineInstr *MI =
+    LiveInts->getInstructionFromIndex(S.end.getPrevSlot());
+  if (!MI) {
+    report("Live segment doesn't end at a valid instruction", EndMBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+    return;
+  }
+
+  // The block slot must refer to a basic block boundary.
+  if (S.end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB);
+    report_context(LR, Reg, LaneMask);
+    report_context(S);
+  }
+
+  if (S.end.isDead()) {
+    // Segment ends on the dead slot.
+    // That means there must be a dead def.
+    if (!SlotIndex::isSameInstr(S.start, S.end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
+    }
+  }
+
+  // A live segment can only end at an early-clobber slot if it is being
+  // redefined by an early-clobber def.
+  if (S.end.isEarlyClobber()) {
+    if (I+1 == LR.end() || (I+1)->start != S.end) {
+      report("Live segment ending at early clobber slot must be "
+             "redefined by an EC def in the same instruction", EndMBB);
+      report_context(LR, Reg, LaneMask);
+      report_context(S);
+    }
+  }
+
+  // The following checks only apply to virtual registers. Physreg liveness
+  // is too weird to check.
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // A live segment can end with either a redefinition, a kill flag on a
+    // use, or a dead flag on a def.
+    bool hasRead = false;
+    bool hasSubRegDef = false;
+    bool hasDeadDef = false;
+    for (ConstMIBundleOperands MOI(*MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || MOI->getReg() != Reg)
+        continue;
+      if (LaneMask != 0 &&
+          (LaneMask & TRI->getSubRegIndexLaneMask(MOI->getSubReg())) == 0)
+        continue;
+      if (MOI->isDef()) {
+        if (MOI->getSubReg() != 0)
+          hasSubRegDef = true;
+        if (MOI->isDead())
+          hasDeadDef = true;
+      }
+      if (MOI->readsReg())
+        hasRead = true;
+    }
+    if (S.end.isDead()) {
+      // Make sure that the corresponding machine operand for a "dead" live
+      // range has the dead flag. We cannot perform this check for subregister
+      // liveranges as partially dead values are allowed.
+      if (LaneMask == 0 && !hasDeadDef) {
+        report("Instruction ending live segment on dead slot has no dead flag",
+               MI);
+        report_context(LR, Reg, LaneMask);
+        report_context(S);
+      }
+    } else {
+      if (!hasRead) {
+        // When tracking subregister liveness, the main range must start new
+        // values on partial register writes, even if there is no read.
+        if (!MRI->shouldTrackSubRegLiveness(Reg) || LaneMask != 0 ||
+            !hasSubRegDef) {
+          report("Instruction ending live segment doesn't read the register",
+                 MI);
+          report_context(LR, Reg, LaneMask);
+          report_context(S);
+        }
+      }
+    }
+  }
+
+  // Now check all the basic blocks in this live segment.
+  MachineFunction::const_iterator MFI = MBB->getIterator();
+  // Is this live segment the beginning of a non-PHIDef VN?
+  if (S.start == VNI->def && !VNI->isPHIDef()) {
+    // Not live-in to any blocks.
+    if (MBB == EndMBB)
+      return;
+    // Skip this block.
+    ++MFI;
+  }
+  for (;;) {
+    assert(LiveInts->isLiveInToMBB(LR, &*MFI));
+    // We don't know how to track physregs into a landing pad.
+    if (!TargetRegisterInfo::isVirtualRegister(Reg) &&
+        MFI->isEHPad()) {
+      if (&*MFI == EndMBB)
+        break;
+      ++MFI;
+      continue;
+    }
+
+    // Is VNI a PHI-def in the current block?
+    bool IsPHI = VNI->isPHIDef() &&
+      VNI->def == LiveInts->getMBBStartIdx(&*MFI);
+
+    // Check that VNI is live-out of all predecessors.
+    for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
+         PE = MFI->pred_end(); PI != PE; ++PI) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
+      const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
+
+      // All predecessors must have a live-out value if this is not a
+      // subregister liverange.
+      if (!PVNI && LaneMask == 0) {
+        report("Register not marked live out of predecessor", *PI);
+        report_context(LR, Reg, LaneMask);
+        report_context(*VNI);
+        errs() << " live into BB#" << MFI->getNumber()
+               << '@' << LiveInts->getMBBStartIdx(&*MFI) << ", not live before "
+               << PEnd << '\n';
+        continue;
+      }
+
+      // Only PHI-defs can take different predecessor values.
+      if (!IsPHI && PVNI != VNI) {
+        report("Different value live out of predecessor", *PI);
+        report_context(LR, Reg, LaneMask);
+        errs() << "Valno #" << PVNI->id << " live out of BB#"
+               << (*PI)->getNumber() << '@' << PEnd << "\nValno #" << VNI->id
+               << " live into BB#" << MFI->getNumber() << '@'
+               << LiveInts->getMBBStartIdx(&*MFI) << '\n';
+      }
+    }
+    if (&*MFI == EndMBB)
+      break;
+    ++MFI;
+  }
+}
+
+void MachineVerifier::verifyLiveRange(const LiveRange &LR, unsigned Reg,
+                                      LaneBitmask LaneMask) {
+  for (const VNInfo *VNI : LR.valnos)
+    verifyLiveRangeValue(LR, VNI, Reg, LaneMask);
+
+  for (LiveRange::const_iterator I = LR.begin(), E = LR.end(); I != E; ++I)
+    verifyLiveRangeSegment(LR, I, Reg, LaneMask);
+}
+
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  unsigned Reg = LI.reg;
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  verifyLiveRange(LI, Reg);
+
+  LaneBitmask Mask = 0;
+  LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if ((Mask & SR.LaneMask) != 0) {
+      report("Lane masks of sub ranges overlap in live interval", MF);
+      report_context(LI);
+    }
+    if ((SR.LaneMask & ~MaxMask) != 0) {
+      report("Subrange lanemask is invalid", MF);
+      report_context(LI);
+    }
+    if (SR.empty()) {
+      report("Subrange must not be empty", MF);
+      report_context(SR, LI.reg, SR.LaneMask);
+    }
+    Mask |= SR.LaneMask;
+    verifyLiveRange(SR, LI.reg, SR.LaneMask);
+    if (!LI.covers(SR)) {
+      report("A Subrange is not covered by the main range", MF);
+      report_context(LI);
+    }
+  }
+
+  // Check the LI only has one connected component.
+  ConnectedVNInfoEqClasses ConEQ(*LiveInts);
+  unsigned NumComp = ConEQ.Classify(LI);
+  if (NumComp > 1) {
+    report("Multiple connected components in live interval", MF);
+    report_context(LI);
+    for (unsigned comp = 0; comp != NumComp; ++comp) {
+      errs() << comp << ": valnos";
+      for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
+           E = LI.vni_end(); I!=E; ++I)
+        if (comp == ConEQ.getEqClass(*I))
+          errs() << ' ' << (*I)->id;
+      errs() << '\n';
+    }
+  }
+}
+
+namespace {
+  // FrameSetup and FrameDestroy can have zero adjustment, so using a single
+  // integer, we can't tell whether it is a FrameSetup or FrameDestroy if the
+  // value is zero.
+  // We use a bool plus an integer to capture the stack state.
+  struct StackStateOfBB {
+    StackStateOfBB() : EntryValue(0), ExitValue(0), EntryIsSetup(false),
+      ExitIsSetup(false) { }
+    StackStateOfBB(int EntryVal, int ExitVal, bool EntrySetup, bool ExitSetup) :
+      EntryValue(EntryVal), ExitValue(ExitVal), EntryIsSetup(EntrySetup),
+      ExitIsSetup(ExitSetup) { }
+    // Can be negative, which means we are setting up a frame.
+    int EntryValue;
+    int ExitValue;
+    bool EntryIsSetup;
+    bool ExitIsSetup;
+  };
+}
+
+/// Make sure on every path through the CFG, a FrameSetup <n> is always followed
+/// by a FrameDestroy <n>, stack adjustments are identical on all
+/// CFG edges to a merge point, and frame is destroyed at end of a return block.
+void MachineVerifier::verifyStackFrame() {
+  unsigned FrameSetupOpcode   = TII->getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
+
+  SmallVector<StackStateOfBB, 8> SPState;
+  SPState.resize(MF->getNumBlockIDs());
+  SmallPtrSet<const MachineBasicBlock*, 8> Reachable;
+
+  // Visit the MBBs in DFS order.
+  for (df_ext_iterator<const MachineFunction*,
+                       SmallPtrSet<const MachineBasicBlock*, 8> >
+       DFI = df_ext_begin(MF, Reachable), DFE = df_ext_end(MF, Reachable);
+       DFI != DFE; ++DFI) {
+    const MachineBasicBlock *MBB = *DFI;
+
+    StackStateOfBB BBState;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      const MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      BBState.EntryValue = SPState[StackPred->getNumber()].ExitValue;
+      BBState.EntryIsSetup = SPState[StackPred->getNumber()].ExitIsSetup;
+      BBState.ExitValue = BBState.EntryValue;
+      BBState.ExitIsSetup = BBState.EntryIsSetup;
+    }
+
+    // Update stack state by checking contents of MBB.
+    for (const auto &I : *MBB) {
+      if (I.getOpcode() == FrameSetupOpcode) {
+        // The first operand of a FrameOpcode should be i32.
+        int Size = I.getOperand(0).getImm();
+        assert(Size >= 0 &&
+          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
+
+        if (BBState.ExitIsSetup)
+          report("FrameSetup is after another FrameSetup", &I);
+        BBState.ExitValue -= Size;
+        BBState.ExitIsSetup = true;
+      }
+
+      if (I.getOpcode() == FrameDestroyOpcode) {
+        // The first operand of a FrameOpcode should be i32.
+        int Size = I.getOperand(0).getImm();
+        assert(Size >= 0 &&
+          "Value should be non-negative in FrameSetup and FrameDestroy.\n");
+
+        if (!BBState.ExitIsSetup)
+          report("FrameDestroy is not after a FrameSetup", &I);
+        int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue :
+                                               BBState.ExitValue;
+        if (BBState.ExitIsSetup && AbsSPAdj != Size) {
+          report("FrameDestroy <n> is after FrameSetup <m>", &I);
+          errs() << "FrameDestroy <" << Size << "> is after FrameSetup <"
+              << AbsSPAdj << ">.\n";
+        }
+        BBState.ExitValue += Size;
+        BBState.ExitIsSetup = false;
+      }
+    }
+    SPState[MBB->getNumber()] = BBState;
+
+    // Make sure the exit state of any predecessor is consistent with the entry
+    // state.
+    for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+         E = MBB->pred_end(); I != E; ++I) {
+      if (Reachable.count(*I) &&
+          (SPState[(*I)->getNumber()].ExitValue != BBState.EntryValue ||
+           SPState[(*I)->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
+        report("The exit stack state of a predecessor is inconsistent.", MBB);
+        errs() << "Predecessor BB#" << (*I)->getNumber() << " has exit state ("
+            << SPState[(*I)->getNumber()].ExitValue << ", "
+            << SPState[(*I)->getNumber()].ExitIsSetup
+            << "), while BB#" << MBB->getNumber() << " has entry state ("
+            << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
+      }
+    }
+
+    // Make sure the entry state of any successor is consistent with the exit
+    // state.
+    for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
+         E = MBB->succ_end(); I != E; ++I) {
+      if (Reachable.count(*I) &&
+          (SPState[(*I)->getNumber()].EntryValue != BBState.ExitValue ||
+           SPState[(*I)->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
+        report("The entry stack state of a successor is inconsistent.", MBB);
+        errs() << "Successor BB#" << (*I)->getNumber() << " has entry state ("
+            << SPState[(*I)->getNumber()].EntryValue << ", "
+            << SPState[(*I)->getNumber()].EntryIsSetup
+            << "), while BB#" << MBB->getNumber() << " has exit state ("
+            << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
+      }
+    }
+
+    // Make sure a basic block with return ends with zero stack adjustment.
+    if (!MBB->empty() && MBB->back().isReturn()) {
+      if (BBState.ExitIsSetup)
+        report("A return block ends with a FrameSetup.", MBB);
+      if (BBState.ExitValue)
+        report("A return block ends with a nonzero stack adjustment.", MBB);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
new file mode 100644
index 000000000000..0177e414f8d9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/OptimizePHIs.cpp
@@ -0,0 +1,196 @@
+//===-- OptimizePHIs.cpp - Optimize machine instruction PHIs --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes machine instruction PHIs to take advantage of
+// opportunities created during DAG legalization.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "phi-opt"
+
+STATISTIC(NumPHICycles, "Number of PHI cycles replaced");
+STATISTIC(NumDeadPHICycles, "Number of dead PHI cycles");
+
+namespace {
+  class OptimizePHIs : public MachineFunctionPass {
+    MachineRegisterInfo *MRI;
+    const TargetInstrInfo *TII;
+
+  public:
+    static char ID; // Pass identification
+    OptimizePHIs() : MachineFunctionPass(ID) {
+      initializeOptimizePHIsPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+  private:
+    typedef SmallPtrSet<MachineInstr*, 16> InstrSet;
+    typedef SmallPtrSetIterator<MachineInstr*> InstrSetIterator;
+
+    bool IsSingleValuePHICycle(MachineInstr *MI, unsigned &SingleValReg,
+                               InstrSet &PHIsInCycle);
+    bool IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle);
+    bool OptimizeBB(MachineBasicBlock &MBB);
+  };
+}
+
+char OptimizePHIs::ID = 0;
+char &llvm::OptimizePHIsID = OptimizePHIs::ID;
+INITIALIZE_PASS(OptimizePHIs, "opt-phis",
+                "Optimize machine instruction PHIs", false, false)
+
+bool OptimizePHIs::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  MRI = &Fn.getRegInfo();
+  TII = Fn.getSubtarget().getInstrInfo();
+
+  // Find dead PHI cycles and PHI cycles that can be replaced by a single
+  // value.  InstCombine does these optimizations, but DAG legalization may
+  // introduce new opportunities, e.g., when i64 values are split up for
+  // 32-bit targets.
+  bool Changed = false;
+  for (MachineFunction::iterator I = Fn.begin(), E = Fn.end(); I != E; ++I)
+    Changed |= OptimizeBB(*I);
+
+  return Changed;
+}
+
+/// IsSingleValuePHICycle - Check if MI is a PHI where all the source operands
+/// are copies of SingleValReg, possibly via copies through other PHIs.  If
+/// SingleValReg is zero on entry, it is set to the register with the single
+/// non-copy value.  PHIsInCycle is a set used to keep track of the PHIs that
+/// have been scanned.
+bool OptimizePHIs::IsSingleValuePHICycle(MachineInstr *MI,
+                                         unsigned &SingleValReg,
+                                         InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsSingleValuePHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  // Scan the PHI operands.
+  for (unsigned i = 1; i != MI->getNumOperands(); i += 2) {
+    unsigned SrcReg = MI->getOperand(i).getReg();
+    if (SrcReg == DstReg)
+      continue;
+    MachineInstr *SrcMI = MRI->getVRegDef(SrcReg);
+
+    // Skip over register-to-register moves.
+    if (SrcMI && SrcMI->isCopy() &&
+        !SrcMI->getOperand(0).getSubReg() &&
+        !SrcMI->getOperand(1).getSubReg() &&
+        TargetRegisterInfo::isVirtualRegister(SrcMI->getOperand(1).getReg()))
+      SrcMI = MRI->getVRegDef(SrcMI->getOperand(1).getReg());
+    if (!SrcMI)
+      return false;
+
+    if (SrcMI->isPHI()) {
+      if (!IsSingleValuePHICycle(SrcMI, SingleValReg, PHIsInCycle))
+        return false;
+    } else {
+      // Fail if there is more than one non-phi/non-move register.
+      if (SingleValReg != 0)
+        return false;
+      SingleValReg = SrcReg;
+    }
+  }
+  return true;
+}
+
+/// IsDeadPHICycle - Check if the register defined by a PHI is only used by
+/// other PHIs in a cycle.
+bool OptimizePHIs::IsDeadPHICycle(MachineInstr *MI, InstrSet &PHIsInCycle) {
+  assert(MI->isPHI() && "IsDeadPHICycle expects a PHI instruction");
+  unsigned DstReg = MI->getOperand(0).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+         "PHI destination is not a virtual register");
+
+  // See if we already saw this register.
+  if (!PHIsInCycle.insert(MI).second)
+    return true;
+
+  // Don't scan crazily complex things.
+  if (PHIsInCycle.size() == 16)
+    return false;
+
+  for (MachineInstr &UseMI : MRI->use_instructions(DstReg)) {
+    if (!UseMI.isPHI() || !IsDeadPHICycle(&UseMI, PHIsInCycle))
+      return false;
+  }
+
+  return true;
+}
+
+/// OptimizeBB - Remove dead PHI cycles and PHI cycles that can be replaced by
+/// a single value.
+bool OptimizePHIs::OptimizeBB(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  for (MachineBasicBlock::iterator
+         MII = MBB.begin(), E = MBB.end(); MII != E; ) {
+    MachineInstr *MI = &*MII++;
+    if (!MI->isPHI())
+      break;
+
+    // Check for single-value PHI cycles.
+    unsigned SingleValReg = 0;
+    InstrSet PHIsInCycle;
+    if (IsSingleValuePHICycle(MI, SingleValReg, PHIsInCycle) &&
+        SingleValReg != 0) {
+      unsigned OldReg = MI->getOperand(0).getReg();
+      if (!MRI->constrainRegClass(SingleValReg, MRI->getRegClass(OldReg)))
+        continue;
+
+      MRI->replaceRegWith(OldReg, SingleValReg);
+      MI->eraseFromParent();
+      ++NumPHICycles;
+      Changed = true;
+      continue;
+    }
+
+    // Check for dead PHI cycles.
+    PHIsInCycle.clear();
+    if (IsDeadPHICycle(MI, PHIsInCycle)) {
+      for (InstrSetIterator PI = PHIsInCycle.begin(), PE = PHIsInCycle.end();
+           PI != PE; ++PI) {
+        MachineInstr *PhiMI = *PI;
+        if (&*MII == PhiMI)
+          ++MII;
+        PhiMI->eraseFromParent();
+      }
+      ++NumDeadPHICycles;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIElimination.cpp b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
new file mode 100644
index 000000000000..b8d54315d148
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIElimination.cpp
@@ -0,0 +1,652 @@
+//===-- PhiElimination.cpp - Eliminate PHI nodes by inserting copies ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass eliminates machine instruction PHI nodes by inserting copy
+// instructions.  This destroys SSA information, but is the desired input for
+// some register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "phielim"
+
+static cl::opt<bool>
+DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false),
+                     cl::Hidden, cl::desc("Disable critical edge splitting "
+                                          "during PHI elimination"));
+
+static cl::opt<bool>
+SplitAllCriticalEdges("phi-elim-split-all-critical-edges", cl::init(false),
+                      cl::Hidden, cl::desc("Split all critical edges during "
+                                           "PHI elimination"));
+
+static cl::opt<bool> NoPhiElimLiveOutEarlyExit(
+    "no-phi-elim-live-out-early-exit", cl::init(false), cl::Hidden,
+    cl::desc("Do not use an early exit if isLiveOutPastPHIs returns true."));
+
+namespace {
+  class PHIElimination : public MachineFunctionPass {
+    MachineRegisterInfo *MRI; // Machine register information
+    LiveVariables *LV;
+    LiveIntervals *LIS;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PHIElimination() : MachineFunctionPass(ID) {
+      initializePHIEliminationPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  private:
+    /// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions
+    /// in predecessor basic blocks.
+    ///
+    bool EliminatePHINodes(MachineFunction &MF, MachineBasicBlock &MBB);
+    void LowerPHINode(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator LastPHIIt);
+
+    /// analyzePHINodes - Gather information about the PHI nodes in
+    /// here. In particular, we want to map the number of uses of a virtual
+    /// register which is used in a PHI node. We map that to the BB the
+    /// vreg is coming from. This is used later to determine when the vreg
+    /// is killed in the BB.
+    ///
+    void analyzePHINodes(const MachineFunction& Fn);
+
+    /// Split critical edges where necessary for good coalescer performance.
+    bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
+                       MachineLoopInfo *MLI);
+
+    // These functions are temporary abstractions around LiveVariables and
+    // LiveIntervals, so they can go away when LiveVariables does.
+    bool isLiveIn(unsigned Reg, const MachineBasicBlock *MBB);
+    bool isLiveOutPastPHIs(unsigned Reg, const MachineBasicBlock *MBB);
+
+    typedef std::pair<unsigned, unsigned> BBVRegPair;
+    typedef DenseMap<BBVRegPair, unsigned> VRegPHIUse;
+
+    VRegPHIUse VRegPHIUseCount;
+
+    // Defs of PHI sources which are implicit_def.
+    SmallPtrSet<MachineInstr*, 4> ImpDefs;
+
+    // Map reusable lowered PHI node -> incoming join register.
+    typedef DenseMap<MachineInstr*, unsigned,
+                     MachineInstrExpressionTrait> LoweredPHIMap;
+    LoweredPHIMap LoweredPHIs;
+  };
+}
+
+STATISTIC(NumLowered, "Number of phis lowered");
+STATISTIC(NumCriticalEdgesSplit, "Number of critical edges split");
+STATISTIC(NumReused, "Number of reused lowered phis");
+
+char PHIElimination::ID = 0;
+char& llvm::PHIEliminationID = PHIElimination::ID;
+
+INITIALIZE_PASS_BEGIN(PHIElimination, "phi-node-elimination",
+                      "Eliminate PHI nodes for register allocation",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveVariables)
+INITIALIZE_PASS_END(PHIElimination, "phi-node-elimination",
+                    "Eliminate PHI nodes for register allocation", false, false)
+
+void PHIElimination::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addUsedIfAvailable<LiveVariables>();
+  AU.addPreserved<LiveVariables>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addPreserved<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
+  MRI = &MF.getRegInfo();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+
+  bool Changed = false;
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  // Split critical edges to help the coalescer. This does not yet support
+  // updating LiveIntervals, so we disable it.
+  if (!DisableEdgeSplitting && (LV || LIS)) {
+    MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+    for (auto &MBB : MF)
+      Changed |= SplitPHIEdges(MF, MBB, MLI);
+  }
+
+  // Populate VRegPHIUseCount
+  analyzePHINodes(MF);
+
+  // Eliminate PHI instructions by inserting copies into predecessor blocks.
+  for (auto &MBB : MF)
+    Changed |= EliminatePHINodes(MF, MBB);
+
+  // Remove dead IMPLICIT_DEF instructions.
+  for (MachineInstr *DefMI : ImpDefs) {
+    unsigned DefReg = DefMI->getOperand(0).getReg();
+    if (MRI->use_nodbg_empty(DefReg)) {
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(*DefMI);
+      DefMI->eraseFromParent();
+    }
+  }
+
+  // Clean up the lowered PHI instructions.
+  for (auto &I : LoweredPHIs) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*I.first);
+    MF.DeleteMachineInstr(I.first);
+  }
+
+  LoweredPHIs.clear();
+  ImpDefs.clear();
+  VRegPHIUseCount.clear();
+
+  return Changed;
+}
+
+/// EliminatePHINodes - Eliminate phi nodes by inserting copy instructions in
+/// predecessor basic blocks.
+///
+bool PHIElimination::EliminatePHINodes(MachineFunction &MF,
+                                             MachineBasicBlock &MBB) {
+  if (MBB.empty() || !MBB.front().isPHI())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  // Get an iterator to the first instruction after the last PHI node (this may
+  // also be the end of the basic block).
+  MachineBasicBlock::iterator LastPHIIt =
+    std::prev(MBB.SkipPHIsAndLabels(MBB.begin()));
+
+  while (MBB.front().isPHI())
+    LowerPHINode(MBB, LastPHIIt);
+
+  return true;
+}
+
+/// isImplicitlyDefined - Return true if all defs of VirtReg are implicit-defs.
+/// This includes registers with no defs.
+static bool isImplicitlyDefined(unsigned VirtReg,
+                                const MachineRegisterInfo *MRI) {
+  for (MachineInstr &DI : MRI->def_instructions(VirtReg))
+    if (!DI.isImplicitDef())
+      return false;
+  return true;
+}
+
+/// isSourceDefinedByImplicitDef - Return true if all sources of the phi node
+/// are implicit_def's.
+static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
+                                         const MachineRegisterInfo *MRI) {
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    if (!isImplicitlyDefined(MPhi->getOperand(i).getReg(), MRI))
+      return false;
+  return true;
+}
+
+
+/// LowerPHINode - Lower the PHI node at the top of the specified block,
+///
+void PHIElimination::LowerPHINode(MachineBasicBlock &MBB,
+                                  MachineBasicBlock::iterator LastPHIIt) {
+  ++NumLowered;
+
+  MachineBasicBlock::iterator AfterPHIsIt = std::next(LastPHIIt);
+
+  // Unlink the PHI node from the basic block, but don't delete the PHI yet.
+  MachineInstr *MPhi = MBB.remove(&*MBB.begin());
+
+  unsigned NumSrcs = (MPhi->getNumOperands() - 1) / 2;
+  unsigned DestReg = MPhi->getOperand(0).getReg();
+  assert(MPhi->getOperand(0).getSubReg() == 0 && "Can't handle sub-reg PHIs");
+  bool isDead = MPhi->getOperand(0).isDead();
+
+  // Create a new register for the incoming PHI arguments.
+  MachineFunction &MF = *MBB.getParent();
+  unsigned IncomingReg = 0;
+  bool reusedIncoming = false;  // Is IncomingReg reused from an earlier PHI?
+
+  // Insert a register to register copy at the top of the current block (but
+  // after any remaining phi nodes) which copies the new incoming register
+  // into the phi node destination.
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  if (isSourceDefinedByImplicitDef(MPhi, MRI))
+    // If all sources of a PHI node are implicit_def, just emit an
+    // implicit_def instead of a copy.
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
+  else {
+    // Can we reuse an earlier PHI node? This only happens for critical edges,
+    // typically those created by tail duplication.
+    unsigned &entry = LoweredPHIs[MPhi];
+    if (entry) {
+      // An identical PHI node was already lowered. Reuse the incoming register.
+      IncomingReg = entry;
+      reusedIncoming = true;
+      ++NumReused;
+      DEBUG(dbgs() << "Reusing " << PrintReg(IncomingReg) << " for " << *MPhi);
+    } else {
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(DestReg);
+      entry = IncomingReg = MF.getRegInfo().createVirtualRegister(RC);
+    }
+    BuildMI(MBB, AfterPHIsIt, MPhi->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), DestReg)
+      .addReg(IncomingReg);
+  }
+
+  // Update live variable information if there is any.
+  if (LV) {
+    MachineInstr &PHICopy = *std::prev(AfterPHIsIt);
+
+    if (IncomingReg) {
+      LiveVariables::VarInfo &VI = LV->getVarInfo(IncomingReg);
+
+      // Increment use count of the newly created virtual register.
+      LV->setPHIJoin(IncomingReg);
+
+      // When we are reusing the incoming register, it may already have been
+      // killed in this block. The old kill will also have been inserted at
+      // AfterPHIsIt, so it appears before the current PHICopy.
+      if (reusedIncoming)
+        if (MachineInstr *OldKill = VI.findKill(&MBB)) {
+          DEBUG(dbgs() << "Remove old kill from " << *OldKill);
+          LV->removeVirtualRegisterKilled(IncomingReg, *OldKill);
+          DEBUG(MBB.dump());
+        }
+
+      // Add information to LiveVariables to know that the incoming value is
+      // killed.  Note that because the value is defined in several places (once
+      // each for each incoming block), the "def" block and instruction fields
+      // for the VarInfo is not filled in.
+      LV->addVirtualRegisterKilled(IncomingReg, PHICopy);
+    }
+
+    // Since we are going to be deleting the PHI node, if it is the last use of
+    // any registers, or if the value itself is dead, we need to move this
+    // information over to the new copy we just inserted.
+    LV->removeVirtualRegistersKilled(*MPhi);
+
+    // If the result is dead, update LV.
+    if (isDead) {
+      LV->addVirtualRegisterDead(DestReg, PHICopy);
+      LV->removeVirtualRegisterDead(DestReg, *MPhi);
+    }
+  }
+
+  // Update LiveIntervals for the new copy or implicit def.
+  if (LIS) {
+    SlotIndex DestCopyIndex =
+        LIS->InsertMachineInstrInMaps(*std::prev(AfterPHIsIt));
+
+    SlotIndex MBBStartIndex = LIS->getMBBStartIdx(&MBB);
+    if (IncomingReg) {
+      // Add the region from the beginning of MBB to the copy instruction to
+      // IncomingReg's live interval.
+      LiveInterval &IncomingLI = LIS->createEmptyInterval(IncomingReg);
+      VNInfo *IncomingVNI = IncomingLI.getVNInfoAt(MBBStartIndex);
+      if (!IncomingVNI)
+        IncomingVNI = IncomingLI.getNextValue(MBBStartIndex,
+                                              LIS->getVNInfoAllocator());
+      IncomingLI.addSegment(LiveInterval::Segment(MBBStartIndex,
+                                                  DestCopyIndex.getRegSlot(),
+                                                  IncomingVNI));
+    }
+
+    LiveInterval &DestLI = LIS->getInterval(DestReg);
+    assert(DestLI.begin() != DestLI.end() &&
+           "PHIs should have nonempty LiveIntervals.");
+    if (DestLI.endIndex().isDead()) {
+      // A dead PHI's live range begins and ends at the start of the MBB, but
+      // the lowered copy, which will still be dead, needs to begin and end at
+      // the copy instruction.
+      VNInfo *OrigDestVNI = DestLI.getVNInfoAt(MBBStartIndex);
+      assert(OrigDestVNI && "PHI destination should be live at block entry.");
+      DestLI.removeSegment(MBBStartIndex, MBBStartIndex.getDeadSlot());
+      DestLI.createDeadDef(DestCopyIndex.getRegSlot(),
+                           LIS->getVNInfoAllocator());
+      DestLI.removeValNo(OrigDestVNI);
+    } else {
+      // Otherwise, remove the region from the beginning of MBB to the copy
+      // instruction from DestReg's live interval.
+      DestLI.removeSegment(MBBStartIndex, DestCopyIndex.getRegSlot());
+      VNInfo *DestVNI = DestLI.getVNInfoAt(DestCopyIndex.getRegSlot());
+      assert(DestVNI && "PHI destination should be live at its definition.");
+      DestVNI->def = DestCopyIndex.getRegSlot();
+    }
+  }
+
+  // Adjust the VRegPHIUseCount map to account for the removal of this PHI node.
+  for (unsigned i = 1; i != MPhi->getNumOperands(); i += 2)
+    --VRegPHIUseCount[BBVRegPair(MPhi->getOperand(i+1).getMBB()->getNumber(),
+                                 MPhi->getOperand(i).getReg())];
+
+  // Now loop over all of the incoming arguments, changing them to copy into the
+  // IncomingReg register in the corresponding predecessor basic block.
+  SmallPtrSet<MachineBasicBlock*, 8> MBBsInsertedInto;
+  for (int i = NumSrcs - 1; i >= 0; --i) {
+    unsigned SrcReg = MPhi->getOperand(i*2+1).getReg();
+    unsigned SrcSubReg = MPhi->getOperand(i*2+1).getSubReg();
+    bool SrcUndef = MPhi->getOperand(i*2+1).isUndef() ||
+      isImplicitlyDefined(SrcReg, MRI);
+    assert(TargetRegisterInfo::isVirtualRegister(SrcReg) &&
+           "Machine PHI Operands must all be virtual registers!");
+
+    // Get the MachineBasicBlock equivalent of the BasicBlock that is the source
+    // path the PHI.
+    MachineBasicBlock &opBlock = *MPhi->getOperand(i*2+2).getMBB();
+
+    // Check to make sure we haven't already emitted the copy for this block.
+    // This can happen because PHI nodes may have multiple entries for the same
+    // basic block.
+    if (!MBBsInsertedInto.insert(&opBlock).second)
+      continue;  // If the copy has already been emitted, we're done.
+
+    // Find a safe location to insert the copy, this may be the first terminator
+    // in the block (or end()).
+    MachineBasicBlock::iterator InsertPos =
+      findPHICopyInsertPoint(&opBlock, &MBB, SrcReg);
+
+    // Insert the copy.
+    MachineInstr *NewSrcInstr = nullptr;
+    if (!reusedIncoming && IncomingReg) {
+      if (SrcUndef) {
+        // The source register is undefined, so there is no need for a real
+        // COPY, but we still need to ensure joint dominance by defs.
+        // Insert an IMPLICIT_DEF instruction.
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                              TII->get(TargetOpcode::IMPLICIT_DEF),
+                              IncomingReg);
+
+        // Clean up the old implicit-def, if there even was one.
+        if (MachineInstr *DefMI = MRI->getVRegDef(SrcReg))
+          if (DefMI->isImplicitDef())
+            ImpDefs.insert(DefMI);
+      } else {
+        NewSrcInstr = BuildMI(opBlock, InsertPos, MPhi->getDebugLoc(),
+                            TII->get(TargetOpcode::COPY), IncomingReg)
+                        .addReg(SrcReg, 0, SrcSubReg);
+      }
+    }
+
+    // We only need to update the LiveVariables kill of SrcReg if this was the
+    // last PHI use of SrcReg to be lowered on this CFG edge and it is not live
+    // out of the predecessor. We can also ignore undef sources.
+    if (LV && !SrcUndef &&
+        !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)] &&
+        !LV->isLiveOut(SrcReg, opBlock)) {
+      // We want to be able to insert a kill of the register if this PHI (aka,
+      // the copy we just inserted) is the last use of the source value. Live
+      // variable analysis conservatively handles this by saying that the value
+      // is live until the end of the block the PHI entry lives in. If the value
+      // really is dead at the PHI copy, there will be no successor blocks which
+      // have the value live-in.
+
+      // Okay, if we now know that the value is not live out of the block, we
+      // can add a kill marker in this block saying that it kills the incoming
+      // value!
+
+      // In our final twist, we have to decide which instruction kills the
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+      for (MachineBasicBlock::iterator Term = FirstTerm;
+          Term != opBlock.end(); ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = FirstTerm;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugValue())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = std::prev(InsertPos);
+        }
+      }
+      assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
+
+      // Finally, mark it killed.
+      LV->addVirtualRegisterKilled(SrcReg, *KillInst);
+
+      // This vreg no longer lives all of the way through opBlock.
+      unsigned opBlockNum = opBlock.getNumber();
+      LV->getVarInfo(SrcReg).AliveBlocks.reset(opBlockNum);
+    }
+
+    if (LIS) {
+      if (NewSrcInstr) {
+        LIS->InsertMachineInstrInMaps(*NewSrcInstr);
+        LIS->addSegmentToEndOfBlock(IncomingReg, *NewSrcInstr);
+      }
+
+      if (!SrcUndef &&
+          !VRegPHIUseCount[BBVRegPair(opBlock.getNumber(), SrcReg)]) {
+        LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+
+        bool isLiveOut = false;
+        for (MachineBasicBlock::succ_iterator SI = opBlock.succ_begin(),
+             SE = opBlock.succ_end(); SI != SE; ++SI) {
+          SlotIndex startIdx = LIS->getMBBStartIdx(*SI);
+          VNInfo *VNI = SrcLI.getVNInfoAt(startIdx);
+
+          // Definitions by other PHIs are not truly live-in for our purposes.
+          if (VNI && VNI->def != startIdx) {
+            isLiveOut = true;
+            break;
+          }
+        }
+
+        if (!isLiveOut) {
+          MachineBasicBlock::iterator KillInst = opBlock.end();
+          MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+          for (MachineBasicBlock::iterator Term = FirstTerm;
+              Term != opBlock.end(); ++Term) {
+            if (Term->readsRegister(SrcReg))
+              KillInst = Term;
+          }
+
+          if (KillInst == opBlock.end()) {
+            // No terminator uses the register.
+
+            if (reusedIncoming || !IncomingReg) {
+              // We may have to rewind a bit if we didn't just insert a copy.
+              KillInst = FirstTerm;
+              while (KillInst != opBlock.begin()) {
+                --KillInst;
+                if (KillInst->isDebugValue())
+                  continue;
+                if (KillInst->readsRegister(SrcReg))
+                  break;
+              }
+            } else {
+              // We just inserted this copy.
+              KillInst = std::prev(InsertPos);
+            }
+          }
+          assert(KillInst->readsRegister(SrcReg) &&
+                 "Cannot find kill instruction");
+
+          SlotIndex LastUseIndex = LIS->getInstructionIndex(*KillInst);
+          SrcLI.removeSegment(LastUseIndex.getRegSlot(),
+                              LIS->getMBBEndIdx(&opBlock));
+        }
+      }
+    }
+  }
+
+  // Really delete the PHI instruction now, if it is not in the LoweredPHIs map.
+  if (reusedIncoming || !IncomingReg) {
+    if (LIS)
+      LIS->RemoveMachineInstrFromMaps(*MPhi);
+    MF.DeleteMachineInstr(MPhi);
+  }
+}
+
+/// analyzePHINodes - Gather information about the PHI nodes in here. In
+/// particular, we want to map the number of uses of a virtual register which is
+/// used in a PHI node. We map that to the BB the vreg is coming from. This is
+/// used later to determine when the vreg is killed in the BB.
+///
+void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
+  for (const auto &MBB : MF)
+    for (const auto &BBI : MBB) {
+      if (!BBI.isPHI())
+        break;
+      for (unsigned i = 1, e = BBI.getNumOperands(); i != e; i += 2)
+        ++VRegPHIUseCount[BBVRegPair(BBI.getOperand(i+1).getMBB()->getNumber(),
+                                     BBI.getOperand(i).getReg())];
+    }
+}
+
+bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
+                                   MachineBasicBlock &MBB,
+                                   MachineLoopInfo *MLI) {
+  if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad())
+    return false;   // Quick exit for basic blocks without PHIs.
+
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : nullptr;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
+  bool Changed = false;
+  for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
+      unsigned Reg = BBI->getOperand(i).getReg();
+      MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
+      // Avoid splitting backedges of loops. It would introduce small
+      // out-of-line blocks into the loop which is very bad for code placement.
+      if (PreMBB == &MBB && !SplitAllCriticalEdges)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : nullptr;
+      if (IsLoopHeader && PreLoop == CurLoop && !SplitAllCriticalEdges)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      bool ShouldSplit = isLiveOutPastPHIs(Reg, PreMBB);
+      if (!ShouldSplit && !NoPhiElimLiveOutEarlyExit)
+        continue;
+      if (ShouldSplit) {
+        DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
+                     << PreMBB->getNumber() << " -> BB#" << MBB.getNumber()
+                     << ": " << *BBI);
+      }
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      ShouldSplit = ShouldSplit && !isLiveIn(Reg, &MBB);
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit && !SplitAllCriticalEdges)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, *this)) {
+        DEBUG(dbgs() << "Failed to split critical edge.\n");
+        continue;
+      }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
+    }
+  }
+  return Changed;
+}
+
+bool PHIElimination::isLiveIn(unsigned Reg, const MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveIn() requires either LiveVariables or LiveIntervals");
+  if (LIS)
+    return LIS->isLiveInToMBB(LIS->getInterval(Reg), MBB);
+  else
+    return LV->isLiveIn(Reg, *MBB);
+}
+
+bool PHIElimination::isLiveOutPastPHIs(unsigned Reg,
+                                       const MachineBasicBlock *MBB) {
+  assert((LV || LIS) &&
+         "isLiveOutPastPHIs() requires either LiveVariables or LiveIntervals");
+  // LiveVariables considers uses in PHIs to be in the predecessor basic block,
+  // so that a register used only in a PHI is not live out of the block. In
+  // contrast, LiveIntervals considers uses in PHIs to be on the edge rather than
+  // in the predecessor basic block, so that a register used only in a PHI is live
+  // out of the block.
+  if (LIS) {
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    for (const MachineBasicBlock *SI : MBB->successors())
+      if (LI.liveAt(LIS->getMBBStartIdx(SI)))
+        return true;
+    return false;
+  } else {
+    return LV->isLiveOut(Reg, *MBB);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
new file mode 100644
index 000000000000..4cabc3a8c1fd
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -0,0 +1,59 @@
+//===-- PHIEliminationUtils.cpp - Helper functions for PHI elimination ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PHIEliminationUtils.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+using namespace llvm;
+
+// findCopyInsertPoint - Find a safe place in MBB to insert a copy from SrcReg
+// when following the CFG edge to SuccMBB. This needs to be after any def of
+// SrcReg, but before any subsequent point where control flow might jump out of
+// the basic block.
+MachineBasicBlock::iterator
+llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                             unsigned SrcReg) {
+  // Handle the trivial case trivially.
+  if (MBB->empty())
+    return MBB->begin();
+
+  // Usually, we just want to insert the copy before the first terminator
+  // instruction. However, for the edge going to a landing pad, we must insert
+  // the copy before the call/invoke instruction.
+  if (!SuccMBB->isEHPad())
+    return MBB->getFirstTerminator();
+
+  // Discover any defs/uses in this basic block.
+  SmallPtrSet<MachineInstr*, 8> DefUsesInMBB;
+  MachineRegisterInfo& MRI = MBB->getParent()->getRegInfo();
+  for (MachineInstr &RI : MRI.reg_instructions(SrcReg)) {
+    if (RI.getParent() == MBB)
+      DefUsesInMBB.insert(&RI);
+  }
+
+  MachineBasicBlock::iterator InsertPoint;
+  if (DefUsesInMBB.empty()) {
+    // No defs.  Insert the copy at the start of the basic block.
+    InsertPoint = MBB->begin();
+  } else if (DefUsesInMBB.size() == 1) {
+    // Insert the copy immediately after the def/use.
+    InsertPoint = *DefUsesInMBB.begin();
+    ++InsertPoint;
+  } else {
+    // Insert the copy immediately after the last def/use.
+    InsertPoint = MBB->end();
+    while (!DefUsesInMBB.count(&*--InsertPoint)) {}
+    ++InsertPoint;
+  }
+
+  // Make sure the copy goes after any phi nodes however.
+  return MBB->SkipPHIsAndLabels(InsertPoint);
+}
diff --git a/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h
new file mode 100644
index 000000000000..b997d7ac5f4f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PHIEliminationUtils.h
@@ -0,0 +1,25 @@
+//=- PHIEliminationUtils.h - Helper functions for PHI elimination -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+#define LLVM_LIB_CODEGEN_PHIELIMINATIONUTILS_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+
+namespace llvm {
+    /// findPHICopyInsertPoint - Find a safe place in MBB to insert a copy from
+    /// SrcReg when following the CFG edge to SuccMBB. This needs to be after
+    /// any def of SrcReg, but before any subsequent point where control flow
+    /// might jump out of the basic block.
+    MachineBasicBlock::iterator
+    findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
+                           unsigned SrcReg);
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/ParallelCG.cpp b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
new file mode 100644
index 000000000000..ccdaec1bc180
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ParallelCG.cpp
@@ -0,0 +1,99 @@
+//===-- ParallelCG.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that can be used for parallel code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ParallelCG.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+
+using namespace llvm;
+
+static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
+                    function_ref<std::unique_ptr<TargetMachine>()> TMFactory,
+                    TargetMachine::CodeGenFileType FileType) {
+  std::unique_ptr<TargetMachine> TM = TMFactory();
+  legacy::PassManager CodeGenPasses;
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS, FileType))
+    report_fatal_error("Failed to setup codegen");
+  CodeGenPasses.run(*M);
+}
+
+std::unique_ptr<Module> llvm::splitCodeGen(
+    std::unique_ptr<Module> M, ArrayRef<llvm::raw_pwrite_stream *> OSs,
+    ArrayRef<llvm::raw_pwrite_stream *> BCOSs,
+    const std::function<std::unique_ptr<TargetMachine>()> &TMFactory,
+    TargetMachine::CodeGenFileType FileType, bool PreserveLocals) {
+  assert(BCOSs.empty() || BCOSs.size() == OSs.size());
+
+  if (OSs.size() == 1) {
+    if (!BCOSs.empty())
+      WriteBitcodeToFile(M.get(), *BCOSs[0]);
+    codegen(M.get(), *OSs[0], TMFactory, FileType);
+    return M;
+  }
+
+  // Create ThreadPool in nested scope so that threads will be joined
+  // on destruction.
+  {
+    ThreadPool CodegenThreadPool(OSs.size());
+    int ThreadCount = 0;
+
+    SplitModule(
+        std::move(M), OSs.size(),
+        [&](std::unique_ptr<Module> MPart) {
+          // We want to clone the module in a new context to multi-thread the
+          // codegen. We do it by serializing partition modules to bitcode
+          // (while still on the main thread, in order to avoid data races) and
+          // spinning up new threads which deserialize the partitions into
+          // separate contexts.
+          // FIXME: Provide a more direct way to do this in LLVM.
+          SmallString<0> BC;
+          raw_svector_ostream BCOS(BC);
+          WriteBitcodeToFile(MPart.get(), BCOS);
+
+          if (!BCOSs.empty()) {
+            BCOSs[ThreadCount]->write(BC.begin(), BC.size());
+            BCOSs[ThreadCount]->flush();
+          }
+
+          llvm::raw_pwrite_stream *ThreadOS = OSs[ThreadCount++];
+          // Enqueue the task
+          CodegenThreadPool.async(
+              [TMFactory, FileType, ThreadOS](const SmallString<0> &BC) {
+                LLVMContext Ctx;
+                ErrorOr<std::unique_ptr<Module>> MOrErr = parseBitcodeFile(
+                    MemoryBufferRef(StringRef(BC.data(), BC.size()),
+                                    "<split-module>"),
+                    Ctx);
+                if (!MOrErr)
+                  report_fatal_error("Failed to read bitcode");
+                std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
+
+                codegen(MPartInCtx.get(), *ThreadOS, TMFactory, FileType);
+              },
+              // Pass BC using std::move to ensure that it get moved rather than
+              // copied into the thread's context.
+              std::move(BC));
+        },
+        PreserveLocals);
+  }
+
+  return {};
+}
diff --git a/contrib/llvm/lib/CodeGen/PatchableFunction.cpp b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
new file mode 100644
index 000000000000..32468c90b864
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -0,0 +1,88 @@
+//===-- PatchableFunction.cpp - Patchable prologues for LLVM -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements edits function bodies in place to support the
+// "patchable-function" attribute.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct PatchableFunction : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  PatchableFunction() : MachineFunctionPass(ID) {
+    initializePatchableFunctionPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+   MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+};
+}
+
+/// Returns true if instruction \p MI will not result in actual machine code
+/// instructions.
+static bool doesNotGeneratecode(const MachineInstr &MI) {
+  // TODO: Introduce an MCInstrDesc flag for this
+  switch (MI.getOpcode()) {
+  default: return false;
+  case TargetOpcode::IMPLICIT_DEF:
+  case TargetOpcode::KILL:
+  case TargetOpcode::CFI_INSTRUCTION:
+  case TargetOpcode::EH_LABEL:
+  case TargetOpcode::GC_LABEL:
+  case TargetOpcode::DBG_VALUE:
+    return true;
+  }
+}
+
+bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction()->hasFnAttribute("patchable-function"))
+    return false;
+
+#ifndef NDEBUG
+  Attribute PatchAttr = MF.getFunction()->getFnAttribute("patchable-function");
+  StringRef PatchType = PatchAttr.getValueAsString();
+  assert(PatchType == "prologue-short-redirect" && "Only possibility today!");
+#endif
+
+  auto &FirstMBB = *MF.begin();
+  MachineBasicBlock::iterator FirstActualI = FirstMBB.begin();
+  for (; doesNotGeneratecode(*FirstActualI); ++FirstActualI)
+    assert(FirstActualI != FirstMBB.end());
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  auto MIB = BuildMI(FirstMBB, FirstActualI, FirstActualI->getDebugLoc(),
+                     TII->get(TargetOpcode::PATCHABLE_OP))
+                 .addImm(2)
+                 .addImm(FirstActualI->getOpcode());
+
+  for (auto &MO : FirstActualI->operands())
+    MIB.addOperand(MO);
+
+  FirstActualI->eraseFromParent();
+  MF.ensureAlignment(4);
+  return true;
+}
+
+char PatchableFunction::ID = 0;
+char &llvm::PatchableFunctionID = PatchableFunction::ID;
+INITIALIZE_PASS(PatchableFunction, "patchable-function",
+                "Implement the 'patchable-function' attribute", false, false)
diff --git a/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
new file mode 100644
index 000000000000..60b27dd75a89
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -0,0 +1,1947 @@
+//===-- PeepholeOptimizer.cpp - Peephole Optimizations --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Perform peephole optimizations on the machine code:
+//
+// - Optimize Extensions
+//
+//     Optimization of sign / zero extension instructions. It may be extended to
+//     handle other instructions with similar properties.
+//
+//     On some targets, some instructions, e.g. X86 sign / zero extension, may
+//     leave the source value in the lower part of the result. This optimization
+//     will replace some uses of the pre-extension value with uses of the
+//     sub-register of the results.
+//
+// - Optimize Comparisons
+//
+//     Optimization of comparison instructions. For instance, in this code:
+//
+//       sub r1, 1
+//       cmp r1, 0
+//       bz  L1
+//
+//     If the "sub" instruction all ready sets (or could be modified to set) the
+//     same flag that the "cmp" instruction sets and that "bz" uses, then we can
+//     eliminate the "cmp" instruction.
+//
+//     Another instance, in this code:
+//
+//       sub r1, r3 | sub r1, imm
+//       cmp r3, r1 or cmp r1, r3 | cmp r1, imm
+//       bge L1
+//
+//     If the branch instruction can use flag from "sub", then we can replace
+//     "sub" with "subs" and eliminate the "cmp" instruction.
+//
+// - Optimize Loads:
+//
+//     Loads that can be folded into a later instruction. A load is foldable
+//     if it loads to virtual registers and the virtual register defined has
+//     a single use.
+//
+// - Optimize Copies and Bitcast (more generally, target specific copies):
+//
+//     Rewrite copies and bitcasts to avoid cross register bank copies
+//     when possible.
+//     E.g., Consider the following example, where capital and lower
+//     letters denote different register file:
+//     b = copy A <-- cross-bank copy
+//     C = copy b <-- cross-bank copy
+//   =>
+//     b = copy A <-- cross-bank copy
+//     C = copy A <-- same-bank copy
+//
+//     E.g., for bitcast:
+//     b = bitcast A <-- cross-bank copy
+//     C = bitcast b <-- cross-bank copy
+//   =>
+//     b = bitcast A <-- cross-bank copy
+//     C = copy A    <-- same-bank copy
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "peephole-opt"
+
+// Optimize Extensions
+static cl::opt<bool>
+Aggressive("aggressive-ext-opt", cl::Hidden,
+           cl::desc("Aggressive extension optimization"));
+
+static cl::opt<bool>
+DisablePeephole("disable-peephole", cl::Hidden, cl::init(false),
+                cl::desc("Disable the peephole optimizer"));
+
+static cl::opt<bool>
+DisableAdvCopyOpt("disable-adv-copy-opt", cl::Hidden, cl::init(false),
+                  cl::desc("Disable advanced copy optimization"));
+
+static cl::opt<bool> DisableNAPhysCopyOpt(
+    "disable-non-allocatable-phys-copy-opt", cl::Hidden, cl::init(false),
+    cl::desc("Disable non-allocatable physical register copy optimization"));
+
+// Limit the number of PHI instructions to process
+// in PeepholeOptimizer::getNextSource.
+static cl::opt<unsigned> RewritePHILimit(
+    "rewrite-phi-limit", cl::Hidden, cl::init(10),
+    cl::desc("Limit the length of PHI chains to lookup"));
+
+STATISTIC(NumReuse,      "Number of extension results reused");
+STATISTIC(NumCmps,       "Number of compares eliminated");
+STATISTIC(NumImmFold,    "Number of move immediate folded");
+STATISTIC(NumLoadFold,   "Number of loads folded");
+STATISTIC(NumSelects,    "Number of selects optimized");
+STATISTIC(NumUncoalescableCopies, "Number of uncoalescable copies optimized");
+STATISTIC(NumRewrittenCopies, "Number of copies rewritten");
+STATISTIC(NumNAPhysCopies, "Number of non-allocatable physical copies removed");
+
+namespace {
+  class ValueTrackerResult;
+
+  class PeepholeOptimizer : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    MachineRegisterInfo   *MRI;
+    MachineDominatorTree  *DT;  // Machine dominator tree
+
+  public:
+    static char ID; // Pass identification
+    PeepholeOptimizer() : MachineFunctionPass(ID) {
+      initializePeepholeOptimizerPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+      if (Aggressive) {
+        AU.addRequired<MachineDominatorTree>();
+        AU.addPreserved<MachineDominatorTree>();
+      }
+    }
+
+    /// \brief Track Def -> Use info used for rewriting copies.
+    typedef SmallDenseMap<TargetInstrInfo::RegSubRegPair, ValueTrackerResult>
+        RewriteMapTy;
+
+  private:
+    bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
+    bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                          SmallPtrSetImpl<MachineInstr*> &LocalMIs);
+    bool optimizeSelect(MachineInstr *MI,
+                        SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool optimizeCondBranch(MachineInstr *MI);
+    bool optimizeCoalescableCopy(MachineInstr *MI);
+    bool optimizeUncoalescableCopy(MachineInstr *MI,
+                                   SmallPtrSetImpl<MachineInstr *> &LocalMIs);
+    bool findNextSource(unsigned Reg, unsigned SubReg,
+                        RewriteMapTy &RewriteMap);
+    bool isMoveImmediate(MachineInstr *MI,
+                         SmallSet<unsigned, 4> &ImmDefRegs,
+                         DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
+                       SmallSet<unsigned, 4> &ImmDefRegs,
+                       DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+
+    /// \brief If copy instruction \p MI is a virtual register copy, track it in
+    /// the set \p CopySrcRegs and \p CopyMIs. If this virtual register was
+    /// previously seen as a copy, replace the uses of this copy with the
+    /// previously seen copy's destination register.
+    bool foldRedundantCopy(MachineInstr *MI,
+                           SmallSet<unsigned, 4> &CopySrcRegs,
+                           DenseMap<unsigned, MachineInstr *> &CopyMIs);
+
+    /// \brief Is the register \p Reg a non-allocatable physical register?
+    bool isNAPhysCopy(unsigned Reg);
+
+    /// \brief If copy instruction \p MI is a non-allocatable virtual<->physical
+    /// register copy, track it in the \p NAPhysToVirtMIs map. If this
+    /// non-allocatable physical register was previously copied to a virtual
+    /// registered and hasn't been clobbered, the virt->phys copy can be
+    /// deleted.
+    bool foldRedundantNAPhysCopy(
+        MachineInstr *MI,
+        DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs);
+
+    bool isLoadFoldable(MachineInstr *MI,
+                        SmallSet<unsigned, 16> &FoldAsLoadDefCandidates);
+
+    /// \brief Check whether \p MI is understood by the register coalescer
+    /// but may require some rewriting.
+    bool isCoalescableCopy(const MachineInstr &MI) {
+      // SubregToRegs are not interesting, because they are already register
+      // coalescer friendly.
+      return MI.isCopy() || (!DisableAdvCopyOpt &&
+                             (MI.isRegSequence() || MI.isInsertSubreg() ||
+                              MI.isExtractSubreg()));
+    }
+
+    /// \brief Check whether \p MI is a copy like instruction that is
+    /// not recognized by the register coalescer.
+    bool isUncoalescableCopy(const MachineInstr &MI) {
+      return MI.isBitcast() ||
+             (!DisableAdvCopyOpt &&
+              (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+               MI.isExtractSubregLike()));
+    }
+  };
+
+  /// \brief Helper class to hold a reply for ValueTracker queries. Contains the
+  /// returned sources for a given search and the instructions where the sources
+  /// were tracked from.
+  class ValueTrackerResult {
+  private:
+    /// Track all sources found by one ValueTracker query.
+    SmallVector<TargetInstrInfo::RegSubRegPair, 2> RegSrcs;
+
+    /// Instruction using the sources in 'RegSrcs'.
+    const MachineInstr *Inst;
+
+  public:
+    ValueTrackerResult() : Inst(nullptr) {}
+    ValueTrackerResult(unsigned Reg, unsigned SubReg) : Inst(nullptr) {
+      addSource(Reg, SubReg);
+    }
+
+    bool isValid() const { return getNumSources() > 0; }
+
+    void setInst(const MachineInstr *I) { Inst = I; }
+    const MachineInstr *getInst() const { return Inst; }
+
+    void clear() {
+      RegSrcs.clear();
+      Inst = nullptr;
+    }
+
+    void addSource(unsigned SrcReg, unsigned SrcSubReg) {
+      RegSrcs.push_back(TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg));
+    }
+
+    void setSource(int Idx, unsigned SrcReg, unsigned SrcSubReg) {
+      assert(Idx < getNumSources() && "Reg pair source out of index");
+      RegSrcs[Idx] = TargetInstrInfo::RegSubRegPair(SrcReg, SrcSubReg);
+    }
+
+    int getNumSources() const { return RegSrcs.size(); }
+
+    unsigned getSrcReg(int Idx) const {
+      assert(Idx < getNumSources() && "Reg source out of index");
+      return RegSrcs[Idx].Reg;
+    }
+
+    unsigned getSrcSubReg(int Idx) const {
+      assert(Idx < getNumSources() && "SubReg source out of index");
+      return RegSrcs[Idx].SubReg;
+    }
+
+    bool operator==(const ValueTrackerResult &Other) {
+      if (Other.getInst() != getInst())
+        return false;
+
+      if (Other.getNumSources() != getNumSources())
+        return false;
+
+      for (int i = 0, e = Other.getNumSources(); i != e; ++i)
+        if (Other.getSrcReg(i) != getSrcReg(i) ||
+            Other.getSrcSubReg(i) != getSrcSubReg(i))
+          return false;
+      return true;
+    }
+  };
+
+  /// \brief Helper class to track the possible sources of a value defined by
+  /// a (chain of) copy related instructions.
+  /// Given a definition (instruction and definition index), this class
+  /// follows the use-def chain to find successive suitable sources.
+  /// The given source can be used to rewrite the definition into
+  /// def = COPY src.
+  ///
+  /// For instance, let us consider the following snippet:
+  /// v0 =
+  /// v2 = INSERT_SUBREG v1, v0, sub0
+  /// def = COPY v2.sub0
+  ///
+  /// Using a ValueTracker for def = COPY v2.sub0 will give the following
+  /// suitable sources:
+  /// v2.sub0 and v0.
+  /// Then, def can be rewritten into def = COPY v0.
+  class ValueTracker {
+  private:
+    /// The current point into the use-def chain.
+    const MachineInstr *Def;
+    /// The index of the definition in Def.
+    unsigned DefIdx;
+    /// The sub register index of the definition.
+    unsigned DefSubReg;
+    /// The register where the value can be found.
+    unsigned Reg;
+    /// Specifiy whether or not the value tracking looks through
+    /// complex instructions. When this is false, the value tracker
+    /// bails on everything that is not a copy or a bitcast.
+    ///
+    /// Note: This could have been implemented as a specialized version of
+    /// the ValueTracker class but that would have complicated the code of
+    /// the users of this class.
+    bool UseAdvancedTracking;
+    /// MachineRegisterInfo used to perform tracking.
+    const MachineRegisterInfo &MRI;
+    /// Optional TargetInstrInfo used to perform some complex
+    /// tracking.
+    const TargetInstrInfo *TII;
+
+    /// \brief Dispatcher to the right underlying implementation of
+    /// getNextSource.
+    ValueTrackerResult getNextSourceImpl();
+    /// \brief Specialized version of getNextSource for Copy instructions.
+    ValueTrackerResult getNextSourceFromCopy();
+    /// \brief Specialized version of getNextSource for Bitcast instructions.
+    ValueTrackerResult getNextSourceFromBitcast();
+    /// \brief Specialized version of getNextSource for RegSequence
+    /// instructions.
+    ValueTrackerResult getNextSourceFromRegSequence();
+    /// \brief Specialized version of getNextSource for InsertSubreg
+    /// instructions.
+    ValueTrackerResult getNextSourceFromInsertSubreg();
+    /// \brief Specialized version of getNextSource for ExtractSubreg
+    /// instructions.
+    ValueTrackerResult getNextSourceFromExtractSubreg();
+    /// \brief Specialized version of getNextSource for SubregToReg
+    /// instructions.
+    ValueTrackerResult getNextSourceFromSubregToReg();
+    /// \brief Specialized version of getNextSource for PHI instructions.
+    ValueTrackerResult getNextSourceFromPHI();
+
+  public:
+    /// \brief Create a ValueTracker instance for the value defined by \p Reg.
+    /// \p DefSubReg represents the sub register index the value tracker will
+    /// track. It does not need to match the sub register index used in the
+    /// definition of \p Reg.
+    /// \p UseAdvancedTracking specifies whether or not the value tracker looks
+    /// through complex instructions. By default (false), it handles only copy
+    /// and bitcast instructions.
+    /// If \p Reg is a physical register, a value tracker constructed with
+    /// this constructor will not find any alternative source.
+    /// Indeed, when \p Reg is a physical register that constructor does not
+    /// know which definition of \p Reg it should track.
+    /// Use the next constructor to track a physical register.
+    ValueTracker(unsigned Reg, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
+                 bool UseAdvancedTracking = false,
+                 const TargetInstrInfo *TII = nullptr)
+        : Def(nullptr), DefIdx(0), DefSubReg(DefSubReg), Reg(Reg),
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        Def = MRI.getVRegDef(Reg);
+        DefIdx = MRI.def_begin(Reg).getOperandNo();
+      }
+    }
+
+    /// \brief Create a ValueTracker instance for the value defined by
+    /// the pair \p MI, \p DefIdx.
+    /// Unlike the other constructor, the value tracker produced by this one
+    /// may be able to find a new source when the definition is a physical
+    /// register.
+    /// This could be useful to rewrite target specific instructions into
+    /// generic copy instructions.
+    ValueTracker(const MachineInstr &MI, unsigned DefIdx, unsigned DefSubReg,
+                 const MachineRegisterInfo &MRI,
+                 bool UseAdvancedTracking = false,
+                 const TargetInstrInfo *TII = nullptr)
+        : Def(&MI), DefIdx(DefIdx), DefSubReg(DefSubReg),
+          UseAdvancedTracking(UseAdvancedTracking), MRI(MRI), TII(TII) {
+      assert(DefIdx < Def->getDesc().getNumDefs() &&
+             Def->getOperand(DefIdx).isReg() && "Invalid definition");
+      Reg = Def->getOperand(DefIdx).getReg();
+    }
+
+    /// \brief Following the use-def chain, get the next available source
+    /// for the tracked value.
+    /// \return A ValueTrackerResult containing a set of registers
+    /// and sub registers with tracked values. A ValueTrackerResult with
+    /// an empty set of registers means no source was found.
+    ValueTrackerResult getNextSource();
+
+    /// \brief Get the last register where the initial value can be found.
+    /// Initially this is the register of the definition.
+    /// Then, after each successful call to getNextSource, this is the
+    /// register of the last source.
+    unsigned getReg() const { return Reg; }
+  };
+}
+
+char PeepholeOptimizer::ID = 0;
+char &llvm::PeepholeOptimizerID = PeepholeOptimizer::ID;
+INITIALIZE_PASS_BEGIN(PeepholeOptimizer, DEBUG_TYPE,
+                "Peephole Optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
+                "Peephole Optimizations", false, false)
+
+/// If instruction is a copy-like instruction, i.e. it reads a single register
+/// and writes a single register and it does not modify the source, and if the
+/// source value is preserved as a sub-register of the result, then replace all
+/// reachable uses of the source with the subreg of the result.
+///
+/// Do not generate an EXTRACT that is used only in a debug use, as this changes
+/// the code. Since this code does not currently share EXTRACTs, just ignore all
+/// debug uses.
+bool PeepholeOptimizer::
+optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                 SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
+  unsigned SrcReg, DstReg, SubIdx;
+  if (!TII->isCoalescableExtInstr(*MI, SrcReg, DstReg, SubIdx))
+    return false;
+
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  if (MRI->hasOneNonDBGUse(SrcReg))
+    // No other uses.
+    return false;
+
+  // Ensure DstReg can get a register class that actually supports
+  // sub-registers. Don't change the class until we commit.
+  const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+  DstRC = TRI->getSubClassWithSubReg(DstRC, SubIdx);
+  if (!DstRC)
+    return false;
+
+  // The ext instr may be operating on a sub-register of SrcReg as well.
+  // PPC::EXTSW is a 32 -> 64-bit sign extension, but it reads a 64-bit
+  // register.
+  // If UseSrcSubIdx is Set, SubIdx also applies to SrcReg, and only uses of
+  // SrcReg:SubIdx should be replaced.
+  bool UseSrcSubIdx =
+      TRI->getSubClassWithSubReg(MRI->getRegClass(SrcReg), SubIdx) != nullptr;
+
+  // The source has other uses. See if we can replace the other uses with use of
+  // the result of the extension.
+  SmallPtrSet<MachineBasicBlock*, 4> ReachedBBs;
+  for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
+    ReachedBBs.insert(UI.getParent());
+
+  // Uses that are in the same BB of uses of the result of the instruction.
+  SmallVector<MachineOperand*, 8> Uses;
+
+  // Uses that the result of the instruction can reach.
+  SmallVector<MachineOperand*, 8> ExtendedUses;
+
+  bool ExtendLife = true;
+  for (MachineOperand &UseMO : MRI->use_nodbg_operands(SrcReg)) {
+    MachineInstr *UseMI = UseMO.getParent();
+    if (UseMI == MI)
+      continue;
+
+    if (UseMI->isPHI()) {
+      ExtendLife = false;
+      continue;
+    }
+
+    // Only accept uses of SrcReg:SubIdx.
+    if (UseSrcSubIdx && UseMO.getSubReg() != SubIdx)
+      continue;
+
+    // It's an error to translate this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1024, 4
+    //
+    // into this:
+    //
+    //    %reg1025 = <sext> %reg1024
+    //     ...
+    //    %reg1027 = COPY %reg1025:4
+    //    %reg1026 = SUBREG_TO_REG 0, %reg1027, 4
+    //
+    // The problem here is that SUBREG_TO_REG is there to assert that an
+    // implicit zext occurs. It doesn't insert a zext instruction. If we allow
+    // the COPY here, it will give us the value after the <sext>, not the
+    // original value of %reg1024 before <sext>.
+    if (UseMI->getOpcode() == TargetOpcode::SUBREG_TO_REG)
+      continue;
+
+    MachineBasicBlock *UseMBB = UseMI->getParent();
+    if (UseMBB == MBB) {
+      // Local uses that come after the extension.
+      if (!LocalMIs.count(UseMI))
+        Uses.push_back(&UseMO);
+    } else if (ReachedBBs.count(UseMBB)) {
+      // Non-local uses where the result of the extension is used. Always
+      // replace these unless it's a PHI.
+      Uses.push_back(&UseMO);
+    } else if (Aggressive && DT->dominates(MBB, UseMBB)) {
+      // We may want to extend the live range of the extension result in order
+      // to replace these uses.
+      ExtendedUses.push_back(&UseMO);
+    } else {
+      // Both will be live out of the def MBB anyway. Don't extend live range of
+      // the extension result.
+      ExtendLife = false;
+      break;
+    }
+  }
+
+  if (ExtendLife && !ExtendedUses.empty())
+    // Extend the liveness of the extension result.
+    Uses.append(ExtendedUses.begin(), ExtendedUses.end());
+
+  // Now replace all uses.
+  bool Changed = false;
+  if (!Uses.empty()) {
+    SmallPtrSet<MachineBasicBlock*, 4> PHIBBs;
+
+    // Look for PHI uses of the extended result, we don't want to extend the
+    // liveness of a PHI input. It breaks all kinds of assumptions down
+    // stream. A PHI use is expected to be the kill of its source values.
+    for (MachineInstr &UI : MRI->use_nodbg_instructions(DstReg))
+      if (UI.isPHI())
+        PHIBBs.insert(UI.getParent());
+
+    const TargetRegisterClass *RC = MRI->getRegClass(SrcReg);
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i) {
+      MachineOperand *UseMO = Uses[i];
+      MachineInstr *UseMI = UseMO->getParent();
+      MachineBasicBlock *UseMBB = UseMI->getParent();
+      if (PHIBBs.count(UseMBB))
+        continue;
+
+      // About to add uses of DstReg, clear DstReg's kill flags.
+      if (!Changed) {
+        MRI->clearKillFlags(DstReg);
+        MRI->constrainRegClass(DstReg, DstRC);
+      }
+
+      unsigned NewVR = MRI->createVirtualRegister(RC);
+      MachineInstr *Copy = BuildMI(*UseMBB, UseMI, UseMI->getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY), NewVR)
+        .addReg(DstReg, 0, SubIdx);
+      // SubIdx applies to both SrcReg and DstReg when UseSrcSubIdx is set.
+      if (UseSrcSubIdx) {
+        Copy->getOperand(0).setSubReg(SubIdx);
+        Copy->getOperand(0).setIsUndef();
+      }
+      UseMO->setReg(NewVR);
+      ++NumReuse;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+/// If the instruction is a compare and the previous instruction it's comparing
+/// against already sets (or could be modified to set) the same flag as the
+/// compare, then we can remove the comparison and use the flag from the
+/// previous instruction.
+bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
+                                         MachineBasicBlock *MBB) {
+  // If this instruction is a comparison against zero and isn't comparing a
+  // physical register, we can try to optimize it.
+  unsigned SrcReg, SrcReg2;
+  int CmpMask, CmpValue;
+  if (!TII->analyzeCompare(*MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+      (SrcReg2 != 0 && TargetRegisterInfo::isPhysicalRegister(SrcReg2)))
+    return false;
+
+  // Attempt to optimize the comparison instruction.
+  if (TII->optimizeCompareInstr(*MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
+    ++NumCmps;
+    return true;
+  }
+
+  return false;
+}
+
+/// Optimize a select instruction.
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI,
+                            SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  unsigned TrueOp = 0;
+  unsigned FalseOp = 0;
+  bool Optimizable = false;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeSelect(*MI, Cond, TrueOp, FalseOp, Optimizable))
+    return false;
+  if (!Optimizable)
+    return false;
+  if (!TII->optimizeSelect(*MI, LocalMIs))
+    return false;
+  MI->eraseFromParent();
+  ++NumSelects;
+  return true;
+}
+
+/// \brief Check if a simpler conditional branch can be
+// generated
+bool PeepholeOptimizer::optimizeCondBranch(MachineInstr *MI) {
+  return TII->optimizeCondBranch(*MI);
+}
+
+/// \brief Try to find the next source that share the same register file
+/// for the value defined by \p Reg and \p SubReg.
+/// When true is returned, the \p RewriteMap can be used by the client to
+/// retrieve all Def -> Use along the way up to the next source. Any found
+/// Use that is not itself a key for another entry, is the next source to
+/// use. During the search for the next source, multiple sources can be found
+/// given multiple incoming sources of a PHI instruction. In this case, we
+/// look in each PHI source for the next source; all found next sources must
+/// share the same register file as \p Reg and \p SubReg. The client should
+/// then be capable to rewrite all intermediate PHIs to get the next source.
+/// \return False if no alternative sources are available. True otherwise.
+bool PeepholeOptimizer::findNextSource(unsigned Reg, unsigned SubReg,
+                                       RewriteMapTy &RewriteMap) {
+  // Do not try to find a new source for a physical register.
+  // So far we do not have any motivating example for doing that.
+  // Thus, instead of maintaining untested code, we will revisit that if
+  // that changes at some point.
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return false;
+  const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
+
+  SmallVector<TargetInstrInfo::RegSubRegPair, 4> SrcToLook;
+  TargetInstrInfo::RegSubRegPair CurSrcPair(Reg, SubReg);
+  SrcToLook.push_back(CurSrcPair);
+
+  unsigned PHICount = 0;
+  while (!SrcToLook.empty() && PHICount < RewritePHILimit) {
+    TargetInstrInfo::RegSubRegPair Pair = SrcToLook.pop_back_val();
+    // As explained above, do not handle physical registers
+    if (TargetRegisterInfo::isPhysicalRegister(Pair.Reg))
+      return false;
+
+    CurSrcPair = Pair;
+    ValueTracker ValTracker(CurSrcPair.Reg, CurSrcPair.SubReg, *MRI,
+                            !DisableAdvCopyOpt, TII);
+    ValueTrackerResult Res;
+    bool ShouldRewrite = false;
+
+    do {
+      // Follow the chain of copies until we reach the top of the use-def chain
+      // or find a more suitable source.
+      Res = ValTracker.getNextSource();
+      if (!Res.isValid())
+        break;
+
+      // Insert the Def -> Use entry for the recently found source.
+      ValueTrackerResult CurSrcRes = RewriteMap.lookup(CurSrcPair);
+      if (CurSrcRes.isValid()) {
+        assert(CurSrcRes == Res && "ValueTrackerResult found must match");
+        // An existent entry with multiple sources is a PHI cycle we must avoid.
+        // Otherwise it's an entry with a valid next source we already found.
+        if (CurSrcRes.getNumSources() > 1) {
+          DEBUG(dbgs() << "findNextSource: found PHI cycle, aborting...\n");
+          return false;
+        }
+        break;
+      }
+      RewriteMap.insert(std::make_pair(CurSrcPair, Res));
+
+      // ValueTrackerResult usually have one source unless it's the result from
+      // a PHI instruction. Add the found PHI edges to be looked up further.
+      unsigned NumSrcs = Res.getNumSources();
+      if (NumSrcs > 1) {
+        PHICount++;
+        for (unsigned i = 0; i < NumSrcs; ++i)
+          SrcToLook.push_back(TargetInstrInfo::RegSubRegPair(
+              Res.getSrcReg(i), Res.getSrcSubReg(i)));
+        break;
+      }
+
+      CurSrcPair.Reg = Res.getSrcReg(0);
+      CurSrcPair.SubReg = Res.getSrcSubReg(0);
+      // Do not extend the live-ranges of physical registers as they add
+      // constraints to the register allocator. Moreover, if we want to extend
+      // the live-range of a physical register, unlike SSA virtual register,
+      // we will have to check that they aren't redefine before the related use.
+      if (TargetRegisterInfo::isPhysicalRegister(CurSrcPair.Reg))
+        return false;
+
+      const TargetRegisterClass *SrcRC = MRI->getRegClass(CurSrcPair.Reg);
+      ShouldRewrite = TRI->shouldRewriteCopySrc(DefRC, SubReg, SrcRC,
+                                                CurSrcPair.SubReg);
+    } while (!ShouldRewrite);
+
+    // Continue looking for new sources...
+    if (Res.isValid())
+      continue;
+
+    // Do not continue searching for a new source if the there's at least
+    // one use-def which cannot be rewritten.
+    if (!ShouldRewrite)
+      return false;
+  }
+
+  if (PHICount >= RewritePHILimit) {
+    DEBUG(dbgs() << "findNextSource: PHI limit reached\n");
+    return false;
+  }
+
+  // If we did not find a more suitable source, there is nothing to optimize.
+  return CurSrcPair.Reg != Reg;
+}
+
+/// \brief Insert a PHI instruction with incoming edges \p SrcRegs that are
+/// guaranteed to have the same register class. This is necessary whenever we
+/// successfully traverse a PHI instruction and find suitable sources coming
+/// from its edges. By inserting a new PHI, we provide a rewritten PHI def
+/// suitable to be used in a new COPY instruction.
+static MachineInstr *
+insertPHI(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+          const SmallVectorImpl<TargetInstrInfo::RegSubRegPair> &SrcRegs,
+          MachineInstr *OrigPHI) {
+  assert(!SrcRegs.empty() && "No sources to create a PHI instruction?");
+
+  const TargetRegisterClass *NewRC = MRI->getRegClass(SrcRegs[0].Reg);
+  unsigned NewVR = MRI->createVirtualRegister(NewRC);
+  MachineBasicBlock *MBB = OrigPHI->getParent();
+  MachineInstrBuilder MIB = BuildMI(*MBB, OrigPHI, OrigPHI->getDebugLoc(),
+                                    TII->get(TargetOpcode::PHI), NewVR);
+
+  unsigned MBBOpIdx = 2;
+  for (auto RegPair : SrcRegs) {
+    MIB.addReg(RegPair.Reg, 0, RegPair.SubReg);
+    MIB.addMBB(OrigPHI->getOperand(MBBOpIdx).getMBB());
+    // Since we're extended the lifetime of RegPair.Reg, clear the
+    // kill flags to account for that and make RegPair.Reg reaches
+    // the new PHI.
+    MRI->clearKillFlags(RegPair.Reg);
+    MBBOpIdx += 2;
+  }
+
+  return MIB;
+}
+
+namespace {
+/// \brief Helper class to rewrite the arguments of a copy-like instruction.
+class CopyRewriter {
+protected:
+  /// The copy-like instruction.
+  MachineInstr &CopyLike;
+  /// The index of the source being rewritten.
+  unsigned CurrentSrcIdx;
+
+public:
+  CopyRewriter(MachineInstr &MI) : CopyLike(MI), CurrentSrcIdx(0) {}
+
+  virtual ~CopyRewriter() {}
+
+  /// \brief Get the next rewritable source (SrcReg, SrcSubReg) and
+  /// the related value that it affects (TrackReg, TrackSubReg).
+  /// A source is considered rewritable if its register class and the
+  /// register class of the related TrackReg may not be register
+  /// coalescer friendly. In other words, given a copy-like instruction
+  /// not all the arguments may be returned at rewritable source, since
+  /// some arguments are none to be register coalescer friendly.
+  ///
+  /// Each call of this method moves the current source to the next
+  /// rewritable source.
+  /// For instance, let CopyLike be the instruction to rewrite.
+  /// CopyLike has one definition and one source:
+  /// dst.dstSubIdx = CopyLike src.srcSubIdx.
+  ///
+  /// The first call will give the first rewritable source, i.e.,
+  /// the only source this instruction has:
+  /// (SrcReg, SrcSubReg) = (src, srcSubIdx).
+  /// This source defines the whole definition, i.e.,
+  /// (TrackReg, TrackSubReg) = (dst, dstSubIdx).
+  ///
+  /// The second and subsequent calls will return false, as there is only one
+  /// rewritable source.
+  ///
+  /// \return True if a rewritable source has been found, false otherwise.
+  /// The output arguments are valid if and only if true is returned.
+  virtual bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                                       unsigned &TrackReg,
+                                       unsigned &TrackSubReg) {
+    // If CurrentSrcIdx == 1, this means this function has already been called
+    // once. CopyLike has one definition and one argument, thus, there is
+    // nothing else to rewrite.
+    if (!CopyLike.isCopy() || CurrentSrcIdx == 1)
+      return false;
+    // This is the first call to getNextRewritableSource.
+    // Move the CurrentSrcIdx to remember that we made that call.
+    CurrentSrcIdx = 1;
+    // The rewritable source is the argument.
+    const MachineOperand &MOSrc = CopyLike.getOperand(1);
+    SrcReg = MOSrc.getReg();
+    SrcSubReg = MOSrc.getSubReg();
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  /// \brief Rewrite the current source with \p NewReg and \p NewSubReg
+  /// if possible.
+  /// \return True if the rewriting was possible, false otherwise.
+  virtual bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) {
+    if (!CopyLike.isCopy() || CurrentSrcIdx != 1)
+      return false;
+    MachineOperand &MOSrc = CopyLike.getOperand(CurrentSrcIdx);
+    MOSrc.setReg(NewReg);
+    MOSrc.setSubReg(NewSubReg);
+    return true;
+  }
+
+  /// \brief Given a \p Def.Reg and Def.SubReg  pair, use \p RewriteMap to find
+  /// the new source to use for rewrite. If \p HandleMultipleSources is true and
+  /// multiple sources for a given \p Def are found along the way, we found a
+  /// PHI instructions that needs to be rewritten.
+  /// TODO: HandleMultipleSources should be removed once we test PHI handling
+  /// with coalescable copies.
+  TargetInstrInfo::RegSubRegPair
+  getNewSource(MachineRegisterInfo *MRI, const TargetInstrInfo *TII,
+               TargetInstrInfo::RegSubRegPair Def,
+               PeepholeOptimizer::RewriteMapTy &RewriteMap,
+               bool HandleMultipleSources = true) {
+
+    TargetInstrInfo::RegSubRegPair LookupSrc(Def.Reg, Def.SubReg);
+    do {
+      ValueTrackerResult Res = RewriteMap.lookup(LookupSrc);
+      // If there are no entries on the map, LookupSrc is the new source.
+      if (!Res.isValid())
+        return LookupSrc;
+
+      // There's only one source for this definition, keep searching...
+      unsigned NumSrcs = Res.getNumSources();
+      if (NumSrcs == 1) {
+        LookupSrc.Reg = Res.getSrcReg(0);
+        LookupSrc.SubReg = Res.getSrcSubReg(0);
+        continue;
+      }
+
+      // TODO: Remove once multiple srcs w/ coalescable copies are supported.
+      if (!HandleMultipleSources)
+        break;
+
+      // Multiple sources, recurse into each source to find a new source
+      // for it. Then, rewrite the PHI accordingly to its new edges.
+      SmallVector<TargetInstrInfo::RegSubRegPair, 4> NewPHISrcs;
+      for (unsigned i = 0; i < NumSrcs; ++i) {
+        TargetInstrInfo::RegSubRegPair PHISrc(Res.getSrcReg(i),
+                                              Res.getSrcSubReg(i));
+        NewPHISrcs.push_back(
+            getNewSource(MRI, TII, PHISrc, RewriteMap, HandleMultipleSources));
+      }
+
+      // Build the new PHI node and return its def register as the new source.
+      MachineInstr *OrigPHI = const_cast<MachineInstr *>(Res.getInst());
+      MachineInstr *NewPHI = insertPHI(MRI, TII, NewPHISrcs, OrigPHI);
+      DEBUG(dbgs() << "-- getNewSource\n");
+      DEBUG(dbgs() << "   Replacing: " << *OrigPHI);
+      DEBUG(dbgs() << "        With: " << *NewPHI);
+      const MachineOperand &MODef = NewPHI->getOperand(0);
+      return TargetInstrInfo::RegSubRegPair(MODef.getReg(), MODef.getSubReg());
+
+    } while (1);
+
+    return TargetInstrInfo::RegSubRegPair(0, 0);
+  }
+
+  /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
+  /// and create a new COPY instruction. More info about RewriteMap in
+  /// PeepholeOptimizer::findNextSource. Right now this is only used to handle
+  /// Uncoalescable copies, since they are copy like instructions that aren't
+  /// recognized by the register allocator.
+  virtual MachineInstr *
+  RewriteSource(TargetInstrInfo::RegSubRegPair Def,
+                PeepholeOptimizer::RewriteMapTy &RewriteMap) {
+    return nullptr;
+  }
+};
+
+/// \brief Helper class to rewrite uncoalescable copy like instructions
+/// into new COPY (coalescable friendly) instructions.
+class UncoalescableRewriter : public CopyRewriter {
+protected:
+  const TargetInstrInfo &TII;
+  MachineRegisterInfo   &MRI;
+  /// The number of defs in the bitcast
+  unsigned NumDefs;
+
+public:
+  UncoalescableRewriter(MachineInstr &MI, const TargetInstrInfo &TII,
+                         MachineRegisterInfo &MRI)
+      : CopyRewriter(MI), TII(TII), MRI(MRI) {
+    NumDefs = MI.getDesc().getNumDefs();
+  }
+
+  /// \brief Get the next rewritable def source (TrackReg, TrackSubReg)
+  /// All such sources need to be considered rewritable in order to
+  /// rewrite a uncoalescable copy-like instruction. This method return
+  /// each definition that must be checked if rewritable.
+  ///
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // Find the next non-dead definition and continue from there.
+    if (CurrentSrcIdx == NumDefs)
+      return false;
+
+    while (CopyLike.getOperand(CurrentSrcIdx).isDead()) {
+      ++CurrentSrcIdx;
+      if (CurrentSrcIdx == NumDefs)
+        return false;
+    }
+
+    // What we track are the alternative sources of the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(CurrentSrcIdx);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+
+    CurrentSrcIdx++;
+    return true;
+  }
+
+  /// \brief Rewrite the source found through \p Def, by using the \p RewriteMap
+  /// and create a new COPY instruction. More info about RewriteMap in
+  /// PeepholeOptimizer::findNextSource. Right now this is only used to handle
+  /// Uncoalescable copies, since they are copy like instructions that aren't
+  /// recognized by the register allocator.
+  MachineInstr *
+  RewriteSource(TargetInstrInfo::RegSubRegPair Def,
+                PeepholeOptimizer::RewriteMapTy &RewriteMap) override {
+    assert(!TargetRegisterInfo::isPhysicalRegister(Def.Reg) &&
+           "We do not rewrite physical registers");
+
+    // Find the new source to use in the COPY rewrite.
+    TargetInstrInfo::RegSubRegPair NewSrc =
+        getNewSource(&MRI, &TII, Def, RewriteMap);
+
+    // Insert the COPY.
+    const TargetRegisterClass *DefRC = MRI.getRegClass(Def.Reg);
+    unsigned NewVR = MRI.createVirtualRegister(DefRC);
+
+    MachineInstr *NewCopy =
+        BuildMI(*CopyLike.getParent(), &CopyLike, CopyLike.getDebugLoc(),
+                TII.get(TargetOpcode::COPY), NewVR)
+            .addReg(NewSrc.Reg, 0, NewSrc.SubReg);
+
+    NewCopy->getOperand(0).setSubReg(Def.SubReg);
+    if (Def.SubReg)
+      NewCopy->getOperand(0).setIsUndef();
+
+    DEBUG(dbgs() << "-- RewriteSource\n");
+    DEBUG(dbgs() << "   Replacing: " << CopyLike);
+    DEBUG(dbgs() << "        With: " << *NewCopy);
+    MRI.replaceRegWith(Def.Reg, NewVR);
+    MRI.clearKillFlags(NewVR);
+
+    // We extended the lifetime of NewSrc.Reg, clear the kill flags to
+    // account for that.
+    MRI.clearKillFlags(NewSrc.Reg);
+
+    return NewCopy;
+  }
+};
+
+/// \brief Specialized rewriter for INSERT_SUBREG instruction.
+class InsertSubregRewriter : public CopyRewriter {
+public:
+  InsertSubregRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isInsertSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = INSERT_SUBREG Src1, Src2.src2SubIdx, subIdx.
+  /// Src1 has the same register class has dst, hence, there is
+  /// nothing to rewrite.
+  /// Src2.src2SubIdx, may not be register coalescer friendly.
+  /// Therefore, the first call to this method returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx).
+  ///
+  /// Subsequence calls will return false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 2)
+      return false;
+    // We are looking at v2 = INSERT_SUBREG v0, v1, sub0.
+    CurrentSrcIdx = 2;
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(2);
+    SrcReg = MOInsertedReg.getReg();
+    SrcSubReg = MOInsertedReg.getSubReg();
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+
+    // We want to track something that is compatible with the
+    // partial definition.
+    TrackReg = MODef.getReg();
+    if (MODef.getSubReg())
+      // Bail if we have to compose sub-register indices.
+      return false;
+    TrackSubReg = (unsigned)CopyLike.getOperand(3).getImm();
+    return true;
+  }
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    if (CurrentSrcIdx != 2)
+      return false;
+    // We are rewriting the inserted reg.
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for EXTRACT_SUBREG instruction.
+class ExtractSubregRewriter : public CopyRewriter {
+  const TargetInstrInfo &TII;
+
+public:
+  ExtractSubregRewriter(MachineInstr &MI, const TargetInstrInfo &TII)
+      : CopyRewriter(MI), TII(TII) {
+    assert(MI.isExtractSubreg() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst.dstSubIdx = EXTRACT_SUBREG Src, subIdx.
+  /// There is only one rewritable source: Src.subIdx,
+  /// which defines dst.dstSubIdx.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // If we already get the only source we can rewrite, return false.
+    if (CurrentSrcIdx == 1)
+      return false;
+    // We are looking at v1 = EXTRACT_SUBREG v0, sub0.
+    CurrentSrcIdx = 1;
+    const MachineOperand &MOExtractedReg = CopyLike.getOperand(1);
+    SrcReg = MOExtractedReg.getReg();
+    // If we have to compose sub-register indices, bail out.
+    if (MOExtractedReg.getSubReg())
+      return false;
+
+    SrcSubReg = CopyLike.getOperand(2).getImm();
+
+    // We want to track something that is compatible with the definition.
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    TrackSubReg = MODef.getSubReg();
+    return true;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // The only source we can rewrite is the input register.
+    if (CurrentSrcIdx != 1)
+      return false;
+
+    CopyLike.getOperand(CurrentSrcIdx).setReg(NewReg);
+
+    // If we find a source that does not require to extract something,
+    // rewrite the operation with a copy.
+    if (!NewSubReg) {
+      // Move the current index to an invalid position.
+      // We do not want another call to this method to be able
+      // to do any change.
+      CurrentSrcIdx = -1;
+      // Rewrite the operation as a COPY.
+      // Get rid of the sub-register index.
+      CopyLike.RemoveOperand(2);
+      // Morph the operation into a COPY.
+      CopyLike.setDesc(TII.get(TargetOpcode::COPY));
+      return true;
+    }
+    CopyLike.getOperand(CurrentSrcIdx + 1).setImm(NewSubReg);
+    return true;
+  }
+};
+
+/// \brief Specialized rewriter for REG_SEQUENCE instruction.
+class RegSequenceRewriter : public CopyRewriter {
+public:
+  RegSequenceRewriter(MachineInstr &MI) : CopyRewriter(MI) {
+    assert(MI.isRegSequence() && "Invalid instruction");
+  }
+
+  /// \brief See CopyRewriter::getNextRewritableSource.
+  /// Here CopyLike has the following form:
+  /// dst = REG_SEQUENCE Src1.src1SubIdx, subIdx1, Src2.src2SubIdx, subIdx2.
+  /// Each call will return a different source, walking all the available
+  /// source.
+  ///
+  /// The first call returns:
+  /// (SrcReg, SrcSubReg) = (Src1, src1SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx1).
+  ///
+  /// The second call returns:
+  /// (SrcReg, SrcSubReg) = (Src2, src2SubIdx).
+  /// (TrackReg, TrackSubReg) = (dst, subIdx2).
+  ///
+  /// And so on, until all the sources have been traversed, then
+  /// it returns false.
+  bool getNextRewritableSource(unsigned &SrcReg, unsigned &SrcSubReg,
+                               unsigned &TrackReg,
+                               unsigned &TrackSubReg) override {
+    // We are looking at v0 = REG_SEQUENCE v1, sub1, v2, sub2, etc.
+
+    // If this is the first call, move to the first argument.
+    if (CurrentSrcIdx == 0) {
+      CurrentSrcIdx = 1;
+    } else {
+      // Otherwise, move to the next argument and check that it is valid.
+      CurrentSrcIdx += 2;
+      if (CurrentSrcIdx >= CopyLike.getNumOperands())
+        return false;
+    }
+    const MachineOperand &MOInsertedReg = CopyLike.getOperand(CurrentSrcIdx);
+    SrcReg = MOInsertedReg.getReg();
+    // If we have to compose sub-register indices, bail out.
+    if ((SrcSubReg = MOInsertedReg.getSubReg()))
+      return false;
+
+    // We want to track something that is compatible with the related
+    // partial definition.
+    TrackSubReg = CopyLike.getOperand(CurrentSrcIdx + 1).getImm();
+
+    const MachineOperand &MODef = CopyLike.getOperand(0);
+    TrackReg = MODef.getReg();
+    // If we have to compose sub-registers, bail.
+    return MODef.getSubReg() == 0;
+  }
+
+  bool RewriteCurrentSource(unsigned NewReg, unsigned NewSubReg) override {
+    // We cannot rewrite out of bound operands.
+    // Moreover, rewritable sources are at odd positions.
+    if ((CurrentSrcIdx & 1) != 1 || CurrentSrcIdx > CopyLike.getNumOperands())
+      return false;
+
+    MachineOperand &MO = CopyLike.getOperand(CurrentSrcIdx);
+    MO.setReg(NewReg);
+    MO.setSubReg(NewSubReg);
+    return true;
+  }
+};
+} // End namespace.
+
+/// \brief Get the appropriated CopyRewriter for \p MI.
+/// \return A pointer to a dynamically allocated CopyRewriter or nullptr
+/// if no rewriter works for \p MI.
+static CopyRewriter *getCopyRewriter(MachineInstr &MI,
+                                     const TargetInstrInfo &TII,
+                                     MachineRegisterInfo &MRI) {
+  // Handle uncoalescable copy-like instructions.
+  if (MI.isBitcast() || (MI.isRegSequenceLike() || MI.isInsertSubregLike() ||
+                         MI.isExtractSubregLike()))
+    return new UncoalescableRewriter(MI, TII, MRI);
+
+  switch (MI.getOpcode()) {
+  default:
+    return nullptr;
+  case TargetOpcode::COPY:
+    return new CopyRewriter(MI);
+  case TargetOpcode::INSERT_SUBREG:
+    return new InsertSubregRewriter(MI);
+  case TargetOpcode::EXTRACT_SUBREG:
+    return new ExtractSubregRewriter(MI, TII);
+  case TargetOpcode::REG_SEQUENCE:
+    return new RegSequenceRewriter(MI);
+  }
+  llvm_unreachable(nullptr);
+}
+
+/// \brief Optimize generic copy instructions to avoid cross
+/// register bank copy. The optimization looks through a chain of
+/// copies and tries to find a source that has a compatible register
+/// class.
+/// Two register classes are considered to be compatible if they share
+/// the same register bank.
+/// New copies issued by this optimization are register allocator
+/// friendly. This optimization does not remove any copy as it may
+/// overconstrain the register allocator, but replaces some operands
+/// when possible.
+/// \pre isCoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been rewritten. False otherwise.
+bool PeepholeOptimizer::optimizeCoalescableCopy(MachineInstr *MI) {
+  assert(MI && isCoalescableCopy(*MI) && "Invalid argument");
+  assert(MI->getDesc().getNumDefs() == 1 &&
+         "Coalescer can understand multiple defs?!");
+  const MachineOperand &MODef = MI->getOperand(0);
+  // Do not rewrite physical definitions.
+  if (TargetRegisterInfo::isPhysicalRegister(MODef.getReg()))
+    return false;
+
+  bool Changed = false;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI));
+  // If none exists, bail out.
+  if (!CpyRewriter)
+    return false;
+  // Rewrite each rewritable source.
+  unsigned SrcReg, SrcSubReg, TrackReg, TrackSubReg;
+  while (CpyRewriter->getNextRewritableSource(SrcReg, SrcSubReg, TrackReg,
+                                              TrackSubReg)) {
+    // Keep track of PHI nodes and its incoming edges when looking for sources.
+    RewriteMapTy RewriteMap;
+    // Try to find a more suitable source. If we failed to do so, or get the
+    // actual source, move to the next source.
+    if (!findNextSource(TrackReg, TrackSubReg, RewriteMap))
+      continue;
+
+    // Get the new source to rewrite. TODO: Only enable handling of multiple
+    // sources (PHIs) once we have a motivating example and testcases for it.
+    TargetInstrInfo::RegSubRegPair TrackPair(TrackReg, TrackSubReg);
+    TargetInstrInfo::RegSubRegPair NewSrc = CpyRewriter->getNewSource(
+        MRI, TII, TrackPair, RewriteMap, false /* multiple sources */);
+    if (SrcReg == NewSrc.Reg || NewSrc.Reg == 0)
+      continue;
+
+    // Rewrite source.
+    if (CpyRewriter->RewriteCurrentSource(NewSrc.Reg, NewSrc.SubReg)) {
+      // We may have extended the live-range of NewSrc, account for that.
+      MRI->clearKillFlags(NewSrc.Reg);
+      Changed = true;
+    }
+  }
+  // TODO: We could have a clean-up method to tidy the instruction.
+  // E.g., v0 = INSERT_SUBREG v1, v1.sub0, sub0
+  // => v0 = COPY v1
+  // Currently we haven't seen motivating example for that and we
+  // want to avoid untested code.
+  NumRewrittenCopies += Changed;
+  return Changed;
+}
+
+/// \brief Optimize copy-like instructions to create
+/// register coalescer friendly instruction.
+/// The optimization tries to kill-off the \p MI by looking
+/// through a chain of copies to find a source that has a compatible
+/// register class.
+/// If such a source is found, it replace \p MI by a generic COPY
+/// operation.
+/// \pre isUncoalescableCopy(*MI) is true.
+/// \return True, when \p MI has been optimized. In that case, \p MI has
+/// been removed from its parent.
+/// All COPY instructions created, are inserted in \p LocalMIs.
+bool PeepholeOptimizer::optimizeUncoalescableCopy(
+    MachineInstr *MI, SmallPtrSetImpl<MachineInstr *> &LocalMIs) {
+  assert(MI && isUncoalescableCopy(*MI) && "Invalid argument");
+
+  // Check if we can rewrite all the values defined by this instruction.
+  SmallVector<TargetInstrInfo::RegSubRegPair, 4> RewritePairs;
+  // Get the right rewriter for the current copy.
+  std::unique_ptr<CopyRewriter> CpyRewriter(getCopyRewriter(*MI, *TII, *MRI));
+  // If none exists, bail out.
+  if (!CpyRewriter)
+    return false;
+
+  // Rewrite each rewritable source by generating new COPYs. This works
+  // differently from optimizeCoalescableCopy since it first makes sure that all
+  // definitions can be rewritten.
+  RewriteMapTy RewriteMap;
+  unsigned Reg, SubReg, CopyDefReg, CopyDefSubReg;
+  while (CpyRewriter->getNextRewritableSource(Reg, SubReg, CopyDefReg,
+                                              CopyDefSubReg)) {
+    // If a physical register is here, this is probably for a good reason.
+    // Do not rewrite that.
+    if (TargetRegisterInfo::isPhysicalRegister(CopyDefReg))
+      return false;
+
+    // If we do not know how to rewrite this definition, there is no point
+    // in trying to kill this instruction.
+    TargetInstrInfo::RegSubRegPair Def(CopyDefReg, CopyDefSubReg);
+    if (!findNextSource(Def.Reg, Def.SubReg, RewriteMap))
+      return false;
+
+    RewritePairs.push_back(Def);
+  }
+
+  // The change is possible for all defs, do it.
+  for (const auto &Def : RewritePairs) {
+    // Rewrite the "copy" in a way the register coalescer understands.
+    MachineInstr *NewCopy = CpyRewriter->RewriteSource(Def, RewriteMap);
+    assert(NewCopy && "Should be able to always generate a new copy");
+    LocalMIs.insert(NewCopy);
+  }
+
+  // MI is now dead.
+  MI->eraseFromParent();
+  ++NumUncoalescableCopies;
+  return true;
+}
+
+/// Check whether MI is a candidate for folding into a later instruction.
+/// We only fold loads to virtual registers and the virtual register defined
+/// has a single use.
+bool PeepholeOptimizer::isLoadFoldable(
+    MachineInstr *MI, SmallSet<unsigned, 16> &FoldAsLoadDefCandidates) {
+  if (!MI->canFoldAsLoad() || !MI->mayLoad())
+    return false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+
+  unsigned Reg = MI->getOperand(0).getReg();
+  // To reduce compilation time, we check MRI->hasOneNonDBGUse when inserting
+  // loads. It should be checked when processing uses of the load, since
+  // uses can be removed during peephole.
+  if (!MI->getOperand(0).getSubReg() &&
+      TargetRegisterInfo::isVirtualRegister(Reg) &&
+      MRI->hasOneNonDBGUse(Reg)) {
+    FoldAsLoadDefCandidates.insert(Reg);
+    return true;
+  }
+  return false;
+}
+
+bool PeepholeOptimizer::isMoveImmediate(
+    MachineInstr *MI, SmallSet<unsigned, 4> &ImmDefRegs,
+    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (!MI->isMoveImmediate())
+    return false;
+  if (MCID.getNumDefs() != 1)
+    return false;
+  unsigned Reg = MI->getOperand(0).getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    ImmDefMIs.insert(std::make_pair(Reg, MI));
+    ImmDefRegs.insert(Reg);
+    return true;
+  }
+
+  return false;
+}
+
+/// Try folding register operands that are defined by move immediate
+/// instructions, i.e. a trivial constant folding optimization, if
+/// and only if the def and use are in the same BB.
+bool PeepholeOptimizer::foldImmediate(
+    MachineInstr *MI, MachineBasicBlock *MBB, SmallSet<unsigned, 4> &ImmDefRegs,
+    DenseMap<unsigned, MachineInstr *> &ImmDefMIs) {
+  for (unsigned i = 0, e = MI->getDesc().getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (ImmDefRegs.count(Reg) == 0)
+      continue;
+    DenseMap<unsigned, MachineInstr*>::iterator II = ImmDefMIs.find(Reg);
+    assert(II != ImmDefMIs.end() && "couldn't find immediate definition");
+    if (TII->FoldImmediate(*MI, *II->second, Reg, MRI)) {
+      ++NumImmFold;
+      return true;
+    }
+  }
+  return false;
+}
+
+// FIXME: This is very simple and misses some cases which should be handled when
+// motivating examples are found.
+//
+// The copy rewriting logic should look at uses as well as defs and be able to
+// eliminate copies across blocks.
+//
+// Later copies that are subregister extracts will also not be eliminated since
+// only the first copy is considered.
+//
+// e.g.
+// %vreg1 = COPY %vreg0
+// %vreg2 = COPY %vreg0:sub1
+//
+// Should replace %vreg2 uses with %vreg1:sub1
+bool PeepholeOptimizer::foldRedundantCopy(
+    MachineInstr *MI, SmallSet<unsigned, 4> &CopySrcRegs,
+    DenseMap<unsigned, MachineInstr *> &CopyMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  if (!TargetRegisterInfo::isVirtualRegister(DstReg))
+    return false;
+
+  if (CopySrcRegs.insert(SrcReg).second) {
+    // First copy of this reg seen.
+    CopyMIs.insert(std::make_pair(SrcReg, MI));
+    return false;
+  }
+
+  MachineInstr *PrevCopy = CopyMIs.find(SrcReg)->second;
+
+  unsigned SrcSubReg = MI->getOperand(1).getSubReg();
+  unsigned PrevSrcSubReg = PrevCopy->getOperand(1).getSubReg();
+
+  // Can't replace different subregister extracts.
+  if (SrcSubReg != PrevSrcSubReg)
+    return false;
+
+  unsigned PrevDstReg = PrevCopy->getOperand(0).getReg();
+
+  // Only replace if the copy register class is the same.
+  //
+  // TODO: If we have multiple copies to different register classes, we may want
+  // to track multiple copies of the same source register.
+  if (MRI->getRegClass(DstReg) != MRI->getRegClass(PrevDstReg))
+    return false;
+
+  MRI->replaceRegWith(DstReg, PrevDstReg);
+
+  // Lifetime of the previous copy has been extended.
+  MRI->clearKillFlags(PrevDstReg);
+  return true;
+}
+
+bool PeepholeOptimizer::isNAPhysCopy(unsigned Reg) {
+  return TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         !MRI->isAllocatable(Reg);
+}
+
+bool PeepholeOptimizer::foldRedundantNAPhysCopy(
+    MachineInstr *MI, DenseMap<unsigned, MachineInstr *> &NAPhysToVirtMIs) {
+  assert(MI->isCopy() && "expected a COPY machine instruction");
+
+  if (DisableNAPhysCopyOpt)
+    return false;
+
+  unsigned DstReg = MI->getOperand(0).getReg();
+  unsigned SrcReg = MI->getOperand(1).getReg();
+  if (isNAPhysCopy(SrcReg) && TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    // %vreg = COPY %PHYSREG
+    // Avoid using a datastructure which can track multiple live non-allocatable
+    // phys->virt copies since LLVM doesn't seem to do this.
+    NAPhysToVirtMIs.insert({SrcReg, MI});
+    return false;
+  }
+
+  if (!(TargetRegisterInfo::isVirtualRegister(SrcReg) && isNAPhysCopy(DstReg)))
+    return false;
+
+  // %PHYSREG = COPY %vreg
+  auto PrevCopy = NAPhysToVirtMIs.find(DstReg);
+  if (PrevCopy == NAPhysToVirtMIs.end()) {
+    // We can't remove the copy: there was an intervening clobber of the
+    // non-allocatable physical register after the copy to virtual.
+    DEBUG(dbgs() << "NAPhysCopy: intervening clobber forbids erasing " << *MI
+                 << '\n');
+    return false;
+  }
+
+  unsigned PrevDstReg = PrevCopy->second->getOperand(0).getReg();
+  if (PrevDstReg == SrcReg) {
+    // Remove the virt->phys copy: we saw the virtual register definition, and
+    // the non-allocatable physical register's state hasn't changed since then.
+    DEBUG(dbgs() << "NAPhysCopy: erasing " << *MI << '\n');
+    ++NumNAPhysCopies;
+    return true;
+  }
+
+  // Potential missed optimization opportunity: we saw a different virtual
+  // register get a copy of the non-allocatable physical register, and we only
+  // track one such copy. Avoid getting confused by this new non-allocatable
+  // physical register definition, and remove it from the tracked copies.
+  DEBUG(dbgs() << "NAPhysCopy: missed opportunity " << *MI << '\n');
+  NAPhysToVirtMIs.erase(PrevCopy);
+  return false;
+}
+
+bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
+  DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
+
+  if (DisablePeephole)
+    return false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  DT  = Aggressive ? &getAnalysis<MachineDominatorTree>() : nullptr;
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    bool SeenMoveImm = false;
+
+    // During this forward scan, at some point it needs to answer the question
+    // "given a pointer to an MI in the current BB, is it located before or
+    // after the current instruction".
+    // To perform this, the following set keeps track of the MIs already seen
+    // during the scan, if a MI is not in the set, it is assumed to be located
+    // after. Newly created MIs have to be inserted in the set as well.
+    SmallPtrSet<MachineInstr*, 16> LocalMIs;
+    SmallSet<unsigned, 4> ImmDefRegs;
+    DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+    SmallSet<unsigned, 16> FoldAsLoadDefCandidates;
+
+    // Track when a non-allocatable physical register is copied to a virtual
+    // register so that useless moves can be removed.
+    //
+    // %PHYSREG is the map index; MI is the last valid `%vreg = COPY %PHYSREG`
+    // without any intervening re-definition of %PHYSREG.
+    DenseMap<unsigned, MachineInstr *> NAPhysToVirtMIs;
+
+    // Set of virtual registers that are copied from.
+    SmallSet<unsigned, 4> CopySrcRegs;
+    DenseMap<unsigned, MachineInstr *> CopySrcMIs;
+
+    for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end();
+         MII != MIE; ) {
+      MachineInstr *MI = &*MII;
+      // We may be erasing MI below, increment MII now.
+      ++MII;
+      LocalMIs.insert(MI);
+
+      // Skip debug values. They should not affect this peephole optimization.
+      if (MI->isDebugValue())
+          continue;
+
+      // If we run into an instruction we can't fold across, discard
+      // the load candidates.
+      if (MI->isLoadFoldBarrier())
+        FoldAsLoadDefCandidates.clear();
+
+      if (MI->isPosition() || MI->isPHI())
+        continue;
+
+      if (!MI->isCopy()) {
+        for (const auto &Op : MI->operands()) {
+          // Visit all operands: definitions can be implicit or explicit.
+          if (Op.isReg()) {
+            unsigned Reg = Op.getReg();
+            if (Op.isDef() && isNAPhysCopy(Reg)) {
+              const auto &Def = NAPhysToVirtMIs.find(Reg);
+              if (Def != NAPhysToVirtMIs.end()) {
+                // A new definition of the non-allocatable physical register
+                // invalidates previous copies.
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          } else if (Op.isRegMask()) {
+            const uint32_t *RegMask = Op.getRegMask();
+            for (auto &RegMI : NAPhysToVirtMIs) {
+              unsigned Def = RegMI.first;
+              if (MachineOperand::clobbersPhysReg(RegMask, Def)) {
+                DEBUG(dbgs() << "NAPhysCopy: invalidating because of " << *MI
+                             << '\n');
+                NAPhysToVirtMIs.erase(Def);
+              }
+            }
+          }
+        }
+      }
+
+      if (MI->isImplicitDef() || MI->isKill())
+        continue;
+
+      if (MI->isInlineAsm() || MI->hasUnmodeledSideEffects()) {
+        // Blow away all non-allocatable physical registers knowledge since we
+        // don't know what's correct anymore.
+        //
+        // FIXME: handle explicit asm clobbers.
+        DEBUG(dbgs() << "NAPhysCopy: blowing away all info due to " << *MI
+                     << '\n');
+        NAPhysToVirtMIs.clear();
+        continue;
+      }
+
+      if ((isUncoalescableCopy(*MI) &&
+           optimizeUncoalescableCopy(MI, LocalMIs)) ||
+          (MI->isCompare() && optimizeCmpInstr(MI, &MBB)) ||
+          (MI->isSelect() && optimizeSelect(MI, LocalMIs))) {
+        // MI is deleted.
+        LocalMIs.erase(MI);
+        Changed = true;
+        continue;
+      }
+
+      if (MI->isConditionalBranch() && optimizeCondBranch(MI)) {
+        Changed = true;
+        continue;
+      }
+
+      if (isCoalescableCopy(*MI) && optimizeCoalescableCopy(MI)) {
+        // MI is just rewritten.
+        Changed = true;
+        continue;
+      }
+
+      if (MI->isCopy() &&
+          (foldRedundantCopy(MI, CopySrcRegs, CopySrcMIs) ||
+           foldRedundantNAPhysCopy(MI, NAPhysToVirtMIs))) {
+        LocalMIs.erase(MI);
+        MI->eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
+      if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
+        SeenMoveImm = true;
+      } else {
+        Changed |= optimizeExtInstr(MI, &MBB, LocalMIs);
+        // optimizeExtInstr might have created new instructions after MI
+        // and before the already incremented MII. Adjust MII so that the
+        // next iteration sees the new instructions.
+        MII = MI;
+        ++MII;
+        if (SeenMoveImm)
+          Changed |= foldImmediate(MI, &MBB, ImmDefRegs, ImmDefMIs);
+      }
+
+      // Check whether MI is a load candidate for folding into a later
+      // instruction. If MI is not a candidate, check whether we can fold an
+      // earlier load into MI.
+      if (!isLoadFoldable(MI, FoldAsLoadDefCandidates) &&
+          !FoldAsLoadDefCandidates.empty()) {
+        const MCInstrDesc &MIDesc = MI->getDesc();
+        for (unsigned i = MIDesc.getNumDefs(); i != MIDesc.getNumOperands();
+             ++i) {
+          const MachineOperand &MOp = MI->getOperand(i);
+          if (!MOp.isReg())
+            continue;
+          unsigned FoldAsLoadDefReg = MOp.getReg();
+          if (FoldAsLoadDefCandidates.count(FoldAsLoadDefReg)) {
+            // We need to fold load after optimizeCmpInstr, since
+            // optimizeCmpInstr can enable folding by converting SUB to CMP.
+            // Save FoldAsLoadDefReg because optimizeLoadInstr() resets it and
+            // we need it for markUsesInDebugValueAsUndef().
+            unsigned FoldedReg = FoldAsLoadDefReg;
+            MachineInstr *DefMI = nullptr;
+            if (MachineInstr *FoldMI =
+                    TII->optimizeLoadInstr(*MI, MRI, FoldAsLoadDefReg, DefMI)) {
+              // Update LocalMIs since we replaced MI with FoldMI and deleted
+              // DefMI.
+              DEBUG(dbgs() << "Replacing: " << *MI);
+              DEBUG(dbgs() << "     With: " << *FoldMI);
+              LocalMIs.erase(MI);
+              LocalMIs.erase(DefMI);
+              LocalMIs.insert(FoldMI);
+              MI->eraseFromParent();
+              DefMI->eraseFromParent();
+              MRI->markUsesInDebugValueAsUndef(FoldedReg);
+              FoldAsLoadDefCandidates.erase(FoldedReg);
+              ++NumLoadFold;
+              // MI is replaced with FoldMI.
+              Changed = true;
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return Changed;
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromCopy() {
+  assert(Def->isCopy() && "Invalid definition");
+  // Copy instruction are supposed to be: Def = Src.
+  // If someone breaks this assumption, bad things will happen everywhere.
+  assert(Def->getNumOperands() == 2 && "Invalid number of operands");
+
+  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of src.
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
+  // Otherwise, we want the whole source.
+  const MachineOperand &Src = Def->getOperand(1);
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromBitcast() {
+  assert(Def->isBitcast() && "Invalid definition");
+
+  // Bail if there are effects that a plain copy will not expose.
+  if (Def->hasUnmodeledSideEffects())
+    return ValueTrackerResult();
+
+  // Bitcasts with more than one def are not supported.
+  if (Def->getDesc().getNumDefs() != 1)
+    return ValueTrackerResult();
+  if (Def->getOperand(DefIdx).getSubReg() != DefSubReg)
+    // If we look for a different subreg, it means we want a subreg of the src.
+    // Bails as we do not support composing subregs yet.
+    return ValueTrackerResult();
+
+  unsigned SrcIdx = Def->getNumOperands();
+  for (unsigned OpIdx = DefIdx + 1, EndOpIdx = SrcIdx; OpIdx != EndOpIdx;
+       ++OpIdx) {
+    const MachineOperand &MO = Def->getOperand(OpIdx);
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    // Ignore dead implicit defs.
+    if (MO.isImplicit() && MO.isDead())
+      continue;
+    assert(!MO.isDef() && "We should have skipped all the definitions by now");
+    if (SrcIdx != EndOpIdx)
+      // Multiple sources?
+      return ValueTrackerResult();
+    SrcIdx = OpIdx;
+  }
+  const MachineOperand &Src = Def->getOperand(SrcIdx);
+  return ValueTrackerResult(Src.getReg(), Src.getSubReg());
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromRegSequence() {
+  assert((Def->isRegSequence() || Def->isRegSequenceLike()) &&
+         "Invalid definition");
+
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subregs, bail out.
+    // The case we are checking is Def.<subreg> = REG_SEQUENCE.
+    // This should almost never happen as the SSA property is tracked at
+    // the register level (as opposed to the subreg level).
+    // I.e.,
+    // Def.sub0 =
+    // Def.sub1 =
+    // is a valid SSA representation for Def.sub0 and Def.sub1, but not for
+    // Def. Thus, it must not be generated.
+    // However, some code could theoretically generates a single
+    // Def.sub0 (i.e, not defining the other subregs) and we would
+    // have this case.
+    // If we can ascertain (or force) that this never happens, we could
+    // turn that into an assertion.
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  SmallVector<TargetInstrInfo::RegSubRegPairAndIdx, 8> RegSeqInputRegs;
+  if (!TII->getRegSequenceInputs(*Def, DefIdx, RegSeqInputRegs))
+    return ValueTrackerResult();
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  // Check if one of the operand defines the subreg we are interested in.
+  for (auto &RegSeqInput : RegSeqInputRegs) {
+    if (RegSeqInput.SubIdx == DefSubReg) {
+      if (RegSeqInput.SubReg)
+        // Bail if we have to compose sub registers.
+        return ValueTrackerResult();
+
+      return ValueTrackerResult(RegSeqInput.Reg, RegSeqInput.SubReg);
+    }
+  }
+
+  // If the subreg we are tracking is super-defined by another subreg,
+  // we could follow this value. However, this would require to compose
+  // the subreg and we do not do that for now.
+  return ValueTrackerResult();
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromInsertSubreg() {
+  assert((Def->isInsertSubreg() || Def->isInsertSubregLike()) &&
+         "Invalid definition");
+
+  if (Def->getOperand(DefIdx).getSubReg())
+    // If we are composing subreg, bail out.
+    // Same remark as getNextSourceFromRegSequence.
+    // I.e., this may be turned into an assert.
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the REG_SEQUENCE here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  TargetInstrInfo::RegSubRegPair BaseReg;
+  TargetInstrInfo::RegSubRegPairAndIdx InsertedReg;
+  if (!TII->getInsertSubregInputs(*Def, DefIdx, BaseReg, InsertedReg))
+    return ValueTrackerResult();
+
+  // We are looking at:
+  // Def = INSERT_SUBREG v0, v1, sub1
+  // There are two cases:
+  // 1. DefSubReg == sub1, get v1.
+  // 2. DefSubReg != sub1, the value may be available through v0.
+
+  // #1 Check if the inserted register matches the required sub index.
+  if (InsertedReg.SubIdx == DefSubReg) {
+    return ValueTrackerResult(InsertedReg.Reg, InsertedReg.SubReg);
+  }
+  // #2 Otherwise, if the sub register we are looking for is not partial
+  // defined by the inserted element, we can look through the main
+  // register (v0).
+  const MachineOperand &MODef = Def->getOperand(DefIdx);
+  // If the result register (Def) and the base register (v0) do not
+  // have the same register class or if we have to compose
+  // subregisters, bail out.
+  if (MRI.getRegClass(MODef.getReg()) != MRI.getRegClass(BaseReg.Reg) ||
+      BaseReg.SubReg)
+    return ValueTrackerResult();
+
+  // Get the TRI and check if the inserted sub-register overlaps with the
+  // sub-register we are tracking.
+  const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+  if (!TRI ||
+      (TRI->getSubRegIndexLaneMask(DefSubReg) &
+       TRI->getSubRegIndexLaneMask(InsertedReg.SubIdx)) != 0)
+    return ValueTrackerResult();
+  // At this point, the value is available in v0 via the same subreg
+  // we used for Def.
+  return ValueTrackerResult(BaseReg.Reg, DefSubReg);
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromExtractSubreg() {
+  assert((Def->isExtractSubreg() ||
+          Def->isExtractSubregLike()) && "Invalid definition");
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0, sub0
+
+  // Bail if we have to compose sub registers.
+  // Indeed, if DefSubReg != 0, we would have to compose it with sub0.
+  if (DefSubReg)
+    return ValueTrackerResult();
+
+  if (!TII)
+    // We could handle the EXTRACT_SUBREG here, but we do not want to
+    // duplicate the code from the generic TII.
+    return ValueTrackerResult();
+
+  TargetInstrInfo::RegSubRegPairAndIdx ExtractSubregInputReg;
+  if (!TII->getExtractSubregInputs(*Def, DefIdx, ExtractSubregInputReg))
+    return ValueTrackerResult();
+
+  // Bail if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose v0.subreg with sub0.
+  if (ExtractSubregInputReg.SubReg)
+    return ValueTrackerResult();
+  // Otherwise, the value is available in the v0.sub0.
+  return ValueTrackerResult(ExtractSubregInputReg.Reg,
+                            ExtractSubregInputReg.SubIdx);
+}
+
+ValueTrackerResult ValueTracker::getNextSourceFromSubregToReg() {
+  assert(Def->isSubregToReg() && "Invalid definition");
+  // We are looking at:
+  // Def = SUBREG_TO_REG Imm, v0, sub0
+
+  // Bail if we have to compose sub registers.
+  // If DefSubReg != sub0, we would have to check that all the bits
+  // we track are included in sub0 and if yes, we would have to
+  // determine the right subreg in v0.
+  if (DefSubReg != Def->getOperand(3).getImm())
+    return ValueTrackerResult();
+  // Bail if we have to compose sub registers.
+  // Likewise, if v0.subreg != 0, we would have to compose it with sub0.
+  if (Def->getOperand(2).getSubReg())
+    return ValueTrackerResult();
+
+  return ValueTrackerResult(Def->getOperand(2).getReg(),
+                            Def->getOperand(3).getImm());
+}
+
+/// \brief Explore each PHI incoming operand and return its sources
+ValueTrackerResult ValueTracker::getNextSourceFromPHI() {
+  assert(Def->isPHI() && "Invalid definition");
+  ValueTrackerResult Res;
+
+  // If we look for a different subreg, bail as we do not support composing
+  // subregs yet.
+  if (Def->getOperand(0).getSubReg() != DefSubReg)
+    return ValueTrackerResult();
+
+  // Return all register sources for PHI instructions.
+  for (unsigned i = 1, e = Def->getNumOperands(); i < e; i += 2) {
+    auto &MO = Def->getOperand(i);
+    assert(MO.isReg() && "Invalid PHI instruction");
+    Res.addSource(MO.getReg(), MO.getSubReg());
+  }
+
+  return Res;
+}
+
+ValueTrackerResult ValueTracker::getNextSourceImpl() {
+  assert(Def && "This method needs a valid definition");
+
+  assert(((Def->getOperand(DefIdx).isDef() &&
+           (DefIdx < Def->getDesc().getNumDefs() ||
+            Def->getDesc().isVariadic())) ||
+          Def->getOperand(DefIdx).isImplicit()) &&
+         "Invalid DefIdx");
+  if (Def->isCopy())
+    return getNextSourceFromCopy();
+  if (Def->isBitcast())
+    return getNextSourceFromBitcast();
+  // All the remaining cases involve "complex" instructions.
+  // Bail if we did not ask for the advanced tracking.
+  if (!UseAdvancedTracking)
+    return ValueTrackerResult();
+  if (Def->isRegSequence() || Def->isRegSequenceLike())
+    return getNextSourceFromRegSequence();
+  if (Def->isInsertSubreg() || Def->isInsertSubregLike())
+    return getNextSourceFromInsertSubreg();
+  if (Def->isExtractSubreg() || Def->isExtractSubregLike())
+    return getNextSourceFromExtractSubreg();
+  if (Def->isSubregToReg())
+    return getNextSourceFromSubregToReg();
+  if (Def->isPHI())
+    return getNextSourceFromPHI();
+  return ValueTrackerResult();
+}
+
+ValueTrackerResult ValueTracker::getNextSource() {
+  // If we reach a point where we cannot move up in the use-def chain,
+  // there is nothing we can get.
+  if (!Def)
+    return ValueTrackerResult();
+
+  ValueTrackerResult Res = getNextSourceImpl();
+  if (Res.isValid()) {
+    // Update definition, definition index, and subregister for the
+    // next call of getNextSource.
+    // Update the current register.
+    bool OneRegSrc = Res.getNumSources() == 1;
+    if (OneRegSrc)
+      Reg = Res.getSrcReg(0);
+    // Update the result before moving up in the use-def chain
+    // with the instruction containing the last found sources.
+    Res.setInst(Def);
+
+    // If we can still move up in the use-def chain, move to the next
+    // definition.
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) && OneRegSrc) {
+      Def = MRI.getVRegDef(Reg);
+      DefIdx = MRI.def_begin(Reg).getOperandNo();
+      DefSubReg = Res.getSrcSubReg(0);
+      return Res;
+    }
+  }
+  // If we end up here, this means we will not be able to find another source
+  // for the next iteration. Make sure any new call to getNextSource bails out
+  // early by cutting the use-def chain.
+  Def = nullptr;
+  return Res;
+}
diff --git a/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
new file mode 100644
index 000000000000..5bc5f7524dbf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PostRAHazardRecognizer.cpp
@@ -0,0 +1,98 @@
+//===----- PostRAHazardRecognizer.cpp - hazard recognizer -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This runs the hazard recognizer and emits noops when necessary.  This
+/// gives targets a way to run the hazard recognizer without running one of
+/// the schedulers.  Example use cases for this pass would be:
+///
+/// - Targets that need the hazard recognizer to be run at -O0.
+/// - Targets that want to guarantee that hazards at the beginning of
+///   scheduling regions are handled correctly.  The post-RA scheduler is
+///   a top-down scheduler, but when there are multiple scheduling regions
+///   in a basic block, it visits the regions in bottom-up order.  This
+///   makes it impossible for the scheduler to gauranttee it can correctly
+///   handle hazards at the beginning of scheduling regions.
+///
+/// This pass traverses all the instructions in a program in top-down order.
+/// In contrast to the instruction scheduling passes, this pass never resets
+/// the hazard recognizer to ensure it can correctly handles noop hazards at
+/// the begining of blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-hazard-rec"
+
+STATISTIC(NumNoops, "Number of noops inserted");
+
+namespace {
+  class PostRAHazardRecognizer : public MachineFunctionPass {
+
+  public:
+    static char ID;
+    PostRAHazardRecognizer() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  };
+  char PostRAHazardRecognizer::ID = 0;
+
+}
+
+char &llvm::PostRAHazardRecognizerID = PostRAHazardRecognizer::ID;
+
+INITIALIZE_PASS(PostRAHazardRecognizer, DEBUG_TYPE,
+                "Post RA hazard recognizer", false, false)
+
+bool PostRAHazardRecognizer::runOnMachineFunction(MachineFunction &Fn) {
+  const TargetInstrInfo *TII = Fn.getSubtarget().getInstrInfo();
+  std::unique_ptr<ScheduleHazardRecognizer> HazardRec(
+      TII->CreateTargetPostRAHazardRecognizer(Fn));
+
+  // Return if the target has not implemented a hazard recognizer.
+  if (!HazardRec.get())
+    return false;
+
+  // Loop over all of the basic blocks
+  for (auto &MBB : Fn) {
+    // We do not call HazardRec->reset() here to make sure we are handling noop
+    // hazards at the start of basic blocks.
+    for (MachineInstr &MI : MBB) {
+      // If we need to emit noops prior to this instruction, then do so.
+      unsigned NumPreNoops = HazardRec->PreEmitNoops(&MI);
+      for (unsigned i = 0; i != NumPreNoops; ++i) {
+        HazardRec->EmitNoop();
+        TII->insertNoop(MBB, MachineBasicBlock::iterator(MI));
+        ++NumNoops;
+      }
+
+      HazardRec->EmitInstruction(&MI);
+      if (HazardRec->atIssueLimit()) {
+        HazardRec->AdvanceCycle();
+      }
+    }
+  }
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
new file mode 100644
index 000000000000..3fce307f3dd4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -0,0 +1,705 @@
+//===----- SchedulePostRAList.cpp - list scheduler ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AggressiveAntiDepBreaker.h"
+#include "AntiDepBreaker.h"
+#include "CriticalAntiDepBreaker.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "post-RA-sched"
+
+STATISTIC(NumNoops, "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+STATISTIC(NumFixedAnti, "Number of fixed anti-dependencies");
+
+// Post-RA scheduling is enabled with
+// TargetSubtargetInfo.enablePostRAScheduler(). This flag can be used to
+// override the target.
+static cl::opt<bool>
+EnablePostRAScheduler("post-RA-scheduler",
+                       cl::desc("Enable scheduling after register allocation"),
+                       cl::init(false), cl::Hidden);
+static cl::opt<std::string>
+EnableAntiDepBreaking("break-anti-dependencies",
+                      cl::desc("Break post-RA scheduling anti-dependencies: "
+                               "\"critical\", \"all\", or \"none\""),
+                      cl::init("none"), cl::Hidden);
+
+// If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+static cl::opt<int>
+DebugDiv("postra-sched-debugdiv",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+static cl::opt<int>
+DebugMod("postra-sched-debugmod",
+                      cl::desc("Debug control MBBs that are scheduled"),
+                      cl::init(0), cl::Hidden);
+
+AntiDepBreaker::~AntiDepBreaker() { }
+
+namespace {
+  class PostRAScheduler : public MachineFunctionPass {
+    const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
+
+  public:
+    static char ID;
+    PostRAScheduler() : MachineFunctionPass(ID) {}
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<TargetPassConfig>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
+      AU.addRequired<MachineLoopInfo>();
+      AU.addPreserved<MachineLoopInfo>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getRequiredProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  private:
+    bool enablePostRAScheduler(
+        const TargetSubtargetInfo &ST, CodeGenOpt::Level OptLevel,
+        TargetSubtargetInfo::AntiDepBreakMode &Mode,
+        TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const;
+  };
+  char PostRAScheduler::ID = 0;
+
+  class SchedulePostRATDList : public ScheduleDAGInstrs {
+    /// AvailableQueue - The priority queue to use for the available SUnits.
+    ///
+    LatencyPriorityQueue AvailableQueue;
+
+    /// PendingQueue - This contains all of the instructions whose operands have
+    /// been issued, but their results are not ready yet (due to the latency of
+    /// the operation).  Once the operands becomes available, the instruction is
+    /// added to the AvailableQueue.
+    std::vector<SUnit*> PendingQueue;
+
+    /// HazardRec - The hazard recognizer to use.
+    ScheduleHazardRecognizer *HazardRec;
+
+    /// AntiDepBreak - Anti-dependence breaking object, or NULL if none
+    AntiDepBreaker *AntiDepBreak;
+
+    /// AA - AliasAnalysis for making memory reference queries.
+    AliasAnalysis *AA;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+    /// Ordered list of DAG postprocessing steps.
+    std::vector<std::unique_ptr<ScheduleDAGMutation>> Mutations;
+
+    /// The index in BB of RegionEnd.
+    ///
+    /// This is the instruction number from the top of the current block, not
+    /// the SlotIndex. It is only used by the AntiDepBreaker.
+    unsigned EndIndex;
+
+  public:
+    SchedulePostRATDList(
+        MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+        const RegisterClassInfo &,
+        TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+        SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs);
+
+    ~SchedulePostRATDList() override;
+
+    /// startBlock - Initialize register live-range state for scheduling in
+    /// this block.
+    ///
+    void startBlock(MachineBasicBlock *BB) override;
+
+    // Set the index of RegionEnd within the current BB.
+    void setEndIndex(unsigned EndIdx) { EndIndex = EndIdx; }
+
+    /// Initialize the scheduler state for the next scheduling region.
+    void enterRegion(MachineBasicBlock *bb,
+                     MachineBasicBlock::iterator begin,
+                     MachineBasicBlock::iterator end,
+                     unsigned regioninstrs) override;
+
+    /// Notify that the scheduler has finished scheduling the current region.
+    void exitRegion() override;
+
+    /// Schedule - Schedule the instruction range using list scheduling.
+    ///
+    void schedule() override;
+
+    void EmitSchedule();
+
+    /// Observe - Update liveness information to account for the current
+    /// instruction, which will not be scheduled.
+    ///
+    void Observe(MachineInstr &MI, unsigned Count);
+
+    /// finishBlock - Clean up register live-range state.
+    ///
+    void finishBlock() override;
+
+  private:
+    /// Apply each ScheduleDAGMutation step in order.
+    void postprocessDAG();
+
+    void ReleaseSucc(SUnit *SU, SDep *SuccEdge);
+    void ReleaseSuccessors(SUnit *SU);
+    void ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+    void ListScheduleTopDown();
+
+    void dumpSchedule() const;
+    void emitNoop(unsigned CurCycle);
+  };
+}
+
+char &llvm::PostRASchedulerID = PostRAScheduler::ID;
+
+INITIALIZE_PASS(PostRAScheduler, "post-RA-sched",
+                "Post RA top-down list latency scheduler", false, false)
+
+SchedulePostRATDList::SchedulePostRATDList(
+    MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA,
+    const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::AntiDepBreakMode AntiDepMode,
+    SmallVectorImpl<const TargetRegisterClass *> &CriticalPathRCs)
+    : ScheduleDAGInstrs(MF, &MLI), AA(AA), EndIndex(0) {
+
+  const InstrItineraryData *InstrItins =
+      MF.getSubtarget().getInstrItineraryData();
+  HazardRec =
+      MF.getSubtarget().getInstrInfo()->CreateTargetPostRAHazardRecognizer(
+          InstrItins, this);
+  MF.getSubtarget().getPostRAMutations(Mutations);
+
+  assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
+          MRI.tracksLiveness()) &&
+         "Live-ins must be accurate for anti-dependency breaking");
+  AntiDepBreak =
+    ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
+     (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
+     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL) ?
+      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : nullptr));
+}
+
+SchedulePostRATDList::~SchedulePostRATDList() {
+  delete HazardRec;
+  delete AntiDepBreak;
+}
+
+/// Initialize state associated with the next scheduling region.
+void SchedulePostRATDList::enterRegion(MachineBasicBlock *bb,
+                 MachineBasicBlock::iterator begin,
+                 MachineBasicBlock::iterator end,
+                 unsigned regioninstrs) {
+  ScheduleDAGInstrs::enterRegion(bb, begin, end, regioninstrs);
+  Sequence.clear();
+}
+
+/// Print the schedule before exiting the region.
+void SchedulePostRATDList::exitRegion() {
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+  ScheduleDAGInstrs::exitRegion();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// dumpSchedule - dump the scheduled Sequence.
+void SchedulePostRATDList::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+bool PostRAScheduler::enablePostRAScheduler(
+    const TargetSubtargetInfo &ST,
+    CodeGenOpt::Level OptLevel,
+    TargetSubtargetInfo::AntiDepBreakMode &Mode,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) const {
+  Mode = ST.getAntiDepBreakMode();
+  ST.getCriticalPathRCs(CriticalPathRCs);
+
+  // Check for explicit enable/disable of post-ra scheduling.
+  if (EnablePostRAScheduler.getPosition() > 0)
+    return EnablePostRAScheduler;
+
+  return ST.enablePostRAScheduler() &&
+         OptLevel >= ST.getOptLevelToEnablePostRAScheduler();
+}
+
+bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
+  if (skipFunction(*Fn.getFunction()))
+    return false;
+
+  TII = Fn.getSubtarget().getInstrInfo();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
+
+  RegClassInfo.runOnMachineFunction(Fn);
+
+  TargetSubtargetInfo::AntiDepBreakMode AntiDepMode =
+    TargetSubtargetInfo::ANTIDEP_NONE;
+  SmallVector<const TargetRegisterClass*, 4> CriticalPathRCs;
+
+  // Check that post-RA scheduling is enabled for this target.
+  // This may upgrade the AntiDepMode.
+  if (!enablePostRAScheduler(Fn.getSubtarget(), PassConfig->getOptLevel(),
+                             AntiDepMode, CriticalPathRCs))
+    return false;
+
+  // Check for antidep breaking override...
+  if (EnableAntiDepBreaking.getPosition() > 0) {
+    AntiDepMode = (EnableAntiDepBreaking == "all")
+      ? TargetSubtargetInfo::ANTIDEP_ALL
+      : ((EnableAntiDepBreaking == "critical")
+         ? TargetSubtargetInfo::ANTIDEP_CRITICAL
+         : TargetSubtargetInfo::ANTIDEP_NONE);
+  }
+
+  DEBUG(dbgs() << "PostRAScheduler\n");
+
+  SchedulePostRATDList Scheduler(Fn, MLI, AA, RegClassInfo, AntiDepMode,
+                                 CriticalPathRCs);
+
+  // Loop over all of the basic blocks
+  for (auto &MBB : Fn) {
+#ifndef NDEBUG
+    // If DebugDiv > 0 then only schedule MBB with (ID % DebugDiv) == DebugMod
+    if (DebugDiv > 0) {
+      static int bbcnt = 0;
+      if (bbcnt++ % DebugDiv != DebugMod)
+        continue;
+      dbgs() << "*** DEBUG scheduling " << Fn.getName()
+             << ":BB#" << MBB.getNumber() << " ***\n";
+    }
+#endif
+
+    // Initialize register live-range state for scheduling in this block.
+    Scheduler.startBlock(&MBB);
+
+    // Schedule each sequence of instructions not interrupted by a label
+    // or anything else that effectively needs to shut down scheduling.
+    MachineBasicBlock::iterator Current = MBB.end();
+    unsigned Count = MBB.size(), CurrentCount = Count;
+    for (MachineBasicBlock::iterator I = Current; I != MBB.begin();) {
+      MachineInstr &MI = *std::prev(I);
+      --Count;
+      // Calls are not scheduling boundaries before register allocation, but
+      // post-ra we don't gain anything by scheduling across calls since we
+      // don't need to worry about register pressure.
+      if (MI.isCall() || TII->isSchedulingBoundary(MI, &MBB, Fn)) {
+        Scheduler.enterRegion(&MBB, I, Current, CurrentCount - Count);
+        Scheduler.setEndIndex(CurrentCount);
+        Scheduler.schedule();
+        Scheduler.exitRegion();
+        Scheduler.EmitSchedule();
+        Current = &MI;
+        CurrentCount = Count;
+        Scheduler.Observe(MI, CurrentCount);
+      }
+      I = MI;
+      if (MI.isBundle())
+        Count -= MI.getBundleSize();
+    }
+    assert(Count == 0 && "Instruction count mismatch!");
+    assert((MBB.begin() == Current || CurrentCount != 0) &&
+           "Instruction count mismatch!");
+    Scheduler.enterRegion(&MBB, MBB.begin(), Current, CurrentCount);
+    Scheduler.setEndIndex(CurrentCount);
+    Scheduler.schedule();
+    Scheduler.exitRegion();
+    Scheduler.EmitSchedule();
+
+    // Clean up register live-range state.
+    Scheduler.finishBlock();
+
+    // Update register kills
+    Scheduler.fixupKills(&MBB);
+  }
+
+  return true;
+}
+
+/// StartBlock - Initialize register live-range state for scheduling in
+/// this block.
+///
+void SchedulePostRATDList::startBlock(MachineBasicBlock *BB) {
+  // Call the superclass.
+  ScheduleDAGInstrs::startBlock(BB);
+
+  // Reset the hazard recognizer and anti-dep breaker.
+  HazardRec->Reset();
+  if (AntiDepBreak)
+    AntiDepBreak->StartBlock(BB);
+}
+
+/// Schedule - Schedule the instruction range using list scheduling.
+///
+void SchedulePostRATDList::schedule() {
+  // Build the scheduling graph.
+  buildSchedGraph(AA);
+
+  if (AntiDepBreak) {
+    unsigned Broken =
+      AntiDepBreak->BreakAntiDependencies(SUnits, RegionBegin, RegionEnd,
+                                          EndIndex, DbgValues);
+
+    if (Broken != 0) {
+      // We made changes. Update the dependency graph.
+      // Theoretically we could update the graph in place:
+      // When a live range is changed to use a different register, remove
+      // the def's anti-dependence *and* output-dependence edges due to
+      // that register, and add new anti-dependence and output-dependence
+      // edges based on the next live range of the register.
+      ScheduleDAG::clearDAG();
+      buildSchedGraph(AA);
+
+      NumFixedAnti += Broken;
+    }
+  }
+
+  postprocessDAG();
+
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+  DEBUG(
+    for (const SUnit &SU : SUnits) {
+      SU.dumpAll(this);
+      dbgs() << '\n';
+    }
+  );
+
+  AvailableQueue.initNodes(SUnits);
+  ListScheduleTopDown();
+  AvailableQueue.releaseState();
+}
+
+/// Observe - Update liveness information to account for the current
+/// instruction, which will not be scheduled.
+///
+void SchedulePostRATDList::Observe(MachineInstr &MI, unsigned Count) {
+  if (AntiDepBreak)
+    AntiDepBreak->Observe(MI, Count, EndIndex);
+}
+
+/// FinishBlock - Clean up register live-range state.
+///
+void SchedulePostRATDList::finishBlock() {
+  if (AntiDepBreak)
+    AntiDepBreak->FinishBlock();
+
+  // Call the superclass.
+  ScheduleDAGInstrs::finishBlock();
+}
+
+/// Apply each ScheduleDAGMutation step in order.
+void SchedulePostRATDList::postprocessDAG() {
+  for (auto &M : Mutations)
+    M->apply(this);
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero.
+void SchedulePostRATDList::ReleaseSucc(SUnit *SU, SDep *SuccEdge) {
+  SUnit *SuccSU = SuccEdge->getSUnit();
+
+  if (SuccEdge->isWeak()) {
+    --SuccSU->WeakPredsLeft;
+    return;
+  }
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --SuccSU->NumPredsLeft;
+
+  // Standard scheduler algorithms will recompute the depth of the successor
+  // here as such:
+  //   SuccSU->setDepthToAtLeast(SU->getDepth() + SuccEdge->getLatency());
+  //
+  // However, we lazily compute node depth instead. Note that
+  // ScheduleNodeTopDown has already updated the depth of this node which causes
+  // all descendents to be marked dirty. Setting the successor depth explicitly
+  // here would cause depth to be recomputed for all its ancestors. If the
+  // successor is not yet ready (because of a transitively redundant edge) then
+  // this causes depth computation to be quadratic in the size of the DAG.
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU)
+    PendingQueue.push_back(SuccSU);
+}
+
+/// ReleaseSuccessors - Call ReleaseSucc on each of SU's successors.
+void SchedulePostRATDList::ReleaseSuccessors(SUnit *SU) {
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    ReleaseSucc(SU, &*I);
+  }
+}
+
+/// ScheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void SchedulePostRATDList::ScheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() &&
+         "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  ReleaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue.scheduledNode(SU);
+}
+
+/// emitNoop - Add a noop to the current instruction sequence.
+void SchedulePostRATDList::emitNoop(unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Emitting noop in cycle " << CurCycle << '\n');
+  HazardRec->EmitNoop();
+  Sequence.push_back(nullptr);   // NULL here means noop
+  ++NumNoops;
+}
+
+/// ListScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void SchedulePostRATDList::ListScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // We're scheduling top-down but we're visiting the regions in
+  // bottom-up order, so we don't know the hazards at the start of a
+  // region. So assume no hazards (this should usually be ok as most
+  // blocks are a single region).
+  HazardRec->Reset();
+
+  // Release any successors of the special Entry node.
+  ReleaseSuccessors(&EntrySU);
+
+  // Add all leaves to Available queue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (!SUnits[i].NumPredsLeft && !SUnits[i].isAvailable) {
+      AvailableQueue.push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // In any cycle where we can't schedule any instructions, we must
+  // stall or emit a noop, depending on the target.
+  bool CycleHasInsts = false;
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    unsigned MinDepth = ~0u;
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() <= CurCycle) {
+        AvailableQueue.push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      } else if (PendingQueue[i]->getDepth() < MinDepth)
+        MinDepth = PendingQueue[i]->getDepth();
+    }
+
+    DEBUG(dbgs() << "\n*** Examining Available\n"; AvailableQueue.dump(this));
+
+    SUnit *FoundSUnit = nullptr, *NotPreferredSUnit = nullptr;
+    bool HasNoopHazards = false;
+    while (!AvailableQueue.empty()) {
+      SUnit *CurSUnit = AvailableQueue.pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        if (HazardRec->ShouldPreferAnother(CurSUnit)) {
+          if (!NotPreferredSUnit) {
+            // If this is the first non-preferred node for this cycle, then
+            // record it and continue searching for a preferred node. If this
+            // is not the first non-preferred node, then treat it as though
+            // there had been a hazard.
+            NotPreferredSUnit = CurSUnit;
+            continue;
+          }
+        } else {
+          FoundSUnit = CurSUnit;
+          break;
+        }
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // If we have a non-preferred node, push it back onto the available list.
+    // If we did not find a preferred node, then schedule this first
+    // non-preferred node.
+    if (NotPreferredSUnit) {
+      if (!FoundSUnit) {
+        DEBUG(dbgs() << "*** Will schedule a non-preferred instruction...\n");
+        FoundSUnit = NotPreferredSUnit;
+      } else {
+        AvailableQueue.push(NotPreferredSUnit);
+      }
+
+      NotPreferredSUnit = nullptr;
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue.push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule...
+    if (FoundSUnit) {
+      // If we need to emit noops prior to this instruction, then do so.
+      unsigned NumPreNoops = HazardRec->PreEmitNoops(FoundSUnit);
+      for (unsigned i = 0; i != NumPreNoops; ++i)
+        emitNoop(CurCycle);
+
+      // ... schedule the node...
+      ScheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+      CycleHasInsts = true;
+      if (HazardRec->atIssueLimit()) {
+        DEBUG(dbgs() << "*** Max instructions per cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++CurCycle;
+        CycleHasInsts = false;
+      }
+    } else {
+      if (CycleHasInsts) {
+        DEBUG(dbgs() << "*** Finished cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+      } else if (!HasNoopHazards) {
+        // Otherwise, we have a pipeline stall, but no other problem,
+        // just advance the current cycle and try again.
+        DEBUG(dbgs() << "*** Stall in cycle " << CurCycle << '\n');
+        HazardRec->AdvanceCycle();
+        ++NumStalls;
+      } else {
+        // Otherwise, we have no instructions to issue and we have instructions
+        // that will fault if we don't do this right.  This is the case for
+        // processors without pipeline interlocks and other cases.
+        emitNoop(CurCycle);
+      }
+
+      ++CurCycle;
+      CycleHasInsts = false;
+    }
+  }
+
+#ifndef NDEBUG
+  unsigned ScheduledNodes = VerifyScheduledDAG(/*isBottomUp=*/false);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+#endif // NDEBUG
+}
+
+// EmitSchedule - Emit the machine code in scheduled order.
+void SchedulePostRATDList::EmitSchedule() {
+  RegionBegin = RegionEnd;
+
+  // If first instruction was a DBG_VALUE then put it back.
+  if (FirstDbgValue)
+    BB->splice(RegionEnd, BB, FirstDbgValue);
+
+  // Then re-insert them according to the given schedule.
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      BB->splice(RegionEnd, BB, SU->getInstr());
+    else
+      // Null SUnit* is a noop.
+      TII->insertNoop(*BB, RegionEnd);
+
+    // Update the Begin iterator, as the first instruction in the block
+    // may have been scheduled later.
+    if (i == 0)
+      RegionBegin = std::prev(RegionEnd);
+  }
+
+  // Reinsert any remaining debug_values.
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+         DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
+    std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
+    MachineInstr *DbgValue = P.first;
+    MachineBasicBlock::iterator OrigPrivMI = P.second;
+    BB->splice(++OrigPrivMI, BB, DbgValue);
+  }
+  DbgValues.clear();
+  FirstDbgValue = nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/contrib/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
new file mode 100644
index 000000000000..fbc2bc64f425
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -0,0 +1,94 @@
+//===-- PreISelIntrinsicLowering.cpp - Pre-ISel intrinsic lowering pass ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR lowering for the llvm.load.relative intrinsic.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+bool lowerLoadRelative(Function &F) {
+  if (F.use_empty())
+    return false;
+
+  bool Changed = false;
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+  Type *Int32PtrTy = Int32Ty->getPointerTo();
+  Type *Int8Ty = Type::getInt8Ty(F.getContext());
+
+  for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
+    auto CI = dyn_cast<CallInst>(I->getUser());
+    ++I;
+    if (!CI || CI->getCalledValue() != &F)
+      continue;
+
+    IRBuilder<> B(CI);
+    Value *OffsetPtr =
+        B.CreateGEP(Int8Ty, CI->getArgOperand(0), CI->getArgOperand(1));
+    Value *OffsetPtrI32 = B.CreateBitCast(OffsetPtr, Int32PtrTy);
+    Value *OffsetI32 = B.CreateAlignedLoad(OffsetPtrI32, 4);
+
+    Value *ResultPtr = B.CreateGEP(Int8Ty, CI->getArgOperand(0), OffsetI32);
+
+    CI->replaceAllUsesWith(ResultPtr);
+    CI->eraseFromParent();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+bool lowerIntrinsics(Module &M) {
+  bool Changed = false;
+  for (Function &F : M) {
+    if (F.getName().startswith("llvm.load.relative."))
+      Changed |= lowerLoadRelative(F);
+  }
+  return Changed;
+}
+
+class PreISelIntrinsicLoweringLegacyPass : public ModulePass {
+public:
+  static char ID;
+  PreISelIntrinsicLoweringLegacyPass() : ModulePass(ID) {}
+
+  bool runOnModule(Module &M) { return lowerIntrinsics(M); }
+};
+
+char PreISelIntrinsicLoweringLegacyPass::ID;
+}
+
+INITIALIZE_PASS(PreISelIntrinsicLoweringLegacyPass,
+                "pre-isel-intrinsic-lowering", "Pre-ISel Intrinsic Lowering",
+                false, false)
+
+namespace llvm {
+ModulePass *createPreISelIntrinsicLoweringPass() {
+  return new PreISelIntrinsicLoweringLegacyPass;
+}
+
+PreservedAnalyses PreISelIntrinsicLoweringPass::run(Module &M,
+                                                    ModuleAnalysisManager &AM) {
+  if (!lowerIntrinsics(M))
+    return PreservedAnalyses::all();
+  else
+    return PreservedAnalyses::none();
+}
+} // End llvm namespace
diff --git a/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
new file mode 100644
index 000000000000..d27ea2f51867
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ProcessImplicitDefs.cpp
@@ -0,0 +1,168 @@
+//===---------------------- ProcessImplicitDefs.cpp -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "processimplicitdefs"
+
+namespace {
+/// Process IMPLICIT_DEF instructions and make sure there is one implicit_def
+/// for each use. Add isUndef marker to implicit_def defs and their uses.
+class ProcessImplicitDefs : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+  SmallSetVector<MachineInstr*, 16> WorkList;
+
+  void processImplicitDef(MachineInstr *MI);
+  bool canTurnIntoImplicitDef(MachineInstr *MI);
+
+public:
+  static char ID;
+
+  ProcessImplicitDefs() : MachineFunctionPass(ID) {
+    initializeProcessImplicitDefsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &au) const override;
+
+  bool runOnMachineFunction(MachineFunction &fn) override;
+};
+} // end anonymous namespace
+
+char ProcessImplicitDefs::ID = 0;
+char &llvm::ProcessImplicitDefsID = ProcessImplicitDefs::ID;
+
+INITIALIZE_PASS_BEGIN(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions", false, false)
+INITIALIZE_PASS_END(ProcessImplicitDefs, "processimpdefs",
+                "Process Implicit Definitions", false, false)
+
+void ProcessImplicitDefs::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<AAResultsWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool ProcessImplicitDefs::canTurnIntoImplicitDef(MachineInstr *MI) {
+  if (!MI->isCopyLike() &&
+      !MI->isInsertSubreg() &&
+      !MI->isRegSequence() &&
+      !MI->isPHI())
+    return false;
+  for (const MachineOperand &MO : MI->operands())
+    if (MO.isReg() && MO.isUse() && MO.readsReg())
+      return false;
+  return true;
+}
+
+void ProcessImplicitDefs::processImplicitDef(MachineInstr *MI) {
+  DEBUG(dbgs() << "Processing " << *MI);
+  unsigned Reg = MI->getOperand(0).getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    // For virtual registers, mark all uses as <undef>, and convert users to
+    // implicit-def when possible.
+    for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
+      MO.setIsUndef();
+      MachineInstr *UserMI = MO.getParent();
+      if (!canTurnIntoImplicitDef(UserMI))
+        continue;
+      DEBUG(dbgs() << "Converting to IMPLICIT_DEF: " << *UserMI);
+      UserMI->setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+      WorkList.insert(UserMI);
+    }
+    MI->eraseFromParent();
+    return;
+  }
+
+  // This is a physreg implicit-def.
+  // Look for the first instruction to use or define an alias.
+  MachineBasicBlock::instr_iterator UserMI = MI->getIterator();
+  MachineBasicBlock::instr_iterator UserE = MI->getParent()->instr_end();
+  bool Found = false;
+  for (++UserMI; UserMI != UserE; ++UserMI) {
+    for (MachineOperand &MO : UserMI->operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned UserReg = MO.getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(UserReg) ||
+          !TRI->regsOverlap(Reg, UserReg))
+        continue;
+      // UserMI uses or redefines Reg. Set <undef> flags on all uses.
+      Found = true;
+      if (MO.isUse())
+        MO.setIsUndef();
+    }
+    if (Found)
+      break;
+  }
+
+  // If we found the using MI, we can erase the IMPLICIT_DEF.
+  if (Found) {
+    DEBUG(dbgs() << "Physreg user: " << *UserMI);
+    MI->eraseFromParent();
+    return;
+  }
+
+  // Using instr wasn't found, it could be in another block.
+  // Leave the physreg IMPLICIT_DEF, but trim any extra operands.
+  for (unsigned i = MI->getNumOperands() - 1; i; --i)
+    MI->RemoveOperand(i);
+  DEBUG(dbgs() << "Keeping physreg: " << *MI);
+}
+
+/// processImplicitDefs - Process IMPLICIT_DEF instructions and turn them into
+/// <undef> operands.
+bool ProcessImplicitDefs::runOnMachineFunction(MachineFunction &MF) {
+
+  DEBUG(dbgs() << "********** PROCESS IMPLICIT DEFS **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  bool Changed = false;
+
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  assert(MRI->isSSA() && "ProcessImplicitDefs only works on SSA form.");
+  assert(WorkList.empty() && "Inconsistent worklist state");
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+       MFI != MFE; ++MFI) {
+    // Scan the basic block for implicit defs.
+    for (MachineBasicBlock::instr_iterator MBBI = MFI->instr_begin(),
+         MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI)
+      if (MBBI->isImplicitDef())
+        WorkList.insert(&*MBBI);
+
+    if (WorkList.empty())
+      continue;
+
+    DEBUG(dbgs() << "BB#" << MFI->getNumber() << " has " << WorkList.size()
+                 << " implicit defs.\n");
+    Changed = true;
+
+    // Drain the WorkList to recursively process any new implicit defs.
+    do processImplicitDef(WorkList.pop_back_val());
+    while (!WorkList.empty());
+  }
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
new file mode 100644
index 000000000000..20a9a394ebd0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -0,0 +1,1234 @@
+//===-- PrologEpilogInserter.cpp - Insert Prolog/Epilog code in function --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is responsible for finalizing the functions frame layout, saving
+// callee saved registers, and for emitting prolog & epilog code for the
+// function.
+//
+// This pass must be run after register allocation.  After this pass is
+// executed, it is illegal to construct MO_FrameIndex operands.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "pei"
+
+typedef SmallVector<MachineBasicBlock *, 4> MBBVector;
+static void doSpillCalleeSavedRegs(MachineFunction &MF, RegScavenger *RS,
+                                   unsigned &MinCSFrameIndex,
+                                   unsigned &MaxCXFrameIndex,
+                                   const MBBVector &SaveBlocks,
+                                   const MBBVector &RestoreBlocks);
+
+static void doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS);
+
+namespace {
+class PEI : public MachineFunctionPass {
+public:
+  static char ID;
+  explicit PEI(const TargetMachine *TM = nullptr) : MachineFunctionPass(ID) {
+    initializePEIPass(*PassRegistry::getPassRegistry());
+
+    if (TM && (!TM->usesPhysRegsForPEI())) {
+      SpillCalleeSavedRegisters = [](MachineFunction &, RegScavenger *,
+                                     unsigned &, unsigned &, const MBBVector &,
+                                     const MBBVector &) {};
+      ScavengeFrameVirtualRegs = [](MachineFunction &, RegScavenger *) {};
+    } else {
+      SpillCalleeSavedRegisters = doSpillCalleeSavedRegs;
+      ScavengeFrameVirtualRegs = doScavengeFrameVirtualRegs;
+      UsesCalleeSaves = true;
+    }
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    MachineFunctionProperties MFP;
+    if (UsesCalleeSaves)
+      MFP.set(MachineFunctionProperties::Property::AllVRegsAllocated);
+    return MFP;
+  }
+
+  /// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+  /// frame indexes with appropriate references.
+  ///
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+private:
+  std::function<void(MachineFunction &MF, RegScavenger *RS,
+                     unsigned &MinCSFrameIndex, unsigned &MaxCSFrameIndex,
+                     const MBBVector &SaveBlocks,
+                     const MBBVector &RestoreBlocks)>
+      SpillCalleeSavedRegisters;
+  std::function<void(MachineFunction &MF, RegScavenger *RS)>
+      ScavengeFrameVirtualRegs;
+
+  bool UsesCalleeSaves = false;
+
+  RegScavenger *RS;
+
+  // MinCSFrameIndex, MaxCSFrameIndex - Keeps the range of callee saved
+  // stack frame indexes.
+  unsigned MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  unsigned MaxCSFrameIndex = 0;
+
+  // Save and Restore blocks of the current function. Typically there is a
+  // single save block, unless Windows EH funclets are involved.
+  MBBVector SaveBlocks;
+  MBBVector RestoreBlocks;
+
+  // Flag to control whether to use the register scavenger to resolve
+  // frame index materialization registers. Set according to
+  // TRI->requiresFrameIndexScavenging() for the current function.
+  bool FrameIndexVirtualScavenging;
+
+  void calculateCallFrameInfo(MachineFunction &Fn);
+  void calculateSaveRestoreBlocks(MachineFunction &Fn);
+
+  void calculateFrameObjectOffsets(MachineFunction &Fn);
+  void replaceFrameIndices(MachineFunction &Fn);
+  void replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                           int &SPAdj);
+  void insertPrologEpilogCode(MachineFunction &Fn);
+};
+} // namespace
+
+char PEI::ID = 0;
+char &llvm::PrologEpilogCodeInserterID = PEI::ID;
+
+static cl::opt<unsigned>
+WarnStackSize("warn-stack-size", cl::Hidden, cl::init((unsigned)-1),
+              cl::desc("Warn for stack size bigger than the given"
+                       " number"));
+
+INITIALIZE_TM_PASS_BEGIN(PEI, "prologepilog", "Prologue/Epilogue Insertion",
+                         false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_TM_PASS_END(PEI, "prologepilog",
+                       "Prologue/Epilogue Insertion & Frame Finalization",
+                       false, false)
+
+MachineFunctionPass *
+llvm::createPrologEpilogInserterPass(const TargetMachine *TM) {
+  return new PEI(TM);
+}
+
+STATISTIC(NumScavengedRegs, "Number of frame index regs scavenged");
+STATISTIC(NumBytesStackSpace,
+          "Number of bytes used for stack in all functions");
+
+void PEI::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<StackProtector>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+
+/// StackObjSet - A set of stack object indexes
+typedef SmallSetVector<int, 8> StackObjSet;
+
+/// runOnMachineFunction - Insert prolog/epilog code and replace abstract
+/// frame indexes with appropriate references.
+///
+bool PEI::runOnMachineFunction(MachineFunction &Fn) {
+  const Function* F = Fn.getFunction();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+
+  RS = TRI->requiresRegisterScavenging(Fn) ? new RegScavenger() : nullptr;
+  FrameIndexVirtualScavenging = TRI->requiresFrameIndexScavenging(Fn);
+
+  // Calculate the MaxCallFrameSize and AdjustsStack variables for the
+  // function's frame information. Also eliminates call frame pseudo
+  // instructions.
+  calculateCallFrameInfo(Fn);
+
+  // Determine placement of CSR spill/restore code and prolog/epilog code:
+  // place all spills in the entry block, all restores in return blocks.
+  calculateSaveRestoreBlocks(Fn);
+
+  // Handle CSR spilling and restoring, for targets that need it.
+  SpillCalleeSavedRegisters(Fn, RS, MinCSFrameIndex, MaxCSFrameIndex,
+                            SaveBlocks, RestoreBlocks);
+
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameFinalized(Fn, RS);
+
+  // Calculate actual frame offsets for all abstract stack objects...
+  calculateFrameObjectOffsets(Fn);
+
+  // Add prolog and epilog code to the function.  This function is required
+  // to align the stack frame as necessary for any stack variables or
+  // called functions.  Because of this, calculateCalleeSavedRegisters()
+  // must be called before this function in order to set the AdjustsStack
+  // and MaxCallFrameSize variables.
+  if (!F->hasFnAttribute(Attribute::Naked))
+    insertPrologEpilogCode(Fn);
+
+  // Replace all MO_FrameIndex operands with physical register references
+  // and actual offsets.
+  //
+  replaceFrameIndices(Fn);
+
+  // If register scavenging is needed, as we've enabled doing it as a
+  // post-pass, scavenge the virtual registers that frame index elimination
+  // inserted.
+  if (TRI->requiresRegisterScavenging(Fn) && FrameIndexVirtualScavenging) {
+      ScavengeFrameVirtualRegs(Fn, RS);
+
+      // Clear any vregs created by virtual scavenging.
+      Fn.getRegInfo().clearVirtRegs();
+  }
+
+  // Warn on stack size when we exceeds the given limit.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  uint64_t StackSize = MFI->getStackSize();
+  if (WarnStackSize.getNumOccurrences() > 0 && WarnStackSize < StackSize) {
+    DiagnosticInfoStackSize DiagStackSize(*F, StackSize);
+    F->getContext().diagnose(DiagStackSize);
+  }
+
+  delete RS;
+  SaveBlocks.clear();
+  RestoreBlocks.clear();
+  MFI->setSavePoint(nullptr);
+  MFI->setRestorePoint(nullptr);
+  return true;
+}
+
+/// Calculate the MaxCallFrameSize and AdjustsStack
+/// variables for the function's frame information and eliminate call frame
+/// pseudo instructions.
+void PEI::calculateCallFrameInfo(MachineFunction &Fn) {
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  unsigned MaxCallFrameSize = 0;
+  bool AdjustsStack = MFI->adjustsStack();
+
+  // Get the function call frame set-up and tear-down instruction opcode
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  // Early exit for targets which have no call frame setup/destroy pseudo
+  // instructions.
+  if (FrameSetupOpcode == ~0u && FrameDestroyOpcode == ~0u)
+    return;
+
+  std::vector<MachineBasicBlock::iterator> FrameSDOps;
+  for (MachineFunction::iterator BB = Fn.begin(), E = Fn.end(); BB != E; ++BB)
+    for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ++I)
+      if (I->getOpcode() == FrameSetupOpcode ||
+          I->getOpcode() == FrameDestroyOpcode) {
+        assert(I->getNumOperands() >= 1 && "Call Frame Setup/Destroy Pseudo"
+               " instructions should have a single immediate argument!");
+        unsigned Size = I->getOperand(0).getImm();
+        if (Size > MaxCallFrameSize) MaxCallFrameSize = Size;
+        AdjustsStack = true;
+        FrameSDOps.push_back(I);
+      } else if (I->isInlineAsm()) {
+        // Some inline asm's need a stack frame, as indicated by operand 1.
+        unsigned ExtraInfo = I->getOperand(InlineAsm::MIOp_ExtraInfo).getImm();
+        if (ExtraInfo & InlineAsm::Extra_IsAlignStack)
+          AdjustsStack = true;
+      }
+
+  MFI->setAdjustsStack(AdjustsStack);
+  MFI->setMaxCallFrameSize(MaxCallFrameSize);
+
+  for (std::vector<MachineBasicBlock::iterator>::iterator
+         i = FrameSDOps.begin(), e = FrameSDOps.end(); i != e; ++i) {
+    MachineBasicBlock::iterator I = *i;
+
+    // If call frames are not being included as part of the stack frame, and
+    // the target doesn't indicate otherwise, remove the call frame pseudos
+    // here. The sub/add sp instruction pairs are still inserted, but we don't
+    // need to track the SP adjustment for frame index elimination.
+    if (TFI->canSimplifyCallFramePseudos(Fn))
+      TFI->eliminateCallFramePseudoInstr(Fn, *I->getParent(), I);
+  }
+}
+
+/// Compute the sets of entry and return blocks for saving and restoring
+/// callee-saved registers, and placing prolog and epilog code.
+void PEI::calculateSaveRestoreBlocks(MachineFunction &Fn) {
+  const MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Even when we do not change any CSR, we still want to insert the
+  // prologue and epilogue of the function.
+  // So set the save points for those.
+
+  // Use the points found by shrink-wrapping, if any.
+  if (MFI->getSavePoint()) {
+    SaveBlocks.push_back(MFI->getSavePoint());
+    assert(MFI->getRestorePoint() && "Both restore and save must be set");
+    MachineBasicBlock *RestoreBlock = MFI->getRestorePoint();
+    // If RestoreBlock does not have any successor and is not a return block
+    // then the end point is unreachable and we do not need to insert any
+    // epilogue.
+    if (!RestoreBlock->succ_empty() || RestoreBlock->isReturnBlock())
+      RestoreBlocks.push_back(RestoreBlock);
+    return;
+  }
+
+  // Save refs to entry and return blocks.
+  SaveBlocks.push_back(&Fn.front());
+  for (MachineBasicBlock &MBB : Fn) {
+    if (MBB.isEHFuncletEntry())
+      SaveBlocks.push_back(&MBB);
+    if (MBB.isReturnBlock())
+      RestoreBlocks.push_back(&MBB);
+  }
+}
+
+static void assignCalleeSavedSpillSlots(MachineFunction &F,
+                                        const BitVector &SavedRegs,
+                                        unsigned &MinCSFrameIndex,
+                                        unsigned &MaxCSFrameIndex) {
+  if (SavedRegs.empty())
+    return;
+
+  const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
+  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+
+  std::vector<CalleeSavedInfo> CSI;
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (SavedRegs.test(Reg))
+      CSI.push_back(CalleeSavedInfo(Reg));
+  }
+
+  const TargetFrameLowering *TFI = F.getSubtarget().getFrameLowering();
+  MachineFrameInfo *MFI = F.getFrameInfo();
+  if (!TFI->assignCalleeSavedSpillSlots(F, RegInfo, CSI)) {
+    // If target doesn't implement this, use generic code.
+
+    if (CSI.empty())
+      return; // Early exit if no callee saved registers are modified!
+
+    unsigned NumFixedSpillSlots;
+    const TargetFrameLowering::SpillSlot *FixedSpillSlots =
+        TFI->getCalleeSavedSpillSlots(NumFixedSpillSlots);
+
+    // Now that we know which registers need to be saved and restored, allocate
+    // stack slots for them.
+    for (auto &CS : CSI) {
+      unsigned Reg = CS.getReg();
+      const TargetRegisterClass *RC = RegInfo->getMinimalPhysRegClass(Reg);
+
+      int FrameIdx;
+      if (RegInfo->hasReservedSpillSlot(F, Reg, FrameIdx)) {
+        CS.setFrameIdx(FrameIdx);
+        continue;
+      }
+
+      // Check to see if this physreg must be spilled to a particular stack slot
+      // on this target.
+      const TargetFrameLowering::SpillSlot *FixedSlot = FixedSpillSlots;
+      while (FixedSlot != FixedSpillSlots + NumFixedSpillSlots &&
+             FixedSlot->Reg != Reg)
+        ++FixedSlot;
+
+      if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
+        // Nope, just spill it anywhere convenient.
+        unsigned Align = RC->getAlignment();
+        unsigned StackAlign = TFI->getStackAlignment();
+
+        // We may not be able to satisfy the desired alignment specification of
+        // the TargetRegisterClass if the stack alignment is smaller. Use the
+        // min.
+        Align = std::min(Align, StackAlign);
+        FrameIdx = MFI->CreateStackObject(RC->getSize(), Align, true);
+        if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
+        if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
+      } else {
+        // Spill it to the stack where we must.
+        FrameIdx =
+            MFI->CreateFixedSpillStackObject(RC->getSize(), FixedSlot->Offset);
+      }
+
+      CS.setFrameIdx(FrameIdx);
+    }
+  }
+
+  MFI->setCalleeSavedInfo(CSI);
+}
+
+/// Helper function to update the liveness information for the callee-saved
+/// registers.
+static void updateLiveness(MachineFunction &MF) {
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  // Visited will contain all the basic blocks that are in the region
+  // where the callee saved registers are alive:
+  // - Anything that is not Save or Restore -> LiveThrough.
+  // - Save -> LiveIn.
+  // - Restore -> LiveOut.
+  // The live-out is not attached to the block, so no need to keep
+  // Restore in this set.
+  SmallPtrSet<MachineBasicBlock *, 8> Visited;
+  SmallVector<MachineBasicBlock *, 8> WorkList;
+  MachineBasicBlock *Entry = &MF.front();
+  MachineBasicBlock *Save = MFI->getSavePoint();
+
+  if (!Save)
+    Save = Entry;
+
+  if (Entry != Save) {
+    WorkList.push_back(Entry);
+    Visited.insert(Entry);
+  }
+  Visited.insert(Save);
+
+  MachineBasicBlock *Restore = MFI->getRestorePoint();
+  if (Restore)
+    // By construction Restore cannot be visited, otherwise it
+    // means there exists a path to Restore that does not go
+    // through Save.
+    WorkList.push_back(Restore);
+
+  while (!WorkList.empty()) {
+    const MachineBasicBlock *CurBB = WorkList.pop_back_val();
+    // By construction, the region that is after the save point is
+    // dominated by the Save and post-dominated by the Restore.
+    if (CurBB == Save && Save != Restore)
+      continue;
+    // Enqueue all the successors not already visited.
+    // Those are by construction either before Save or after Restore.
+    for (MachineBasicBlock *SuccBB : CurBB->successors())
+      if (Visited.insert(SuccBB).second)
+        WorkList.push_back(SuccBB);
+  }
+
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    for (MachineBasicBlock *MBB : Visited) {
+      MCPhysReg Reg = CSI[i].getReg();
+      // Add the callee-saved register as live-in.
+      // It's killed at the spill.
+      if (!MBB->isLiveIn(Reg))
+        MBB->addLiveIn(Reg);
+    }
+  }
+}
+
+/// insertCSRSpillsAndRestores - Insert spill and restore code for
+/// callee saved registers used in the function.
+///
+static void insertCSRSpillsAndRestores(MachineFunction &Fn,
+                                       const MBBVector &SaveBlocks,
+                                       const MBBVector &RestoreBlocks) {
+  // Get callee saved register information.
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+  const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
+
+  MFI->setCalleeSavedInfoValid(true);
+
+  // Early exit if no callee saved registers are modified!
+  if (CSI.empty())
+    return;
+
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  const TargetRegisterInfo *TRI = Fn.getSubtarget().getRegisterInfo();
+  MachineBasicBlock::iterator I;
+
+  // Spill using target interface.
+  for (MachineBasicBlock *SaveBlock : SaveBlocks) {
+    I = SaveBlock->begin();
+    if (!TFI->spillCalleeSavedRegisters(*SaveBlock, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        // Insert the spill to the stack frame.
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.storeRegToStackSlot(*SaveBlock, I, Reg, true, CSI[i].getFrameIdx(),
+                                RC, TRI);
+      }
+    }
+    // Update the live-in information of all the blocks up to the save point.
+    updateLiveness(Fn);
+  }
+
+  // Restore using target interface.
+  for (MachineBasicBlock *MBB : RestoreBlocks) {
+    I = MBB->end();
+
+    // Skip over all terminator instructions, which are part of the return
+    // sequence.
+    MachineBasicBlock::iterator I2 = I;
+    while (I2 != MBB->begin() && (--I2)->isTerminator())
+      I = I2;
+
+    bool AtStart = I == MBB->begin();
+    MachineBasicBlock::iterator BeforeI = I;
+    if (!AtStart)
+      --BeforeI;
+
+    // Restore all registers immediately before the return and any
+    // terminators that precede it.
+    if (!TFI->restoreCalleeSavedRegisters(*MBB, I, CSI, TRI)) {
+      for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+        unsigned Reg = CSI[i].getReg();
+        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        TII.loadRegFromStackSlot(*MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+        assert(I != MBB->begin() &&
+               "loadRegFromStackSlot didn't insert any code!");
+        // Insert in reverse order.  loadRegFromStackSlot can insert
+        // multiple instructions.
+        if (AtStart)
+          I = MBB->begin();
+        else {
+          I = BeforeI;
+          ++I;
+        }
+      }
+    }
+  }
+}
+
+static void doSpillCalleeSavedRegs(MachineFunction &Fn, RegScavenger *RS,
+                                   unsigned &MinCSFrameIndex,
+                                   unsigned &MaxCSFrameIndex,
+                                   const MBBVector &SaveBlocks,
+                                   const MBBVector &RestoreBlocks) {
+  const Function *F = Fn.getFunction();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  MinCSFrameIndex = std::numeric_limits<unsigned>::max();
+  MaxCSFrameIndex = 0;
+
+  // Determine which of the registers in the callee save list should be saved.
+  BitVector SavedRegs;
+  TFI->determineCalleeSaves(Fn, SavedRegs, RS);
+
+  // Assign stack slots for any callee-saved registers that must be spilled.
+  assignCalleeSavedSpillSlots(Fn, SavedRegs, MinCSFrameIndex, MaxCSFrameIndex);
+
+  // Add the code to save and restore the callee saved registers.
+  if (!F->hasFnAttribute(Attribute::Naked))
+    insertCSRSpillsAndRestores(Fn, SaveBlocks, RestoreBlocks);
+}
+
+/// AdjustStackOffset - Helper function used to adjust the stack frame offset.
+static inline void
+AdjustStackOffset(MachineFrameInfo *MFI, int FrameIdx,
+                  bool StackGrowsDown, int64_t &Offset,
+                  unsigned &MaxAlign, unsigned Skew) {
+  // If the stack grows down, add the object size to find the lowest address.
+  if (StackGrowsDown)
+    Offset += MFI->getObjectSize(FrameIdx);
+
+  unsigned Align = MFI->getObjectAlignment(FrameIdx);
+
+  // If the alignment of this object is greater than that of the stack, then
+  // increase the stack alignment to match.
+  MaxAlign = std::max(MaxAlign, Align);
+
+  // Adjust to alignment boundary.
+  Offset = alignTo(Offset, Align, Skew);
+
+  if (StackGrowsDown) {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, -Offset); // Set the computed offset
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << Offset << "]\n");
+    MFI->setObjectOffset(FrameIdx, Offset);
+    Offset += MFI->getObjectSize(FrameIdx);
+  }
+}
+
+/// Compute which bytes of fixed and callee-save stack area are unused and keep
+/// track of them in StackBytesFree.
+///
+static inline void
+computeFreeStackSlots(MachineFrameInfo *MFI, bool StackGrowsDown,
+                      unsigned MinCSFrameIndex, unsigned MaxCSFrameIndex,
+                      int64_t FixedCSEnd, BitVector &StackBytesFree) {
+  // Avoid undefined int64_t -> int conversion below in extreme case.
+  if (FixedCSEnd > std::numeric_limits<int>::max())
+    return;
+
+  StackBytesFree.resize(FixedCSEnd, true);
+
+  SmallVector<int, 16> AllocatedFrameSlots;
+  // Add fixed objects.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i)
+    AllocatedFrameSlots.push_back(i);
+  // Add callee-save objects.
+  for (int i = MinCSFrameIndex; i <= (int)MaxCSFrameIndex; ++i)
+    AllocatedFrameSlots.push_back(i);
+
+  for (int i : AllocatedFrameSlots) {
+    // These are converted from int64_t, but they should always fit in int
+    // because of the FixedCSEnd check above.
+    int ObjOffset = MFI->getObjectOffset(i);
+    int ObjSize = MFI->getObjectSize(i);
+    int ObjStart, ObjEnd;
+    if (StackGrowsDown) {
+      // ObjOffset is negative when StackGrowsDown is true.
+      ObjStart = -ObjOffset - ObjSize;
+      ObjEnd = -ObjOffset;
+    } else {
+      ObjStart = ObjOffset;
+      ObjEnd = ObjOffset + ObjSize;
+    }
+    // Ignore fixed holes that are in the previous stack frame.
+    if (ObjEnd > 0)
+      StackBytesFree.reset(ObjStart, ObjEnd);
+  }
+}
+
+/// Assign frame object to an unused portion of the stack in the fixed stack
+/// object range.  Return true if the allocation was successful.
+///
+static inline bool scavengeStackSlot(MachineFrameInfo *MFI, int FrameIdx,
+                                     bool StackGrowsDown, unsigned MaxAlign,
+                                     BitVector &StackBytesFree) {
+  if (MFI->isVariableSizedObjectIndex(FrameIdx))
+    return false;
+
+  if (StackBytesFree.none()) {
+    // clear it to speed up later scavengeStackSlot calls to
+    // StackBytesFree.none()
+    StackBytesFree.clear();
+    return false;
+  }
+
+  unsigned ObjAlign = MFI->getObjectAlignment(FrameIdx);
+  if (ObjAlign > MaxAlign)
+    return false;
+
+  int64_t ObjSize = MFI->getObjectSize(FrameIdx);
+  int FreeStart;
+  for (FreeStart = StackBytesFree.find_first(); FreeStart != -1;
+       FreeStart = StackBytesFree.find_next(FreeStart)) {
+
+    // Check that free space has suitable alignment.
+    unsigned ObjStart = StackGrowsDown ? FreeStart + ObjSize : FreeStart;
+    if (alignTo(ObjStart, ObjAlign) != ObjStart)
+      continue;
+
+    if (FreeStart + ObjSize > StackBytesFree.size())
+      return false;
+
+    bool AllBytesFree = true;
+    for (unsigned Byte = 0; Byte < ObjSize; ++Byte)
+      if (!StackBytesFree.test(FreeStart + Byte)) {
+        AllBytesFree = false;
+        break;
+      }
+    if (AllBytesFree)
+      break;
+  }
+
+  if (FreeStart == -1)
+    return false;
+
+  if (StackGrowsDown) {
+    int ObjStart = -(FreeStart + ObjSize);
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << ObjStart
+                 << "]\n");
+    MFI->setObjectOffset(FrameIdx, ObjStart);
+  } else {
+    DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") scavenged at SP[" << FreeStart
+                 << "]\n");
+    MFI->setObjectOffset(FrameIdx, FreeStart);
+  }
+
+  StackBytesFree.reset(FreeStart, FreeStart + ObjSize);
+  return true;
+}
+
+/// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
+/// those required to be close to the Stack Protector) to stack offsets.
+static void
+AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                      SmallSet<int, 16> &ProtectedObjs,
+                      MachineFrameInfo *MFI, bool StackGrowsDown,
+                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
+
+  for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
+        E = UnassignedObjs.end(); I != E; ++I) {
+    int i = *I;
+    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+    ProtectedObjs.insert(i);
+  }
+}
+
+/// calculateFrameObjectOffsets - Calculate actual frame offsets for all of the
+/// abstract stack objects.
+///
+void PEI::calculateFrameObjectOffsets(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  StackProtector *SP = &getAnalysis<StackProtector>();
+
+  bool StackGrowsDown =
+    TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  // Loop over all of the stack objects, assigning sequential addresses...
+  MachineFrameInfo *MFI = Fn.getFrameInfo();
+
+  // Start at the beginning of the local area.
+  // The Offset is the distance from the stack top in the direction
+  // of stack growth -- so it's always nonnegative.
+  int LocalAreaOffset = TFI.getOffsetOfLocalArea();
+  if (StackGrowsDown)
+    LocalAreaOffset = -LocalAreaOffset;
+  assert(LocalAreaOffset >= 0
+         && "Local area offset should be in direction of stack growth");
+  int64_t Offset = LocalAreaOffset;
+
+  // Skew to be applied to alignment.
+  unsigned Skew = TFI.getStackAlignmentSkew(Fn);
+
+  // If there are fixed sized objects that are preallocated in the local area,
+  // non-fixed objects can't be allocated right at the start of local area.
+  // Adjust 'Offset' to point to the end of last fixed sized preallocated
+  // object.
+  for (int i = MFI->getObjectIndexBegin(); i != 0; ++i) {
+    int64_t FixedOff;
+    if (StackGrowsDown) {
+      // The maximum distance from the stack pointer is at lower address of
+      // the object -- which is given by offset. For down growing stack
+      // the offset is negative, so we negate the offset to get the distance.
+      FixedOff = -MFI->getObjectOffset(i);
+    } else {
+      // The maximum distance from the start pointer is at the upper
+      // address of the object.
+      FixedOff = MFI->getObjectOffset(i) + MFI->getObjectSize(i);
+    }
+    if (FixedOff > Offset) Offset = FixedOff;
+  }
+
+  // First assign frame offsets to stack objects that are used to spill
+  // callee saved registers.
+  if (StackGrowsDown) {
+    for (unsigned i = MinCSFrameIndex; i <= MaxCSFrameIndex; ++i) {
+      // If the stack grows down, we need to add the size to find the lowest
+      // address of the object.
+      Offset += MFI->getObjectSize(i);
+
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = alignTo(Offset, Align, Skew);
+
+      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
+      MFI->setObjectOffset(i, -Offset);        // Set the computed offset
+    }
+  } else if (MaxCSFrameIndex >= MinCSFrameIndex) {
+    // Be careful about underflow in comparisons agains MinCSFrameIndex.
+    for (unsigned i = MaxCSFrameIndex; i != MinCSFrameIndex - 1; --i) {
+      unsigned Align = MFI->getObjectAlignment(i);
+      // Adjust to alignment boundary
+      Offset = alignTo(Offset, Align, Skew);
+
+      DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
+      MFI->setObjectOffset(i, Offset);
+      Offset += MFI->getObjectSize(i);
+    }
+  }
+
+  // FixedCSEnd is the stack offset to the end of the fixed and callee-save
+  // stack area.
+  int64_t FixedCSEnd = Offset;
+  unsigned MaxAlign = MFI->getMaxAlignment();
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // incoming stack pointer if a frame pointer is required and is closer
+  // to the incoming rather than the final stack pointer.
+  const TargetRegisterInfo *RegInfo = Fn.getSubtarget().getRegisterInfo();
+  bool EarlyScavengingSlots = (TFI.hasFP(Fn) &&
+                               TFI.isFPCloseToIncomingSP() &&
+                               RegInfo->useFPForScavengingIndex(Fn) &&
+                               !RegInfo->needsStackRealignment(Fn));
+  if (RS && EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+  }
+
+  // FIXME: Once this is working, then enable flag will change to a target
+  // check for whether the frame is large enough to want to use virtual
+  // frame index registers. Functions which don't want/need this optimization
+  // will continue to use the existing code path.
+  if (MFI->getUseLocalStackAllocationBlock()) {
+    unsigned Align = MFI->getLocalFrameMaxAlign();
+
+    // Adjust to alignment boundary.
+    Offset = alignTo(Offset, Align, Skew);
+
+    DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
+
+    // Resolve offsets for objects in the local block.
+    for (unsigned i = 0, e = MFI->getLocalFrameObjectCount(); i != e; ++i) {
+      std::pair<int, int64_t> Entry = MFI->getLocalFrameObjectMap(i);
+      int64_t FIOffset = (StackGrowsDown ? -Offset : Offset) + Entry.second;
+      DEBUG(dbgs() << "alloc FI(" << Entry.first << ") at SP[" <<
+            FIOffset << "]\n");
+      MFI->setObjectOffset(Entry.first, FIOffset);
+    }
+    // Allocate the local block
+    Offset += MFI->getLocalFrameSize();
+
+    MaxAlign = std::max(Align, MaxAlign);
+  }
+
+  // Retrieve the Exception Handler registration node.
+  int EHRegNodeFrameIndex = INT_MAX;
+  if (const WinEHFuncInfo *FuncInfo = Fn.getWinEHFuncInfo())
+    EHRegNodeFrameIndex = FuncInfo->EHRegNodeFrameIndex;
+
+  // Make sure that the stack protector comes before the local variables on the
+  // stack.
+  SmallSet<int, 16> ProtectedObjs;
+  if (MFI->getStackProtectorIndex() >= 0) {
+    StackObjSet LargeArrayObjs;
+    StackObjSet SmallArrayObjs;
+    StackObjSet AddrOfObjs;
+
+    AdjustStackOffset(MFI, MFI->getStackProtectorIndex(), StackGrowsDown,
+                      Offset, MaxAlign, Skew);
+
+    // Assign large stack objects first.
+    for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+      if (MFI->isObjectPreAllocated(i) &&
+          MFI->getUseLocalStackAllocationBlock())
+        continue;
+      if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+        continue;
+      if (RS && RS->isScavengingFrameIndex((int)i))
+        continue;
+      if (MFI->isDeadObjectIndex(i))
+        continue;
+      if (MFI->getStackProtectorIndex() == (int)i ||
+          EHRegNodeFrameIndex == (int)i)
+        continue;
+
+      switch (SP->getSSPLayout(MFI->getObjectAllocation(i))) {
+      case StackProtector::SSPLK_None:
+        continue;
+      case StackProtector::SSPLK_SmallArray:
+        SmallArrayObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_AddrOf:
+        AddrOfObjs.insert(i);
+        continue;
+      case StackProtector::SSPLK_LargeArray:
+        LargeArrayObjs.insert(i);
+        continue;
+      }
+      llvm_unreachable("Unexpected SSPLayoutKind.");
+    }
+
+    AssignProtectedObjSet(LargeArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+    AssignProtectedObjSet(SmallArrayObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+    AssignProtectedObjSet(AddrOfObjs, ProtectedObjs, MFI, StackGrowsDown,
+                          Offset, MaxAlign, Skew);
+  }
+
+  SmallVector<int, 8> ObjectsToAllocate;
+
+  // Then prepare to assign frame offsets to stack objects that are not used to
+  // spill callee saved registers.
+  for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
+    if (MFI->isObjectPreAllocated(i) &&
+        MFI->getUseLocalStackAllocationBlock())
+      continue;
+    if (i >= MinCSFrameIndex && i <= MaxCSFrameIndex)
+      continue;
+    if (RS && RS->isScavengingFrameIndex((int)i))
+      continue;
+    if (MFI->isDeadObjectIndex(i))
+      continue;
+    if (MFI->getStackProtectorIndex() == (int)i ||
+        EHRegNodeFrameIndex == (int)i)
+      continue;
+    if (ProtectedObjs.count(i))
+      continue;
+
+    // Add the objects that we need to allocate to our working set.
+    ObjectsToAllocate.push_back(i);
+  }
+
+  // Allocate the EH registration node first if one is present.
+  if (EHRegNodeFrameIndex != INT_MAX)
+    AdjustStackOffset(MFI, EHRegNodeFrameIndex, StackGrowsDown, Offset,
+                      MaxAlign, Skew);
+
+  // Give the targets a chance to order the objects the way they like it.
+  if (Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+      Fn.getTarget().Options.StackSymbolOrdering)
+    TFI.orderFrameObjects(Fn, ObjectsToAllocate);
+
+  // Keep track of which bytes in the fixed and callee-save range are used so we
+  // can use the holes when allocating later stack objects.  Only do this if
+  // stack protector isn't being used and the target requests it and we're
+  // optimizing.
+  BitVector StackBytesFree;
+  if (!ObjectsToAllocate.empty() &&
+      Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+      MFI->getStackProtectorIndex() < 0 && TFI.enableStackSlotScavenging(Fn))
+    computeFreeStackSlots(MFI, StackGrowsDown, MinCSFrameIndex, MaxCSFrameIndex,
+                          FixedCSEnd, StackBytesFree);
+
+  // Now walk the objects and actually assign base offsets to them.
+  for (auto &Object : ObjectsToAllocate)
+    if (!scavengeStackSlot(MFI, Object, StackGrowsDown, MaxAlign,
+                           StackBytesFree))
+      AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew);
+
+  // Make sure the special register scavenging spill slot is closest to the
+  // stack pointer.
+  if (RS && !EarlyScavengingSlots) {
+    SmallVector<int, 2> SFIs;
+    RS->getScavengingFrameIndices(SFIs);
+    for (SmallVectorImpl<int>::iterator I = SFIs.begin(),
+           IE = SFIs.end(); I != IE; ++I)
+      AdjustStackOffset(MFI, *I, StackGrowsDown, Offset, MaxAlign, Skew);
+  }
+
+  if (!TFI.targetHandlesStackFrameRounding()) {
+    // If we have reserved argument space for call sites in the function
+    // immediately on entry to the current function, count it as part of the
+    // overall stack size.
+    if (MFI->adjustsStack() && TFI.hasReservedCallFrame(Fn))
+      Offset += MFI->getMaxCallFrameSize();
+
+    // Round up the size to a multiple of the alignment.  If the function has
+    // any calls or alloca's, align to the target's StackAlignment value to
+    // ensure that the callee's frame or the alloca data is suitably aligned;
+    // otherwise, for leaf functions, align to the TransientStackAlignment
+    // value.
+    unsigned StackAlign;
+    if (MFI->adjustsStack() || MFI->hasVarSizedObjects() ||
+        (RegInfo->needsStackRealignment(Fn) && MFI->getObjectIndexEnd() != 0))
+      StackAlign = TFI.getStackAlignment();
+    else
+      StackAlign = TFI.getTransientStackAlignment();
+
+    // If the frame pointer is eliminated, all frame offsets will be relative to
+    // SP not FP. Align to MaxAlign so this works.
+    StackAlign = std::max(StackAlign, MaxAlign);
+    Offset = alignTo(Offset, StackAlign, Skew);
+  }
+
+  // Update frame info to pretend that this is part of the stack...
+  int64_t StackSize = Offset - LocalAreaOffset;
+  MFI->setStackSize(StackSize);
+  NumBytesStackSpace += StackSize;
+}
+
+/// insertPrologEpilogCode - Scan the function for modified callee saved
+/// registers, insert spill code for these callee saved registers, then add
+/// prolog and epilog code to the function.
+///
+void PEI::insertPrologEpilogCode(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+
+  // Add prologue to the function...
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.emitPrologue(Fn, *SaveBlock);
+
+  // Add epilogue to restore the callee-save registers in each exiting block.
+  for (MachineBasicBlock *RestoreBlock : RestoreBlocks)
+    TFI.emitEpilogue(Fn, *RestoreBlock);
+
+  for (MachineBasicBlock *SaveBlock : SaveBlocks)
+    TFI.inlineStackProbe(Fn, *SaveBlock);
+
+  // Emit additional code that is required to support segmented stacks, if
+  // we've been asked for it.  This, when linked with a runtime with support
+  // for segmented stacks (libgcc is one), will result in allocating stack
+  // space in small chunks instead of one large contiguous block.
+  if (Fn.shouldSplitStack()) {
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForSegmentedStacks(Fn, *SaveBlock);
+  }
+
+  // Emit additional code that is required to explicitly handle the stack in
+  // HiPE native code (if needed) when loaded in the Erlang/OTP runtime. The
+  // approach is rather similar to that of Segmented Stacks, but it uses a
+  // different conditional check and another BIF for allocating more stack
+  // space.
+  if (Fn.getFunction()->getCallingConv() == CallingConv::HiPE)
+    for (MachineBasicBlock *SaveBlock : SaveBlocks)
+      TFI.adjustForHiPEPrologue(Fn, *SaveBlock);
+}
+
+/// replaceFrameIndices - Replace all MO_FrameIndex operands with physical
+/// register references and actual offsets.
+///
+void PEI::replaceFrameIndices(MachineFunction &Fn) {
+  const TargetFrameLowering &TFI = *Fn.getSubtarget().getFrameLowering();
+  if (!TFI.needsFrameIndexResolution(Fn)) return;
+
+  // Store SPAdj at exit of a basic block.
+  SmallVector<int, 8> SPState;
+  SPState.resize(Fn.getNumBlockIDs());
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+
+  // Iterate over the reachable blocks in DFS order.
+  for (auto DFI = df_ext_begin(&Fn, Reachable), DFE = df_ext_end(&Fn, Reachable);
+       DFI != DFE; ++DFI) {
+    int SPAdj = 0;
+    // Check the exit state of the DFS stack predecessor.
+    if (DFI.getPathLength() >= 2) {
+      MachineBasicBlock *StackPred = DFI.getPath(DFI.getPathLength() - 2);
+      assert(Reachable.count(StackPred) &&
+             "DFS stack predecessor is already visited.\n");
+      SPAdj = SPState[StackPred->getNumber()];
+    }
+    MachineBasicBlock *BB = *DFI;
+    replaceFrameIndices(BB, Fn, SPAdj);
+    SPState[BB->getNumber()] = SPAdj;
+  }
+
+  // Handle the unreachable blocks.
+  for (auto &BB : Fn) {
+    if (Reachable.count(&BB))
+      // Already handled in DFS traversal.
+      continue;
+    int SPAdj = 0;
+    replaceFrameIndices(&BB, Fn, SPAdj);
+  }
+}
+
+void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &Fn,
+                              int &SPAdj) {
+  assert(Fn.getSubtarget().getRegisterInfo() &&
+         "getRegisterInfo() must be implemented!");
+  const TargetInstrInfo &TII = *Fn.getSubtarget().getInstrInfo();
+  const TargetRegisterInfo &TRI = *Fn.getSubtarget().getRegisterInfo();
+  const TargetFrameLowering *TFI = Fn.getSubtarget().getFrameLowering();
+  unsigned FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+
+  if (RS && !FrameIndexVirtualScavenging) RS->enterBasicBlock(*BB);
+
+  bool InsideCallSequence = false;
+
+  for (MachineBasicBlock::iterator I = BB->begin(); I != BB->end(); ) {
+
+    if (I->getOpcode() == FrameSetupOpcode ||
+        I->getOpcode() == FrameDestroyOpcode) {
+      InsideCallSequence = (I->getOpcode() == FrameSetupOpcode);
+      SPAdj += TII.getSPAdjust(*I);
+
+      I = TFI->eliminateCallFramePseudoInstr(Fn, *BB, I);
+      continue;
+    }
+
+    MachineInstr &MI = *I;
+    bool DoIncr = true;
+    bool DidFinishLoop = true;
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      if (!MI.getOperand(i).isFI())
+        continue;
+
+      // Frame indices in debug values are encoded in a target independent
+      // way with simply the frame index and offset rather than any
+      // target-specific addressing mode.
+      if (MI.isDebugValue()) {
+        assert(i == 0 && "Frame indices can only appear as the first "
+                         "operand of a DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI.getOperand(1);
+        Offset.setImm(
+            Offset.getImm() +
+            TFI->getFrameIndexReference(Fn, MI.getOperand(0).getIndex(), Reg));
+        MI.getOperand(0).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // TODO: This code should be commoned with the code for
+      // PATCHPOINT. There's no good reason for the difference in
+      // implementation other than historical accident.  The only
+      // remaining difference is the unconditional use of the stack
+      // pointer as the base register.
+      if (MI.getOpcode() == TargetOpcode::STATEPOINT) {
+        assert((!MI.isDebugValue() || i == 0) &&
+               "Frame indicies can only appear as the first operand of a "
+               "DBG_VALUE machine instruction");
+        unsigned Reg;
+        MachineOperand &Offset = MI.getOperand(i + 1);
+        int refOffset = TFI->getFrameIndexReferencePreferSP(
+            Fn, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
+        Offset.setImm(Offset.getImm() + refOffset);
+        MI.getOperand(i).ChangeToRegister(Reg, false /*isDef*/);
+        continue;
+      }
+
+      // Some instructions (e.g. inline asm instructions) can have
+      // multiple frame indices and/or cause eliminateFrameIndex
+      // to insert more than one instruction. We need the register
+      // scavenger to go through all of these instructions so that
+      // it can update its register information. We keep the
+      // iterator at the point before insertion so that we can
+      // revisit them in full.
+      bool AtBeginning = (I == BB->begin());
+      if (!AtBeginning) --I;
+
+      // If this instruction has a FrameIndex operand, we need to
+      // use that target machine register info object to eliminate
+      // it.
+      TRI.eliminateFrameIndex(MI, SPAdj, i,
+                              FrameIndexVirtualScavenging ?  nullptr : RS);
+
+      // Reset the iterator if we were at the beginning of the BB.
+      if (AtBeginning) {
+        I = BB->begin();
+        DoIncr = false;
+      }
+
+      DidFinishLoop = false;
+      break;
+    }
+
+    // If we are looking at a call sequence, we need to keep track of
+    // the SP adjustment made by each instruction in the sequence.
+    // This includes both the frame setup/destroy pseudos (handled above),
+    // as well as other instructions that have side effects w.r.t the SP.
+    // Note that this must come after eliminateFrameIndex, because 
+    // if I itself referred to a frame index, we shouldn't count its own
+    // adjustment.
+    if (DidFinishLoop && InsideCallSequence)
+      SPAdj += TII.getSPAdjust(MI);
+
+    if (DoIncr && I != BB->end()) ++I;
+
+    // Update register states.
+    if (RS && !FrameIndexVirtualScavenging && DidFinishLoop)
+      RS->forward(MI);
+  }
+}
+
+/// doScavengeFrameVirtualRegs - Replace all frame index virtual registers
+/// with physical registers. Use the register scavenger to find an
+/// appropriate register to use.
+///
+/// FIXME: Iterating over the instruction stream is unnecessary. We can simply
+/// iterate over the vreg use list, which at this point only contains machine
+/// operands for which eliminateFrameIndex need a new scratch reg.
+static void
+doScavengeFrameVirtualRegs(MachineFunction &MF, RegScavenger *RS) {
+  // Run through the instructions and find any virtual registers.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    RS->enterBasicBlock(MBB);
+
+    int SPAdj = 0;
+
+    // The instruction stream may change in the loop, so check MBB.end()
+    // directly.
+    for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ) {
+      // We might end up here again with a NULL iterator if we scavenged a
+      // register for which we inserted spill code for definition by what was
+      // originally the first instruction in MBB.
+      if (I == MachineBasicBlock::iterator(nullptr))
+        I = MBB.begin();
+
+      const MachineInstr &MI = *I;
+      MachineBasicBlock::iterator J = std::next(I);
+      MachineBasicBlock::iterator P =
+                         I == MBB.begin() ? MachineBasicBlock::iterator(nullptr)
+                                          : std::prev(I);
+
+      // RS should process this instruction before we might scavenge at this
+      // location. This is because we might be replacing a virtual register
+      // defined by this instruction, and if so, registers killed by this
+      // instruction are available, and defined registers are not.
+      RS->forward(I);
+
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+          continue;
+
+        // When we first encounter a new virtual register, it
+        // must be a definition.
+        assert(MO.isDef() && "frame index virtual missing def!");
+        // Scavenge a new scratch register
+        const TargetRegisterClass *RC = MRI.getRegClass(Reg);
+        unsigned ScratchReg = RS->scavengeRegister(RC, J, SPAdj);
+
+        ++NumScavengedRegs;
+
+        // Replace this reference to the virtual register with the
+        // scratch register.
+        assert(ScratchReg && "Missing scratch register!");
+        MRI.replaceRegWith(Reg, ScratchReg);
+
+        // Because this instruction was processed by the RS before this
+        // register was allocated, make sure that the RS now records the
+        // register as being used.
+        RS->setRegUsed(ScratchReg);
+      }
+
+      // If the scavenger needed to use one of its spill slots, the
+      // spill code will have been inserted in between I and J. This is a
+      // problem because we need the spill code before I: Move I to just
+      // prior to J.
+      if (I != std::prev(J)) {
+        MBB.splice(J, &MBB, I);
+
+        // Before we move I, we need to prepare the RS to visit I again.
+        // Specifically, RS will assert if it sees uses of registers that
+        // it believes are undefined. Because we have already processed
+        // register kills in I, when it visits I again, it will believe that
+        // those registers are undefined. To avoid this situation, unprocess
+        // the instruction I.
+        assert(RS->getCurrentPosition() == I &&
+          "The register scavenger has an unexpected position");
+        I = P;
+        RS->unprocess(P);
+      } else
+        ++I;
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
new file mode 100644
index 000000000000..804a4c3dad66
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/PseudoSourceValue.cpp
@@ -0,0 +1,139 @@
+//===-- llvm/CodeGen/PseudoSourceValue.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the PseudoSourceValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+static const char *const PSVNames[] = {
+    "Stack", "GOT", "JumpTable", "ConstantPool", "FixedStack",
+    "GlobalValueCallEntry", "ExternalSymbolCallEntry"};
+
+PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}
+
+PseudoSourceValue::~PseudoSourceValue() {}
+
+void PseudoSourceValue::printCustom(raw_ostream &O) const {
+  O << PSVNames[Kind];
+}
+
+bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  if (isStack())
+    return false;
+  if (isGOT() || isConstantPool() || isJumpTable())
+    return true;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  if (isStack() || isGOT() || isConstantPool() || isJumpTable())
+    return false;
+  llvm_unreachable("Unknown PseudoSourceValue!");
+}
+
+bool PseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return !(isGOT() || isConstantPool() || isJumpTable());
+}
+
+bool FixedStackPseudoSourceValue::isConstant(
+    const MachineFrameInfo *MFI) const {
+  return MFI && MFI->isImmutableObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::isAliased(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  return MFI->isAliasedObjectIndex(FI);
+}
+
+bool FixedStackPseudoSourceValue::mayAlias(const MachineFrameInfo *MFI) const {
+  if (!MFI)
+    return true;
+  // Spill slots will not alias any LLVM IR value.
+  return !MFI->isSpillSlotObjectIndex(FI);
+}
+
+void FixedStackPseudoSourceValue::printCustom(raw_ostream &OS) const {
+  OS << "FixedStack" << FI;
+}
+
+CallEntryPseudoSourceValue::CallEntryPseudoSourceValue(PSVKind Kind)
+    : PseudoSourceValue(Kind) {}
+
+bool CallEntryPseudoSourceValue::isConstant(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::isAliased(const MachineFrameInfo *) const {
+  return false;
+}
+
+bool CallEntryPseudoSourceValue::mayAlias(const MachineFrameInfo *) const {
+  return false;
+}
+
+GlobalValuePseudoSourceValue::GlobalValuePseudoSourceValue(
+    const GlobalValue *GV)
+    : CallEntryPseudoSourceValue(GlobalValueCallEntry), GV(GV) {}
+
+ExternalSymbolPseudoSourceValue::ExternalSymbolPseudoSourceValue(const char *ES)
+    : CallEntryPseudoSourceValue(ExternalSymbolCallEntry), ES(ES) {}
+
+PseudoSourceValueManager::PseudoSourceValueManager()
+    : StackPSV(PseudoSourceValue::Stack), GOTPSV(PseudoSourceValue::GOT),
+      JumpTablePSV(PseudoSourceValue::JumpTable),
+      ConstantPoolPSV(PseudoSourceValue::ConstantPool) {}
+
+const PseudoSourceValue *PseudoSourceValueManager::getStack() {
+  return &StackPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getGOT() { return &GOTPSV; }
+
+const PseudoSourceValue *PseudoSourceValueManager::getConstantPool() {
+  return &ConstantPoolPSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getJumpTable() {
+  return &JumpTablePSV;
+}
+
+const PseudoSourceValue *PseudoSourceValueManager::getFixedStack(int FI) {
+  std::unique_ptr<FixedStackPseudoSourceValue> &V = FSValues[FI];
+  if (!V)
+    V = llvm::make_unique<FixedStackPseudoSourceValue>(FI);
+  return V.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getGlobalValueCallEntry(const GlobalValue *GV) {
+  std::unique_ptr<const GlobalValuePseudoSourceValue> &E =
+      GlobalCallEntries[GV];
+  if (!E)
+    E = llvm::make_unique<GlobalValuePseudoSourceValue>(GV);
+  return E.get();
+}
+
+const PseudoSourceValue *
+PseudoSourceValueManager::getExternalSymbolCallEntry(const char *ES) {
+  std::unique_ptr<const ExternalSymbolPseudoSourceValue> &E =
+      ExternalCallEntries[ES];
+  if (!E)
+    E = llvm::make_unique<ExternalSymbolPseudoSourceValue>(ES);
+  return E.get();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.cpp b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
new file mode 100644
index 000000000000..93eeb9cba457
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -0,0 +1,161 @@
+//===-- RegAllocBase.cpp - Register Allocator Base Class ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class which provides common functionality
+// for LiveIntervalUnion-based register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegAllocBase.h"
+#include "Spiller.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Timer.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumNewQueued    , "Number of new live ranges queued");
+
+// Temporary verification option until we can put verification inside
+// MachineVerifier.
+static cl::opt<bool, true>
+VerifyRegAlloc("verify-regalloc", cl::location(RegAllocBase::VerifyEnabled),
+               cl::desc("Verify during register allocation"));
+
+const char RegAllocBase::TimerGroupName[] = "Register Allocation";
+bool RegAllocBase::VerifyEnabled = false;
+
+//===----------------------------------------------------------------------===//
+//                         RegAllocBase Implementation
+//===----------------------------------------------------------------------===//
+
+// Pin the vtable to this file.
+void RegAllocBase::anchor() {}
+
+void RegAllocBase::init(VirtRegMap &vrm,
+                        LiveIntervals &lis,
+                        LiveRegMatrix &mat) {
+  TRI = &vrm.getTargetRegInfo();
+  MRI = &vrm.getRegInfo();
+  VRM = &vrm;
+  LIS = &lis;
+  Matrix = &mat;
+  MRI->freezeReservedRegs(vrm.getMachineFunction());
+  RegClassInfo.runOnMachineFunction(vrm.getMachineFunction());
+}
+
+// Visit all the live registers. If they are already assigned to a physical
+// register, unify them with the corresponding LiveIntervalUnion, otherwise push
+// them on the priority queue for later assignment.
+void RegAllocBase::seedLiveRegs() {
+  NamedRegionTimer T("Seed Live Regs", TimerGroupName, TimePassesIsEnabled);
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    enqueue(&LIS->getInterval(Reg));
+  }
+}
+
+// Top-level driver to manage the queue of unassigned VirtRegs and call the
+// selectOrSplit implementation.
+void RegAllocBase::allocatePhysRegs() {
+  seedLiveRegs();
+
+  // Continue assigning vregs one at a time to available physical registers.
+  while (LiveInterval *VirtReg = dequeue()) {
+    assert(!VRM->hasPhys(VirtReg->reg) && "Register already assigned");
+
+    // Unused registers can appear when the spiller coalesces snippets.
+    if (MRI->reg_nodbg_empty(VirtReg->reg)) {
+      DEBUG(dbgs() << "Dropping unused " << *VirtReg << '\n');
+      aboutToRemoveInterval(*VirtReg);
+      LIS->removeInterval(VirtReg->reg);
+      continue;
+    }
+
+    // Invalidate all interference queries, live ranges could have changed.
+    Matrix->invalidateVirtRegs();
+
+    // selectOrSplit requests the allocator to return an available physical
+    // register if possible and populate a list of new live intervals that
+    // result from splitting.
+    DEBUG(dbgs() << "\nselectOrSplit "
+          << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
+          << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
+    typedef SmallVector<unsigned, 4> VirtRegVec;
+    VirtRegVec SplitVRegs;
+    unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
+
+    if (AvailablePhysReg == ~0u) {
+      // selectOrSplit failed to find a register!
+      // Probably caused by an inline asm.
+      MachineInstr *MI = nullptr;
+      for (MachineRegisterInfo::reg_instr_iterator
+           I = MRI->reg_instr_begin(VirtReg->reg), E = MRI->reg_instr_end();
+           I != E; ) {
+        MachineInstr *TmpMI = &*(I++);
+        if (TmpMI->isInlineAsm()) {
+          MI = TmpMI;
+          break;
+        }
+      }
+      if (MI)
+        MI->emitError("inline assembly requires more registers than available");
+      else
+        report_fatal_error("ran out of registers during register allocation");
+      // Keep going after reporting the error.
+      VRM->assignVirt2Phys(VirtReg->reg,
+                 RegClassInfo.getOrder(MRI->getRegClass(VirtReg->reg)).front());
+      continue;
+    }
+
+    if (AvailablePhysReg)
+      Matrix->assign(*VirtReg, AvailablePhysReg);
+
+    for (VirtRegVec::iterator I = SplitVRegs.begin(), E = SplitVRegs.end();
+         I != E; ++I) {
+      LiveInterval *SplitVirtReg = &LIS->getInterval(*I);
+      assert(!VRM->hasPhys(SplitVirtReg->reg) && "Register already assigned");
+      if (MRI->reg_nodbg_empty(SplitVirtReg->reg)) {
+        DEBUG(dbgs() << "not queueing unused  " << *SplitVirtReg << '\n');
+        aboutToRemoveInterval(*SplitVirtReg);
+        LIS->removeInterval(SplitVirtReg->reg);
+        continue;
+      }
+      DEBUG(dbgs() << "queuing new interval: " << *SplitVirtReg << "\n");
+      assert(TargetRegisterInfo::isVirtualRegister(SplitVirtReg->reg) &&
+             "expect split value in virtual register");
+      enqueue(SplitVirtReg);
+      ++NumNewQueued;
+    }
+  }
+}
+
+void RegAllocBase::postOptimization() {
+  spiller().postOptimization();
+  for (auto DeadInst : DeadRemats) {
+    LIS->RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBase.h b/contrib/llvm/lib/CodeGen/RegAllocBase.h
new file mode 100644
index 000000000000..296ffe8692c6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBase.h
@@ -0,0 +1,122 @@
+//===-- RegAllocBase.h - basic regalloc interface and driver --*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RegAllocBase class, which is the skeleton of a basic
+// register allocation algorithm and interface for extending it. It provides the
+// building blocks on which to construct other experimental allocators and test
+// the validity of two principles:
+//
+// - If virtual and physical register liveness is modeled using intervals, then
+// on-the-fly interference checking is cheap. Furthermore, interferences can be
+// lazily cached and reused.
+//
+// - Register allocation complexity, and generated code performance is
+// determined by the effectiveness of live range splitting rather than optimal
+// coloring.
+//
+// Following the first principle, interfering checking revolves around the
+// LiveIntervalUnion data structure.
+//
+// To fulfill the second principle, the basic allocator provides a driver for
+// incremental splitting. It essentially punts on the problem of register
+// coloring, instead driving the assignment of virtual to physical registers by
+// the cost of splitting. The basic allocator allows for heuristic reassignment
+// of registers, if a more sophisticated allocator chooses to do that.
+//
+// This framework provides a way to engineer the compile time vs. code
+// quality trade-off without relying on a particular theoretical solver.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_REGALLOCBASE_H
+#define LLVM_LIB_CODEGEN_REGALLOCBASE_H
+
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+
+namespace llvm {
+
+template<typename T> class SmallVectorImpl;
+class TargetRegisterInfo;
+class VirtRegMap;
+class LiveIntervals;
+class LiveRegMatrix;
+class Spiller;
+
+/// RegAllocBase provides the register allocation driver and interface that can
+/// be extended to add interesting heuristics.
+///
+/// Register allocators must override the selectOrSplit() method to implement
+/// live range splitting. They must also override enqueue/dequeue to provide an
+/// assignment order.
+class RegAllocBase {
+  virtual void anchor();
+protected:
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+  VirtRegMap *VRM;
+  LiveIntervals *LIS;
+  LiveRegMatrix *Matrix;
+  RegisterClassInfo RegClassInfo;
+
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
+  RegAllocBase()
+    : TRI(nullptr), MRI(nullptr), VRM(nullptr), LIS(nullptr), Matrix(nullptr) {}
+
+  virtual ~RegAllocBase() {}
+
+  // A RegAlloc pass should call this before allocatePhysRegs.
+  void init(VirtRegMap &vrm, LiveIntervals &lis, LiveRegMatrix &mat);
+
+  // The top-level driver. The output is a VirtRegMap that us updated with
+  // physical register assignments.
+  void allocatePhysRegs();
+
+  // Include spiller post optimization and removing dead defs left because of
+  // rematerialization.
+  virtual void postOptimization();
+
+  // Get a temporary reference to a Spiller instance.
+  virtual Spiller &spiller() = 0;
+
+  /// enqueue - Add VirtReg to the priority queue of unassigned registers.
+  virtual void enqueue(LiveInterval *LI) = 0;
+
+  /// dequeue - Return the next unassigned register, or NULL.
+  virtual LiveInterval *dequeue() = 0;
+
+  // A RegAlloc pass should override this to provide the allocation heuristics.
+  // Each call must guarantee forward progess by returning an available PhysReg
+  // or new set of split live virtual registers. It is up to the splitter to
+  // converge quickly toward fully spilled live ranges.
+  virtual unsigned selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<unsigned> &splitLVRs) = 0;
+
+  // Use this group name for NamedRegionTimer.
+  static const char TimerGroupName[];
+
+  /// Method called when the allocator is about to remove a LiveInterval.
+  virtual void aboutToRemoveInterval(LiveInterval &LI) {}
+
+public:
+  /// VerifyEnabled - True when -verify-regalloc is given.
+  static bool VerifyEnabled;
+
+private:
+  void seedLiveRegs();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
new file mode 100644
index 000000000000..11dfda67377f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -0,0 +1,298 @@
+//===-- RegAllocBasic.cpp - Basic Register Allocator ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RABasic function pass, which provides a minimal
+// implementation of the basic register allocator.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "AllocationOrder.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "Spiller.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cstdlib>
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static RegisterRegAlloc basicRegAlloc("basic", "basic register allocator",
+                                      createBasicRegisterAllocator);
+
+namespace {
+  struct CompSpillWeight {
+    bool operator()(LiveInterval *A, LiveInterval *B) const {
+      return A->weight < B->weight;
+    }
+  };
+}
+
+namespace {
+/// RABasic provides a minimal implementation of the basic register allocation
+/// algorithm. It prioritizes live virtual registers by spill weight and spills
+/// whenever a register is unavailable. This is not practical in production but
+/// provides a useful baseline both for measuring other allocators and comparing
+/// the speed of the basic algorithm against other styles of allocators.
+class RABasic : public MachineFunctionPass, public RegAllocBase
+{
+  // context
+  MachineFunction *MF;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  std::priority_queue<LiveInterval*, std::vector<LiveInterval*>,
+                      CompSpillWeight> Queue;
+
+  // Scratch space.  Allocated here to avoid repeated malloc calls in
+  // selectOrSplit().
+  BitVector UsableRegs;
+
+public:
+  RABasic();
+
+  /// Return the pass name.
+  const char* getPassName() const override {
+    return "Basic Register Allocator";
+  }
+
+  /// RABasic analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  void releaseMemory() override;
+
+  Spiller &spiller() override { return *SpillerInstance; }
+
+  void enqueue(LiveInterval *LI) override {
+    Queue.push(LI);
+  }
+
+  LiveInterval *dequeue() override {
+    if (Queue.empty())
+      return nullptr;
+    LiveInterval *LI = Queue.top();
+    Queue.pop();
+    return LI;
+  }
+
+  unsigned selectOrSplit(LiveInterval &VirtReg,
+                         SmallVectorImpl<unsigned> &SplitVRegs) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  // Helper for spilling all live virtual registers currently unified under preg
+  // that interfere with the most recently queried lvr.  Return true if spilling
+  // was successful, and append any new spilled/split intervals to splitLVRs.
+  bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
+                          SmallVectorImpl<unsigned> &SplitVRegs);
+
+  static char ID;
+};
+
+char RABasic::ID = 0;
+
+} // end anonymous namespace
+
+RABasic::RABasic(): MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
+}
+
+void RABasic::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
+  AU.addRequiredID(MachineDominatorsID);
+  AU.addPreservedID(MachineDominatorsID);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RABasic::releaseMemory() {
+  SpillerInstance.reset();
+}
+
+
+// Spill or split all live virtual registers currently unified under PhysReg
+// that interfere with VirtReg. The newly spilled or split live intervals are
+// returned by appending them to SplitVRegs.
+bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
+                                 SmallVectorImpl<unsigned> &SplitVRegs) {
+  // Record each interference and determine if all are spillable before mutating
+  // either the union or live intervals.
+  SmallVector<LiveInterval*, 8> Intfs;
+
+  // Collect interferences assigned to any alias of the physical register.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    Q.collectInterferingVRegs();
+    if (Q.seenUnspillableVReg())
+      return false;
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
+        return false;
+      Intfs.push_back(Intf);
+    }
+  }
+  DEBUG(dbgs() << "spilling " << TRI->getName(PhysReg) <<
+        " interferences with " << VirtReg << "\n");
+  assert(!Intfs.empty() && "expected interference");
+
+  // Spill each interfering vreg allocated to PhysReg or an alias.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval &Spill = *Intfs[i];
+
+    // Skip duplicates.
+    if (!VRM->hasPhys(Spill.reg))
+      continue;
+
+    // Deallocate the interfering vreg by removing it from the union.
+    // A LiveInterval instance may not be in a union during modification!
+    Matrix->unassign(Spill);
+
+    // Spill the extracted interval.
+    LiveRangeEdit LRE(&Spill, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+    spiller().spill(LRE);
+  }
+  return true;
+}
+
+// Driver for the register assignment and splitting heuristics.
+// Manages iteration over the LiveIntervalUnions.
+//
+// This is a minimal implementation of register assignment and splitting that
+// spills whenever we run out of registers.
+//
+// selectOrSplit can only be called once per live virtual register. We then do a
+// single interference test for each register the correct class until we find an
+// available register. So, the number of interference tests in the worst case is
+// |vregs| * |machineregs|. And since the number of interference tests is
+// minimal, there is no value in caching them outside the scope of
+// selectOrSplit().
+unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
+                                SmallVectorImpl<unsigned> &SplitVRegs) {
+  // Populate a list of physical register spill candidates.
+  SmallVector<unsigned, 8> PhysRegSpillCands;
+
+  // Check for an available register in this class.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
+  while (unsigned PhysReg = Order.next()) {
+    // Check for interference in PhysReg
+    switch (Matrix->checkInterference(VirtReg, PhysReg)) {
+    case LiveRegMatrix::IK_Free:
+      // PhysReg is available, allocate it.
+      return PhysReg;
+
+    case LiveRegMatrix::IK_VirtReg:
+      // Only virtual registers in the way, we may be able to spill them.
+      PhysRegSpillCands.push_back(PhysReg);
+      continue;
+
+    default:
+      // RegMask or RegUnit interference.
+      continue;
+    }
+  }
+
+  // Try to spill another interfering reg with less spill weight.
+  for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(),
+       PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
+    if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
+      continue;
+
+    assert(!Matrix->checkInterference(VirtReg, *PhysRegI) &&
+           "Interference after spill.");
+    // Tell the caller to allocate to this newly freed physical register.
+    return *PhysRegI;
+  }
+
+  // No other spill candidates were found, so spill the current VirtReg.
+  DEBUG(dbgs() << "spilling: " << VirtReg << '\n');
+  if (!VirtReg.isSpillable())
+    return ~0u;
+  LiveRangeEdit LRE(&VirtReg, SplitVRegs, *MF, *LIS, VRM, nullptr, &DeadRemats);
+  spiller().spill(LRE);
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+bool RABasic::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "********** BASIC REGISTER ALLOCATION **********\n"
+               << "********** Function: "
+               << mf.getName() << '\n');
+
+  MF = &mf;
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+
+  calculateSpillWeightsAndHints(*LIS, *MF, VRM,
+                                getAnalysis<MachineLoopInfo>(),
+                                getAnalysis<MachineBlockFrequencyInfo>());
+
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+
+  allocatePhysRegs();
+  postOptimization();
+
+  // Diagnostic output before rewriting
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << *VRM << "\n");
+
+  releaseMemory();
+  return true;
+}
+
+FunctionPass* llvm::createBasicRegisterAllocator()
+{
+  return new RABasic();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocFast.cpp b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
new file mode 100644
index 000000000000..55fb33edd720
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -0,0 +1,1122 @@
+//===-- RegAllocFast.cpp - A fast register allocator for debug code -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This register allocator allocates registers to a basic block at a time,
+// attempting to keep values in registers and reusing registers as appropriate.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/IndexedMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumStores, "Number of stores added");
+STATISTIC(NumLoads , "Number of loads added");
+STATISTIC(NumCopies, "Number of copies coalesced");
+
+static RegisterRegAlloc
+  fastRegAlloc("fast", "fast register allocator", createFastRegisterAllocator);
+
+namespace {
+  class RAFast : public MachineFunctionPass {
+  public:
+    static char ID;
+    RAFast() : MachineFunctionPass(ID), StackSlotForVirtReg(-1),
+               isBulkSpilling(false) {}
+
+  private:
+    MachineFunction *MF;
+    MachineRegisterInfo *MRI;
+    const TargetRegisterInfo *TRI;
+    const TargetInstrInfo *TII;
+    RegisterClassInfo RegClassInfo;
+
+    // Basic block currently being allocated.
+    MachineBasicBlock *MBB;
+
+    // StackSlotForVirtReg - Maps virtual regs to the frame index where these
+    // values are spilled.
+    IndexedMap<int, VirtReg2IndexFunctor> StackSlotForVirtReg;
+
+    // Everything we know about a live virtual register.
+    struct LiveReg {
+      MachineInstr *LastUse;    // Last instr to use reg.
+      unsigned VirtReg;         // Virtual register number.
+      unsigned PhysReg;         // Currently held here.
+      unsigned short LastOpNum; // OpNum on LastUse.
+      bool Dirty;               // Register needs spill.
+
+      explicit LiveReg(unsigned v)
+        : LastUse(nullptr), VirtReg(v), PhysReg(0), LastOpNum(0), Dirty(false){}
+
+      unsigned getSparseSetIndex() const {
+        return TargetRegisterInfo::virtReg2Index(VirtReg);
+      }
+    };
+
+    typedef SparseSet<LiveReg> LiveRegMap;
+
+    // LiveVirtRegs - This map contains entries for each virtual register
+    // that is currently available in a physical register.
+    LiveRegMap LiveVirtRegs;
+
+    DenseMap<unsigned, SmallVector<MachineInstr *, 4> > LiveDbgValueMap;
+
+    // RegState - Track the state of a physical register.
+    enum RegState {
+      // A disabled register is not available for allocation, but an alias may
+      // be in use. A register can only be moved out of the disabled state if
+      // all aliases are disabled.
+      regDisabled,
+
+      // A free register is not currently in use and can be allocated
+      // immediately without checking aliases.
+      regFree,
+
+      // A reserved register has been assigned explicitly (e.g., setting up a
+      // call parameter), and it remains reserved until it is used.
+      regReserved
+
+      // A register state may also be a virtual register number, indication that
+      // the physical register is currently allocated to a virtual register. In
+      // that case, LiveVirtRegs contains the inverse mapping.
+    };
+
+    // PhysRegState - One of the RegState enums, or a virtreg.
+    std::vector<unsigned> PhysRegState;
+
+    // Set of register units.
+    typedef SparseSet<unsigned> UsedInInstrSet;
+
+    // Set of register units that are used in the current instruction, and so
+    // cannot be allocated.
+    UsedInInstrSet UsedInInstr;
+
+    // Mark a physreg as used in this instruction.
+    void markRegUsedInInstr(unsigned PhysReg) {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        UsedInInstr.insert(*Units);
+    }
+
+    // Check if a physreg or any of its aliases are used in this instruction.
+    bool isRegUsedInInstr(unsigned PhysReg) const {
+      for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
+        if (UsedInInstr.count(*Units))
+          return true;
+      return false;
+    }
+
+    // SkippedInstrs - Descriptors of instructions whose clobber list was
+    // ignored because all registers were spilled. It is still necessary to
+    // mark all the clobbered registers as used by the function.
+    SmallPtrSet<const MCInstrDesc*, 4> SkippedInstrs;
+
+    // isBulkSpilling - This flag is set when LiveRegMap will be cleared
+    // completely after spilling all live registers. LiveRegMap entries should
+    // not be erased.
+    bool isBulkSpilling;
+
+    enum : unsigned {
+      spillClean = 1,
+      spillDirty = 100,
+      spillImpossible = ~0u
+    };
+  public:
+    const char *getPassName() const override {
+      return "Fast Register Allocator";
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    MachineFunctionProperties getSetProperties() const override {
+      return MachineFunctionProperties().set(
+          MachineFunctionProperties::Property::AllVRegsAllocated);
+    }
+
+  private:
+    bool runOnMachineFunction(MachineFunction &Fn) override;
+    void AllocateBasicBlock();
+    void handleThroughOperands(MachineInstr *MI,
+                               SmallVectorImpl<unsigned> &VirtDead);
+    int getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC);
+    bool isLastUseOfLocalReg(MachineOperand&);
+
+    void addKillFlag(const LiveReg&);
+    void killVirtReg(LiveRegMap::iterator);
+    void killVirtReg(unsigned VirtReg);
+    void spillVirtReg(MachineBasicBlock::iterator MI, LiveRegMap::iterator);
+    void spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg);
+
+    void usePhysReg(MachineOperand&);
+    void definePhysReg(MachineInstr &MI, unsigned PhysReg, RegState NewState);
+    unsigned calcSpillCost(unsigned PhysReg) const;
+    void assignVirtToPhysReg(LiveReg&, unsigned PhysReg);
+    LiveRegMap::iterator findLiveVirtReg(unsigned VirtReg) {
+      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+    }
+    LiveRegMap::const_iterator findLiveVirtReg(unsigned VirtReg) const {
+      return LiveVirtRegs.find(TargetRegisterInfo::virtReg2Index(VirtReg));
+    }
+    LiveRegMap::iterator assignVirtToPhysReg(unsigned VReg, unsigned PhysReg);
+    LiveRegMap::iterator allocVirtReg(MachineInstr &MI, LiveRegMap::iterator,
+                                      unsigned Hint);
+    LiveRegMap::iterator defineVirtReg(MachineInstr &MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    LiveRegMap::iterator reloadVirtReg(MachineInstr &MI, unsigned OpNum,
+                                       unsigned VirtReg, unsigned Hint);
+    void spillAll(MachineBasicBlock::iterator MI);
+    bool setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg);
+  };
+  char RAFast::ID = 0;
+}
+
+/// getStackSpaceFor - This allocates space for the specified virtual register
+/// to be held on the stack.
+int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
+  // Find the location Reg would belong...
+  int SS = StackSlotForVirtReg[VirtReg];
+  if (SS != -1)
+    return SS;          // Already has space allocated?
+
+  // Allocate a new stack object for this spill location...
+  int FrameIdx = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                            RC->getAlignment());
+
+  // Assign the slot.
+  StackSlotForVirtReg[VirtReg] = FrameIdx;
+  return FrameIdx;
+}
+
+/// isLastUseOfLocalReg - Return true if MO is the only remaining reference to
+/// its virtual register, and it is guaranteed to be a block-local register.
+///
+bool RAFast::isLastUseOfLocalReg(MachineOperand &MO) {
+  // If the register has ever been spilled or reloaded, we conservatively assume
+  // it is a global register used in multiple blocks.
+  if (StackSlotForVirtReg[MO.getReg()] != -1)
+    return false;
+
+  // Check that the use/def chain has exactly one operand - MO.
+  MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
+  if (&*I != &MO)
+    return false;
+  return ++I == MRI->reg_nodbg_end();
+}
+
+/// addKillFlag - Set kill flags on last use of a virtual register.
+void RAFast::addKillFlag(const LiveReg &LR) {
+  if (!LR.LastUse) return;
+  MachineOperand &MO = LR.LastUse->getOperand(LR.LastOpNum);
+  if (MO.isUse() && !LR.LastUse->isRegTiedToDefOperand(LR.LastOpNum)) {
+    if (MO.getReg() == LR.PhysReg)
+      MO.setIsKill();
+    else
+      LR.LastUse->addRegisterKilled(LR.PhysReg, TRI, true);
+  }
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(LiveRegMap::iterator LRI) {
+  addKillFlag(*LRI);
+  assert(PhysRegState[LRI->PhysReg] == LRI->VirtReg &&
+         "Broken RegState mapping");
+  PhysRegState[LRI->PhysReg] = regFree;
+  // Erase from LiveVirtRegs unless we're spilling in bulk.
+  if (!isBulkSpilling)
+    LiveVirtRegs.erase(LRI);
+}
+
+/// killVirtReg - Mark virtreg as no longer available.
+void RAFast::killVirtReg(unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "killVirtReg needs a virtual register");
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  if (LRI != LiveVirtRegs.end())
+    killVirtReg(LRI);
+}
+
+/// spillVirtReg - This method spills the value specified by VirtReg into the
+/// corresponding stack slot if needed.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI, unsigned VirtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Spilling a physical register is illegal!");
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  assert(LRI != LiveVirtRegs.end() && "Spilling unmapped virtual register");
+  spillVirtReg(MI, LRI);
+}
+
+/// spillVirtReg - Do the actual work of spilling.
+void RAFast::spillVirtReg(MachineBasicBlock::iterator MI,
+                          LiveRegMap::iterator LRI) {
+  LiveReg &LR = *LRI;
+  assert(PhysRegState[LR.PhysReg] == LRI->VirtReg && "Broken RegState mapping");
+
+  if (LR.Dirty) {
+    // If this physreg is used by the instruction, we want to kill it on the
+    // instruction, not on the spill.
+    bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
+    LR.Dirty = false;
+    DEBUG(dbgs() << "Spilling " << PrintReg(LRI->VirtReg, TRI)
+                 << " in " << PrintReg(LR.PhysReg, TRI));
+    const TargetRegisterClass *RC = MRI->getRegClass(LRI->VirtReg);
+    int FI = getStackSpaceFor(LRI->VirtReg, RC);
+    DEBUG(dbgs() << " to stack slot #" << FI << "\n");
+    TII->storeRegToStackSlot(*MBB, MI, LR.PhysReg, SpillKill, FI, RC, TRI);
+    ++NumStores;   // Update statistics
+
+    // If this register is used by DBG_VALUE then insert new DBG_VALUE to
+    // identify spilled location as the place to find corresponding variable's
+    // value.
+    SmallVectorImpl<MachineInstr *> &LRIDbgValues =
+      LiveDbgValueMap[LRI->VirtReg];
+    for (unsigned li = 0, le = LRIDbgValues.size(); li != le; ++li) {
+      MachineInstr *DBG = LRIDbgValues[li];
+      const MDNode *Var = DBG->getDebugVariable();
+      const MDNode *Expr = DBG->getDebugExpression();
+      bool IsIndirect = DBG->isIndirectDebugValue();
+      uint64_t Offset = IsIndirect ? DBG->getOperand(1).getImm() : 0;
+      DebugLoc DL = DBG->getDebugLoc();
+      assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+             "Expected inlined-at fields to agree");
+      MachineInstr *NewDV =
+          BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::DBG_VALUE))
+              .addFrameIndex(FI)
+              .addImm(Offset)
+              .addMetadata(Var)
+              .addMetadata(Expr);
+      assert(NewDV->getParent() == MBB && "dangling parent pointer");
+      (void)NewDV;
+      DEBUG(dbgs() << "Inserting debug info due to spill:" << "\n" << *NewDV);
+    }
+    // Now this register is spilled there is should not be any DBG_VALUE
+    // pointing to this register because they are all pointing to spilled value
+    // now.
+    LRIDbgValues.clear();
+    if (SpillKill)
+      LR.LastUse = nullptr; // Don't kill register again
+  }
+  killVirtReg(LRI);
+}
+
+/// spillAll - Spill all dirty virtregs without killing them.
+void RAFast::spillAll(MachineBasicBlock::iterator MI) {
+  if (LiveVirtRegs.empty()) return;
+  isBulkSpilling = true;
+  // The LiveRegMap is keyed by an unsigned (the virtreg number), so the order
+  // of spilling here is deterministic, if arbitrary.
+  for (LiveRegMap::iterator i = LiveVirtRegs.begin(), e = LiveVirtRegs.end();
+       i != e; ++i)
+    spillVirtReg(MI, i);
+  LiveVirtRegs.clear();
+  isBulkSpilling = false;
+}
+
+/// usePhysReg - Handle the direct use of a physical register.
+/// Check that the register is not used by a virtreg.
+/// Kill the physreg, marking it free.
+/// This may add implicit kills to MO->getParent() and invalidate MO.
+void RAFast::usePhysReg(MachineOperand &MO) {
+  unsigned PhysReg = MO.getReg();
+  assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+         "Bad usePhysReg operand");
+
+  // Ignore undef uses.
+  if (MO.isUndef())
+    return;
+
+  markRegUsedInInstr(PhysReg);
+  switch (PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regReserved:
+    PhysRegState[PhysReg] = regFree;
+    // Fall through
+  case regFree:
+    MO.setIsKill();
+    return;
+  default:
+    // The physreg was allocated to a virtual register. That means the value we
+    // wanted has been clobbered.
+    llvm_unreachable("Instruction uses an allocated register");
+  }
+
+  // Maybe a superregister is reserved?
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regReserved:
+      // Either PhysReg is a subregister of Alias and we mark the
+      // whole register as free, or PhysReg is the superregister of
+      // Alias and we mark all the aliases as disabled before freeing
+      // PhysReg.
+      // In the latter case, since PhysReg was disabled, this means that
+      // its value is defined only by physical sub-registers. This check
+      // is performed by the assert of the default case in this loop.
+      // Note: The value of the superregister may only be partial
+      // defined, that is why regDisabled is a valid state for aliases.
+      assert((TRI->isSuperRegister(PhysReg, Alias) ||
+              TRI->isSuperRegister(Alias, PhysReg)) &&
+             "Instruction is not using a subregister of a reserved register");
+      // Fall through.
+    case regFree:
+      if (TRI->isSuperRegister(PhysReg, Alias)) {
+        // Leave the superregister in the working set.
+        PhysRegState[Alias] = regFree;
+        MO.getParent()->addRegisterKilled(Alias, TRI, true);
+        return;
+      }
+      // Some other alias was in the working set - clear it.
+      PhysRegState[Alias] = regDisabled;
+      break;
+    default:
+      llvm_unreachable("Instruction uses an alias of an allocated register");
+    }
+  }
+
+  // All aliases are disabled, bring register into working set.
+  PhysRegState[PhysReg] = regFree;
+  MO.setIsKill();
+}
+
+/// definePhysReg - Mark PhysReg as reserved or free after spilling any
+/// virtregs. This is very similar to defineVirtReg except the physreg is
+/// reserved instead of allocated.
+void RAFast::definePhysReg(MachineInstr &MI, unsigned PhysReg,
+                           RegState NewState) {
+  markRegUsedInInstr(PhysReg);
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  default:
+    spillVirtReg(MI, VirtReg);
+    // Fall through.
+  case regFree:
+  case regReserved:
+    PhysRegState[PhysReg] = NewState;
+    return;
+  }
+
+  // This is a disabled register, disable all aliases.
+  PhysRegState[PhysReg] = NewState;
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    default:
+      spillVirtReg(MI, VirtReg);
+      // Fall through.
+    case regFree:
+    case regReserved:
+      PhysRegState[Alias] = regDisabled;
+      if (TRI->isSuperRegister(PhysReg, Alias))
+        return;
+      break;
+    }
+  }
+}
+
+
+// calcSpillCost - Return the cost of spilling clearing out PhysReg and
+// aliases so it is free for allocation.
+// Returns 0 when PhysReg is free or disabled with all aliases disabled - it
+// can be allocated directly.
+// Returns spillImpossible when PhysReg or an alias can't be spilled.
+unsigned RAFast::calcSpillCost(unsigned PhysReg) const {
+  if (isRegUsedInInstr(PhysReg)) {
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is already used in instr.\n");
+    return spillImpossible;
+  }
+  switch (unsigned VirtReg = PhysRegState[PhysReg]) {
+  case regDisabled:
+    break;
+  case regFree:
+    return 0;
+  case regReserved:
+    DEBUG(dbgs() << PrintReg(VirtReg, TRI) << " corresponding "
+                 << PrintReg(PhysReg, TRI) << " is reserved already.\n");
+    return spillImpossible;
+  default: {
+    LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+    assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+    return I->Dirty ? spillDirty : spillClean;
+  }
+  }
+
+  // This is a disabled register, add up cost of aliases.
+  DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is disabled.\n");
+  unsigned Cost = 0;
+  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
+    unsigned Alias = *AI;
+    switch (unsigned VirtReg = PhysRegState[Alias]) {
+    case regDisabled:
+      break;
+    case regFree:
+      ++Cost;
+      break;
+    case regReserved:
+      return spillImpossible;
+    default: {
+      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+      assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+      Cost += I->Dirty ? spillDirty : spillClean;
+      break;
+    }
+    }
+  }
+  return Cost;
+}
+
+
+/// assignVirtToPhysReg - This method updates local state so that we know
+/// that PhysReg is the proper container for VirtReg now.  The physical
+/// register must not be used for anything else when this is called.
+///
+void RAFast::assignVirtToPhysReg(LiveReg &LR, unsigned PhysReg) {
+  DEBUG(dbgs() << "Assigning " << PrintReg(LR.VirtReg, TRI) << " to "
+               << PrintReg(PhysReg, TRI) << "\n");
+  PhysRegState[PhysReg] = LR.VirtReg;
+  assert(!LR.PhysReg && "Already assigned a physreg");
+  LR.PhysReg = PhysReg;
+}
+
+RAFast::LiveRegMap::iterator
+RAFast::assignVirtToPhysReg(unsigned VirtReg, unsigned PhysReg) {
+  LiveRegMap::iterator LRI = findLiveVirtReg(VirtReg);
+  assert(LRI != LiveVirtRegs.end() && "VirtReg disappeared");
+  assignVirtToPhysReg(*LRI, PhysReg);
+  return LRI;
+}
+
+/// allocVirtReg - Allocate a physical register for VirtReg.
+RAFast::LiveRegMap::iterator RAFast::allocVirtReg(MachineInstr &MI,
+                                                  LiveRegMap::iterator LRI,
+                                                  unsigned Hint) {
+  const unsigned VirtReg = LRI->VirtReg;
+
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Can only allocate virtual registers");
+
+  const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+
+  // Ignore invalid hints.
+  if (Hint && (!TargetRegisterInfo::isPhysicalRegister(Hint) ||
+               !RC->contains(Hint) || !MRI->isAllocatable(Hint)))
+    Hint = 0;
+
+  // Take hint when possible.
+  if (Hint) {
+    // Ignore the hint if we would have to spill a dirty register.
+    unsigned Cost = calcSpillCost(Hint);
+    if (Cost < spillDirty) {
+      if (Cost)
+        definePhysReg(MI, Hint, regFree);
+      // definePhysReg may kill virtual registers and modify LiveVirtRegs.
+      // That invalidates LRI, so run a new lookup for VirtReg.
+      return assignVirtToPhysReg(VirtReg, Hint);
+    }
+  }
+
+  ArrayRef<MCPhysReg> AO = RegClassInfo.getOrder(RC);
+
+  // First try to find a completely free register.
+  for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
+    unsigned PhysReg = *I;
+    if (PhysRegState[PhysReg] == regFree && !isRegUsedInInstr(PhysReg)) {
+      assignVirtToPhysReg(*LRI, PhysReg);
+      return LRI;
+    }
+  }
+
+  DEBUG(dbgs() << "Allocating " << PrintReg(VirtReg) << " from "
+               << TRI->getRegClassName(RC) << "\n");
+
+  unsigned BestReg = 0, BestCost = spillImpossible;
+  for (ArrayRef<MCPhysReg>::iterator I = AO.begin(), E = AO.end(); I != E; ++I){
+    unsigned Cost = calcSpillCost(*I);
+    DEBUG(dbgs() << "\tRegister: " << PrintReg(*I, TRI) << "\n");
+    DEBUG(dbgs() << "\tCost: " << Cost << "\n");
+    DEBUG(dbgs() << "\tBestCost: " << BestCost << "\n");
+    // Cost is 0 when all aliases are already disabled.
+    if (Cost == 0) {
+      assignVirtToPhysReg(*LRI, *I);
+      return LRI;
+    }
+    if (Cost < BestCost)
+      BestReg = *I, BestCost = Cost;
+  }
+
+  if (BestReg) {
+    definePhysReg(MI, BestReg, regFree);
+    // definePhysReg may kill virtual registers and modify LiveVirtRegs.
+    // That invalidates LRI, so run a new lookup for VirtReg.
+    return assignVirtToPhysReg(VirtReg, BestReg);
+  }
+
+  // Nothing we can do. Report an error and keep going with a bad allocation.
+  if (MI.isInlineAsm())
+    MI.emitError("inline assembly requires more registers than available");
+  else
+    MI.emitError("ran out of registers during register allocation");
+  definePhysReg(MI, *AO.begin(), regFree);
+  return assignVirtToPhysReg(VirtReg, *AO.begin());
+}
+
+/// defineVirtReg - Allocate a register for VirtReg and mark it as dirty.
+RAFast::LiveRegMap::iterator RAFast::defineVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  if (New) {
+    // If there is no hint, peek at the only use of this register.
+    if ((!Hint || !TargetRegisterInfo::isPhysicalRegister(Hint)) &&
+        MRI->hasOneNonDBGUse(VirtReg)) {
+      const MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(VirtReg);
+      // It's a copy, use the destination register as a hint.
+      if (UseMI.isCopyLike())
+        Hint = UseMI.getOperand(0).getReg();
+    }
+    LRI = allocVirtReg(MI, LRI, Hint);
+  } else if (LRI->LastUse) {
+    // Redefining a live register - kill at the last use, unless it is this
+    // instruction defining VirtReg multiple times.
+    if (LRI->LastUse != &MI || LRI->LastUse->getOperand(LRI->LastOpNum).isUse())
+      addKillFlag(*LRI);
+  }
+  assert(LRI->PhysReg && "Register not assigned");
+  LRI->LastUse = &MI;
+  LRI->LastOpNum = OpNum;
+  LRI->Dirty = true;
+  markRegUsedInInstr(LRI->PhysReg);
+  return LRI;
+}
+
+/// reloadVirtReg - Make sure VirtReg is available in a physreg and return it.
+RAFast::LiveRegMap::iterator RAFast::reloadVirtReg(MachineInstr &MI,
+                                                   unsigned OpNum,
+                                                   unsigned VirtReg,
+                                                   unsigned Hint) {
+  assert(TargetRegisterInfo::isVirtualRegister(VirtReg) &&
+         "Not a virtual register");
+  LiveRegMap::iterator LRI;
+  bool New;
+  std::tie(LRI, New) = LiveVirtRegs.insert(LiveReg(VirtReg));
+  MachineOperand &MO = MI.getOperand(OpNum);
+  if (New) {
+    LRI = allocVirtReg(MI, LRI, Hint);
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg);
+    int FrameIndex = getStackSpaceFor(VirtReg, RC);
+    DEBUG(dbgs() << "Reloading " << PrintReg(VirtReg, TRI) << " into "
+                 << PrintReg(LRI->PhysReg, TRI) << "\n");
+    TII->loadRegFromStackSlot(*MBB, MI, LRI->PhysReg, FrameIndex, RC, TRI);
+    ++NumLoads;
+  } else if (LRI->Dirty) {
+    if (isLastUseOfLocalReg(MO)) {
+      DEBUG(dbgs() << "Killing last use: " << MO << "\n");
+      if (MO.isUse())
+        MO.setIsKill();
+      else
+        MO.setIsDead();
+    } else if (MO.isKill()) {
+      DEBUG(dbgs() << "Clearing dubious kill: " << MO << "\n");
+      MO.setIsKill(false);
+    } else if (MO.isDead()) {
+      DEBUG(dbgs() << "Clearing dubious dead: " << MO << "\n");
+      MO.setIsDead(false);
+    }
+  } else if (MO.isKill()) {
+    // We must remove kill flags from uses of reloaded registers because the
+    // register would be killed immediately, and there might be a second use:
+    //   %foo = OR %x<kill>, %x
+    // This would cause a second reload of %x into a different register.
+    DEBUG(dbgs() << "Clearing clean kill: " << MO << "\n");
+    MO.setIsKill(false);
+  } else if (MO.isDead()) {
+    DEBUG(dbgs() << "Clearing clean dead: " << MO << "\n");
+    MO.setIsDead(false);
+  }
+  assert(LRI->PhysReg && "Register not assigned");
+  LRI->LastUse = &MI;
+  LRI->LastOpNum = OpNum;
+  markRegUsedInInstr(LRI->PhysReg);
+  return LRI;
+}
+
+// setPhysReg - Change operand OpNum in MI the refer the PhysReg, considering
+// subregs. This may invalidate any operand pointers.
+// Return true if the operand kills its register.
+bool RAFast::setPhysReg(MachineInstr *MI, unsigned OpNum, unsigned PhysReg) {
+  MachineOperand &MO = MI->getOperand(OpNum);
+  bool Dead = MO.isDead();
+  if (!MO.getSubReg()) {
+    MO.setReg(PhysReg);
+    return MO.isKill() || Dead;
+  }
+
+  // Handle subregister index.
+  MO.setReg(PhysReg ? TRI->getSubReg(PhysReg, MO.getSubReg()) : 0);
+  MO.setSubReg(0);
+
+  // A kill flag implies killing the full register. Add corresponding super
+  // register kill.
+  if (MO.isKill()) {
+    MI->addRegisterKilled(PhysReg, TRI, true);
+    return true;
+  }
+
+  // A <def,read-undef> of a sub-register requires an implicit def of the full
+  // register.
+  if (MO.isDef() && MO.isUndef())
+    MI->addRegisterDefined(PhysReg, TRI);
+
+  return Dead;
+}
+
+// Handle special instruction operand like early clobbers and tied ops when
+// there are additional physreg defines.
+void RAFast::handleThroughOperands(MachineInstr *MI,
+                                   SmallVectorImpl<unsigned> &VirtDead) {
+  DEBUG(dbgs() << "Scanning for through registers:");
+  SmallSet<unsigned, 8> ThroughRegs;
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+    if (MO.isEarlyClobber() || MI->isRegTiedToDefOperand(i) ||
+        (MO.getSubReg() && MI->readsVirtualRegister(Reg))) {
+      if (ThroughRegs.insert(Reg).second)
+        DEBUG(dbgs() << ' ' << PrintReg(Reg));
+    }
+  }
+
+  // If any physreg defines collide with preallocated through registers,
+  // we must spill and reallocate.
+  DEBUG(dbgs() << "\nChecking for physdef collisions.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || !MO.isDef()) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    markRegUsedInInstr(Reg);
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+      if (ThroughRegs.count(PhysRegState[*AI]))
+        definePhysReg(*MI, *AI, regFree);
+    }
+  }
+
+  SmallVector<unsigned, 8> PartialDefs;
+  DEBUG(dbgs() << "Allocating tied uses.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (MO.isUse()) {
+      unsigned DefIdx = 0;
+      if (!MI->isRegTiedToDefOperand(i, &DefIdx)) continue;
+      DEBUG(dbgs() << "Operand " << i << "("<< MO << ") is tied to operand "
+        << DefIdx << ".\n");
+      LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, 0);
+      unsigned PhysReg = LRI->PhysReg;
+      setPhysReg(MI, i, PhysReg);
+      // Note: we don't update the def operand yet. That would cause the normal
+      // def-scan to attempt spilling.
+    } else if (MO.getSubReg() && MI->readsVirtualRegister(Reg)) {
+      DEBUG(dbgs() << "Partial redefine: " << MO << "\n");
+      // Reload the register, but don't assign to the operand just yet.
+      // That would confuse the later phys-def processing pass.
+      LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, 0);
+      PartialDefs.push_back(LRI->PhysReg);
+    }
+  }
+
+  DEBUG(dbgs() << "Allocating early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (!MO.isEarlyClobber())
+      continue;
+    // Note: defineVirtReg may invalidate MO.
+    LiveRegMap::iterator LRI = defineVirtReg(*MI, i, Reg, 0);
+    unsigned PhysReg = LRI->PhysReg;
+    if (setPhysReg(MI, i, PhysReg))
+      VirtDead.push_back(Reg);
+  }
+
+  // Restore UsedInInstr to a state usable for allocating normal virtual uses.
+  UsedInInstr.clear();
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg() || (MO.isDef() && !MO.isEarlyClobber())) continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+    DEBUG(dbgs() << "\tSetting " << PrintReg(Reg, TRI)
+                 << " as used in instr\n");
+    markRegUsedInInstr(Reg);
+  }
+
+  // Also mark PartialDefs as used to avoid reallocation.
+  for (unsigned i = 0, e = PartialDefs.size(); i != e; ++i)
+    markRegUsedInInstr(PartialDefs[i]);
+}
+
+void RAFast::AllocateBasicBlock() {
+  DEBUG(dbgs() << "\nAllocating " << *MBB);
+
+  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
+  assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
+
+  MachineBasicBlock::iterator MII = MBB->begin();
+
+  // Add live-in registers as live.
+  for (const auto &LI : MBB->liveins())
+    if (MRI->isAllocatable(LI.PhysReg))
+      definePhysReg(*MII, LI.PhysReg, regReserved);
+
+  SmallVector<unsigned, 8> VirtDead;
+  SmallVector<MachineInstr*, 32> Coalesced;
+
+  // Otherwise, sequentially allocate each instruction in the MBB.
+  while (MII != MBB->end()) {
+    MachineInstr *MI = &*MII++;
+    const MCInstrDesc &MCID = MI->getDesc();
+    DEBUG({
+        dbgs() << "\n>> " << *MI << "Regs:";
+        for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
+          if (PhysRegState[Reg] == regDisabled) continue;
+          dbgs() << " " << TRI->getName(Reg);
+          switch(PhysRegState[Reg]) {
+          case regFree:
+            break;
+          case regReserved:
+            dbgs() << "*";
+            break;
+          default: {
+            dbgs() << '=' << PrintReg(PhysRegState[Reg]);
+            LiveRegMap::iterator I = findLiveVirtReg(PhysRegState[Reg]);
+            assert(I != LiveVirtRegs.end() && "Missing VirtReg entry");
+            if (I->Dirty)
+              dbgs() << "*";
+            assert(I->PhysReg == Reg && "Bad inverse map");
+            break;
+          }
+          }
+        }
+        dbgs() << '\n';
+        // Check that LiveVirtRegs is the inverse.
+        for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
+             e = LiveVirtRegs.end(); i != e; ++i) {
+           assert(TargetRegisterInfo::isVirtualRegister(i->VirtReg) &&
+                  "Bad map key");
+           assert(TargetRegisterInfo::isPhysicalRegister(i->PhysReg) &&
+                  "Bad map value");
+           assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
+        }
+      });
+
+    // Debug values are not allowed to change codegen in any way.
+    if (MI->isDebugValue()) {
+      bool ScanDbgValue = true;
+      while (ScanDbgValue) {
+        ScanDbgValue = false;
+        for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+          MachineOperand &MO = MI->getOperand(i);
+          if (!MO.isReg()) continue;
+          unsigned Reg = MO.getReg();
+          if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+          LiveRegMap::iterator LRI = findLiveVirtReg(Reg);
+          if (LRI != LiveVirtRegs.end())
+            setPhysReg(MI, i, LRI->PhysReg);
+          else {
+            int SS = StackSlotForVirtReg[Reg];
+            if (SS == -1) {
+              // We can't allocate a physreg for a DebugValue, sorry!
+              DEBUG(dbgs() << "Unable to allocate vreg used by DBG_VALUE");
+              MO.setReg(0);
+            }
+            else {
+              // Modify DBG_VALUE now that the value is in a spill slot.
+              bool IsIndirect = MI->isIndirectDebugValue();
+              uint64_t Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
+              const MDNode *Var = MI->getDebugVariable();
+              const MDNode *Expr = MI->getDebugExpression();
+              DebugLoc DL = MI->getDebugLoc();
+              MachineBasicBlock *MBB = MI->getParent();
+              assert(
+                  cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+                  "Expected inlined-at fields to agree");
+              MachineInstr *NewDV = BuildMI(*MBB, MBB->erase(MI), DL,
+                                            TII->get(TargetOpcode::DBG_VALUE))
+                                        .addFrameIndex(SS)
+                                        .addImm(Offset)
+                                        .addMetadata(Var)
+                                        .addMetadata(Expr);
+              DEBUG(dbgs() << "Modifying debug info due to spill:"
+                           << "\t" << *NewDV);
+              // Scan NewDV operands from the beginning.
+              MI = NewDV;
+              ScanDbgValue = true;
+              break;
+            }
+          }
+          LiveDbgValueMap[Reg].push_back(MI);
+        }
+      }
+      // Next instruction.
+      continue;
+    }
+
+    // If this is a copy, we may be able to coalesce.
+    unsigned CopySrc = 0, CopyDst = 0, CopySrcSub = 0, CopyDstSub = 0;
+    if (MI->isCopy()) {
+      CopyDst = MI->getOperand(0).getReg();
+      CopySrc = MI->getOperand(1).getReg();
+      CopyDstSub = MI->getOperand(0).getSubReg();
+      CopySrcSub = MI->getOperand(1).getSubReg();
+    }
+
+    // Track registers used by instruction.
+    UsedInInstr.clear();
+
+    // First scan.
+    // Mark physreg uses and early clobbers as used.
+    // Find the end of the virtreg operands
+    unsigned VirtOpEnd = 0;
+    bool hasTiedOps = false;
+    bool hasEarlyClobbers = false;
+    bool hasPartialRedefs = false;
+    bool hasPhysDefs = false;
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      // Make sure MRI knows about registers clobbered by regmasks.
+      if (MO.isRegMask()) {
+        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+        continue;
+      }
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!Reg) continue;
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        VirtOpEnd = i+1;
+        if (MO.isUse()) {
+          hasTiedOps = hasTiedOps ||
+                              MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1;
+        } else {
+          if (MO.isEarlyClobber())
+            hasEarlyClobbers = true;
+          if (MO.getSubReg() && MI->readsVirtualRegister(Reg))
+            hasPartialRedefs = true;
+        }
+        continue;
+      }
+      if (!MRI->isAllocatable(Reg)) continue;
+      if (MO.isUse()) {
+        usePhysReg(MO);
+      } else if (MO.isEarlyClobber()) {
+        definePhysReg(*MI, Reg,
+                      (MO.isImplicit() || MO.isDead()) ? regFree : regReserved);
+        hasEarlyClobbers = true;
+      } else
+        hasPhysDefs = true;
+    }
+
+    // The instruction may have virtual register operands that must be allocated
+    // the same register at use-time and def-time: early clobbers and tied
+    // operands. If there are also physical defs, these registers must avoid
+    // both physical defs and uses, making them more constrained than normal
+    // operands.
+    // Similarly, if there are multiple defs and tied operands, we must make
+    // sure the same register is allocated to uses and defs.
+    // We didn't detect inline asm tied operands above, so just make this extra
+    // pass for all inline asm.
+    if (MI->isInlineAsm() || hasEarlyClobbers || hasPartialRedefs ||
+        (hasTiedOps && (hasPhysDefs || MCID.getNumDefs() > 1))) {
+      handleThroughOperands(MI, VirtDead);
+      // Don't attempt coalescing when we have funny stuff going on.
+      CopyDst = 0;
+      // Pretend we have early clobbers so the use operands get marked below.
+      // This is not necessary for the common case of a single tied use.
+      hasEarlyClobbers = true;
+    }
+
+    // Second scan.
+    // Allocate virtreg uses.
+    for (unsigned i = 0; i != VirtOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+      if (MO.isUse()) {
+        LiveRegMap::iterator LRI = reloadVirtReg(*MI, i, Reg, CopyDst);
+        unsigned PhysReg = LRI->PhysReg;
+        CopySrc = (CopySrc == Reg || CopySrc == PhysReg) ? PhysReg : 0;
+        if (setPhysReg(MI, i, PhysReg))
+          killVirtReg(LRI);
+      }
+    }
+
+    // Track registers defined by instruction - early clobbers and tied uses at
+    // this point.
+    UsedInInstr.clear();
+    if (hasEarlyClobbers) {
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (!MO.isReg()) continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg || !TargetRegisterInfo::isPhysicalRegister(Reg)) continue;
+        // Look for physreg defs and tied uses.
+        if (!MO.isDef() && !MI->isRegTiedToDefOperand(i)) continue;
+        markRegUsedInInstr(Reg);
+      }
+    }
+
+    unsigned DefOpEnd = MI->getNumOperands();
+    if (MI->isCall()) {
+      // Spill all virtregs before a call. This serves one purpose: If an
+      // exception is thrown, the landing pad is going to expect to find
+      // registers in their spill slots.
+      // Note: although this is appealing to just consider all definitions
+      // as call-clobbered, this is not correct because some of those
+      // definitions may be used later on and we do not want to reuse
+      // those for virtual registers in between.
+      DEBUG(dbgs() << "  Spilling remaining registers before call.\n");
+      spillAll(MI);
+
+      // The imp-defs are skipped below, but we still need to mark those
+      // registers as used by the function.
+      SkippedInstrs.insert(&MCID);
+    }
+
+    // Third scan.
+    // Allocate defs and collect dead defs.
+    for (unsigned i = 0; i != DefOpEnd; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (!MO.isReg() || !MO.isDef() || !MO.getReg() || MO.isEarlyClobber())
+        continue;
+      unsigned Reg = MO.getReg();
+
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        if (!MRI->isAllocatable(Reg)) continue;
+        definePhysReg(*MI, Reg, MO.isDead() ? regFree : regReserved);
+        continue;
+      }
+      LiveRegMap::iterator LRI = defineVirtReg(*MI, i, Reg, CopySrc);
+      unsigned PhysReg = LRI->PhysReg;
+      if (setPhysReg(MI, i, PhysReg)) {
+        VirtDead.push_back(Reg);
+        CopyDst = 0; // cancel coalescing;
+      } else
+        CopyDst = (CopyDst == Reg || CopyDst == PhysReg) ? PhysReg : 0;
+    }
+
+    // Kill dead defs after the scan to ensure that multiple defs of the same
+    // register are allocated identically. We didn't need to do this for uses
+    // because we are crerating our own kill flags, and they are always at the
+    // last use.
+    for (unsigned i = 0, e = VirtDead.size(); i != e; ++i)
+      killVirtReg(VirtDead[i]);
+    VirtDead.clear();
+
+    if (CopyDst && CopyDst == CopySrc && CopyDstSub == CopySrcSub) {
+      DEBUG(dbgs() << "-- coalescing: " << *MI);
+      Coalesced.push_back(MI);
+    } else {
+      DEBUG(dbgs() << "<< " << *MI);
+    }
+  }
+
+  // Spill all physical registers holding virtual registers now.
+  DEBUG(dbgs() << "Spilling live registers at end of block.\n");
+  spillAll(MBB->getFirstTerminator());
+
+  // Erase all the coalesced copies. We are delaying it until now because
+  // LiveVirtRegs might refer to the instrs.
+  for (unsigned i = 0, e = Coalesced.size(); i != e; ++i)
+    MBB->erase(Coalesced[i]);
+  NumCopies += Coalesced.size();
+
+  DEBUG(MBB->dump());
+}
+
+/// runOnMachineFunction - Register allocate the whole function
+///
+bool RAFast::runOnMachineFunction(MachineFunction &Fn) {
+  DEBUG(dbgs() << "********** FAST REGISTER ALLOCATION **********\n"
+               << "********** Function: " << Fn.getName() << '\n');
+  MF = &Fn;
+  MRI = &MF->getRegInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  MRI->freezeReservedRegs(Fn);
+  RegClassInfo.runOnMachineFunction(Fn);
+  UsedInInstr.clear();
+  UsedInInstr.setUniverse(TRI->getNumRegUnits());
+
+  assert(!MRI->isSSA() && "regalloc requires leaving SSA");
+
+  // initialize the virtual->physical register map to have a 'null'
+  // mapping for all virtual registers
+  StackSlotForVirtReg.resize(MRI->getNumVirtRegs());
+  LiveVirtRegs.setUniverse(MRI->getNumVirtRegs());
+
+  // Loop over all of the basic blocks, eliminating virtual register references
+  for (MachineFunction::iterator MBBi = Fn.begin(), MBBe = Fn.end();
+       MBBi != MBBe; ++MBBi) {
+    MBB = &*MBBi;
+    AllocateBasicBlock();
+  }
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers.
+  MRI->clearVirtRegs();
+
+  SkippedInstrs.clear();
+  StackSlotForVirtReg.clear();
+  LiveDbgValueMap.clear();
+  return true;
+}
+
+FunctionPass *llvm::createFastRegisterAllocator() {
+  return new RAFast();
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
new file mode 100644
index 000000000000..c4d4b1eadf3e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -0,0 +1,2619 @@
+//===-- RegAllocGreedy.cpp - greedy register allocator --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the RAGreedy function pass for register allocation in
+// optimized builds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AllocationOrder.h"
+#include "InterferenceCache.h"
+#include "LiveDebugVariables.h"
+#include "RegAllocBase.h"
+#include "SpillPlacement.h"
+#include "Spiller.h"
+#include "SplitKit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <queue>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumGlobalSplits, "Number of split global live ranges");
+STATISTIC(NumLocalSplits,  "Number of split local live ranges");
+STATISTIC(NumEvicted,      "Number of interferences evicted");
+
+static cl::opt<SplitEditor::ComplementSpillMode> SplitSpillMode(
+    "split-spill-mode", cl::Hidden,
+    cl::desc("Spill mode for splitting live ranges"),
+    cl::values(clEnumValN(SplitEditor::SM_Partition, "default", "Default"),
+               clEnumValN(SplitEditor::SM_Size, "size", "Optimize for size"),
+               clEnumValN(SplitEditor::SM_Speed, "speed", "Optimize for speed"),
+               clEnumValEnd),
+    cl::init(SplitEditor::SM_Speed));
+
+static cl::opt<unsigned>
+LastChanceRecoloringMaxDepth("lcr-max-depth", cl::Hidden,
+                             cl::desc("Last chance recoloring max depth"),
+                             cl::init(5));
+
+static cl::opt<unsigned> LastChanceRecoloringMaxInterference(
+    "lcr-max-interf", cl::Hidden,
+    cl::desc("Last chance recoloring maximum number of considered"
+             " interference at a time"),
+    cl::init(8));
+
+static cl::opt<bool>
+ExhaustiveSearch("exhaustive-register-search", cl::NotHidden,
+                 cl::desc("Exhaustive Search for registers bypassing the depth "
+                          "and interference cutoffs of last chance recoloring"));
+
+static cl::opt<bool> EnableLocalReassignment(
+    "enable-local-reassign", cl::Hidden,
+    cl::desc("Local reassignment can yield better allocation decisions, but "
+             "may be compile time intensive"),
+    cl::init(false));
+
+static cl::opt<bool> EnableDeferredSpilling(
+    "enable-deferred-spilling", cl::Hidden,
+    cl::desc("Instead of spilling a variable right away, defer the actual "
+             "code insertion to the end of the allocation. That way the "
+             "allocator might still find a suitable coloring for this "
+             "variable because of other evicted variables."),
+    cl::init(false));
+
+// FIXME: Find a good default for this flag and remove the flag.
+static cl::opt<unsigned>
+CSRFirstTimeCost("regalloc-csr-first-time-cost",
+              cl::desc("Cost for first time use of callee-saved register."),
+              cl::init(0), cl::Hidden);
+
+static RegisterRegAlloc greedyRegAlloc("greedy", "greedy register allocator",
+                                       createGreedyRegisterAllocator);
+
+namespace {
+class RAGreedy : public MachineFunctionPass,
+                 public RegAllocBase,
+                 private LiveRangeEdit::Delegate {
+  // Convenient shortcuts.
+  typedef std::priority_queue<std::pair<unsigned, unsigned> > PQueue;
+  typedef SmallPtrSet<LiveInterval *, 4> SmallLISet;
+  typedef SmallSet<unsigned, 16> SmallVirtRegSet;
+
+  // context
+  MachineFunction *MF;
+
+  // Shortcuts to some useful interface.
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RCI;
+
+  // analyses
+  SlotIndexes *Indexes;
+  MachineBlockFrequencyInfo *MBFI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  EdgeBundles *Bundles;
+  SpillPlacement *SpillPlacer;
+  LiveDebugVariables *DebugVars;
+  AliasAnalysis *AA;
+
+  // state
+  std::unique_ptr<Spiller> SpillerInstance;
+  PQueue Queue;
+  unsigned NextCascade;
+
+  // Live ranges pass through a number of stages as we try to allocate them.
+  // Some of the stages may also create new live ranges:
+  //
+  // - Region splitting.
+  // - Per-block splitting.
+  // - Local splitting.
+  // - Spilling.
+  //
+  // Ranges produced by one of the stages skip the previous stages when they are
+  // dequeued. This improves performance because we can skip interference checks
+  // that are unlikely to give any results. It also guarantees that the live
+  // range splitting algorithm terminates, something that is otherwise hard to
+  // ensure.
+  enum LiveRangeStage {
+    /// Newly created live range that has never been queued.
+    RS_New,
+
+    /// Only attempt assignment and eviction. Then requeue as RS_Split.
+    RS_Assign,
+
+    /// Attempt live range splitting if assignment is impossible.
+    RS_Split,
+
+    /// Attempt more aggressive live range splitting that is guaranteed to make
+    /// progress.  This is used for split products that may not be making
+    /// progress.
+    RS_Split2,
+
+    /// Live range will be spilled.  No more splitting will be attempted.
+    RS_Spill,
+
+
+    /// Live range is in memory. Because of other evictions, it might get moved
+    /// in a register in the end.
+    RS_Memory,
+
+    /// There is nothing more we can do to this live range.  Abort compilation
+    /// if it can't be assigned.
+    RS_Done
+  };
+
+  // Enum CutOffStage to keep a track whether the register allocation failed
+  // because of the cutoffs encountered in last chance recoloring.
+  // Note: This is used as bitmask. New value should be next power of 2.
+  enum CutOffStage {
+    // No cutoffs encountered
+    CO_None = 0,
+
+    // lcr-max-depth cutoff encountered
+    CO_Depth = 1,
+
+    // lcr-max-interf cutoff encountered
+    CO_Interf = 2
+  };
+
+  uint8_t CutOffInfo;
+
+#ifndef NDEBUG
+  static const char *const StageName[];
+#endif
+
+  // RegInfo - Keep additional information about each live range.
+  struct RegInfo {
+    LiveRangeStage Stage;
+
+    // Cascade - Eviction loop prevention. See canEvictInterference().
+    unsigned Cascade;
+
+    RegInfo() : Stage(RS_New), Cascade(0) {}
+  };
+
+  IndexedMap<RegInfo, VirtReg2IndexFunctor> ExtraRegInfo;
+
+  LiveRangeStage getStage(const LiveInterval &VirtReg) const {
+    return ExtraRegInfo[VirtReg.reg].Stage;
+  }
+
+  void setStage(const LiveInterval &VirtReg, LiveRangeStage Stage) {
+    ExtraRegInfo.resize(MRI->getNumVirtRegs());
+    ExtraRegInfo[VirtReg.reg].Stage = Stage;
+  }
+
+  template<typename Iterator>
+  void setStage(Iterator Begin, Iterator End, LiveRangeStage NewStage) {
+    ExtraRegInfo.resize(MRI->getNumVirtRegs());
+    for (;Begin != End; ++Begin) {
+      unsigned Reg = *Begin;
+      if (ExtraRegInfo[Reg].Stage == RS_New)
+        ExtraRegInfo[Reg].Stage = NewStage;
+    }
+  }
+
+  /// Cost of evicting interference.
+  struct EvictionCost {
+    unsigned BrokenHints; ///< Total number of broken hints.
+    float MaxWeight;      ///< Maximum spill weight evicted.
+
+    EvictionCost(): BrokenHints(0), MaxWeight(0) {}
+
+    bool isMax() const { return BrokenHints == ~0u; }
+
+    void setMax() { BrokenHints = ~0u; }
+
+    void setBrokenHints(unsigned NHints) { BrokenHints = NHints; }
+
+    bool operator<(const EvictionCost &O) const {
+      return std::tie(BrokenHints, MaxWeight) <
+             std::tie(O.BrokenHints, O.MaxWeight);
+    }
+  };
+
+  // splitting state.
+  std::unique_ptr<SplitAnalysis> SA;
+  std::unique_ptr<SplitEditor> SE;
+
+  /// Cached per-block interference maps
+  InterferenceCache IntfCache;
+
+  /// All basic blocks where the current register has uses.
+  SmallVector<SpillPlacement::BlockConstraint, 8> SplitConstraints;
+
+  /// Global live range splitting candidate info.
+  struct GlobalSplitCandidate {
+    // Register intended for assignment, or 0.
+    unsigned PhysReg;
+
+    // SplitKit interval index for this candidate.
+    unsigned IntvIdx;
+
+    // Interference for PhysReg.
+    InterferenceCache::Cursor Intf;
+
+    // Bundles where this candidate should be live.
+    BitVector LiveBundles;
+    SmallVector<unsigned, 8> ActiveBlocks;
+
+    void reset(InterferenceCache &Cache, unsigned Reg) {
+      PhysReg = Reg;
+      IntvIdx = 0;
+      Intf.setPhysReg(Cache, Reg);
+      LiveBundles.clear();
+      ActiveBlocks.clear();
+    }
+
+    // Set B[i] = C for every live bundle where B[i] was NoCand.
+    unsigned getBundles(SmallVectorImpl<unsigned> &B, unsigned C) {
+      unsigned Count = 0;
+      for (int i = LiveBundles.find_first(); i >= 0;
+           i = LiveBundles.find_next(i))
+        if (B[i] == NoCand) {
+          B[i] = C;
+          Count++;
+        }
+      return Count;
+    }
+  };
+
+  /// Candidate info for each PhysReg in AllocationOrder.
+  /// This vector never shrinks, but grows to the size of the largest register
+  /// class.
+  SmallVector<GlobalSplitCandidate, 32> GlobalCand;
+
+  enum : unsigned { NoCand = ~0u };
+
+  /// Candidate map. Each edge bundle is assigned to a GlobalCand entry, or to
+  /// NoCand which indicates the stack interval.
+  SmallVector<unsigned, 32> BundleCand;
+
+  /// Callee-save register cost, calculated once per machine function.
+  BlockFrequency CSRCost;
+
+  /// Run or not the local reassignment heuristic. This information is
+  /// obtained from the TargetSubtargetInfo.
+  bool EnableLocalReassign;
+
+  /// Set of broken hints that may be reconciled later because of eviction.
+  SmallSetVector<LiveInterval *, 8> SetOfBrokenHints;
+
+public:
+  RAGreedy();
+
+  /// Return the pass name.
+  const char* getPassName() const override {
+    return "Greedy Register Allocator";
+  }
+
+  /// RAGreedy analysis usage.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void releaseMemory() override;
+  Spiller &spiller() override { return *SpillerInstance; }
+  void enqueue(LiveInterval *LI) override;
+  LiveInterval *dequeue() override;
+  unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
+  void aboutToRemoveInterval(LiveInterval &) override;
+
+  /// Perform register allocation.
+  bool runOnMachineFunction(MachineFunction &mf) override;
+
+  static char ID;
+
+private:
+  unsigned selectOrSplitImpl(LiveInterval &, SmallVectorImpl<unsigned> &,
+                             SmallVirtRegSet &, unsigned = 0);
+
+  bool LRE_CanEraseVirtReg(unsigned) override;
+  void LRE_WillShrinkVirtReg(unsigned) override;
+  void LRE_DidCloneVirtReg(unsigned, unsigned) override;
+  void enqueue(PQueue &CurQueue, LiveInterval *LI);
+  LiveInterval *dequeue(PQueue &CurQueue);
+
+  BlockFrequency calcSpillCost();
+  bool addSplitConstraints(InterferenceCache::Cursor, BlockFrequency&);
+  void addThroughConstraints(InterferenceCache::Cursor, ArrayRef<unsigned>);
+  void growRegion(GlobalSplitCandidate &Cand);
+  BlockFrequency calcGlobalSplitCost(GlobalSplitCandidate&);
+  bool calcCompactRegion(GlobalSplitCandidate&);
+  void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
+  void calcGapWeights(unsigned, SmallVectorImpl<float>&);
+  unsigned canReassign(LiveInterval &VirtReg, unsigned PhysReg);
+  bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
+  bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&);
+  void evictInterference(LiveInterval&, unsigned,
+                         SmallVectorImpl<unsigned>&);
+  bool mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
+                                  SmallLISet &RecoloringCandidates,
+                                  const SmallVirtRegSet &FixedRegisters);
+
+  unsigned tryAssign(LiveInterval&, AllocationOrder&,
+                     SmallVectorImpl<unsigned>&);
+  unsigned tryEvict(LiveInterval&, AllocationOrder&,
+                    SmallVectorImpl<unsigned>&, unsigned = ~0u);
+  unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
+                          SmallVectorImpl<unsigned>&);
+  /// Calculate cost of region splitting.
+  unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
+                                    AllocationOrder &Order,
+                                    BlockFrequency &BestCost,
+                                    unsigned &NumCands, bool IgnoreCSR);
+  /// Perform region splitting.
+  unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
+                         bool HasCompact,
+                         SmallVectorImpl<unsigned> &NewVRegs);
+  /// Check other options before using a callee-saved register for the first
+  /// time.
+  unsigned tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 unsigned PhysReg, unsigned &CostPerUseLimit,
+                                 SmallVectorImpl<unsigned> &NewVRegs);
+  void initializeCSRCost();
+  unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
+                         SmallVectorImpl<unsigned>&);
+  unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
+                               SmallVectorImpl<unsigned>&);
+  unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
+    SmallVectorImpl<unsigned>&);
+  unsigned trySplit(LiveInterval&, AllocationOrder&,
+                    SmallVectorImpl<unsigned>&);
+  unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &,
+                                   SmallVectorImpl<unsigned> &,
+                                   SmallVirtRegSet &, unsigned);
+  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
+                               SmallVirtRegSet &, unsigned);
+  void tryHintRecoloring(LiveInterval &);
+  void tryHintsRecoloring();
+
+  /// Model the information carried by one end of a copy.
+  struct HintInfo {
+    /// The frequency of the copy.
+    BlockFrequency Freq;
+    /// The virtual register or physical register.
+    unsigned Reg;
+    /// Its currently assigned register.
+    /// In case of a physical register Reg == PhysReg.
+    unsigned PhysReg;
+    HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
+        : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
+  };
+  typedef SmallVector<HintInfo, 4> HintsInfo;
+  BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
+  void collectHintInfo(unsigned, HintsInfo &);
+
+  bool isUnusedCalleeSavedReg(unsigned PhysReg) const;
+};
+} // end anonymous namespace
+
+char RAGreedy::ID = 0;
+
+#ifndef NDEBUG
+const char *const RAGreedy::StageName[] = {
+    "RS_New",
+    "RS_Assign",
+    "RS_Split",
+    "RS_Split2",
+    "RS_Spill",
+    "RS_Memory",
+    "RS_Done"
+};
+#endif
+
+// Hysteresis to use when comparing floats.
+// This helps stabilize decisions based on float comparisons.
+const float Hysteresis = (2007 / 2048.0f); // 0.97998046875
+
+
+FunctionPass* llvm::createGreedyRegisterAllocator() {
+  return new RAGreedy();
+}
+
+RAGreedy::RAGreedy(): MachineFunctionPass(ID) {
+  initializeLiveDebugVariablesPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+  initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+  initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+  initializeMachineSchedulerPass(*PassRegistry::getPassRegistry());
+  initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+  initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
+  initializeMachineLoopInfoPass(*PassRegistry::getPassRegistry());
+  initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  initializeLiveRegMatrixPass(*PassRegistry::getPassRegistry());
+  initializeEdgeBundlesPass(*PassRegistry::getPassRegistry());
+  initializeSpillPlacementPass(*PassRegistry::getPassRegistry());
+}
+
+void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addPreserved<MachineBlockFrequencyInfo>();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addPreserved<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<VirtRegMap>();
+  AU.addPreserved<VirtRegMap>();
+  AU.addRequired<LiveRegMatrix>();
+  AU.addPreserved<LiveRegMatrix>();
+  AU.addRequired<EdgeBundles>();
+  AU.addRequired<SpillPlacement>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                     LiveRangeEdit delegate methods
+//===----------------------------------------------------------------------===//
+
+bool RAGreedy::LRE_CanEraseVirtReg(unsigned VirtReg) {
+  if (VRM->hasPhys(VirtReg)) {
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    Matrix->unassign(LI);
+    aboutToRemoveInterval(LI);
+    return true;
+  }
+  // Unassigned virtreg is probably in the priority queue.
+  // RegAllocBase will erase it after dequeueing.
+  return false;
+}
+
+void RAGreedy::LRE_WillShrinkVirtReg(unsigned VirtReg) {
+  if (!VRM->hasPhys(VirtReg))
+    return;
+
+  // Register is assigned, put it back on the queue for reassignment.
+  LiveInterval &LI = LIS->getInterval(VirtReg);
+  Matrix->unassign(LI);
+  enqueue(&LI);
+}
+
+void RAGreedy::LRE_DidCloneVirtReg(unsigned New, unsigned Old) {
+  // Cloning a register we haven't even heard about yet?  Just ignore it.
+  if (!ExtraRegInfo.inBounds(Old))
+    return;
+
+  // LRE may clone a virtual register because dead code elimination causes it to
+  // be split into connected components. The new components are much smaller
+  // than the original, so they should get a new chance at being assigned.
+  // same stage as the parent.
+  ExtraRegInfo[Old].Stage = RS_Assign;
+  ExtraRegInfo.grow(New);
+  ExtraRegInfo[New] = ExtraRegInfo[Old];
+}
+
+void RAGreedy::releaseMemory() {
+  SpillerInstance.reset();
+  ExtraRegInfo.clear();
+  GlobalCand.clear();
+}
+
+void RAGreedy::enqueue(LiveInterval *LI) { enqueue(Queue, LI); }
+
+void RAGreedy::enqueue(PQueue &CurQueue, LiveInterval *LI) {
+  // Prioritize live ranges by size, assigning larger ranges first.
+  // The queue holds (size, reg) pairs.
+  const unsigned Size = LI->getSize();
+  const unsigned Reg = LI->reg;
+  assert(TargetRegisterInfo::isVirtualRegister(Reg) &&
+         "Can only enqueue virtual registers");
+  unsigned Prio;
+
+  ExtraRegInfo.grow(Reg);
+  if (ExtraRegInfo[Reg].Stage == RS_New)
+    ExtraRegInfo[Reg].Stage = RS_Assign;
+
+  if (ExtraRegInfo[Reg].Stage == RS_Split) {
+    // Unsplit ranges that couldn't be allocated immediately are deferred until
+    // everything else has been allocated.
+    Prio = Size;
+  } else if (ExtraRegInfo[Reg].Stage == RS_Memory) {
+    // Memory operand should be considered last.
+    // Change the priority such that Memory operand are assigned in
+    // the reverse order that they came in.
+    // TODO: Make this a member variable and probably do something about hints.
+    static unsigned MemOp = 0;
+    Prio = MemOp++;
+  } else {
+    // Giant live ranges fall back to the global assignment heuristic, which
+    // prevents excessive spilling in pathological cases.
+    bool ReverseLocal = TRI->reverseLocalAssignment();
+    const TargetRegisterClass &RC = *MRI->getRegClass(Reg);
+    bool ForceGlobal = !ReverseLocal &&
+      (Size / SlotIndex::InstrDist) > (2 * RC.getNumRegs());
+
+    if (ExtraRegInfo[Reg].Stage == RS_Assign && !ForceGlobal && !LI->empty() &&
+        LIS->intervalIsInOneMBB(*LI)) {
+      // Allocate original local ranges in linear instruction order. Since they
+      // are singly defined, this produces optimal coloring in the absence of
+      // global interference and other constraints.
+      if (!ReverseLocal)
+        Prio = LI->beginIndex().getInstrDistance(Indexes->getLastIndex());
+      else {
+        // Allocating bottom up may allow many short LRGs to be assigned first
+        // to one of the cheap registers. This could be much faster for very
+        // large blocks on targets with many physical registers.
+        Prio = Indexes->getZeroIndex().getInstrDistance(LI->endIndex());
+      }
+      Prio |= RC.AllocationPriority << 24;
+    } else {
+      // Allocate global and split ranges in long->short order. Long ranges that
+      // don't fit should be spilled (or split) ASAP so they don't create
+      // interference.  Mark a bit to prioritize global above local ranges.
+      Prio = (1u << 29) + Size;
+    }
+    // Mark a higher bit to prioritize global and local above RS_Split.
+    Prio |= (1u << 31);
+
+    // Boost ranges that have a physical register hint.
+    if (VRM->hasKnownPreference(Reg))
+      Prio |= (1u << 30);
+  }
+  // The virtual register number is a tie breaker for same-sized ranges.
+  // Give lower vreg numbers higher priority to assign them first.
+  CurQueue.push(std::make_pair(Prio, ~Reg));
+}
+
+LiveInterval *RAGreedy::dequeue() { return dequeue(Queue); }
+
+LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
+  if (CurQueue.empty())
+    return nullptr;
+  LiveInterval *LI = &LIS->getInterval(~CurQueue.top().second);
+  CurQueue.pop();
+  return LI;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Direct Assignment
+//===----------------------------------------------------------------------===//
+
+/// tryAssign - Try to assign VirtReg to an available register.
+unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
+                             AllocationOrder &Order,
+                             SmallVectorImpl<unsigned> &NewVRegs) {
+  Order.rewind();
+  unsigned PhysReg;
+  while ((PhysReg = Order.next()))
+    if (!Matrix->checkInterference(VirtReg, PhysReg))
+      break;
+  if (!PhysReg || Order.isHint())
+    return PhysReg;
+
+  // PhysReg is available, but there may be a better choice.
+
+  // If we missed a simple hint, try to cheaply evict interference from the
+  // preferred register.
+  if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
+    if (Order.isHint(Hint)) {
+      DEBUG(dbgs() << "missed hint " << PrintReg(Hint, TRI) << '\n');
+      EvictionCost MaxCost;
+      MaxCost.setBrokenHints(1);
+      if (canEvictInterference(VirtReg, Hint, true, MaxCost)) {
+        evictInterference(VirtReg, Hint, NewVRegs);
+        return Hint;
+      }
+    }
+
+  // Try to evict interference from a cheaper alternative.
+  unsigned Cost = TRI->getCostPerUse(PhysReg);
+
+  // Most registers have 0 additional cost.
+  if (!Cost)
+    return PhysReg;
+
+  DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " is available at cost " << Cost
+               << '\n');
+  unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost);
+  return CheapReg ? CheapReg : PhysReg;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Interference eviction
+//===----------------------------------------------------------------------===//
+
+unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
+  unsigned PhysReg;
+  while ((PhysReg = Order.next())) {
+    if (PhysReg == PrevReg)
+      continue;
+
+    MCRegUnitIterator Units(PhysReg, TRI);
+    for (; Units.isValid(); ++Units) {
+      // Instantiate a "subquery", not to be confused with the Queries array.
+      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]);
+      if (subQ.checkInterference())
+        break;
+    }
+    // If no units have interference, break out with the current PhysReg.
+    if (!Units.isValid())
+      break;
+  }
+  if (PhysReg)
+    DEBUG(dbgs() << "can reassign: " << VirtReg << " from "
+          << PrintReg(PrevReg, TRI) << " to " << PrintReg(PhysReg, TRI)
+          << '\n');
+  return PhysReg;
+}
+
+/// shouldEvict - determine if A should evict the assigned live range B. The
+/// eviction policy defined by this function together with the allocation order
+/// defined by enqueue() decides which registers ultimately end up being split
+/// and spilled.
+///
+/// Cascade numbers are used to prevent infinite loops if this function is a
+/// cyclic relation.
+///
+/// @param A          The live range to be assigned.
+/// @param IsHint     True when A is about to be assigned to its preferred
+///                   register.
+/// @param B          The live range to be evicted.
+/// @param BreaksHint True when B is already assigned to its preferred register.
+bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
+                           LiveInterval &B, bool BreaksHint) {
+  bool CanSplit = getStage(B) < RS_Spill;
+
+  // Be fairly aggressive about following hints as long as the evictee can be
+  // split.
+  if (CanSplit && IsHint && !BreaksHint)
+    return true;
+
+  if (A.weight > B.weight) {
+    DEBUG(dbgs() << "should evict: " << B << " w= " << B.weight << '\n');
+    return true;
+  }
+  return false;
+}
+
+/// canEvictInterference - Return true if all interferences between VirtReg and
+/// PhysReg can be evicted.
+///
+/// @param VirtReg Live range that is about to be assigned.
+/// @param PhysReg Desired register for assignment.
+/// @param IsHint  True when PhysReg is VirtReg's preferred register.
+/// @param MaxCost Only look for cheaper candidates and update with new cost
+///                when returning true.
+/// @returns True when interference can be evicted cheaper than MaxCost.
+bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
+                                    bool IsHint, EvictionCost &MaxCost) {
+  // It is only possible to evict virtual register interference.
+  if (Matrix->checkInterference(VirtReg, PhysReg) > LiveRegMatrix::IK_VirtReg)
+    return false;
+
+  bool IsLocal = LIS->intervalIsInOneMBB(VirtReg);
+
+  // Find VirtReg's cascade number. This will be unassigned if VirtReg was never
+  // involved in an eviction before. If a cascade number was assigned, deny
+  // evicting anything with the same or a newer cascade number. This prevents
+  // infinite eviction loops.
+  //
+  // This works out so a register without a cascade number is allowed to evict
+  // anything, and it can be evicted by anything.
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  if (!Cascade)
+    Cascade = NextCascade;
+
+  EvictionCost Cost;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    // If there is 10 or more interferences, chances are one is heavier.
+    if (Q.collectInterferingVRegs(10) >= 10)
+      return false;
+
+    // Check if any interfering live range is heavier than MaxWeight.
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      assert(TargetRegisterInfo::isVirtualRegister(Intf->reg) &&
+             "Only expecting virtual register interference from query");
+      // Never evict spill products. They cannot split or spill.
+      if (getStage(*Intf) == RS_Done)
+        return false;
+      // Once a live range becomes small enough, it is urgent that we find a
+      // register for it. This is indicated by an infinite spill weight. These
+      // urgent live ranges get to evict almost anything.
+      //
+      // Also allow urgent evictions of unspillable ranges from a strictly
+      // larger allocation order.
+      bool Urgent = !VirtReg.isSpillable() &&
+        (Intf->isSpillable() ||
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(VirtReg.reg)) <
+         RegClassInfo.getNumAllocatableRegs(MRI->getRegClass(Intf->reg)));
+      // Only evict older cascades or live ranges without a cascade.
+      unsigned IntfCascade = ExtraRegInfo[Intf->reg].Cascade;
+      if (Cascade <= IntfCascade) {
+        if (!Urgent)
+          return false;
+        // We permit breaking cascades for urgent evictions. It should be the
+        // last resort, though, so make it really expensive.
+        Cost.BrokenHints += 10;
+      }
+      // Would this break a satisfied hint?
+      bool BreaksHint = VRM->hasPreferredPhys(Intf->reg);
+      // Update eviction cost.
+      Cost.BrokenHints += BreaksHint;
+      Cost.MaxWeight = std::max(Cost.MaxWeight, Intf->weight);
+      // Abort if this would be too expensive.
+      if (!(Cost < MaxCost))
+        return false;
+      if (Urgent)
+        continue;
+      // Apply the eviction policy for non-urgent evictions.
+      if (!shouldEvict(VirtReg, IsHint, *Intf, BreaksHint))
+        return false;
+      // If !MaxCost.isMax(), then we're just looking for a cheap register.
+      // Evicting another local live range in this case could lead to suboptimal
+      // coloring.
+      if (!MaxCost.isMax() && IsLocal && LIS->intervalIsInOneMBB(*Intf) &&
+          (!EnableLocalReassign || !canReassign(*Intf, PhysReg))) {
+        return false;
+      }
+    }
+  }
+  MaxCost = Cost;
+  return true;
+}
+
+/// evictInterference - Evict any interferring registers that prevent VirtReg
+/// from being assigned to Physreg. This assumes that canEvictInterference
+/// returned true.
+void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
+  // Make sure that VirtReg has a cascade number, and assign that cascade
+  // number to every evicted register. These live ranges than then only be
+  // evicted by a newer cascade, preventing infinite loops.
+  unsigned Cascade = ExtraRegInfo[VirtReg.reg].Cascade;
+  if (!Cascade)
+    Cascade = ExtraRegInfo[VirtReg.reg].Cascade = NextCascade++;
+
+  DEBUG(dbgs() << "evicting " << PrintReg(PhysReg, TRI)
+               << " interference: Cascade " << Cascade << '\n');
+
+  // Collect all interfering virtregs first.
+  SmallVector<LiveInterval*, 8> Intfs;
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
+    ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
+    Intfs.append(IVR.begin(), IVR.end());
+  }
+
+  // Evict them second. This will invalidate the queries.
+  for (unsigned i = 0, e = Intfs.size(); i != e; ++i) {
+    LiveInterval *Intf = Intfs[i];
+    // The same VirtReg may be present in multiple RegUnits. Skip duplicates.
+    if (!VRM->hasPhys(Intf->reg))
+      continue;
+    Matrix->unassign(*Intf);
+    assert((ExtraRegInfo[Intf->reg].Cascade < Cascade ||
+            VirtReg.isSpillable() < Intf->isSpillable()) &&
+           "Cannot decrease cascade number, illegal eviction");
+    ExtraRegInfo[Intf->reg].Cascade = Cascade;
+    ++NumEvicted;
+    NewVRegs.push_back(Intf->reg);
+  }
+}
+
+/// Returns true if the given \p PhysReg is a callee saved register and has not
+/// been used for allocation yet.
+bool RAGreedy::isUnusedCalleeSavedReg(unsigned PhysReg) const {
+  unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
+  if (CSR == 0)
+    return false;
+
+  return !Matrix->isPhysRegUsed(PhysReg);
+}
+
+/// tryEvict - Try to evict all interferences for a physreg.
+/// @param  VirtReg Currently unassigned virtual register.
+/// @param  Order   Physregs to try.
+/// @return         Physreg to assign VirtReg, or 0.
+unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
+                            AllocationOrder &Order,
+                            SmallVectorImpl<unsigned> &NewVRegs,
+                            unsigned CostPerUseLimit) {
+  NamedRegionTimer T("Evict", TimerGroupName, TimePassesIsEnabled);
+
+  // Keep track of the cheapest interference seen so far.
+  EvictionCost BestCost;
+  BestCost.setMax();
+  unsigned BestPhys = 0;
+  unsigned OrderLimit = Order.getOrder().size();
+
+  // When we are just looking for a reduced cost per use, don't break any
+  // hints, and only evict smaller spill weights.
+  if (CostPerUseLimit < ~0u) {
+    BestCost.BrokenHints = 0;
+    BestCost.MaxWeight = VirtReg.weight;
+
+    // Check of any registers in RC are below CostPerUseLimit.
+    const TargetRegisterClass *RC = MRI->getRegClass(VirtReg.reg);
+    unsigned MinCost = RegClassInfo.getMinCost(RC);
+    if (MinCost >= CostPerUseLimit) {
+      DEBUG(dbgs() << TRI->getRegClassName(RC) << " minimum cost = " << MinCost
+                   << ", no cheaper registers to be found.\n");
+      return 0;
+    }
+
+    // It is normal for register classes to have a long tail of registers with
+    // the same cost. We don't need to look at them if they're too expensive.
+    if (TRI->getCostPerUse(Order.getOrder().back()) >= CostPerUseLimit) {
+      OrderLimit = RegClassInfo.getLastCostChange(RC);
+      DEBUG(dbgs() << "Only trying the first " << OrderLimit << " regs.\n");
+    }
+  }
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.next(OrderLimit)) {
+    if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
+      continue;
+    // The first use of a callee-saved register in a function has cost 1.
+    // Don't start using a CSR when the CostPerUseLimit is low.
+    if (CostPerUseLimit == 1 && isUnusedCalleeSavedReg(PhysReg)) {
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << " would clobber CSR "
+            << PrintReg(RegClassInfo.getLastCalleeSavedAlias(PhysReg), TRI)
+            << '\n');
+      continue;
+    }
+
+    if (!canEvictInterference(VirtReg, PhysReg, false, BestCost))
+      continue;
+
+    // Best so far.
+    BestPhys = PhysReg;
+
+    // Stop if the hint can be used.
+    if (Order.isHint())
+      break;
+  }
+
+  if (!BestPhys)
+    return 0;
+
+  evictInterference(VirtReg, BestPhys, NewVRegs);
+  return BestPhys;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                              Region Splitting
+//===----------------------------------------------------------------------===//
+
+/// addSplitConstraints - Fill out the SplitConstraints vector based on the
+/// interference pattern in Physreg and its aliases. Add the constraints to
+/// SpillPlacement and return the static cost of this split in Cost, assuming
+/// that all preferences in SplitConstraints are met.
+/// Return false if there are no bundles with positive bias.
+bool RAGreedy::addSplitConstraints(InterferenceCache::Cursor Intf,
+                                   BlockFrequency &Cost) {
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+
+  // Reset interference dependent info.
+  SplitConstraints.resize(UseBlocks.size());
+  BlockFrequency StaticCost = 0;
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+
+    BC.Number = BI.MBB->getNumber();
+    Intf.moveToBlock(BC.Number);
+    BC.Entry = BI.LiveIn ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.Exit = BI.LiveOut ? SpillPlacement::PrefReg : SpillPlacement::DontCare;
+    BC.ChangesValue = BI.FirstDef.isValid();
+
+    if (!Intf.hasInterference())
+      continue;
+
+    // Number of spill code instructions to insert.
+    unsigned Ins = 0;
+
+    // Interference for the live-in value.
+    if (BI.LiveIn) {
+      if (Intf.first() <= Indexes->getMBBStartIdx(BC.Number)) {
+        BC.Entry = SpillPlacement::MustSpill;
+        ++Ins;
+      } else if (Intf.first() < BI.FirstInstr) {
+        BC.Entry = SpillPlacement::PrefSpill;
+        ++Ins;
+      } else if (Intf.first() < BI.LastInstr) {
+        ++Ins;
+      }
+    }
+
+    // Interference for the live-out value.
+    if (BI.LiveOut) {
+      if (Intf.last() >= SA->getLastSplitPoint(BC.Number)) {
+        BC.Exit = SpillPlacement::MustSpill;
+        ++Ins;
+      } else if (Intf.last() > BI.LastInstr) {
+        BC.Exit = SpillPlacement::PrefSpill;
+        ++Ins;
+      } else if (Intf.last() > BI.FirstInstr) {
+        ++Ins;
+      }
+    }
+
+    // Accumulate the total frequency of inserted spill code.
+    while (Ins--)
+      StaticCost += SpillPlacer->getBlockFrequency(BC.Number);
+  }
+  Cost = StaticCost;
+
+  // Add constraints for use-blocks. Note that these are the only constraints
+  // that may add a positive bias, it is downhill from here.
+  SpillPlacer->addConstraints(SplitConstraints);
+  return SpillPlacer->scanActiveBundles();
+}
+
+
+/// addThroughConstraints - Add constraints and links to SpillPlacer from the
+/// live-through blocks in Blocks.
+void RAGreedy::addThroughConstraints(InterferenceCache::Cursor Intf,
+                                     ArrayRef<unsigned> Blocks) {
+  const unsigned GroupSize = 8;
+  SpillPlacement::BlockConstraint BCS[GroupSize];
+  unsigned TBS[GroupSize];
+  unsigned B = 0, T = 0;
+
+  for (unsigned i = 0; i != Blocks.size(); ++i) {
+    unsigned Number = Blocks[i];
+    Intf.moveToBlock(Number);
+
+    if (!Intf.hasInterference()) {
+      assert(T < GroupSize && "Array overflow");
+      TBS[T] = Number;
+      if (++T == GroupSize) {
+        SpillPlacer->addLinks(makeArrayRef(TBS, T));
+        T = 0;
+      }
+      continue;
+    }
+
+    assert(B < GroupSize && "Array overflow");
+    BCS[B].Number = Number;
+
+    // Interference for the live-in value.
+    if (Intf.first() <= Indexes->getMBBStartIdx(Number))
+      BCS[B].Entry = SpillPlacement::MustSpill;
+    else
+      BCS[B].Entry = SpillPlacement::PrefSpill;
+
+    // Interference for the live-out value.
+    if (Intf.last() >= SA->getLastSplitPoint(Number))
+      BCS[B].Exit = SpillPlacement::MustSpill;
+    else
+      BCS[B].Exit = SpillPlacement::PrefSpill;
+
+    if (++B == GroupSize) {
+      SpillPlacer->addConstraints(makeArrayRef(BCS, B));
+      B = 0;
+    }
+  }
+
+  SpillPlacer->addConstraints(makeArrayRef(BCS, B));
+  SpillPlacer->addLinks(makeArrayRef(TBS, T));
+}
+
+void RAGreedy::growRegion(GlobalSplitCandidate &Cand) {
+  // Keep track of through blocks that have not been added to SpillPlacer.
+  BitVector Todo = SA->getThroughBlocks();
+  SmallVectorImpl<unsigned> &ActiveBlocks = Cand.ActiveBlocks;
+  unsigned AddedTo = 0;
+#ifndef NDEBUG
+  unsigned Visited = 0;
+#endif
+
+  for (;;) {
+    ArrayRef<unsigned> NewBundles = SpillPlacer->getRecentPositive();
+    // Find new through blocks in the periphery of PrefRegBundles.
+    for (int i = 0, e = NewBundles.size(); i != e; ++i) {
+      unsigned Bundle = NewBundles[i];
+      // Look at all blocks connected to Bundle in the full graph.
+      ArrayRef<unsigned> Blocks = Bundles->getBlocks(Bundle);
+      for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+           I != E; ++I) {
+        unsigned Block = *I;
+        if (!Todo.test(Block))
+          continue;
+        Todo.reset(Block);
+        // This is a new through block. Add it to SpillPlacer later.
+        ActiveBlocks.push_back(Block);
+#ifndef NDEBUG
+        ++Visited;
+#endif
+      }
+    }
+    // Any new blocks to add?
+    if (ActiveBlocks.size() == AddedTo)
+      break;
+
+    // Compute through constraints from the interference, or assume that all
+    // through blocks prefer spilling when forming compact regions.
+    auto NewBlocks = makeArrayRef(ActiveBlocks).slice(AddedTo);
+    if (Cand.PhysReg)
+      addThroughConstraints(Cand.Intf, NewBlocks);
+    else
+      // Provide a strong negative bias on through blocks to prevent unwanted
+      // liveness on loop backedges.
+      SpillPlacer->addPrefSpill(NewBlocks, /* Strong= */ true);
+    AddedTo = ActiveBlocks.size();
+
+    // Perhaps iterating can enable more bundles?
+    SpillPlacer->iterate();
+  }
+  DEBUG(dbgs() << ", v=" << Visited);
+}
+
+/// calcCompactRegion - Compute the set of edge bundles that should be live
+/// when splitting the current live range into compact regions.  Compact
+/// regions can be computed without looking at interference.  They are the
+/// regions formed by removing all the live-through blocks from the live range.
+///
+/// Returns false if the current live range is already compact, or if the
+/// compact regions would form single block regions anyway.
+bool RAGreedy::calcCompactRegion(GlobalSplitCandidate &Cand) {
+  // Without any through blocks, the live range is already compact.
+  if (!SA->getNumThroughBlocks())
+    return false;
+
+  // Compact regions don't correspond to any physreg.
+  Cand.reset(IntfCache, 0);
+
+  DEBUG(dbgs() << "Compact region bundles");
+
+  // Use the spill placer to determine the live bundles. GrowRegion pretends
+  // that all the through blocks have interference when PhysReg is unset.
+  SpillPlacer->prepare(Cand.LiveBundles);
+
+  // The static split cost will be zero since Cand.Intf reports no interference.
+  BlockFrequency Cost;
+  if (!addSplitConstraints(Cand.Intf, Cost)) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  growRegion(Cand);
+  SpillPlacer->finish();
+
+  if (!Cand.LiveBundles.any()) {
+    DEBUG(dbgs() << ", none.\n");
+    return false;
+  }
+
+  DEBUG({
+    for (int i = Cand.LiveBundles.find_first(); i>=0;
+         i = Cand.LiveBundles.find_next(i))
+    dbgs() << " EB#" << i;
+    dbgs() << ".\n";
+  });
+  return true;
+}
+
+/// calcSpillCost - Compute how expensive it would be to split the live range in
+/// SA around all use blocks instead of forming bundle regions.
+BlockFrequency RAGreedy::calcSpillCost() {
+  BlockFrequency Cost = 0;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    unsigned Number = BI.MBB->getNumber();
+    // We normally only need one spill instruction - a load or a store.
+    Cost += SpillPlacer->getBlockFrequency(Number);
+
+    // Unless the value is redefined in the block.
+    if (BI.LiveIn && BI.LiveOut && BI.FirstDef)
+      Cost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return Cost;
+}
+
+/// calcGlobalSplitCost - Return the global split cost of following the split
+/// pattern in LiveBundles. This cost should be added to the local cost of the
+/// interference pattern in SplitConstraints.
+///
+BlockFrequency RAGreedy::calcGlobalSplitCost(GlobalSplitCandidate &Cand) {
+  BlockFrequency GlobalCost = 0;
+  const BitVector &LiveBundles = Cand.LiveBundles;
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    SpillPlacement::BlockConstraint &BC = SplitConstraints[i];
+    bool RegIn  = LiveBundles[Bundles->getBundle(BC.Number, 0)];
+    bool RegOut = LiveBundles[Bundles->getBundle(BC.Number, 1)];
+    unsigned Ins = 0;
+
+    if (BI.LiveIn)
+      Ins += RegIn != (BC.Entry == SpillPlacement::PrefReg);
+    if (BI.LiveOut)
+      Ins += RegOut != (BC.Exit == SpillPlacement::PrefReg);
+    while (Ins--)
+      GlobalCost += SpillPlacer->getBlockFrequency(BC.Number);
+  }
+
+  for (unsigned i = 0, e = Cand.ActiveBlocks.size(); i != e; ++i) {
+    unsigned Number = Cand.ActiveBlocks[i];
+    bool RegIn  = LiveBundles[Bundles->getBundle(Number, 0)];
+    bool RegOut = LiveBundles[Bundles->getBundle(Number, 1)];
+    if (!RegIn && !RegOut)
+      continue;
+    if (RegIn && RegOut) {
+      // We need double spill code if this block has interference.
+      Cand.Intf.moveToBlock(Number);
+      if (Cand.Intf.hasInterference()) {
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+        GlobalCost += SpillPlacer->getBlockFrequency(Number);
+      }
+      continue;
+    }
+    // live-in / stack-out or stack-in live-out.
+    GlobalCost += SpillPlacer->getBlockFrequency(Number);
+  }
+  return GlobalCost;
+}
+
+/// splitAroundRegion - Split the current live range around the regions
+/// determined by BundleCand and GlobalCand.
+///
+/// Before calling this function, GlobalCand and BundleCand must be initialized
+/// so each bundle is assigned to a valid candidate, or NoCand for the
+/// stack-bound bundles.  The shared SA/SE SplitAnalysis and SplitEditor
+/// objects must be initialized for the current live range, and intervals
+/// created for the used candidates.
+///
+/// @param LREdit    The LiveRangeEdit object handling the current split.
+/// @param UsedCands List of used GlobalCand entries. Every BundleCand value
+///                  must appear in this list.
+void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
+                                 ArrayRef<unsigned> UsedCands) {
+  // These are the intervals created for new global ranges. We may create more
+  // intervals for local ranges.
+  const unsigned NumGlobalIntvs = LREdit.size();
+  DEBUG(dbgs() << "splitAroundRegion with " << NumGlobalIntvs << " globals.\n");
+  assert(NumGlobalIntvs && "No global intervals configured");
+
+  // Isolate even single instructions when dealing with a proper sub-class.
+  // That guarantees register class inflation for the stack interval because it
+  // is all copies.
+  unsigned Reg = SA->getParent().reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+
+  // First handle all the blocks with uses.
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    unsigned Number = BI.MBB->getNumber();
+    unsigned IntvIn = 0, IntvOut = 0;
+    SlotIndex IntfIn, IntfOut;
+    if (BI.LiveIn) {
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+    }
+    if (BI.LiveOut) {
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+    }
+
+    // Create separate intervals for isolated blocks with multiple uses.
+    if (!IntvIn && !IntvOut) {
+      DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " isolated.\n");
+      if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+        SE->splitSingleBlock(BI);
+      continue;
+    }
+
+    if (IntvIn && IntvOut)
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    else if (IntvIn)
+      SE->splitRegInBlock(BI, IntvIn, IntfIn);
+    else
+      SE->splitRegOutBlock(BI, IntvOut, IntfOut);
+  }
+
+  // Handle live-through blocks. The relevant live-through blocks are stored in
+  // the ActiveBlocks list with each candidate. We need to filter out
+  // duplicates.
+  BitVector Todo = SA->getThroughBlocks();
+  for (unsigned c = 0; c != UsedCands.size(); ++c) {
+    ArrayRef<unsigned> Blocks = GlobalCand[UsedCands[c]].ActiveBlocks;
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      unsigned Number = Blocks[i];
+      if (!Todo.test(Number))
+        continue;
+      Todo.reset(Number);
+
+      unsigned IntvIn = 0, IntvOut = 0;
+      SlotIndex IntfIn, IntfOut;
+
+      unsigned CandIn = BundleCand[Bundles->getBundle(Number, 0)];
+      if (CandIn != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandIn];
+        IntvIn = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfIn = Cand.Intf.first();
+      }
+
+      unsigned CandOut = BundleCand[Bundles->getBundle(Number, 1)];
+      if (CandOut != NoCand) {
+        GlobalSplitCandidate &Cand = GlobalCand[CandOut];
+        IntvOut = Cand.IntvIdx;
+        Cand.Intf.moveToBlock(Number);
+        IntfOut = Cand.Intf.last();
+      }
+      if (!IntvIn && !IntvOut)
+        continue;
+      SE->splitLiveThroughBlock(Number, IntvIn, IntfIn, IntvOut, IntfOut);
+    }
+  }
+
+  ++NumGlobalSplits;
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
+
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+  unsigned OrigBlocks = SA->getNumLiveBlocks();
+
+  // Sort out the new intervals created by splitting. We get four kinds:
+  // - Remainder intervals should not be split again.
+  // - Candidate intervals can be assigned to Cand.PhysReg.
+  // - Block-local splits are candidates for local splitting.
+  // - DCE leftovers should go back on the queue.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    LiveInterval &Reg = LIS->getInterval(LREdit.get(i));
+
+    // Ignore old intervals from DCE.
+    if (getStage(Reg) != RS_New)
+      continue;
+
+    // Remainder interval. Don't try splitting again, spill if it doesn't
+    // allocate.
+    if (IntvMap[i] == 0) {
+      setStage(Reg, RS_Spill);
+      continue;
+    }
+
+    // Global intervals. Allow repeated splitting as long as the number of live
+    // blocks is strictly decreasing.
+    if (IntvMap[i] < NumGlobalIntvs) {
+      if (SA->countLiveBlocks(&Reg) >= OrigBlocks) {
+        DEBUG(dbgs() << "Main interval covers the same " << OrigBlocks
+                     << " blocks as original.\n");
+        // Don't allow repeated splitting as a safe guard against looping.
+        setStage(Reg, RS_Split2);
+      }
+      continue;
+    }
+
+    // Other intervals are treated as new. This includes local intervals created
+    // for blocks with multiple uses, and anything created by DCE.
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around region");
+}
+
+unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                  SmallVectorImpl<unsigned> &NewVRegs) {
+  unsigned NumCands = 0;
+  BlockFrequency BestCost;
+
+  // Check if we can split this live range around a compact region.
+  bool HasCompact = calcCompactRegion(GlobalCand.front());
+  if (HasCompact) {
+    // Yes, keep GlobalCand[0] as the compact region candidate.
+    NumCands = 1;
+    BestCost = BlockFrequency::getMaxFrequency();
+  } else {
+    // No benefit from the compact region, our fallback will be per-block
+    // splitting. Make sure we find a solution that is cheaper than spilling.
+    BestCost = calcSpillCost();
+    DEBUG(dbgs() << "Cost of isolating all blocks = ";
+                 MBFI->printBlockFreq(dbgs(), BestCost) << '\n');
+  }
+
+  unsigned BestCand =
+      calculateRegionSplitCost(VirtReg, Order, BestCost, NumCands,
+                               false/*IgnoreCSR*/);
+
+  // No solutions found, fall back to single block splitting.
+  if (!HasCompact && BestCand == NoCand)
+    return 0;
+
+  return doRegionSplit(VirtReg, BestCand, HasCompact, NewVRegs);
+}
+
+unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
+                                            AllocationOrder &Order,
+                                            BlockFrequency &BestCost,
+                                            unsigned &NumCands,
+                                            bool IgnoreCSR) {
+  unsigned BestCand = NoCand;
+  Order.rewind();
+  while (unsigned PhysReg = Order.next()) {
+    if (IgnoreCSR && isUnusedCalleeSavedReg(PhysReg))
+      continue;
+
+    // Discard bad candidates before we run out of interference cache cursors.
+    // This will only affect register classes with a lot of registers (>32).
+    if (NumCands == IntfCache.getMaxCursors()) {
+      unsigned WorstCount = ~0u;
+      unsigned Worst = 0;
+      for (unsigned i = 0; i != NumCands; ++i) {
+        if (i == BestCand || !GlobalCand[i].PhysReg)
+          continue;
+        unsigned Count = GlobalCand[i].LiveBundles.count();
+        if (Count < WorstCount) {
+          Worst = i;
+          WorstCount = Count;
+        }
+      }
+      --NumCands;
+      GlobalCand[Worst] = GlobalCand[NumCands];
+      if (BestCand == NumCands)
+        BestCand = Worst;
+    }
+
+    if (GlobalCand.size() <= NumCands)
+      GlobalCand.resize(NumCands+1);
+    GlobalSplitCandidate &Cand = GlobalCand[NumCands];
+    Cand.reset(IntfCache, PhysReg);
+
+    SpillPlacer->prepare(Cand.LiveBundles);
+    BlockFrequency Cost;
+    if (!addSplitConstraints(Cand.Intf, Cost)) {
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tno positive bundles\n");
+      continue;
+    }
+    DEBUG(dbgs() << PrintReg(PhysReg, TRI) << "\tstatic = ";
+                 MBFI->printBlockFreq(dbgs(), Cost));
+    if (Cost >= BestCost) {
+      DEBUG({
+        if (BestCand == NoCand)
+          dbgs() << " worse than no bundles\n";
+        else
+          dbgs() << " worse than "
+                 << PrintReg(GlobalCand[BestCand].PhysReg, TRI) << '\n';
+      });
+      continue;
+    }
+    growRegion(Cand);
+
+    SpillPlacer->finish();
+
+    // No live bundles, defer to splitSingleBlocks().
+    if (!Cand.LiveBundles.any()) {
+      DEBUG(dbgs() << " no bundles.\n");
+      continue;
+    }
+
+    Cost += calcGlobalSplitCost(Cand);
+    DEBUG({
+      dbgs() << ", total = "; MBFI->printBlockFreq(dbgs(), Cost)
+                                << " with bundles";
+      for (int i = Cand.LiveBundles.find_first(); i>=0;
+           i = Cand.LiveBundles.find_next(i))
+        dbgs() << " EB#" << i;
+      dbgs() << ".\n";
+    });
+    if (Cost < BestCost) {
+      BestCand = NumCands;
+      BestCost = Cost;
+    }
+    ++NumCands;
+  }
+  return BestCand;
+}
+
+unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
+                                 bool HasCompact,
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
+  SmallVector<unsigned, 8> UsedCands;
+  // Prepare split editor.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitSpillMode);
+
+  // Assign all edge bundles to the preferred candidate, or NoCand.
+  BundleCand.assign(Bundles->getNumBundles(), NoCand);
+
+  // Assign bundles for the best candidate region.
+  if (BestCand != NoCand) {
+    GlobalSplitCandidate &Cand = GlobalCand[BestCand];
+    if (unsigned B = Cand.getBundles(BundleCand, BestCand)) {
+      UsedCands.push_back(BestCand);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for " << PrintReg(Cand.PhysReg, TRI) << " in "
+                   << B << " bundles, intv " << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  // Assign bundles for the compact region.
+  if (HasCompact) {
+    GlobalSplitCandidate &Cand = GlobalCand.front();
+    assert(!Cand.PhysReg && "Compact region has no physreg");
+    if (unsigned B = Cand.getBundles(BundleCand, 0)) {
+      UsedCands.push_back(0);
+      Cand.IntvIdx = SE->openIntv();
+      DEBUG(dbgs() << "Split for compact region in " << B << " bundles, intv "
+                   << Cand.IntvIdx << ".\n");
+      (void)B;
+    }
+  }
+
+  splitAroundRegion(LREdit, UsedCands);
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Per-Block Splitting
+//===----------------------------------------------------------------------===//
+
+/// tryBlockSplit - Split a global live range around every block with uses. This
+/// creates a lot of local live ranges, that will be split by tryLocalSplit if
+/// they don't allocate.
+unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
+  assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
+  unsigned Reg = VirtReg.reg;
+  bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitSpillMode);
+  ArrayRef<SplitAnalysis::BlockInfo> UseBlocks = SA->getUseBlocks();
+  for (unsigned i = 0; i != UseBlocks.size(); ++i) {
+    const SplitAnalysis::BlockInfo &BI = UseBlocks[i];
+    if (SA->shouldSplitSingleBlock(BI, SingleInstrs))
+      SE->splitSingleBlock(BI);
+  }
+  // No blocks were split.
+  if (LREdit.empty())
+    return 0;
+
+  // We did split for some blocks.
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+
+  // Tell LiveDebugVariables about the new ranges.
+  DebugVars->splitRegister(Reg, LREdit.regs(), *LIS);
+
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Sort out the new intervals created by splitting. The remainder interval
+  // goes straight to spilling, the new local ranges get to stay RS_New.
+  for (unsigned i = 0, e = LREdit.size(); i != e; ++i) {
+    LiveInterval &LI = LIS->getInterval(LREdit.get(i));
+    if (getStage(LI) == RS_New && IntvMap[i] == 0)
+      setStage(LI, RS_Spill);
+  }
+
+  if (VerifyEnabled)
+    MF->verify(this, "After splitting live range around basic blocks");
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                         Per-Instruction Splitting
+//===----------------------------------------------------------------------===//
+
+/// Get the number of allocatable registers that match the constraints of \p Reg
+/// on \p MI and that are also in \p SuperRC.
+static unsigned getNumAllocatableRegsForConstraints(
+    const MachineInstr *MI, unsigned Reg, const TargetRegisterClass *SuperRC,
+    const TargetInstrInfo *TII, const TargetRegisterInfo *TRI,
+    const RegisterClassInfo &RCI) {
+  assert(SuperRC && "Invalid register class");
+
+  const TargetRegisterClass *ConstrainedRC =
+      MI->getRegClassConstraintEffectForVReg(Reg, SuperRC, TII, TRI,
+                                             /* ExploreBundle */ true);
+  if (!ConstrainedRC)
+    return 0;
+  return RCI.getNumAllocatableRegs(ConstrainedRC);
+}
+
+/// tryInstructionSplit - Split a live range around individual instructions.
+/// This is normally not worthwhile since the spiller is doing essentially the
+/// same thing. However, when the live range is in a constrained register
+/// class, it may help to insert copies such that parts of the live range can
+/// be moved to a larger register class.
+///
+/// This is similar to spilling to a larger register class.
+unsigned
+RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                              SmallVectorImpl<unsigned> &NewVRegs) {
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg);
+  // There is no point to this if there are no larger sub-classes.
+  if (!RegClassInfo.isProperSubClass(CurRC))
+    return 0;
+
+  // Always enable split spill mode, since we're effectively spilling to a
+  // register.
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit, SplitEditor::SM_Size);
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 1)
+    return 0;
+
+  DEBUG(dbgs() << "Split around " << Uses.size() << " individual instrs.\n");
+
+  const TargetRegisterClass *SuperRC =
+      TRI->getLargestLegalSuperClass(CurRC, *MF);
+  unsigned SuperRCNumAllocatableRegs = RCI.getNumAllocatableRegs(SuperRC);
+  // Split around every non-copy instruction if this split will relax
+  // the constraints on the virtual register.
+  // Otherwise, splitting just inserts uncoalescable copies that do not help
+  // the allocation.
+  for (unsigned i = 0; i != Uses.size(); ++i) {
+    if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Uses[i]))
+      if (MI->isFullCopy() ||
+          SuperRCNumAllocatableRegs ==
+              getNumAllocatableRegsForConstraints(MI, VirtReg.reg, SuperRC, TII,
+                                                  TRI, RCI)) {
+        DEBUG(dbgs() << "    skip:\t" << Uses[i] << '\t' << *MI);
+        continue;
+      }
+    SE->openIntv();
+    SlotIndex SegStart = SE->enterIntvBefore(Uses[i]);
+    SlotIndex SegStop  = SE->leaveIntvAfter(Uses[i]);
+    SE->useIntv(SegStart, SegStop);
+  }
+
+  if (LREdit.empty()) {
+    DEBUG(dbgs() << "All uses were copies.\n");
+    return 0;
+  }
+
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+
+  // Assign all new registers to RS_Spill. This was the last chance.
+  setStage(LREdit.begin(), LREdit.end(), RS_Spill);
+  return 0;
+}
+
+
+//===----------------------------------------------------------------------===//
+//                             Local Splitting
+//===----------------------------------------------------------------------===//
+
+
+/// calcGapWeights - Compute the maximum spill weight that needs to be evicted
+/// in order to use PhysReg between two entries in SA->UseSlots.
+///
+/// GapWeight[i] represents the gap between UseSlots[i] and UseSlots[i+1].
+///
+void RAGreedy::calcGapWeights(unsigned PhysReg,
+                              SmallVectorImpl<float> &GapWeight) {
+  assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  const unsigned NumGaps = Uses.size()-1;
+
+  // Start and end points for the interference check.
+  SlotIndex StartIdx =
+    BI.LiveIn ? BI.FirstInstr.getBaseIndex() : BI.FirstInstr;
+  SlotIndex StopIdx =
+    BI.LiveOut ? BI.LastInstr.getBoundaryIndex() : BI.LastInstr;
+
+  GapWeight.assign(NumGaps, 0.0f);
+
+  // Add interference from each overlapping register.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    if (!Matrix->query(const_cast<LiveInterval&>(SA->getParent()), *Units)
+          .checkInterference())
+      continue;
+
+    // We know that VirtReg is a continuous interval from FirstInstr to
+    // LastInstr, so we don't need InterferenceQuery.
+    //
+    // Interference that overlaps an instruction is counted in both gaps
+    // surrounding the instruction. The exception is interference before
+    // StartIdx and after StopIdx.
+    //
+    LiveIntervalUnion::SegmentIter IntI =
+      Matrix->getLiveUnions()[*Units] .find(StartIdx);
+    for (unsigned Gap = 0; IntI.valid() && IntI.start() < StopIdx; ++IntI) {
+      // Skip the gaps before IntI.
+      while (Uses[Gap+1].getBoundaryIndex() < IntI.start())
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      // Update the gaps covered by IntI.
+      const float weight = IntI.value()->weight;
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = std::max(GapWeight[Gap], weight);
+        if (Uses[Gap+1].getBaseIndex() >= IntI.stop())
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+
+  // Add fixed interference.
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    const LiveRange &LR = LIS->getRegUnit(*Units);
+    LiveRange::const_iterator I = LR.find(StartIdx);
+    LiveRange::const_iterator E = LR.end();
+
+    // Same loop as above. Mark any overlapped gaps as HUGE_VALF.
+    for (unsigned Gap = 0; I != E && I->start < StopIdx; ++I) {
+      while (Uses[Gap+1].getBoundaryIndex() < I->start)
+        if (++Gap == NumGaps)
+          break;
+      if (Gap == NumGaps)
+        break;
+
+      for (; Gap != NumGaps; ++Gap) {
+        GapWeight[Gap] = llvm::huge_valf;
+        if (Uses[Gap+1].getBaseIndex() >= I->end)
+          break;
+      }
+      if (Gap == NumGaps)
+        break;
+    }
+  }
+}
+
+/// tryLocalSplit - Try to split VirtReg into smaller intervals inside its only
+/// basic block.
+///
+unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
+  assert(SA->getUseBlocks().size() == 1 && "Not a local interval");
+  const SplitAnalysis::BlockInfo &BI = SA->getUseBlocks().front();
+
+  // Note that it is possible to have an interval that is live-in or live-out
+  // while only covering a single block - A phi-def can use undef values from
+  // predecessors, and the block could be a single-block loop.
+  // We don't bother doing anything clever about such a case, we simply assume
+  // that the interval is continuous from FirstInstr to LastInstr. We should
+  // make sure that we don't do anything illegal to such an interval, though.
+
+  ArrayRef<SlotIndex> Uses = SA->getUseSlots();
+  if (Uses.size() <= 2)
+    return 0;
+  const unsigned NumGaps = Uses.size()-1;
+
+  DEBUG({
+    dbgs() << "tryLocalSplit: ";
+    for (unsigned i = 0, e = Uses.size(); i != e; ++i)
+      dbgs() << ' ' << Uses[i];
+    dbgs() << '\n';
+  });
+
+  // If VirtReg is live across any register mask operands, compute a list of
+  // gaps with register masks.
+  SmallVector<unsigned, 8> RegMaskGaps;
+  if (Matrix->checkRegMaskInterference(VirtReg)) {
+    // Get regmask slots for the whole block.
+    ArrayRef<SlotIndex> RMS = LIS->getRegMaskSlotsInBlock(BI.MBB->getNumber());
+    DEBUG(dbgs() << RMS.size() << " regmasks in block:");
+    // Constrain to VirtReg's live range.
+    unsigned ri = std::lower_bound(RMS.begin(), RMS.end(),
+                                   Uses.front().getRegSlot()) - RMS.begin();
+    unsigned re = RMS.size();
+    for (unsigned i = 0; i != NumGaps && ri != re; ++i) {
+      // Look for Uses[i] <= RMS <= Uses[i+1].
+      assert(!SlotIndex::isEarlierInstr(RMS[ri], Uses[i]));
+      if (SlotIndex::isEarlierInstr(Uses[i+1], RMS[ri]))
+        continue;
+      // Skip a regmask on the same instruction as the last use. It doesn't
+      // overlap the live range.
+      if (SlotIndex::isSameInstr(Uses[i+1], RMS[ri]) && i+1 == NumGaps)
+        break;
+      DEBUG(dbgs() << ' ' << RMS[ri] << ':' << Uses[i] << '-' << Uses[i+1]);
+      RegMaskGaps.push_back(i);
+      // Advance ri to the next gap. A regmask on one of the uses counts in
+      // both gaps.
+      while (ri != re && SlotIndex::isEarlierInstr(RMS[ri], Uses[i+1]))
+        ++ri;
+    }
+    DEBUG(dbgs() << '\n');
+  }
+
+  // Since we allow local split results to be split again, there is a risk of
+  // creating infinite loops. It is tempting to require that the new live
+  // ranges have less instructions than the original. That would guarantee
+  // convergence, but it is too strict. A live range with 3 instructions can be
+  // split 2+3 (including the COPY), and we want to allow that.
+  //
+  // Instead we use these rules:
+  //
+  // 1. Allow any split for ranges with getStage() < RS_Split2. (Except for the
+  //    noop split, of course).
+  // 2. Require progress be made for ranges with getStage() == RS_Split2. All
+  //    the new ranges must have fewer instructions than before the split.
+  // 3. New ranges with the same number of instructions are marked RS_Split2,
+  //    smaller ranges are marked RS_New.
+  //
+  // These rules allow a 3 -> 2+3 split once, which we need. They also prevent
+  // excessive splitting and infinite loops.
+  //
+  bool ProgressRequired = getStage(VirtReg) >= RS_Split2;
+
+  // Best split candidate.
+  unsigned BestBefore = NumGaps;
+  unsigned BestAfter = 0;
+  float BestDiff = 0;
+
+  const float blockFreq =
+    SpillPlacer->getBlockFrequency(BI.MBB->getNumber()).getFrequency() *
+    (1.0f / MBFI->getEntryFreq());
+  SmallVector<float, 8> GapWeight;
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.next()) {
+    // Keep track of the largest spill weight that would need to be evicted in
+    // order to make use of PhysReg between UseSlots[i] and UseSlots[i+1].
+    calcGapWeights(PhysReg, GapWeight);
+
+    // Remove any gaps with regmask clobbers.
+    if (Matrix->checkRegMaskInterference(VirtReg, PhysReg))
+      for (unsigned i = 0, e = RegMaskGaps.size(); i != e; ++i)
+        GapWeight[RegMaskGaps[i]] = llvm::huge_valf;
+
+    // Try to find the best sequence of gaps to close.
+    // The new spill weight must be larger than any gap interference.
+
+    // We will split before Uses[SplitBefore] and after Uses[SplitAfter].
+    unsigned SplitBefore = 0, SplitAfter = 1;
+
+    // MaxGap should always be max(GapWeight[SplitBefore..SplitAfter-1]).
+    // It is the spill weight that needs to be evicted.
+    float MaxGap = GapWeight[0];
+
+    for (;;) {
+      // Live before/after split?
+      const bool LiveBefore = SplitBefore != 0 || BI.LiveIn;
+      const bool LiveAfter = SplitAfter != NumGaps || BI.LiveOut;
+
+      DEBUG(dbgs() << PrintReg(PhysReg, TRI) << ' '
+                   << Uses[SplitBefore] << '-' << Uses[SplitAfter]
+                   << " i=" << MaxGap);
+
+      // Stop before the interval gets so big we wouldn't be making progress.
+      if (!LiveBefore && !LiveAfter) {
+        DEBUG(dbgs() << " all\n");
+        break;
+      }
+      // Should the interval be extended or shrunk?
+      bool Shrink = true;
+
+      // How many gaps would the new range have?
+      unsigned NewGaps = LiveBefore + SplitAfter - SplitBefore + LiveAfter;
+
+      // Legally, without causing looping?
+      bool Legal = !ProgressRequired || NewGaps < NumGaps;
+
+      if (Legal && MaxGap < llvm::huge_valf) {
+        // Estimate the new spill weight. Each instruction reads or writes the
+        // register. Conservatively assume there are no read-modify-write
+        // instructions.
+        //
+        // Try to guess the size of the new interval.
+        const float EstWeight = normalizeSpillWeight(
+            blockFreq * (NewGaps + 1),
+            Uses[SplitBefore].distance(Uses[SplitAfter]) +
+                (LiveBefore + LiveAfter) * SlotIndex::InstrDist,
+            1);
+        // Would this split be possible to allocate?
+        // Never allocate all gaps, we wouldn't be making progress.
+        DEBUG(dbgs() << " w=" << EstWeight);
+        if (EstWeight * Hysteresis >= MaxGap) {
+          Shrink = false;
+          float Diff = EstWeight - MaxGap;
+          if (Diff > BestDiff) {
+            DEBUG(dbgs() << " (best)");
+            BestDiff = Hysteresis * Diff;
+            BestBefore = SplitBefore;
+            BestAfter = SplitAfter;
+          }
+        }
+      }
+
+      // Try to shrink.
+      if (Shrink) {
+        if (++SplitBefore < SplitAfter) {
+          DEBUG(dbgs() << " shrink\n");
+          // Recompute the max when necessary.
+          if (GapWeight[SplitBefore - 1] >= MaxGap) {
+            MaxGap = GapWeight[SplitBefore];
+            for (unsigned i = SplitBefore + 1; i != SplitAfter; ++i)
+              MaxGap = std::max(MaxGap, GapWeight[i]);
+          }
+          continue;
+        }
+        MaxGap = 0;
+      }
+
+      // Try to extend the interval.
+      if (SplitAfter >= NumGaps) {
+        DEBUG(dbgs() << " end\n");
+        break;
+      }
+
+      DEBUG(dbgs() << " extend\n");
+      MaxGap = std::max(MaxGap, GapWeight[SplitAfter++]);
+    }
+  }
+
+  // Didn't find any candidates?
+  if (BestBefore == NumGaps)
+    return 0;
+
+  DEBUG(dbgs() << "Best local split range: " << Uses[BestBefore]
+               << '-' << Uses[BestAfter] << ", " << BestDiff
+               << ", " << (BestAfter - BestBefore + 1) << " instrs\n");
+
+  LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+  SE->reset(LREdit);
+
+  SE->openIntv();
+  SlotIndex SegStart = SE->enterIntvBefore(Uses[BestBefore]);
+  SlotIndex SegStop  = SE->leaveIntvAfter(Uses[BestAfter]);
+  SE->useIntv(SegStart, SegStop);
+  SmallVector<unsigned, 8> IntvMap;
+  SE->finish(&IntvMap);
+  DebugVars->splitRegister(VirtReg.reg, LREdit.regs(), *LIS);
+
+  // If the new range has the same number of instructions as before, mark it as
+  // RS_Split2 so the next split will be forced to make progress. Otherwise,
+  // leave the new intervals as RS_New so they can compete.
+  bool LiveBefore = BestBefore != 0 || BI.LiveIn;
+  bool LiveAfter = BestAfter != NumGaps || BI.LiveOut;
+  unsigned NewGaps = LiveBefore + BestAfter - BestBefore + LiveAfter;
+  if (NewGaps >= NumGaps) {
+    DEBUG(dbgs() << "Tagging non-progress ranges: ");
+    assert(!ProgressRequired && "Didn't make progress when it was required.");
+    for (unsigned i = 0, e = IntvMap.size(); i != e; ++i)
+      if (IntvMap[i] == 1) {
+        setStage(LIS->getInterval(LREdit.get(i)), RS_Split2);
+        DEBUG(dbgs() << PrintReg(LREdit.get(i)));
+      }
+    DEBUG(dbgs() << '\n');
+  }
+  ++NumLocalSplits;
+
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                          Live Range Splitting
+//===----------------------------------------------------------------------===//
+
+/// trySplit - Try to split VirtReg or one of its interferences, making it
+/// assignable.
+/// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
+unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
+                            SmallVectorImpl<unsigned>&NewVRegs) {
+  // Ranges must be Split2 or less.
+  if (getStage(VirtReg) >= RS_Spill)
+    return 0;
+
+  // Local intervals are handled separately.
+  if (LIS->intervalIsInOneMBB(VirtReg)) {
+    NamedRegionTimer T("Local Splitting", TimerGroupName, TimePassesIsEnabled);
+    SA->analyze(&VirtReg);
+    unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+    return tryInstructionSplit(VirtReg, Order, NewVRegs);
+  }
+
+  NamedRegionTimer T("Global Splitting", TimerGroupName, TimePassesIsEnabled);
+
+  SA->analyze(&VirtReg);
+
+  // FIXME: SplitAnalysis may repair broken live ranges coming from the
+  // coalescer. That may cause the range to become allocatable which means that
+  // tryRegionSplit won't be making progress. This check should be replaced with
+  // an assertion when the coalescer is fixed.
+  if (SA->didRepairRange()) {
+    // VirtReg has changed, so all cached queries are invalid.
+    Matrix->invalidateVirtRegs();
+    if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs))
+      return PhysReg;
+  }
+
+  // First try to split around a region spanning multiple blocks. RS_Split2
+  // ranges already made dubious progress with region splitting, so they go
+  // straight to single block splitting.
+  if (getStage(VirtReg) < RS_Split2) {
+    unsigned PhysReg = tryRegionSplit(VirtReg, Order, NewVRegs);
+    if (PhysReg || !NewVRegs.empty())
+      return PhysReg;
+  }
+
+  // Then isolate blocks.
+  return tryBlockSplit(VirtReg, Order, NewVRegs);
+}
+
+//===----------------------------------------------------------------------===//
+//                          Last Chance Recoloring
+//===----------------------------------------------------------------------===//
+
+/// mayRecolorAllInterferences - Check if the virtual registers that
+/// interfere with \p VirtReg on \p PhysReg (or one of its aliases) may be
+/// recolored to free \p PhysReg.
+/// When true is returned, \p RecoloringCandidates has been augmented with all
+/// the live intervals that need to be recolored in order to free \p PhysReg
+/// for \p VirtReg.
+/// \p FixedRegisters contains all the virtual registers that cannot be
+/// recolored.
+bool
+RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
+                                     SmallLISet &RecoloringCandidates,
+                                     const SmallVirtRegSet &FixedRegisters) {
+  const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg);
+
+  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
+    LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
+    // If there is LastChanceRecoloringMaxInterference or more interferences,
+    // chances are one would not be recolorable.
+    if (Q.collectInterferingVRegs(LastChanceRecoloringMaxInterference) >=
+        LastChanceRecoloringMaxInterference && !ExhaustiveSearch) {
+      DEBUG(dbgs() << "Early abort: too many interferences.\n");
+      CutOffInfo |= CO_Interf;
+      return false;
+    }
+    for (unsigned i = Q.interferingVRegs().size(); i; --i) {
+      LiveInterval *Intf = Q.interferingVRegs()[i - 1];
+      // If Intf is done and sit on the same register class as VirtReg,
+      // it would not be recolorable as it is in the same state as VirtReg.
+      if ((getStage(*Intf) == RS_Done &&
+           MRI->getRegClass(Intf->reg) == CurRC) ||
+          FixedRegisters.count(Intf->reg)) {
+        DEBUG(dbgs() << "Early abort: the inteference is not recolorable.\n");
+        return false;
+      }
+      RecoloringCandidates.insert(Intf);
+    }
+  }
+  return true;
+}
+
+/// tryLastChanceRecoloring - Try to assign a color to \p VirtReg by recoloring
+/// its interferences.
+/// Last chance recoloring chooses a color for \p VirtReg and recolors every
+/// virtual register that was using it. The recoloring process may recursively
+/// use the last chance recoloring. Therefore, when a virtual register has been
+/// assigned a color by this mechanism, it is marked as Fixed, i.e., it cannot
+/// be last-chance-recolored again during this recoloring "session".
+/// E.g.,
+/// Let
+/// vA can use {R1, R2    }
+/// vB can use {    R2, R3}
+/// vC can use {R1        }
+/// Where vA, vB, and vC cannot be split anymore (they are reloads for
+/// instance) and they all interfere.
+///
+/// vA is assigned R1
+/// vB is assigned R2
+/// vC tries to evict vA but vA is already done.
+/// Regular register allocation fails.
+///
+/// Last chance recoloring kicks in:
+/// vC does as if vA was evicted => vC uses R1.
+/// vC is marked as fixed.
+/// vA needs to find a color.
+/// None are available.
+/// vA cannot evict vC: vC is a fixed virtual register now.
+/// vA does as if vB was evicted => vA uses R2.
+/// vB needs to find a color.
+/// R3 is available.
+/// Recoloring => vC = R1, vA = R2, vB = R3
+///
+/// \p Order defines the preferred allocation order for \p VirtReg.
+/// \p NewRegs will contain any new virtual register that have been created
+/// (split, spill) during the process and that must be assigned.
+/// \p FixedRegisters contains all the virtual registers that cannot be
+/// recolored.
+/// \p Depth gives the current depth of the last chance recoloring.
+/// \return a physical register that can be used for VirtReg or ~0u if none
+/// exists.
+unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
+                                           AllocationOrder &Order,
+                                           SmallVectorImpl<unsigned> &NewVRegs,
+                                           SmallVirtRegSet &FixedRegisters,
+                                           unsigned Depth) {
+  DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
+  // Ranges must be Done.
+  assert((getStage(VirtReg) >= RS_Done || !VirtReg.isSpillable()) &&
+         "Last chance recoloring should really be last chance");
+  // Set the max depth to LastChanceRecoloringMaxDepth.
+  // We may want to reconsider that if we end up with a too large search space
+  // for target with hundreds of registers.
+  // Indeed, in that case we may want to cut the search space earlier.
+  if (Depth >= LastChanceRecoloringMaxDepth && !ExhaustiveSearch) {
+    DEBUG(dbgs() << "Abort because max depth has been reached.\n");
+    CutOffInfo |= CO_Depth;
+    return ~0u;
+  }
+
+  // Set of Live intervals that will need to be recolored.
+  SmallLISet RecoloringCandidates;
+  // Record the original mapping virtual register to physical register in case
+  // the recoloring fails.
+  DenseMap<unsigned, unsigned> VirtRegToPhysReg;
+  // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
+  // this recoloring "session".
+  FixedRegisters.insert(VirtReg.reg);
+
+  Order.rewind();
+  while (unsigned PhysReg = Order.next()) {
+    DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
+                 << PrintReg(PhysReg, TRI) << '\n');
+    RecoloringCandidates.clear();
+    VirtRegToPhysReg.clear();
+
+    // It is only possible to recolor virtual register interference.
+    if (Matrix->checkInterference(VirtReg, PhysReg) >
+        LiveRegMatrix::IK_VirtReg) {
+      DEBUG(dbgs() << "Some inteferences are not with virtual registers.\n");
+
+      continue;
+    }
+
+    // Early give up on this PhysReg if it is obvious we cannot recolor all
+    // the interferences.
+    if (!mayRecolorAllInterferences(PhysReg, VirtReg, RecoloringCandidates,
+                                    FixedRegisters)) {
+      DEBUG(dbgs() << "Some inteferences cannot be recolored.\n");
+      continue;
+    }
+
+    // RecoloringCandidates contains all the virtual registers that interfer
+    // with VirtReg on PhysReg (or one of its aliases).
+    // Enqueue them for recoloring and perform the actual recoloring.
+    PQueue RecoloringQueue;
+    for (SmallLISet::iterator It = RecoloringCandidates.begin(),
+                              EndIt = RecoloringCandidates.end();
+         It != EndIt; ++It) {
+      unsigned ItVirtReg = (*It)->reg;
+      enqueue(RecoloringQueue, *It);
+      assert(VRM->hasPhys(ItVirtReg) &&
+             "Interferences are supposed to be with allocated vairables");
+
+      // Record the current allocation.
+      VirtRegToPhysReg[ItVirtReg] = VRM->getPhys(ItVirtReg);
+      // unset the related struct.
+      Matrix->unassign(**It);
+    }
+
+    // Do as if VirtReg was assigned to PhysReg so that the underlying
+    // recoloring has the right information about the interferes and
+    // available colors.
+    Matrix->assign(VirtReg, PhysReg);
+
+    // Save the current recoloring state.
+    // If we cannot recolor all the interferences, we will have to start again
+    // at this point for the next physical register.
+    SmallVirtRegSet SaveFixedRegisters(FixedRegisters);
+    if (tryRecoloringCandidates(RecoloringQueue, NewVRegs, FixedRegisters,
+                                Depth)) {
+      // Do not mess up with the global assignment process.
+      // I.e., VirtReg must be unassigned.
+      Matrix->unassign(VirtReg);
+      return PhysReg;
+    }
+
+    DEBUG(dbgs() << "Fail to assign: " << VirtReg << " to "
+                 << PrintReg(PhysReg, TRI) << '\n');
+
+    // The recoloring attempt failed, undo the changes.
+    FixedRegisters = SaveFixedRegisters;
+    Matrix->unassign(VirtReg);
+
+    for (SmallLISet::iterator It = RecoloringCandidates.begin(),
+                              EndIt = RecoloringCandidates.end();
+         It != EndIt; ++It) {
+      unsigned ItVirtReg = (*It)->reg;
+      if (VRM->hasPhys(ItVirtReg))
+        Matrix->unassign(**It);
+      unsigned ItPhysReg = VirtRegToPhysReg[ItVirtReg];
+      Matrix->assign(**It, ItPhysReg);
+    }
+  }
+
+  // Last chance recoloring did not worked either, give up.
+  return ~0u;
+}
+
+/// tryRecoloringCandidates - Try to assign a new color to every register
+/// in \RecoloringQueue.
+/// \p NewRegs will contain any new virtual register created during the
+/// recoloring process.
+/// \p FixedRegisters[in/out] contains all the registers that have been
+/// recolored.
+/// \return true if all virtual registers in RecoloringQueue were successfully
+/// recolored, false otherwise.
+bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
+                                       SmallVectorImpl<unsigned> &NewVRegs,
+                                       SmallVirtRegSet &FixedRegisters,
+                                       unsigned Depth) {
+  while (!RecoloringQueue.empty()) {
+    LiveInterval *LI = dequeue(RecoloringQueue);
+    DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
+    unsigned PhysReg;
+    PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
+    if (PhysReg == ~0u || !PhysReg)
+      return false;
+    DEBUG(dbgs() << "Recoloring of " << *LI
+                 << " succeeded with: " << PrintReg(PhysReg, TRI) << '\n');
+    Matrix->assign(*LI, PhysReg);
+    FixedRegisters.insert(LI->reg);
+  }
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                            Main Entry Point
+//===----------------------------------------------------------------------===//
+
+unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<unsigned> &NewVRegs) {
+  CutOffInfo = CO_None;
+  LLVMContext &Ctx = MF->getFunction()->getContext();
+  SmallVirtRegSet FixedRegisters;
+  unsigned Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  if (Reg == ~0U && (CutOffInfo != CO_None)) {
+    uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
+    if (CutOffEncountered == CO_Depth)
+      Ctx.emitError("register allocation failed: maximum depth for recoloring "
+                    "reached. Use -fexhaustive-register-search to skip "
+                    "cutoffs");
+    else if (CutOffEncountered == CO_Interf)
+      Ctx.emitError("register allocation failed: maximum interference for "
+                    "recoloring reached. Use -fexhaustive-register-search "
+                    "to skip cutoffs");
+    else if (CutOffEncountered == (CO_Depth | CO_Interf))
+      Ctx.emitError("register allocation failed: maximum interference and "
+                    "depth for recoloring reached. Use "
+                    "-fexhaustive-register-search to skip cutoffs");
+  }
+  return Reg;
+}
+
+/// Using a CSR for the first time has a cost because it causes push|pop
+/// to be added to prologue|epilogue. Splitting a cold section of the live
+/// range can have lower cost than using the CSR for the first time;
+/// Spilling a live range in the cold path can have lower cost than using
+/// the CSR for the first time. Returns the physical register if we decide
+/// to use the CSR; otherwise return 0.
+unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
+                                         AllocationOrder &Order,
+                                         unsigned PhysReg,
+                                         unsigned &CostPerUseLimit,
+                                         SmallVectorImpl<unsigned> &NewVRegs) {
+  if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
+    // We choose spill over using the CSR for the first time if the spill cost
+    // is lower than CSRCost.
+    SA->analyze(&VirtReg);
+    if (calcSpillCost() >= CSRCost)
+      return PhysReg;
+
+    // We are going to spill, set CostPerUseLimit to 1 to make sure that
+    // we will not use a callee-saved register in tryEvict.
+    CostPerUseLimit = 1;
+    return 0;
+  }
+  if (getStage(VirtReg) < RS_Split) {
+    // We choose pre-splitting over using the CSR for the first time if
+    // the cost of splitting is lower than CSRCost.
+    SA->analyze(&VirtReg);
+    unsigned NumCands = 0;
+    BlockFrequency BestCost = CSRCost; // Don't modify CSRCost.
+    unsigned BestCand = calculateRegionSplitCost(VirtReg, Order, BestCost,
+                                                 NumCands, true /*IgnoreCSR*/);
+    if (BestCand == NoCand)
+      // Use the CSR if we can't find a region split below CSRCost.
+      return PhysReg;
+
+    // Perform the actual pre-splitting.
+    doRegionSplit(VirtReg, BestCand, false/*HasCompact*/, NewVRegs);
+    return 0;
+  }
+  return PhysReg;
+}
+
+void RAGreedy::aboutToRemoveInterval(LiveInterval &LI) {
+  // Do not keep invalid information around.
+  SetOfBrokenHints.remove(&LI);
+}
+
+void RAGreedy::initializeCSRCost() {
+  // We use the larger one out of the command-line option and the value report
+  // by TRI.
+  CSRCost = BlockFrequency(
+      std::max((unsigned)CSRFirstTimeCost, TRI->getCSRFirstUseCost()));
+  if (!CSRCost.getFrequency())
+    return;
+
+  // Raw cost is relative to Entry == 2^14; scale it appropriately.
+  uint64_t ActualEntry = MBFI->getEntryFreq();
+  if (!ActualEntry) {
+    CSRCost = 0;
+    return;
+  }
+  uint64_t FixedEntry = 1 << 14;
+  if (ActualEntry < FixedEntry)
+    CSRCost *= BranchProbability(ActualEntry, FixedEntry);
+  else if (ActualEntry <= UINT32_MAX)
+    // Invert the fraction and divide.
+    CSRCost /= BranchProbability(FixedEntry, ActualEntry);
+  else
+    // Can't use BranchProbability in general, since it takes 32-bit numbers.
+    CSRCost = CSRCost.getFrequency() * (ActualEntry / FixedEntry);
+}
+
+/// \brief Collect the hint info for \p Reg.
+/// The results are stored into \p Out.
+/// \p Out is not cleared before being populated.
+void RAGreedy::collectHintInfo(unsigned Reg, HintsInfo &Out) {
+  for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) {
+    if (!Instr.isFullCopy())
+      continue;
+    // Look for the other end of the copy.
+    unsigned OtherReg = Instr.getOperand(0).getReg();
+    if (OtherReg == Reg) {
+      OtherReg = Instr.getOperand(1).getReg();
+      if (OtherReg == Reg)
+        continue;
+    }
+    // Get the current assignment.
+    unsigned OtherPhysReg = TargetRegisterInfo::isPhysicalRegister(OtherReg)
+                                ? OtherReg
+                                : VRM->getPhys(OtherReg);
+    // Push the collected information.
+    Out.push_back(HintInfo(MBFI->getBlockFreq(Instr.getParent()), OtherReg,
+                           OtherPhysReg));
+  }
+}
+
+/// \brief Using the given \p List, compute the cost of the broken hints if
+/// \p PhysReg was used.
+/// \return The cost of \p List for \p PhysReg.
+BlockFrequency RAGreedy::getBrokenHintFreq(const HintsInfo &List,
+                                           unsigned PhysReg) {
+  BlockFrequency Cost = 0;
+  for (const HintInfo &Info : List) {
+    if (Info.PhysReg != PhysReg)
+      Cost += Info.Freq;
+  }
+  return Cost;
+}
+
+/// \brief Using the register assigned to \p VirtReg, try to recolor
+/// all the live ranges that are copy-related with \p VirtReg.
+/// The recoloring is then propagated to all the live-ranges that have
+/// been recolored and so on, until no more copies can be coalesced or
+/// it is not profitable.
+/// For a given live range, profitability is determined by the sum of the
+/// frequencies of the non-identity copies it would introduce with the old
+/// and new register.
+void RAGreedy::tryHintRecoloring(LiveInterval &VirtReg) {
+  // We have a broken hint, check if it is possible to fix it by
+  // reusing PhysReg for the copy-related live-ranges. Indeed, we evicted
+  // some register and PhysReg may be available for the other live-ranges.
+  SmallSet<unsigned, 4> Visited;
+  SmallVector<unsigned, 2> RecoloringCandidates;
+  HintsInfo Info;
+  unsigned Reg = VirtReg.reg;
+  unsigned PhysReg = VRM->getPhys(Reg);
+  // Start the recoloring algorithm from the input live-interval, then
+  // it will propagate to the ones that are copy-related with it.
+  Visited.insert(Reg);
+  RecoloringCandidates.push_back(Reg);
+
+  DEBUG(dbgs() << "Trying to reconcile hints for: " << PrintReg(Reg, TRI) << '('
+               << PrintReg(PhysReg, TRI) << ")\n");
+
+  do {
+    Reg = RecoloringCandidates.pop_back_val();
+
+    // We cannot recolor physcal register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+
+    assert(VRM->hasPhys(Reg) && "We have unallocated variable!!");
+
+    // Get the live interval mapped with this virtual register to be able
+    // to check for the interference with the new color.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    unsigned CurrPhys = VRM->getPhys(Reg);
+    // Check that the new color matches the register class constraints and
+    // that it is free for this live range.
+    if (CurrPhys != PhysReg && (!MRI->getRegClass(Reg)->contains(PhysReg) ||
+                                Matrix->checkInterference(LI, PhysReg)))
+      continue;
+
+    DEBUG(dbgs() << PrintReg(Reg, TRI) << '(' << PrintReg(CurrPhys, TRI)
+                 << ") is recolorable.\n");
+
+    // Gather the hint info.
+    Info.clear();
+    collectHintInfo(Reg, Info);
+    // Check if recoloring the live-range will increase the cost of the
+    // non-identity copies.
+    if (CurrPhys != PhysReg) {
+      DEBUG(dbgs() << "Checking profitability:\n");
+      BlockFrequency OldCopiesCost = getBrokenHintFreq(Info, CurrPhys);
+      BlockFrequency NewCopiesCost = getBrokenHintFreq(Info, PhysReg);
+      DEBUG(dbgs() << "Old Cost: " << OldCopiesCost.getFrequency()
+                   << "\nNew Cost: " << NewCopiesCost.getFrequency() << '\n');
+      if (OldCopiesCost < NewCopiesCost) {
+        DEBUG(dbgs() << "=> Not profitable.\n");
+        continue;
+      }
+      // At this point, the cost is either cheaper or equal. If it is
+      // equal, we consider this is profitable because it may expose
+      // more recoloring opportunities.
+      DEBUG(dbgs() << "=> Profitable.\n");
+      // Recolor the live-range.
+      Matrix->unassign(LI);
+      Matrix->assign(LI, PhysReg);
+    }
+    // Push all copy-related live-ranges to keep reconciling the broken
+    // hints.
+    for (const HintInfo &HI : Info) {
+      if (Visited.insert(HI.Reg).second)
+        RecoloringCandidates.push_back(HI.Reg);
+    }
+  } while (!RecoloringCandidates.empty());
+}
+
+/// \brief Try to recolor broken hints.
+/// Broken hints may be repaired by recoloring when an evicted variable
+/// freed up a register for a larger live-range.
+/// Consider the following example:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   ...
+///   = b
+///   = a
+/// Let us assume b gets split:
+/// BB1:
+///   a =
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   = a
+/// Because of how the allocation work, b, c, and d may be assigned different
+/// colors. Now, if a gets evicted later:
+/// BB1:
+///   a =
+///   st a, SpillSlot
+///   b =
+/// BB2:
+///   c = b
+///   ...
+///   d = c
+///   = d
+///   e = ld SpillSlot
+///   = e
+/// This is likely that we can assign the same register for b, c, and d,
+/// getting rid of 2 copies.
+void RAGreedy::tryHintsRecoloring() {
+  for (LiveInterval *LI : SetOfBrokenHints) {
+    assert(TargetRegisterInfo::isVirtualRegister(LI->reg) &&
+           "Recoloring is possible only for virtual registers");
+    // Some dead defs may be around (e.g., because of debug uses).
+    // Ignore those.
+    if (!VRM->hasPhys(LI->reg))
+      continue;
+    tryHintRecoloring(*LI);
+  }
+}
+
+unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
+                                     SmallVectorImpl<unsigned> &NewVRegs,
+                                     SmallVirtRegSet &FixedRegisters,
+                                     unsigned Depth) {
+  unsigned CostPerUseLimit = ~0u;
+  // First try assigning a free register.
+  AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
+  if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs)) {
+    // When NewVRegs is not empty, we may have made decisions such as evicting
+    // a virtual register, go with the earlier decisions and use the physical
+    // register.
+    if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) &&
+        NewVRegs.empty()) {
+      unsigned CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
+                                              CostPerUseLimit, NewVRegs);
+      if (CSRReg || !NewVRegs.empty())
+        // Return now if we decide to use a CSR or create new vregs due to
+        // pre-splitting.
+        return CSRReg;
+    } else
+      return PhysReg;
+  }
+
+  LiveRangeStage Stage = getStage(VirtReg);
+  DEBUG(dbgs() << StageName[Stage]
+               << " Cascade " << ExtraRegInfo[VirtReg.reg].Cascade << '\n');
+
+  // Try to evict a less worthy live range, but only for ranges from the primary
+  // queue. The RS_Split ranges already failed to do this, and they should not
+  // get a second chance until they have been split.
+  if (Stage != RS_Split)
+    if (unsigned PhysReg =
+            tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit)) {
+      unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
+      // If VirtReg has a hint and that hint is broken record this
+      // virtual register as a recoloring candidate for broken hint.
+      // Indeed, since we evicted a variable in its neighborhood it is
+      // likely we can at least partially recolor some of the
+      // copy-related live-ranges.
+      if (Hint && Hint != PhysReg)
+        SetOfBrokenHints.insert(&VirtReg);
+      return PhysReg;
+    }
+
+  assert(NewVRegs.empty() && "Cannot append to existing NewVRegs");
+
+  // The first time we see a live range, don't try to split or spill.
+  // Wait until the second time, when all smaller ranges have been allocated.
+  // This gives a better picture of the interference to split around.
+  if (Stage < RS_Split) {
+    setStage(VirtReg, RS_Split);
+    DEBUG(dbgs() << "wait for second round\n");
+    NewVRegs.push_back(VirtReg.reg);
+    return 0;
+  }
+
+  // If we couldn't allocate a register from spilling, there is probably some
+  // invalid inline assembly. The base class wil report it.
+  if (Stage >= RS_Done || !VirtReg.isSpillable())
+    return tryLastChanceRecoloring(VirtReg, Order, NewVRegs, FixedRegisters,
+                                   Depth);
+
+  // Try splitting VirtReg or interferences.
+  unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs);
+  if (PhysReg || !NewVRegs.empty())
+    return PhysReg;
+
+  // Finally spill VirtReg itself.
+  if (EnableDeferredSpilling && getStage(VirtReg) < RS_Memory) {
+    // TODO: This is experimental and in particular, we do not model
+    // the live range splitting done by spilling correctly.
+    // We would need a deep integration with the spiller to do the
+    // right thing here. Anyway, that is still good for early testing.
+    setStage(VirtReg, RS_Memory);
+    DEBUG(dbgs() << "Do as if this register is in memory\n");
+    NewVRegs.push_back(VirtReg.reg);
+  } else {
+    NamedRegionTimer T("Spiller", TimerGroupName, TimePassesIsEnabled);
+    LiveRangeEdit LRE(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
+    spiller().spill(LRE);
+    setStage(NewVRegs.begin(), NewVRegs.end(), RS_Done);
+
+    if (VerifyEnabled)
+      MF->verify(this, "After spilling");
+  }
+
+  // The live virtual register requesting allocation was spilled, so tell
+  // the caller not to allocate anything during this round.
+  return 0;
+}
+
+bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
+  DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
+               << "********** Function: " << mf.getName() << '\n');
+
+  MF = &mf;
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  RCI.runOnMachineFunction(mf);
+
+  EnableLocalReassign = EnableLocalReassignment ||
+                        MF->getSubtarget().enableRALocalReassignment(
+                            MF->getTarget().getOptLevel());
+
+  if (VerifyEnabled)
+    MF->verify(this, "Before greedy register allocator");
+
+  RegAllocBase::init(getAnalysis<VirtRegMap>(),
+                     getAnalysis<LiveIntervals>(),
+                     getAnalysis<LiveRegMatrix>());
+  Indexes = &getAnalysis<SlotIndexes>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  SpillerInstance.reset(createInlineSpiller(*this, *MF, *VRM));
+  Loops = &getAnalysis<MachineLoopInfo>();
+  Bundles = &getAnalysis<EdgeBundles>();
+  SpillPlacer = &getAnalysis<SpillPlacement>();
+  DebugVars = &getAnalysis<LiveDebugVariables>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+
+  initializeCSRCost();
+
+  calculateSpillWeightsAndHints(*LIS, mf, VRM, *Loops, *MBFI);
+
+  DEBUG(LIS->dump());
+
+  SA.reset(new SplitAnalysis(*VRM, *LIS, *Loops));
+  SE.reset(new SplitEditor(*SA, *AA, *LIS, *VRM, *DomTree, *MBFI));
+  ExtraRegInfo.clear();
+  ExtraRegInfo.resize(MRI->getNumVirtRegs());
+  NextCascade = 1;
+  IntfCache.init(MF, Matrix->getLiveUnions(), Indexes, LIS, TRI);
+  GlobalCand.resize(32);  // This will grow as needed.
+  SetOfBrokenHints.clear();
+
+  allocatePhysRegs();
+  tryHintsRecoloring();
+  postOptimization();
+
+  releaseMemory();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
new file mode 100644
index 000000000000..d1221ec59bd4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -0,0 +1,893 @@
+//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a Partitioned Boolean Quadratic Programming (PBQP) based
+// register allocator for LLVM. This allocator works by constructing a PBQP
+// problem representing the register allocation problem under consideration,
+// solving this using a PBQP solver, and mapping the solution back to a
+// register assignment. If any variables are selected for spilling then spill
+// code is inserted and the process repeated.
+//
+// The PBQP solver (pbqp.c) provided for this allocator uses a heuristic tuned
+// for register allocation. For more information on PBQP for register
+// allocation, see the following papers:
+//
+//   (1) Hames, L. and Scholz, B. 2006. Nearly optimal register allocation with
+//   PBQP. In Proceedings of the 7th Joint Modular Languages Conference
+//   (JMLC'06). LNCS, vol. 4228. Springer, New York, NY, USA. 346-361.
+//
+//   (2) Scholz, B., Eckstein, E. 2002. Register allocation for irregular
+//   architectures. In Proceedings of the Joint Conference on Languages,
+//   Compilers and Tools for Embedded Systems (LCTES'02), ACM Press, New York,
+//   NY, USA, 139-148.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegAllocPBQP.h"
+#include "RegisterCoalescer.h"
+#include "Spiller.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <limits>
+#include <memory>
+#include <queue>
+#include <set>
+#include <sstream>
+#include <vector>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static RegisterRegAlloc
+RegisterPBQPRepAlloc("pbqp", "PBQP register allocator",
+                       createDefaultPBQPRegisterAllocator);
+
+static cl::opt<bool>
+PBQPCoalescing("pbqp-coalescing",
+                cl::desc("Attempt coalescing during PBQP register allocation."),
+                cl::init(false), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<bool>
+PBQPDumpGraphs("pbqp-dump-graphs",
+               cl::desc("Dump graphs for each function/round in the compilation unit."),
+               cl::init(false), cl::Hidden);
+#endif
+
+namespace {
+
+///
+/// PBQP based allocators solve the register allocation problem by mapping
+/// register allocation problems to Partitioned Boolean Quadratic
+/// Programming problems.
+class RegAllocPBQP : public MachineFunctionPass {
+public:
+
+  static char ID;
+
+  /// Construct a PBQP register allocator.
+  RegAllocPBQP(char *cPassID = nullptr)
+      : MachineFunctionPass(ID), customPassID(cPassID) {
+    initializeSlotIndexesPass(*PassRegistry::getPassRegistry());
+    initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+    initializeLiveStacksPass(*PassRegistry::getPassRegistry());
+    initializeVirtRegMapPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// Return the pass name.
+  const char* getPassName() const override {
+    return "PBQP Register Allocator";
+  }
+
+  /// PBQP analysis usage.
+  void getAnalysisUsage(AnalysisUsage &au) const override;
+
+  /// Perform register allocation
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+
+  typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
+  typedef std::vector<const LiveInterval*> Node2LIMap;
+  typedef std::vector<unsigned> AllowedSet;
+  typedef std::vector<AllowedSet> AllowedSetMap;
+  typedef std::pair<unsigned, unsigned> RegPair;
+  typedef std::map<RegPair, PBQP::PBQPNum> CoalesceMap;
+  typedef std::set<unsigned> RegSet;
+
+  char *customPassID;
+
+  RegSet VRegsToAlloc, EmptyIntervalVRegs;
+
+  /// Inst which is a def of an original reg and whose defs are already all
+  /// dead after remat is saved in DeadRemats. The deletion of such inst is
+  /// postponed till all the allocations are done, so its remat expr is
+  /// always available for the remat of all the siblings of the original reg.
+  SmallPtrSet<MachineInstr *, 32> DeadRemats;
+
+  /// \brief Finds the initial set of vreg intervals to allocate.
+  void findVRegIntervalsToAlloc(const MachineFunction &MF, LiveIntervals &LIS);
+
+  /// \brief Constructs an initial graph.
+  void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
+
+  /// \brief Spill the given VReg.
+  void spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals,
+                 MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
+                 Spiller &VRegSpiller);
+
+  /// \brief Given a solved PBQP problem maps this solution back to a register
+  /// assignment.
+  bool mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                         const PBQP::Solution &Solution,
+                         VirtRegMap &VRM,
+                         Spiller &VRegSpiller);
+
+  /// \brief Postprocessing before final spilling. Sets basic block "live in"
+  /// variables.
+  void finalizeAlloc(MachineFunction &MF, LiveIntervals &LIS,
+                     VirtRegMap &VRM) const;
+
+  void postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS);
+};
+
+char RegAllocPBQP::ID = 0;
+
+/// @brief Set spill costs for each node in the PBQP reg-alloc graph.
+class SpillCosts : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // A minimum spill costs, so that register constraints can can be set
+    // without normalization in the [0.0:MinSpillCost( interval.
+    const PBQP::PBQPNum MinSpillCost = 10.0;
+
+    for (auto NId : G.nodeIds()) {
+      PBQP::PBQPNum SpillCost =
+        LIS.getInterval(G.getNodeMetadata(NId).getVReg()).weight;
+      if (SpillCost == 0.0)
+        SpillCost = std::numeric_limits<PBQP::PBQPNum>::min();
+      else
+        SpillCost += MinSpillCost;
+      PBQPRAGraph::RawVector NodeCosts(G.getNodeCosts(NId));
+      NodeCosts[PBQP::RegAlloc::getSpillOptionIdx()] = SpillCost;
+      G.setNodeCosts(NId, std::move(NodeCosts));
+    }
+  }
+};
+
+/// @brief Add interference edges between overlapping vregs.
+class Interference : public PBQPRAConstraint {
+private:
+
+  typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
+  typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;
+  typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
+  typedef DenseSet<IKey> DisjointAllowedRegsCache;
+  typedef std::pair<PBQP::GraphBase::NodeId, PBQP::GraphBase::NodeId> IEdgeKey;
+  typedef DenseSet<IEdgeKey> IEdgeCache;
+
+  bool haveDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                               PBQPRAGraph::NodeId MId,
+                               const DisjointAllowedRegsCache &D) const {
+    const auto *NRegs = &G.getNodeMetadata(NId).getAllowedRegs();
+    const auto *MRegs = &G.getNodeMetadata(MId).getAllowedRegs();
+
+    if (NRegs == MRegs)
+      return false;
+
+    if (NRegs < MRegs)
+      return D.count(IKey(NRegs, MRegs)) > 0;
+
+    return D.count(IKey(MRegs, NRegs)) > 0;
+  }
+
+  void setDisjointAllowedRegs(const PBQPRAGraph &G, PBQPRAGraph::NodeId NId,
+                              PBQPRAGraph::NodeId MId,
+                              DisjointAllowedRegsCache &D) {
+    const auto *NRegs = &G.getNodeMetadata(NId).getAllowedRegs();
+    const auto *MRegs = &G.getNodeMetadata(MId).getAllowedRegs();
+
+    assert(NRegs != MRegs && "AllowedRegs can not be disjoint with itself");
+
+    if (NRegs < MRegs)
+      D.insert(IKey(NRegs, MRegs));
+    else
+      D.insert(IKey(MRegs, NRegs));
+  }
+
+  // Holds (Interval, CurrentSegmentID, and NodeId). The first two are required
+  // for the fast interference graph construction algorithm. The last is there
+  // to save us from looking up node ids via the VRegToNode map in the graph
+  // metadata.
+  typedef std::tuple<LiveInterval*, size_t, PBQP::GraphBase::NodeId>
+    IntervalInfo;
+
+  static SlotIndex getStartPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].start;
+  }
+
+  static SlotIndex getEndPoint(const IntervalInfo &I) {
+    return std::get<0>(I)->segments[std::get<1>(I)].end;
+  }
+
+  static PBQP::GraphBase::NodeId getNodeId(const IntervalInfo &I) {
+    return std::get<2>(I);
+  }
+
+  static bool lowestStartPoint(const IntervalInfo &I1,
+                               const IntervalInfo &I2) {
+    // Condition reversed because priority queue has the *highest* element at
+    // the front, rather than the lowest.
+    return getStartPoint(I1) > getStartPoint(I2);
+  }
+
+  static bool lowestEndPoint(const IntervalInfo &I1,
+                             const IntervalInfo &I2) {
+    SlotIndex E1 = getEndPoint(I1);
+    SlotIndex E2 = getEndPoint(I2);
+
+    if (E1 < E2)
+      return true;
+
+    if (E1 > E2)
+      return false;
+
+    // If two intervals end at the same point, we need a way to break the tie or
+    // the set will assume they're actually equal and refuse to insert a
+    // "duplicate". Just compare the vregs - fast and guaranteed unique.
+    return std::get<0>(I1)->reg < std::get<0>(I2)->reg;
+  }
+
+  static bool isAtLastSegment(const IntervalInfo &I) {
+    return std::get<1>(I) == std::get<0>(I)->size() - 1;
+  }
+
+  static IntervalInfo nextSegment(const IntervalInfo &I) {
+    return std::make_tuple(std::get<0>(I), std::get<1>(I) + 1, std::get<2>(I));
+  }
+
+public:
+
+  void apply(PBQPRAGraph &G) override {
+    // The following is loosely based on the linear scan algorithm introduced in
+    // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
+    // isn't linear, because the size of the active set isn't bound by the
+    // number of registers, but rather the size of the largest clique in the
+    // graph. Still, we expect this to be better than N^2.
+    LiveIntervals &LIS = G.getMetadata().LIS;
+
+    // Interferenc matrices are incredibly regular - they're only a function of
+    // the allowed sets, so we cache them to avoid the overhead of constructing
+    // and uniquing them.
+    IMatrixCache C;
+
+    // Finding an edge is expensive in the worst case (O(max_clique(G))). So
+    // cache locally edges we have already seen.
+    IEdgeCache EC;
+
+    // Cache known disjoint allowed registers pairs
+    DisjointAllowedRegsCache D;
+
+    typedef std::set<IntervalInfo, decltype(&lowestEndPoint)> IntervalSet;
+    typedef std::priority_queue<IntervalInfo, std::vector<IntervalInfo>,
+                                decltype(&lowestStartPoint)> IntervalQueue;
+    IntervalSet Active(lowestEndPoint);
+    IntervalQueue Inactive(lowestStartPoint);
+
+    // Start by building the inactive set.
+    for (auto NId : G.nodeIds()) {
+      unsigned VReg = G.getNodeMetadata(NId).getVReg();
+      LiveInterval &LI = LIS.getInterval(VReg);
+      assert(!LI.empty() && "PBQP graph contains node for empty interval");
+      Inactive.push(std::make_tuple(&LI, 0, NId));
+    }
+
+    while (!Inactive.empty()) {
+      // Tentatively grab the "next" interval - this choice may be overriden
+      // below.
+      IntervalInfo Cur = Inactive.top();
+
+      // Retire any active intervals that end before Cur starts.
+      IntervalSet::iterator RetireItr = Active.begin();
+      while (RetireItr != Active.end() &&
+             (getEndPoint(*RetireItr) <= getStartPoint(Cur))) {
+        // If this interval has subsequent segments, add the next one to the
+        // inactive list.
+        if (!isAtLastSegment(*RetireItr))
+          Inactive.push(nextSegment(*RetireItr));
+
+        ++RetireItr;
+      }
+      Active.erase(Active.begin(), RetireItr);
+
+      // One of the newly retired segments may actually start before the
+      // Cur segment, so re-grab the front of the inactive list.
+      Cur = Inactive.top();
+      Inactive.pop();
+
+      // At this point we know that Cur overlaps all active intervals. Add the
+      // interference edges.
+      PBQP::GraphBase::NodeId NId = getNodeId(Cur);
+      for (const auto &A : Active) {
+        PBQP::GraphBase::NodeId MId = getNodeId(A);
+
+        // Do not add an edge when the nodes' allowed registers do not
+        // intersect: there is obviously no interference.
+        if (haveDisjointAllowedRegs(G, NId, MId, D))
+          continue;
+
+        // Check that we haven't already added this edge
+        IEdgeKey EK(std::min(NId, MId), std::max(NId, MId));
+        if (EC.count(EK))
+          continue;
+
+        // This is a new edge - add it to the graph.
+        if (!createInterferenceEdge(G, NId, MId, C))
+          setDisjointAllowedRegs(G, NId, MId, D);
+        else
+          EC.insert(EK);
+      }
+
+      // Finally, add Cur to the Active set.
+      Active.insert(Cur);
+    }
+  }
+
+private:
+
+  // Create an Interference edge and add it to the graph, unless it is
+  // a null matrix, meaning the nodes' allowed registers do not have any
+  // interference. This case occurs frequently between integer and floating
+  // point registers for example.
+  // return true iff both nodes interferes.
+  bool createInterferenceEdge(PBQPRAGraph &G,
+                              PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,
+                              IMatrixCache &C) {
+
+    const TargetRegisterInfo &TRI =
+        *G.getMetadata().MF.getSubtarget().getRegisterInfo();
+    const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
+    const auto &MRegs = G.getNodeMetadata(MId).getAllowedRegs();
+
+    // Try looking the edge costs up in the IMatrixCache first.
+    IKey K(&NRegs, &MRegs);
+    IMatrixCache::iterator I = C.find(K);
+    if (I != C.end()) {
+      G.addEdgeBypassingCostAllocator(NId, MId, I->second);
+      return true;
+    }
+
+    PBQPRAGraph::RawMatrix M(NRegs.size() + 1, MRegs.size() + 1, 0);
+    bool NodesInterfere = false;
+    for (unsigned I = 0; I != NRegs.size(); ++I) {
+      unsigned PRegN = NRegs[I];
+      for (unsigned J = 0; J != MRegs.size(); ++J) {
+        unsigned PRegM = MRegs[J];
+        if (TRI.regsOverlap(PRegN, PRegM)) {
+          M[I + 1][J + 1] = std::numeric_limits<PBQP::PBQPNum>::infinity();
+          NodesInterfere = true;
+        }
+      }
+    }
+
+    if (!NodesInterfere)
+      return false;
+
+    PBQPRAGraph::EdgeId EId = G.addEdge(NId, MId, std::move(M));
+    C[K] = G.getEdgeCostsPtr(EId);
+
+    return true;
+  }
+};
+
+
+class Coalescing : public PBQPRAConstraint {
+public:
+  void apply(PBQPRAGraph &G) override {
+    MachineFunction &MF = G.getMetadata().MF;
+    MachineBlockFrequencyInfo &MBFI = G.getMetadata().MBFI;
+    CoalescerPair CP(*MF.getSubtarget().getRegisterInfo());
+
+    // Scan the machine function and add a coalescing cost whenever CoalescerPair
+    // gives the Ok.
+    for (const auto &MBB : MF) {
+      for (const auto &MI : MBB) {
+
+        // Skip not-coalescable or already coalesced copies.
+        if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
+          continue;
+
+        unsigned DstReg = CP.getDstReg();
+        unsigned SrcReg = CP.getSrcReg();
+
+        const float Scale = 1.0f / MBFI.getEntryFreq();
+        PBQP::PBQPNum CBenefit = MBFI.getBlockFreq(&MBB).getFrequency() * Scale;
+
+        if (CP.isPhys()) {
+          if (!MF.getRegInfo().isAllocatable(DstReg))
+            continue;
+
+          PBQPRAGraph::NodeId NId = G.getMetadata().getNodeIdForVReg(SrcReg);
+
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed =
+            G.getNodeMetadata(NId).getAllowedRegs();
+
+          unsigned PRegOpt = 0;
+          while (PRegOpt < Allowed.size() && Allowed[PRegOpt] != DstReg)
+            ++PRegOpt;
+
+          if (PRegOpt < Allowed.size()) {
+            PBQPRAGraph::RawVector NewCosts(G.getNodeCosts(NId));
+            NewCosts[PRegOpt + 1] -= CBenefit;
+            G.setNodeCosts(NId, std::move(NewCosts));
+          }
+        } else {
+          PBQPRAGraph::NodeId N1Id = G.getMetadata().getNodeIdForVReg(DstReg);
+          PBQPRAGraph::NodeId N2Id = G.getMetadata().getNodeIdForVReg(SrcReg);
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed1 =
+            &G.getNodeMetadata(N1Id).getAllowedRegs();
+          const PBQPRAGraph::NodeMetadata::AllowedRegVector *Allowed2 =
+            &G.getNodeMetadata(N2Id).getAllowedRegs();
+
+          PBQPRAGraph::EdgeId EId = G.findEdge(N1Id, N2Id);
+          if (EId == G.invalidEdgeId()) {
+            PBQPRAGraph::RawMatrix Costs(Allowed1->size() + 1,
+                                         Allowed2->size() + 1, 0);
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.addEdge(N1Id, N2Id, std::move(Costs));
+          } else {
+            if (G.getEdgeNode1Id(EId) == N2Id) {
+              std::swap(N1Id, N2Id);
+              std::swap(Allowed1, Allowed2);
+            }
+            PBQPRAGraph::RawMatrix Costs(G.getEdgeCosts(EId));
+            addVirtRegCoalesce(Costs, *Allowed1, *Allowed2, CBenefit);
+            G.updateEdgeCosts(EId, std::move(Costs));
+          }
+        }
+      }
+    }
+  }
+
+private:
+
+  void addVirtRegCoalesce(
+                    PBQPRAGraph::RawMatrix &CostMat,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
+                    const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed2,
+                    PBQP::PBQPNum Benefit) {
+    assert(CostMat.getRows() == Allowed1.size() + 1 && "Size mismatch.");
+    assert(CostMat.getCols() == Allowed2.size() + 1 && "Size mismatch.");
+    for (unsigned I = 0; I != Allowed1.size(); ++I) {
+      unsigned PReg1 = Allowed1[I];
+      for (unsigned J = 0; J != Allowed2.size(); ++J) {
+        unsigned PReg2 = Allowed2[J];
+        if (PReg1 == PReg2)
+          CostMat[I + 1][J + 1] -= Benefit;
+      }
+    }
+  }
+
+};
+
+} // End anonymous namespace.
+
+// Out-of-line destructor/anchor for PBQPRAConstraint.
+PBQPRAConstraint::~PBQPRAConstraint() {}
+void PBQPRAConstraint::anchor() {}
+void PBQPRAConstraintList::anchor() {}
+
+void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesCFG();
+  au.addRequired<AAResultsWrapperPass>();
+  au.addPreserved<AAResultsWrapperPass>();
+  au.addRequired<SlotIndexes>();
+  au.addPreserved<SlotIndexes>();
+  au.addRequired<LiveIntervals>();
+  au.addPreserved<LiveIntervals>();
+  //au.addRequiredID(SplitCriticalEdgesID);
+  if (customPassID)
+    au.addRequiredID(*customPassID);
+  au.addRequired<LiveStacks>();
+  au.addPreserved<LiveStacks>();
+  au.addRequired<MachineBlockFrequencyInfo>();
+  au.addPreserved<MachineBlockFrequencyInfo>();
+  au.addRequired<MachineLoopInfo>();
+  au.addPreserved<MachineLoopInfo>();
+  au.addRequired<MachineDominatorTree>();
+  au.addPreserved<MachineDominatorTree>();
+  au.addRequired<VirtRegMap>();
+  au.addPreserved<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
+                                            LiveIntervals &LIS) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Iterate over all live ranges.
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval &LI = LIS.getInterval(Reg);
+
+    // If this live interval is non-empty we will use pbqp to allocate it.
+    // Empty intervals we allocate in a simple post-processing stage in
+    // finalizeAlloc.
+    if (!LI.empty()) {
+      VRegsToAlloc.insert(LI.reg);
+    } else {
+      EmptyIntervalVRegs.insert(LI.reg);
+    }
+  }
+}
+
+static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
+                                   const MachineFunction &MF) {
+  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  for (unsigned i = 0; CSR[i] != 0; ++i)
+    if (TRI.regsOverlap(reg, CSR[i]))
+      return true;
+  return false;
+}
+
+void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
+                                   Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+  const TargetRegisterInfo &TRI =
+      *G.getMetadata().MF.getSubtarget().getRegisterInfo();
+
+  std::vector<unsigned> Worklist(VRegsToAlloc.begin(), VRegsToAlloc.end());
+
+  while (!Worklist.empty()) {
+    unsigned VReg = Worklist.back();
+    Worklist.pop_back();
+
+    const TargetRegisterClass *TRC = MRI.getRegClass(VReg);
+    LiveInterval &VRegLI = LIS.getInterval(VReg);
+
+    // Record any overlaps with regmask operands.
+    BitVector RegMaskOverlaps;
+    LIS.checkRegMaskInterference(VRegLI, RegMaskOverlaps);
+
+    // Compute an initial allowed set for the current vreg.
+    std::vector<unsigned> VRegAllowed;
+    ArrayRef<MCPhysReg> RawPRegOrder = TRC->getRawAllocationOrder(MF);
+    for (unsigned I = 0; I != RawPRegOrder.size(); ++I) {
+      unsigned PReg = RawPRegOrder[I];
+      if (MRI.isReserved(PReg))
+        continue;
+
+      // vregLI crosses a regmask operand that clobbers preg.
+      if (!RegMaskOverlaps.empty() && !RegMaskOverlaps.test(PReg))
+        continue;
+
+      // vregLI overlaps fixed regunit interference.
+      bool Interference = false;
+      for (MCRegUnitIterator Units(PReg, &TRI); Units.isValid(); ++Units) {
+        if (VRegLI.overlaps(LIS.getRegUnit(*Units))) {
+          Interference = true;
+          break;
+        }
+      }
+      if (Interference)
+        continue;
+
+      // preg is usable for this virtual register.
+      VRegAllowed.push_back(PReg);
+    }
+
+    // Check for vregs that have no allowed registers. These should be
+    // pre-spilled and the new vregs added to the worklist.
+    if (VRegAllowed.empty()) {
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
+      continue;
+    }
+
+    PBQPRAGraph::RawVector NodeCosts(VRegAllowed.size() + 1, 0);
+
+    // Tweak cost of callee saved registers, as using then force spilling and
+    // restoring them. This would only happen in the prologue / epilogue though.
+    for (unsigned i = 0; i != VRegAllowed.size(); ++i)
+      if (isACalleeSavedRegister(VRegAllowed[i], TRI, MF))
+        NodeCosts[1 + i] += 1.0;
+
+    PBQPRAGraph::NodeId NId = G.addNode(std::move(NodeCosts));
+    G.getNodeMetadata(NId).setVReg(VReg);
+    G.getNodeMetadata(NId).setAllowedRegs(
+      G.getMetadata().getAllowedRegs(std::move(VRegAllowed)));
+    G.getMetadata().setNodeIdForVReg(VReg, NId);
+  }
+}
+
+void RegAllocPBQP::spillVReg(unsigned VReg,
+                             SmallVectorImpl<unsigned> &NewIntervals,
+                             MachineFunction &MF, LiveIntervals &LIS,
+                             VirtRegMap &VRM, Spiller &VRegSpiller) {
+
+  VRegsToAlloc.erase(VReg);
+  LiveRangeEdit LRE(&LIS.getInterval(VReg), NewIntervals, MF, LIS, &VRM,
+                    nullptr, &DeadRemats);
+  VRegSpiller.spill(LRE);
+
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+  DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> SPILLED (Cost: "
+               << LRE.getParent().weight << ", New vregs: ");
+
+  // Copy any newly inserted live intervals into the list of regs to
+  // allocate.
+  for (LiveRangeEdit::iterator I = LRE.begin(), E = LRE.end();
+       I != E; ++I) {
+    const LiveInterval &LI = LIS.getInterval(*I);
+    assert(!LI.empty() && "Empty spill range.");
+    DEBUG(dbgs() << PrintReg(LI.reg, &TRI) << " ");
+    VRegsToAlloc.insert(LI.reg);
+  }
+
+  DEBUG(dbgs() << ")\n");
+}
+
+bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
+                                     const PBQP::Solution &Solution,
+                                     VirtRegMap &VRM,
+                                     Spiller &VRegSpiller) {
+  MachineFunction &MF = G.getMetadata().MF;
+  LiveIntervals &LIS = G.getMetadata().LIS;
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  (void)TRI;
+
+  // Set to true if we have any spills
+  bool AnotherRoundNeeded = false;
+
+  // Clear the existing allocation.
+  VRM.clearAllVirt();
+
+  // Iterate over the nodes mapping the PBQP solution to a register
+  // assignment.
+  for (auto NId : G.nodeIds()) {
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    unsigned AllocOption = Solution.getSelection(NId);
+
+    if (AllocOption != PBQP::RegAlloc::getSpillOptionIdx()) {
+      unsigned PReg = G.getNodeMetadata(NId).getAllowedRegs()[AllocOption - 1];
+      DEBUG(dbgs() << "VREG " << PrintReg(VReg, &TRI) << " -> "
+            << TRI.getName(PReg) << "\n");
+      assert(PReg != 0 && "Invalid preg selected.");
+      VRM.assignVirt2Phys(VReg, PReg);
+    } else {
+      // Spill VReg. If this introduces new intervals we'll need another round
+      // of allocation.
+      SmallVector<unsigned, 8> NewVRegs;
+      spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
+      AnotherRoundNeeded |= !NewVRegs.empty();
+    }
+  }
+
+  return !AnotherRoundNeeded;
+}
+
+void RegAllocPBQP::finalizeAlloc(MachineFunction &MF,
+                                 LiveIntervals &LIS,
+                                 VirtRegMap &VRM) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // First allocate registers for the empty intervals.
+  for (RegSet::const_iterator
+         I = EmptyIntervalVRegs.begin(), E = EmptyIntervalVRegs.end();
+         I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
+
+    unsigned PReg = MRI.getSimpleHint(LI.reg);
+
+    if (PReg == 0) {
+      const TargetRegisterClass &RC = *MRI.getRegClass(LI.reg);
+      PReg = RC.getRawAllocationOrder(MF).front();
+    }
+
+    VRM.assignVirt2Phys(LI.reg, PReg);
+  }
+}
+
+void RegAllocPBQP::postOptimization(Spiller &VRegSpiller, LiveIntervals &LIS) {
+  VRegSpiller.postOptimization();
+  /// Remove dead defs because of rematerialization.
+  for (auto DeadInst : DeadRemats) {
+    LIS.RemoveMachineInstrFromMaps(*DeadInst);
+    DeadInst->eraseFromParent();
+  }
+  DeadRemats.clear();
+}
+
+static inline float normalizePBQPSpillWeight(float UseDefFreq, unsigned Size,
+                                         unsigned NumInstr) {
+  // All intervals have a spill weight that is mostly proportional to the number
+  // of uses, with uses in loops having a bigger weight.
+  return NumInstr * normalizeSpillWeight(UseDefFreq, Size, 1);
+}
+
+bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
+  LiveIntervals &LIS = getAnalysis<LiveIntervals>();
+  MachineBlockFrequencyInfo &MBFI =
+    getAnalysis<MachineBlockFrequencyInfo>();
+
+  VirtRegMap &VRM = getAnalysis<VirtRegMap>();
+
+  calculateSpillWeightsAndHints(LIS, MF, &VRM, getAnalysis<MachineLoopInfo>(),
+                                MBFI, normalizePBQPSpillWeight);
+
+  std::unique_ptr<Spiller> VRegSpiller(createInlineSpiller(*this, MF, VRM));
+
+  MF.getRegInfo().freezeReservedRegs(MF);
+
+  DEBUG(dbgs() << "PBQP Register Allocating for " << MF.getName() << "\n");
+
+  // Allocator main loop:
+  //
+  // * Map current regalloc problem to a PBQP problem
+  // * Solve the PBQP problem
+  // * Map the solution back to a register allocation
+  // * Spill if necessary
+  //
+  // This process is continued till no more spills are generated.
+
+  // Find the vreg intervals in need of allocation.
+  findVRegIntervalsToAlloc(MF, LIS);
+
+#ifndef NDEBUG
+  const Function &F = *MF.getFunction();
+  std::string FullyQualifiedName =
+    F.getParent()->getModuleIdentifier() + "." + F.getName().str();
+#endif
+
+  // If there are non-empty intervals allocate them using pbqp.
+  if (!VRegsToAlloc.empty()) {
+
+    const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
+    std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
+      llvm::make_unique<PBQPRAConstraintList>();
+    ConstraintsRoot->addConstraint(llvm::make_unique<SpillCosts>());
+    ConstraintsRoot->addConstraint(llvm::make_unique<Interference>());
+    if (PBQPCoalescing)
+      ConstraintsRoot->addConstraint(llvm::make_unique<Coalescing>());
+    ConstraintsRoot->addConstraint(Subtarget.getCustomPBQPConstraints());
+
+    bool PBQPAllocComplete = false;
+    unsigned Round = 0;
+
+    while (!PBQPAllocComplete) {
+      DEBUG(dbgs() << "  PBQP Regalloc round " << Round << ":\n");
+
+      PBQPRAGraph G(PBQPRAGraph::GraphMetadata(MF, LIS, MBFI));
+      initializeGraph(G, VRM, *VRegSpiller);
+      ConstraintsRoot->apply(G);
+
+#ifndef NDEBUG
+      if (PBQPDumpGraphs) {
+        std::ostringstream RS;
+        RS << Round;
+        std::string GraphFileName = FullyQualifiedName + "." + RS.str() +
+                                    ".pbqpgraph";
+        std::error_code EC;
+        raw_fd_ostream OS(GraphFileName, EC, sys::fs::F_Text);
+        DEBUG(dbgs() << "Dumping graph for round " << Round << " to \""
+              << GraphFileName << "\"\n");
+        G.dump(OS);
+      }
+#endif
+
+      PBQP::Solution Solution = PBQP::RegAlloc::solve(G);
+      PBQPAllocComplete = mapPBQPToRegAlloc(G, Solution, VRM, *VRegSpiller);
+      ++Round;
+    }
+  }
+
+  // Finalise allocation, allocate empty ranges.
+  finalizeAlloc(MF, LIS, VRM);
+  postOptimization(*VRegSpiller, LIS);
+  VRegsToAlloc.clear();
+  EmptyIntervalVRegs.clear();
+
+  DEBUG(dbgs() << "Post alloc VirtRegMap:\n" << VRM << "\n");
+
+  return true;
+}
+
+/// Create Printable object for node and register info.
+static Printable PrintNodeInfo(PBQP::RegAlloc::PBQPRAGraph::NodeId NId,
+                               const PBQP::RegAlloc::PBQPRAGraph &G) {
+  return Printable([NId, &G](raw_ostream &OS) {
+    const MachineRegisterInfo &MRI = G.getMetadata().MF.getRegInfo();
+    const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo();
+    unsigned VReg = G.getNodeMetadata(NId).getVReg();
+    const char *RegClassName = TRI->getRegClassName(MRI.getRegClass(VReg));
+    OS << NId << " (" << RegClassName << ':' << PrintReg(VReg, TRI) << ')';
+  });
+}
+
+void PBQP::RegAlloc::PBQPRAGraph::dump(raw_ostream &OS) const {
+  for (auto NId : nodeIds()) {
+    const Vector &Costs = getNodeCosts(NId);
+    assert(Costs.getLength() != 0 && "Empty vector in graph.");
+    OS << PrintNodeInfo(NId, *this) << ": " << Costs << '\n';
+  }
+  OS << '\n';
+
+  for (auto EId : edgeIds()) {
+    NodeId N1Id = getEdgeNode1Id(EId);
+    NodeId N2Id = getEdgeNode2Id(EId);
+    assert(N1Id != N2Id && "PBQP graphs should not have self-edges.");
+    const Matrix &M = getEdgeCosts(EId);
+    assert(M.getRows() != 0 && "No rows in matrix.");
+    assert(M.getCols() != 0 && "No cols in matrix.");
+    OS << PrintNodeInfo(N1Id, *this) << ' ' << M.getRows() << " rows / ";
+    OS << PrintNodeInfo(N2Id, *this) << ' ' << M.getCols() << " cols:\n";
+    OS << M << '\n';
+  }
+}
+
+LLVM_DUMP_METHOD void PBQP::RegAlloc::PBQPRAGraph::dump() const { dump(dbgs()); }
+
+void PBQP::RegAlloc::PBQPRAGraph::printDot(raw_ostream &OS) const {
+  OS << "graph {\n";
+  for (auto NId : nodeIds()) {
+    OS << "  node" << NId << " [ label=\""
+       << PrintNodeInfo(NId, *this) << "\\n"
+       << getNodeCosts(NId) << "\" ]\n";
+  }
+
+  OS << "  edge [ len=" << nodeIds().size() << " ]\n";
+  for (auto EId : edgeIds()) {
+    OS << "  node" << getEdgeNode1Id(EId)
+       << " -- node" << getEdgeNode2Id(EId)
+       << " [ label=\"";
+    const Matrix &EdgeCosts = getEdgeCosts(EId);
+    for (unsigned i = 0; i < EdgeCosts.getRows(); ++i) {
+      OS << EdgeCosts.getRowAsVector(i) << "\\n";
+    }
+    OS << "\" ]\n";
+  }
+  OS << "}\n";
+}
+
+FunctionPass *llvm::createPBQPRegisterAllocator(char *customPassID) {
+  return new RegAllocPBQP(customPassID);
+}
+
+FunctionPass* llvm::createDefaultPBQPRegisterAllocator() {
+  return createPBQPRegisterAllocator();
+}
+
+#undef DEBUG_TYPE
diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
new file mode 100644
index 000000000000..50b885423375
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -0,0 +1,142 @@
+//===-- RegUsageInfoCollector.cpp - Register Usage Information Collector --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass is simple MachineFunction pass which collects register usage
+/// details by iterating through each physical registers and checking
+/// MRI::isPhysRegUsed() then creates a RegMask based on this details.
+/// The pass then stores this RegMask in PhysicalRegisterUsageInfo.cpp
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+STATISTIC(NumCSROpt,
+          "Number of functions optimized for callee saved registers");
+
+namespace llvm {
+void initializeRegUsageInfoCollectorPass(PassRegistry &);
+}
+
+namespace {
+class RegUsageInfoCollector : public MachineFunctionPass {
+public:
+  RegUsageInfoCollector() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoCollectorPass(Registry);
+  }
+
+  const char *getPassName() const override {
+    return "Register Usage Information Collector Pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+
+private:
+  void markRegClobbered(const TargetRegisterInfo *TRI, uint32_t *RegMask,
+                        unsigned PReg);
+};
+} // end of anonymous namespace
+
+char RegUsageInfoCollector::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoCollector, "RegUsageInfoCollector",
+                      "Register Usage Information Collector", false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoCollector, "RegUsageInfoCollector",
+                    "Register Usage Information Collector", false, false)
+
+FunctionPass *llvm::createRegUsageInfoCollector() {
+  return new RegUsageInfoCollector();
+}
+
+void RegUsageInfoCollector::markRegClobbered(const TargetRegisterInfo *TRI,
+                                             uint32_t *RegMask, unsigned PReg) {
+  // If PReg is clobbered then all of its alias are also clobbered.
+  for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+    RegMask[*AI / 32] &= ~(1u << *AI % 32);
+}
+
+void RegUsageInfoCollector::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<PhysicalRegisterUsageInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  const TargetMachine &TM = MF.getTarget();
+
+  DEBUG(dbgs() << " -------------------- " << getPassName()
+               << " -------------------- \n");
+  DEBUG(dbgs() << "Function Name : " << MF.getName() << "\n");
+
+  std::vector<uint32_t> RegMask;
+
+  // Compute the size of the bit vector to represent all the registers.
+  // The bit vector is broken into 32-bit chunks, thus takes the ceil of
+  // the number of registers divided by 32 for the size.
+  unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+  RegMask.resize(RegMaskSize, 0xFFFFFFFF);
+
+  const Function *F = MF.getFunction();
+
+  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+
+  PRUI->setTargetMachine(&TM);
+
+  DEBUG(dbgs() << "Clobbered Registers: ");
+
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
+    if (MRI->isPhysRegModified(PReg, true))
+      markRegClobbered(TRI, &RegMask[0], PReg);
+
+  if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
+    const uint32_t *CallPreservedMask =
+        TRI->getCallPreservedMask(MF, F->getCallingConv());
+    // Set callee saved register as preserved.
+    for (unsigned i = 0; i < RegMaskSize; ++i)
+      RegMask[i] = RegMask[i] | CallPreservedMask[i];
+  } else {
+    ++NumCSROpt;
+    DEBUG(dbgs() << MF.getName()
+                 << " function optimized for not having CSR.\n");
+  }
+
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
+    if (MachineOperand::clobbersPhysReg(&(RegMask[0]), PReg))
+      DEBUG(dbgs() << TRI->getName(PReg) << " ");
+
+  DEBUG(dbgs() << " \n----------------------------------------\n");
+
+  PRUI->storeUpdateRegUsageInfo(F, std::move(RegMask));
+
+  return false;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
new file mode 100644
index 000000000000..759566147cc7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -0,0 +1,131 @@
+//=--- RegUsageInfoPropagate.cpp - Register Usage Informartion Propagation --=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+/// This pass iterates through MachineInstrs in a given MachineFunction and at
+/// each callsite queries RegisterUsageInfo for RegMask (calculated based on
+/// actual register allocation) of the callee function, if the RegMask detail
+/// is available then this pass will update the RegMask of the call instruction.
+/// This updated RegMask will be used by the register allocator while allocating
+/// the current MachineFunction.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/IR/Module.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include <map>
+#include <string>
+
+namespace llvm {
+void initializeRegUsageInfoPropagationPassPass(PassRegistry &);
+}
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+#define RUIP_NAME "Register Usage Information Propagation"
+
+namespace {
+class RegUsageInfoPropagationPass : public MachineFunctionPass {
+
+public:
+  RegUsageInfoPropagationPass() : MachineFunctionPass(ID) {
+    PassRegistry &Registry = *PassRegistry::getPassRegistry();
+    initializeRegUsageInfoPropagationPassPass(Registry);
+  }
+
+  const char *getPassName() const override { return RUIP_NAME; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  static char ID;
+
+private:
+  static void setRegMask(MachineInstr &MI, const uint32_t *RegMask) {
+    for (MachineOperand &MO : MI.operands()) {
+      if (MO.isRegMask())
+        MO.setRegMask(RegMask);
+    }
+  }
+};
+} // end of anonymous namespace
+char RegUsageInfoPropagationPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(RegUsageInfoPropagationPass, "reg-usage-propagation",
+                      RUIP_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(PhysicalRegisterUsageInfo)
+INITIALIZE_PASS_END(RegUsageInfoPropagationPass, "reg-usage-propagation",
+                    RUIP_NAME, false, false)
+
+FunctionPass *llvm::createRegUsageInfoPropPass() {
+  return new RegUsageInfoPropagationPass();
+}
+
+void RegUsageInfoPropagationPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<PhysicalRegisterUsageInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool RegUsageInfoPropagationPass::runOnMachineFunction(MachineFunction &MF) {
+  const Module *M = MF.getFunction()->getParent();
+  PhysicalRegisterUsageInfo *PRUI = &getAnalysis<PhysicalRegisterUsageInfo>();
+
+  DEBUG(dbgs() << " ++++++++++++++++++++ " << getPassName()
+               << " ++++++++++++++++++++  \n");
+  DEBUG(dbgs() << "MachineFunction : " << MF.getName() << "\n");
+
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      if (!MI.isCall())
+        continue;
+      DEBUG(dbgs()
+            << "Call Instruction Before Register Usage Info Propagation : \n");
+      DEBUG(dbgs() << MI << "\n");
+
+      auto UpdateRegMask = [&](const Function *F) {
+        const auto *RegMask = PRUI->getRegUsageInfo(F);
+        if (!RegMask)
+          return;
+        setRegMask(MI, &(*RegMask)[0]);
+        Changed = true;
+      };
+
+      MachineOperand &Operand = MI.getOperand(0);
+      if (Operand.isGlobal())
+        UpdateRegMask(cast<Function>(Operand.getGlobal()));
+      else if (Operand.isSymbol())
+        UpdateRegMask(M->getFunction(Operand.getSymbolName()));
+
+      DEBUG(dbgs()
+            << "Call Instruction After Register Usage Info Propagation : \n");
+      DEBUG(dbgs() << MI << "\n");
+    }
+  }
+
+  DEBUG(dbgs() << " +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
+                  "++++++ \n");
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
new file mode 100644
index 000000000000..178fa18ac5a6
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterClassInfo.cpp
@@ -0,0 +1,181 @@
+//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterClassInfo class which provides dynamic
+// information about target register classes. Callee-saved vs. caller-saved and
+// reserved registers depend on calling conventions and other dynamic
+// information, so some things cannot be determined statically.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+static cl::opt<unsigned>
+StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
+         cl::desc("Limit all regclasses to N registers"));
+
+RegisterClassInfo::RegisterClassInfo()
+  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {}
+
+void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
+  bool Update = false;
+  MF = &mf;
+
+  // Allocate new array the first time we see a new target.
+  if (MF->getSubtarget().getRegisterInfo() != TRI) {
+    TRI = MF->getSubtarget().getRegisterInfo();
+    RegClass.reset(new RCInfo[TRI->getNumRegClasses()]);
+    unsigned NumPSets = TRI->getNumRegPressureSets();
+    PSetLimits.reset(new unsigned[NumPSets]);
+    std::fill(&PSetLimits[0], &PSetLimits[NumPSets], 0);
+    Update = true;
+  }
+
+  // Does this MF have different CSRs?
+  assert(TRI && "no register info set");
+  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+  if (Update || CSR != CalleeSaved) {
+    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+    // overlapping CSR.
+    CSRNum.clear();
+    CSRNum.resize(TRI->getNumRegs(), 0);
+    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    Update = true;
+  }
+  CalleeSaved = CSR;
+
+  // Different reserved registers?
+  const BitVector &RR = MF->getRegInfo().getReservedRegs();
+  if (Reserved.size() != RR.size() || RR != Reserved) {
+    Update = true;
+    Reserved = RR;
+  }
+
+  // Invalidate cached information from previous function.
+  if (Update)
+    ++Tag;
+}
+
+/// compute - Compute the preferred allocation order for RC with reserved
+/// registers filtered out. Volatile registers come first followed by CSR
+/// aliases ordered according to the CSR order specified by the target.
+void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
+  assert(RC && "no register class given");
+  RCInfo &RCI = RegClass[RC->getID()];
+
+  // Raw register count, including all reserved regs.
+  unsigned NumRegs = RC->getNumRegs();
+
+  if (!RCI.Order)
+    RCI.Order.reset(new MCPhysReg[NumRegs]);
+
+  unsigned N = 0;
+  SmallVector<MCPhysReg, 16> CSRAlias;
+  unsigned MinCost = 0xff;
+  unsigned LastCost = ~0u;
+  unsigned LastCostChange = 0;
+
+  // FIXME: Once targets reserve registers instead of removing them from the
+  // allocation order, we can simply use begin/end here.
+  ArrayRef<MCPhysReg> RawOrder = RC->getRawAllocationOrder(*MF);
+  for (unsigned i = 0; i != RawOrder.size(); ++i) {
+    unsigned PhysReg = RawOrder[i];
+    // Remove reserved registers from the allocation order.
+    if (Reserved.test(PhysReg))
+      continue;
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    MinCost = std::min(MinCost, Cost);
+
+    if (CSRNum[PhysReg])
+      // PhysReg aliases a CSR, save it for later.
+      CSRAlias.push_back(PhysReg);
+    else {
+      if (Cost != LastCost)
+        LastCostChange = N;
+      RCI.Order[N++] = PhysReg;
+      LastCost = Cost;
+    }
+  }
+  RCI.NumRegs = N + CSRAlias.size();
+  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+
+  // CSR aliases go after the volatile registers, preserve the target's order.
+  for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
+    unsigned PhysReg = CSRAlias[i];
+    unsigned Cost = TRI->getCostPerUse(PhysReg);
+    if (Cost != LastCost)
+      LastCostChange = N;
+    RCI.Order[N++] = PhysReg;
+    LastCost = Cost;
+  }
+
+  // Register allocator stress test.  Clip register class to N registers.
+  if (StressRA && RCI.NumRegs > StressRA)
+    RCI.NumRegs = StressRA;
+
+  // Check if RC is a proper sub-class.
+  if (const TargetRegisterClass *Super =
+          TRI->getLargestLegalSuperClass(RC, *MF))
+    if (Super != RC && getNumAllocatableRegs(Super) > RCI.NumRegs)
+      RCI.ProperSubClass = true;
+
+  RCI.MinCost = uint8_t(MinCost);
+  RCI.LastCostChange = LastCostChange;
+
+  DEBUG({
+    dbgs() << "AllocationOrder(" << TRI->getRegClassName(RC) << ") = [";
+    for (unsigned I = 0; I != RCI.NumRegs; ++I)
+      dbgs() << ' ' << PrintReg(RCI.Order[I], TRI);
+    dbgs() << (RCI.ProperSubClass ? " ] (sub-class)\n" : " ]\n");
+  });
+
+  // RCI is now up-to-date.
+  RCI.Tag = Tag;
+}
+
+/// This is not accurate because two overlapping register sets may have some
+/// nonoverlapping reserved registers. However, computing the allocation order
+/// for all register classes would be too expensive.
+unsigned RegisterClassInfo::computePSetLimit(unsigned Idx) const {
+  const TargetRegisterClass *RC = nullptr;
+  unsigned NumRCUnits = 0;
+  for (TargetRegisterInfo::regclass_iterator
+         RI = TRI->regclass_begin(), RE = TRI->regclass_end(); RI != RE; ++RI) {
+    const int *PSetID = TRI->getRegClassPressureSets(*RI);
+    for (; *PSetID != -1; ++PSetID) {
+      if ((unsigned)*PSetID == Idx)
+        break;
+    }
+    if (*PSetID == -1)
+      continue;
+
+    // Found a register class that counts against this pressure set.
+    // For efficiency, only compute the set order for the largest set.
+    unsigned NUnits = TRI->getRegClassWeight(*RI).WeightLimit;
+    if (!RC || NUnits > NumRCUnits) {
+      RC = *RI;
+      NumRCUnits = NUnits;
+    }
+  }
+  compute(RC);
+  unsigned NReserved = RC->getNumRegs() - getNumAllocatableRegs(RC);
+  return TRI->getRegPressureSetLimit(*MF, Idx) -
+         TRI->getRegClassWeight(RC).RegWeight * NReserved;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
new file mode 100644
index 000000000000..617ece902e0e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -0,0 +1,3058 @@
+//===- RegisterCoalescer.cpp - Generic Register Coalescing Interface -------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the generic RegisterCoalescer interface which
+// is used as the common interface used by all clients and
+// implementations of register coalescing.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RegisterCoalescer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cmath>
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(numJoins    , "Number of interval joins performed");
+STATISTIC(numCrossRCs , "Number of cross class joins performed");
+STATISTIC(numCommutes , "Number of instruction commuting performed");
+STATISTIC(numExtends  , "Number of copies extended");
+STATISTIC(NumReMats   , "Number of instructions re-materialized");
+STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
+STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
+
+static cl::opt<bool>
+EnableJoining("join-liveintervals",
+              cl::desc("Coalesce copies (default=true)"),
+              cl::init(true));
+
+static cl::opt<bool> UseTerminalRule("terminal-rule",
+                                     cl::desc("Apply the terminal rule"),
+                                     cl::init(false), cl::Hidden);
+
+/// Temporary flag to test critical edge unsplitting.
+static cl::opt<bool>
+EnableJoinSplits("join-splitedges",
+  cl::desc("Coalesce copies on split edges (default=subtarget)"), cl::Hidden);
+
+/// Temporary flag to test global copy optimization.
+static cl::opt<cl::boolOrDefault>
+EnableGlobalCopies("join-globalcopies",
+  cl::desc("Coalesce copies that span blocks (default=subtarget)"),
+  cl::init(cl::BOU_UNSET), cl::Hidden);
+
+static cl::opt<bool>
+VerifyCoalescing("verify-coalescing",
+         cl::desc("Verify machine instrs before and after register coalescing"),
+         cl::Hidden);
+
+namespace {
+  class RegisterCoalescer : public MachineFunctionPass,
+                            private LiveRangeEdit::Delegate {
+    MachineFunction* MF;
+    MachineRegisterInfo* MRI;
+    const TargetMachine* TM;
+    const TargetRegisterInfo* TRI;
+    const TargetInstrInfo* TII;
+    LiveIntervals *LIS;
+    const MachineLoopInfo* Loops;
+    AliasAnalysis *AA;
+    RegisterClassInfo RegClassInfo;
+
+    /// A LaneMask to remember on which subregister live ranges we need to call
+    /// shrinkToUses() later.
+    LaneBitmask ShrinkMask;
+
+    /// True if the main range of the currently coalesced intervals should be
+    /// checked for smaller live intervals.
+    bool ShrinkMainRange;
+
+    /// \brief True if the coalescer should aggressively coalesce global copies
+    /// in favor of keeping local copies.
+    bool JoinGlobalCopies;
+
+    /// \brief True if the coalescer should aggressively coalesce fall-thru
+    /// blocks exclusively containing copies.
+    bool JoinSplitEdges;
+
+    /// Copy instructions yet to be coalesced.
+    SmallVector<MachineInstr*, 8> WorkList;
+    SmallVector<MachineInstr*, 8> LocalWorkList;
+
+    /// Set of instruction pointers that have been erased, and
+    /// that may be present in WorkList.
+    SmallPtrSet<MachineInstr*, 8> ErasedInstrs;
+
+    /// Dead instructions that are about to be deleted.
+    SmallVector<MachineInstr*, 8> DeadDefs;
+
+    /// Virtual registers to be considered for register class inflation.
+    SmallVector<unsigned, 8> InflateRegs;
+
+    /// Recursively eliminate dead defs in DeadDefs.
+    void eliminateDeadDefs();
+
+    /// LiveRangeEdit callback for eliminateDeadDefs().
+    void LRE_WillEraseInstruction(MachineInstr *MI) override;
+
+    /// Coalesce the LocalWorkList.
+    void coalesceLocals();
+
+    /// Join compatible live intervals
+    void joinAllIntervals();
+
+    /// Coalesce copies in the specified MBB, putting
+    /// copies that cannot yet be coalesced into WorkList.
+    void copyCoalesceInMBB(MachineBasicBlock *MBB);
+
+    /// Tries to coalesce all copies in CurrList. Returns true if any progress
+    /// was made.
+    bool copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList);
+
+    /// Attempt to join intervals corresponding to SrcReg/DstReg, which are the
+    /// src/dst of the copy instruction CopyMI.  This returns true if the copy
+    /// was successfully coalesced away. If it is not currently possible to
+    /// coalesce this interval, but it may be possible if other things get
+    /// coalesced, then it returns true by reference in 'Again'.
+    bool joinCopy(MachineInstr *TheCopy, bool &Again);
+
+    /// Attempt to join these two intervals.  On failure, this
+    /// returns false.  The output "SrcInt" will not have been modified, so we
+    /// can use this information below to update aliases.
+    bool joinIntervals(CoalescerPair &CP);
+
+    /// Attempt joining two virtual registers. Return true on success.
+    bool joinVirtRegs(CoalescerPair &CP);
+
+    /// Attempt joining with a reserved physreg.
+    bool joinReservedPhysReg(CoalescerPair &CP);
+
+    /// Add the LiveRange @p ToMerge as a subregister liverange of @p LI.
+    /// Subranges in @p LI which only partially interfere with the desired
+    /// LaneMask are split as necessary. @p LaneMask are the lanes that
+    /// @p ToMerge will occupy in the coalescer register. @p LI has its subrange
+    /// lanemasks already adjusted to the coalesced register.
+    void mergeSubRangeInto(LiveInterval &LI, const LiveRange &ToMerge,
+                           LaneBitmask LaneMask, CoalescerPair &CP);
+
+    /// Join the liveranges of two subregisters. Joins @p RRange into
+    /// @p LRange, @p RRange may be invalid afterwards.
+    void joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                          LaneBitmask LaneMask, const CoalescerPair &CP);
+
+    /// We found a non-trivially-coalescable copy. If the source value number is
+    /// defined by a copy from the destination reg see if we can merge these two
+    /// destination reg valno# into a single value number, eliminating a copy.
+    /// This returns true if an interval was modified.
+    bool adjustCopiesBackFrom(const CoalescerPair &CP, MachineInstr *CopyMI);
+
+    /// Return true if there are definitions of IntB
+    /// other than BValNo val# that can reach uses of AValno val# of IntA.
+    bool hasOtherReachingDefs(LiveInterval &IntA, LiveInterval &IntB,
+                              VNInfo *AValNo, VNInfo *BValNo);
+
+    /// We found a non-trivially-coalescable copy.
+    /// If the source value number is defined by a commutable instruction and
+    /// its other operand is coalesced to the copy dest register, see if we
+    /// can transform the copy into a noop by commuting the definition.
+    /// This returns true if an interval was modified.
+    bool removeCopyByCommutingDef(const CoalescerPair &CP,MachineInstr *CopyMI);
+
+    /// If the source of a copy is defined by a
+    /// trivial computation, replace the copy by rematerialize the definition.
+    bool reMaterializeTrivialDef(const CoalescerPair &CP, MachineInstr *CopyMI,
+                                 bool &IsDefCopy);
+
+    /// Return true if a copy involving a physreg should be joined.
+    bool canJoinPhys(const CoalescerPair &CP);
+
+    /// Replace all defs and uses of SrcReg to DstReg and update the subregister
+    /// number if it is not zero. If DstReg is a physical register and the
+    /// existing subregister number of the def / use being updated is not zero,
+    /// make sure to set it to the correct physical subregister.
+    void updateRegDefsUses(unsigned SrcReg, unsigned DstReg, unsigned SubIdx);
+
+    /// If the given machine operand reads only undefined lanes add an undef
+    /// flag.
+    /// This can happen when undef uses were previously concealed by a copy
+    /// which we coalesced. Example:
+    ///    %vreg0:sub0<def,read-undef> = ...
+    ///    %vreg1 = COPY %vreg0       <-- Coalescing COPY reveals undef
+    ///           = use %vreg1:sub1   <-- hidden undef use
+    void addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                      MachineOperand &MO, unsigned SubRegIdx);
+
+    /// Handle copies of undef values.
+    /// Returns true if @p CopyMI was a copy of an undef value and eliminated.
+    bool eliminateUndefCopy(MachineInstr *CopyMI);
+
+    /// Check whether or not we should apply the terminal rule on the
+    /// destination (Dst) of \p Copy.
+    /// When the terminal rule applies, Copy is not profitable to
+    /// coalesce.
+    /// Dst is terminal if it has exactly one affinity (Dst, Src) and
+    /// at least one interference (Dst, Dst2). If Dst is terminal, the
+    /// terminal rule consists in checking that at least one of
+    /// interfering node, say Dst2, has an affinity of equal or greater
+    /// weight with Src.
+    /// In that case, Dst2 and Dst will not be able to be both coalesced
+    /// with Src. Since Dst2 exposes more coalescing opportunities than
+    /// Dst, we can drop \p Copy.
+    bool applyTerminalRule(const MachineInstr &Copy) const;
+
+    /// Wrapper method for \see LiveIntervals::shrinkToUses.
+    /// This method does the proper fixing of the live-ranges when the afore
+    /// mentioned method returns true.
+    void shrinkToUses(LiveInterval *LI,
+                      SmallVectorImpl<MachineInstr * > *Dead = nullptr) {
+      if (LIS->shrinkToUses(LI, Dead)) {
+        /// Check whether or not \p LI is composed by multiple connected
+        /// components and if that is the case, fix that.
+        SmallVector<LiveInterval*, 8> SplitLIs;
+        LIS->splitSeparateComponents(*LI, SplitLIs);
+      }
+    }
+
+  public:
+    static char ID; ///< Class identification, replacement for typeinfo
+    RegisterCoalescer() : MachineFunctionPass(ID) {
+      initializeRegisterCoalescerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+    void releaseMemory() override;
+
+    /// This is the pass entry point.
+    bool runOnMachineFunction(MachineFunction&) override;
+
+    /// Implement the dump method.
+    void print(raw_ostream &O, const Module* = nullptr) const override;
+  };
+} // end anonymous namespace
+
+char &llvm::RegisterCoalescerID = RegisterCoalescer::ID;
+
+INITIALIZE_PASS_BEGIN(RegisterCoalescer, "simple-register-coalescing",
+                      "Simple Register Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(RegisterCoalescer, "simple-register-coalescing",
+                    "Simple Register Coalescing", false, false)
+
+char RegisterCoalescer::ID = 0;
+
+static bool isMoveInstr(const TargetRegisterInfo &tri, const MachineInstr *MI,
+                        unsigned &Src, unsigned &Dst,
+                        unsigned &SrcSub, unsigned &DstSub) {
+  if (MI->isCopy()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = MI->getOperand(0).getSubReg();
+    Src = MI->getOperand(1).getReg();
+    SrcSub = MI->getOperand(1).getSubReg();
+  } else if (MI->isSubregToReg()) {
+    Dst = MI->getOperand(0).getReg();
+    DstSub = tri.composeSubRegIndices(MI->getOperand(0).getSubReg(),
+                                      MI->getOperand(3).getImm());
+    Src = MI->getOperand(2).getReg();
+    SrcSub = MI->getOperand(2).getSubReg();
+  } else
+    return false;
+  return true;
+}
+
+/// Return true if this block should be vacated by the coalescer to eliminate
+/// branches. The important cases to handle in the coalescer are critical edges
+/// split during phi elimination which contain only copies. Simple blocks that
+/// contain non-branches should also be vacated, but this can be handled by an
+/// earlier pass similar to early if-conversion.
+static bool isSplitEdge(const MachineBasicBlock *MBB) {
+  if (MBB->pred_size() != 1 || MBB->succ_size() != 1)
+    return false;
+
+  for (const auto &MI : *MBB) {
+    if (!MI.isCopyLike() && !MI.isUnconditionalBranch())
+      return false;
+  }
+  return true;
+}
+
+bool CoalescerPair::setRegisters(const MachineInstr *MI) {
+  SrcReg = DstReg = 0;
+  SrcIdx = DstIdx = 0;
+  NewRC = nullptr;
+  Flipped = CrossClass = false;
+
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+  Partial = SrcSub || DstSub;
+
+  // If one register is a physreg, it must be Dst.
+  if (TargetRegisterInfo::isPhysicalRegister(Src)) {
+    if (TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+    Flipped = true;
+  }
+
+  const MachineRegisterInfo &MRI = MI->getParent()->getParent()->getRegInfo();
+
+  if (TargetRegisterInfo::isPhysicalRegister(Dst)) {
+    // Eliminate DstSub on a physreg.
+    if (DstSub) {
+      Dst = TRI.getSubReg(Dst, DstSub);
+      if (!Dst) return false;
+      DstSub = 0;
+    }
+
+    // Eliminate SrcSub by picking a corresponding Dst superregister.
+    if (SrcSub) {
+      Dst = TRI.getMatchingSuperReg(Dst, SrcSub, MRI.getRegClass(Src));
+      if (!Dst) return false;
+    } else if (!MRI.getRegClass(Src)->contains(Dst)) {
+      return false;
+    }
+  } else {
+    // Both registers are virtual.
+    const TargetRegisterClass *SrcRC = MRI.getRegClass(Src);
+    const TargetRegisterClass *DstRC = MRI.getRegClass(Dst);
+
+    // Both registers have subreg indices.
+    if (SrcSub && DstSub) {
+      // Copies between different sub-registers are never coalescable.
+      if (Src == Dst && SrcSub != DstSub)
+        return false;
+
+      NewRC = TRI.getCommonSuperRegClass(SrcRC, SrcSub, DstRC, DstSub,
+                                         SrcIdx, DstIdx);
+      if (!NewRC)
+        return false;
+    } else if (DstSub) {
+      // SrcReg will be merged with a sub-register of DstReg.
+      SrcIdx = DstSub;
+      NewRC = TRI.getMatchingSuperRegClass(DstRC, SrcRC, DstSub);
+    } else if (SrcSub) {
+      // DstReg will be merged with a sub-register of SrcReg.
+      DstIdx = SrcSub;
+      NewRC = TRI.getMatchingSuperRegClass(SrcRC, DstRC, SrcSub);
+    } else {
+      // This is a straight copy without sub-registers.
+      NewRC = TRI.getCommonSubClass(DstRC, SrcRC);
+    }
+
+    // The combined constraint may be impossible to satisfy.
+    if (!NewRC)
+      return false;
+
+    // Prefer SrcReg to be a sub-register of DstReg.
+    // FIXME: Coalescer should support subregs symmetrically.
+    if (DstIdx && !SrcIdx) {
+      std::swap(Src, Dst);
+      std::swap(SrcIdx, DstIdx);
+      Flipped = !Flipped;
+    }
+
+    CrossClass = NewRC != DstRC || NewRC != SrcRC;
+  }
+  // Check our invariants
+  assert(TargetRegisterInfo::isVirtualRegister(Src) && "Src must be virtual");
+  assert(!(TargetRegisterInfo::isPhysicalRegister(Dst) && DstSub) &&
+         "Cannot have a physical SubIdx");
+  SrcReg = Src;
+  DstReg = Dst;
+  return true;
+}
+
+bool CoalescerPair::flip() {
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return false;
+  std::swap(SrcReg, DstReg);
+  std::swap(SrcIdx, DstIdx);
+  Flipped = !Flipped;
+  return true;
+}
+
+bool CoalescerPair::isCoalescable(const MachineInstr *MI) const {
+  if (!MI)
+    return false;
+  unsigned Src, Dst, SrcSub, DstSub;
+  if (!isMoveInstr(TRI, MI, Src, Dst, SrcSub, DstSub))
+    return false;
+
+  // Find the virtual register that is SrcReg.
+  if (Dst == SrcReg) {
+    std::swap(Src, Dst);
+    std::swap(SrcSub, DstSub);
+  } else if (Src != SrcReg) {
+    return false;
+  }
+
+  // Now check that Dst matches DstReg.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    if (!TargetRegisterInfo::isPhysicalRegister(Dst))
+      return false;
+    assert(!DstIdx && !SrcIdx && "Inconsistent CoalescerPair state.");
+    // DstSub could be set for a physreg from INSERT_SUBREG.
+    if (DstSub)
+      Dst = TRI.getSubReg(Dst, DstSub);
+    // Full copy of Src.
+    if (!SrcSub)
+      return DstReg == Dst;
+    // This is a partial register copy. Check that the parts match.
+    return TRI.getSubReg(DstReg, SrcSub) == Dst;
+  } else {
+    // DstReg is virtual.
+    if (DstReg != Dst)
+      return false;
+    // Registers match, do the subregisters line up?
+    return TRI.composeSubRegIndices(SrcIdx, SrcSub) ==
+           TRI.composeSubRegIndices(DstIdx, DstSub);
+  }
+}
+
+void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreservedID(MachineDominatorsID);
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void RegisterCoalescer::eliminateDeadDefs() {
+  SmallVector<unsigned, 8> NewRegs;
+  LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
+                nullptr, this).eliminateDeadDefs(DeadDefs);
+}
+
+void RegisterCoalescer::LRE_WillEraseInstruction(MachineInstr *MI) {
+  // MI may be in WorkList. Make sure we don't visit it.
+  ErasedInstrs.insert(MI);
+}
+
+bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
+                                             MachineInstr *CopyMI) {
+  assert(!CP.isPartial() && "This doesn't work for partial copies.");
+  assert(!CP.isPhys() && "This doesn't work for physreg copies.");
+
+  LiveInterval &IntA =
+    LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+    LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+
+  // We have a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a copy from B, see if we can
+  // merge these two pieces of B into a single value number, eliminating a copy.
+  // For example:
+  //
+  //  A3 = B0
+  //    ...
+  //  B1 = A3      <- this copy
+  //
+  // In this case, B0 can be extended to where the B1 copy lives, allowing the
+  // B1 value number to be replaced with B0 (which simplifies the B
+  // liveinterval).
+
+  // BValNo is a value number in B that is defined by a copy from A.  'B1' in
+  // the example above.
+  LiveInterval::iterator BS = IntB.FindSegmentContaining(CopyIdx);
+  if (BS == IntB.end()) return false;
+  VNInfo *BValNo = BS->valno;
+
+  // Get the location that B is defined at.  Two options: either this value has
+  // an unknown definition point or it is defined at CopyIdx.  If unknown, we
+  // can't process it.
+  if (BValNo->def != CopyIdx) return false;
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  SlotIndex CopyUseIdx = CopyIdx.getRegSlot(true);
+  LiveInterval::iterator AS = IntA.FindSegmentContaining(CopyUseIdx);
+  // The live segment might not exist after fun with physreg coalescing.
+  if (AS == IntA.end()) return false;
+  VNInfo *AValNo = AS->valno;
+
+  // If AValNo is defined as a copy from IntB, we can potentially process this.
+  // Get the instruction that defines this value number.
+  MachineInstr *ACopyMI = LIS->getInstructionFromIndex(AValNo->def);
+  // Don't allow any partial copies, even if isCoalescable() allows them.
+  if (!CP.isCoalescable(ACopyMI) || !ACopyMI->isFullCopy())
+    return false;
+
+  // Get the Segment in IntB that this value number starts with.
+  LiveInterval::iterator ValS =
+    IntB.FindSegmentContaining(AValNo->def.getPrevSlot());
+  if (ValS == IntB.end())
+    return false;
+
+  // Make sure that the end of the live segment is inside the same block as
+  // CopyMI.
+  MachineInstr *ValSEndInst =
+    LIS->getInstructionFromIndex(ValS->end.getPrevSlot());
+  if (!ValSEndInst || ValSEndInst->getParent() != CopyMI->getParent())
+    return false;
+
+  // Okay, we now know that ValS ends in the same block that the CopyMI
+  // live-range starts.  If there are no intervening live segments between them
+  // in IntB, we can merge them.
+  if (ValS+1 != BS) return false;
+
+  DEBUG(dbgs() << "Extending: " << PrintReg(IntB.reg, TRI));
+
+  SlotIndex FillerStart = ValS->end, FillerEnd = BS->start;
+  // We are about to delete CopyMI, so need to remove it as the 'instruction
+  // that defines this value #'. Update the valnum with the new defining
+  // instruction #.
+  BValNo->def = FillerStart;
+
+  // Okay, we can merge them.  We need to insert a new liverange:
+  // [ValS.end, BS.begin) of either value number, then we merge the
+  // two value numbers.
+  IntB.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, BValNo));
+
+  // Okay, merge "B1" into the same value number as "B0".
+  if (BValNo != ValS->valno)
+    IntB.MergeValueNumberInto(BValNo, ValS->valno);
+
+  // Do the same for the subregister segments.
+  for (LiveInterval::SubRange &S : IntB.subranges()) {
+    VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+    S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
+    VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
+    if (SubBValNo != SubValSNo)
+      S.MergeValueNumberInto(SubBValNo, SubValSNo);
+  }
+
+  DEBUG(dbgs() << "   result = " << IntB << '\n');
+
+  // If the source instruction was killing the source register before the
+  // merge, unset the isKill marker given the live range has been extended.
+  int UIdx = ValSEndInst->findRegisterUseOperandIdx(IntB.reg, true);
+  if (UIdx != -1) {
+    ValSEndInst->getOperand(UIdx).setIsKill(false);
+  }
+
+  // Rewrite the copy. If the copy instruction was killing the destination
+  // register before the merge, find the last use and trim the live range. That
+  // will also add the isKill marker.
+  CopyMI->substituteRegister(IntA.reg, IntB.reg, 0, *TRI);
+  if (AS->end == CopyIdx)
+    shrinkToUses(&IntA);
+
+  ++numExtends;
+  return true;
+}
+
+bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
+                                             LiveInterval &IntB,
+                                             VNInfo *AValNo,
+                                             VNInfo *BValNo) {
+  // If AValNo has PHI kills, conservatively assume that IntB defs can reach
+  // the PHI values.
+  if (LIS->hasPHIKill(IntA, AValNo))
+    return true;
+
+  for (LiveRange::Segment &ASeg : IntA.segments) {
+    if (ASeg.valno != AValNo) continue;
+    LiveInterval::iterator BI =
+      std::upper_bound(IntB.begin(), IntB.end(), ASeg.start);
+    if (BI != IntB.begin())
+      --BI;
+    for (; BI != IntB.end() && ASeg.end >= BI->start; ++BI) {
+      if (BI->valno == BValNo)
+        continue;
+      if (BI->start <= ASeg.start && BI->end > ASeg.start)
+        return true;
+      if (BI->start > ASeg.start && BI->start < ASeg.end)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// Copy segements with value number @p SrcValNo from liverange @p Src to live
+/// range @Dst and use value number @p DstValNo there.
+static void addSegmentsWithValNo(LiveRange &Dst, VNInfo *DstValNo,
+                                 const LiveRange &Src, const VNInfo *SrcValNo)
+{
+  for (const LiveRange::Segment &S : Src.segments) {
+    if (S.valno != SrcValNo)
+      continue;
+    Dst.addSegment(LiveRange::Segment(S.start, S.end, DstValNo));
+  }
+}
+
+bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
+                                                 MachineInstr *CopyMI) {
+  assert(!CP.isPhys());
+
+  LiveInterval &IntA =
+      LIS->getInterval(CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg());
+  LiveInterval &IntB =
+      LIS->getInterval(CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg());
+
+  // We found a non-trivially-coalescable copy with IntA being the source and
+  // IntB being the dest, thus this defines a value number in IntB.  If the
+  // source value number (in IntA) is defined by a commutable instruction and
+  // its other operand is coalesced to the copy dest register, see if we can
+  // transform the copy into a noop by commuting the definition. For example,
+  //
+  //  A3 = op A2 B0<kill>
+  //    ...
+  //  B1 = A3      <- this copy
+  //    ...
+  //     = op A3   <- more uses
+  //
+  // ==>
+  //
+  //  B2 = op B0 A2<kill>
+  //    ...
+  //  B1 = B2      <- now an identity copy
+  //    ...
+  //     = op B2   <- more uses
+
+  // BValNo is a value number in B that is defined by a copy from A. 'B1' in
+  // the example above.
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+  VNInfo *BValNo = IntB.getVNInfoAt(CopyIdx);
+  assert(BValNo != nullptr && BValNo->def == CopyIdx);
+
+  // AValNo is the value number in A that defines the copy, A3 in the example.
+  VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
+  assert(AValNo && !AValNo->isUnused() && "COPY source not live");
+  if (AValNo->isPHIDef())
+    return false;
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
+  if (!DefMI)
+    return false;
+  if (!DefMI->isCommutable())
+    return false;
+  // If DefMI is a two-address instruction then commuting it will change the
+  // destination register.
+  int DefIdx = DefMI->findRegisterDefOperandIdx(IntA.reg);
+  assert(DefIdx != -1);
+  unsigned UseOpIdx;
+  if (!DefMI->isRegTiedToUseOperand(DefIdx, &UseOpIdx))
+    return false;
+
+  // FIXME: The code below tries to commute 'UseOpIdx' operand with some other
+  // commutable operand which is expressed by 'CommuteAnyOperandIndex'value
+  // passed to the method. That _other_ operand is chosen by
+  // the findCommutedOpIndices() method.
+  //
+  // That is obviously an area for improvement in case of instructions having
+  // more than 2 operands. For example, if some instruction has 3 commutable
+  // operands then all possible variants (i.e. op#1<->op#2, op#1<->op#3,
+  // op#2<->op#3) of commute transformation should be considered/tried here.
+  unsigned NewDstIdx = TargetInstrInfo::CommuteAnyOperandIndex;
+  if (!TII->findCommutedOpIndices(*DefMI, UseOpIdx, NewDstIdx))
+    return false;
+
+  MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
+  unsigned NewReg = NewDstMO.getReg();
+  if (NewReg != IntB.reg || !IntB.Query(AValNo->def).isKill())
+    return false;
+
+  // Make sure there are no other definitions of IntB that would reach the
+  // uses which the new definition can reach.
+  if (hasOtherReachingDefs(IntA, IntB, AValNo, BValNo))
+    return false;
+
+  // If some of the uses of IntA.reg is already coalesced away, return false.
+  // It's not possible to determine whether it's safe to perform the coalescing.
+  for (MachineOperand &MO : MRI->use_nodbg_operands(IntA.reg)) {
+    MachineInstr *UseMI = MO.getParent();
+    unsigned OpNo = &MO - &UseMI->getOperand(0);
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI);
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    if (US == IntA.end() || US->valno != AValNo)
+      continue;
+    // If this use is tied to a def, we can't rewrite the register.
+    if (UseMI->isRegTiedToDefOperand(OpNo))
+      return false;
+  }
+
+  DEBUG(dbgs() << "\tremoveCopyByCommutingDef: " << AValNo->def << '\t'
+               << *DefMI);
+
+  // At this point we have decided that it is legal to do this
+  // transformation.  Start by commuting the instruction.
+  MachineBasicBlock *MBB = DefMI->getParent();
+  MachineInstr *NewMI =
+      TII->commuteInstruction(*DefMI, false, UseOpIdx, NewDstIdx);
+  if (!NewMI)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(IntA.reg) &&
+      TargetRegisterInfo::isVirtualRegister(IntB.reg) &&
+      !MRI->constrainRegClass(IntB.reg, MRI->getRegClass(IntA.reg)))
+    return false;
+  if (NewMI != DefMI) {
+    LIS->ReplaceMachineInstrInMaps(*DefMI, *NewMI);
+    MachineBasicBlock::iterator Pos = DefMI;
+    MBB->insert(Pos, NewMI);
+    MBB->erase(DefMI);
+  }
+
+  // If ALR and BLR overlaps and end of BLR extends beyond end of ALR, e.g.
+  // A = or A, B
+  // ...
+  // B = A
+  // ...
+  // C = A<kill>
+  // ...
+  //   = B
+
+  // Update uses of IntA of the specific Val# with IntB.
+  for (MachineRegisterInfo::use_iterator UI = MRI->use_begin(IntA.reg),
+                                         UE = MRI->use_end();
+       UI != UE; /* ++UI is below because of possible MI removal */) {
+    MachineOperand &UseMO = *UI;
+    ++UI;
+    if (UseMO.isUndef())
+      continue;
+    MachineInstr *UseMI = UseMO.getParent();
+    if (UseMI->isDebugValue()) {
+      // FIXME These don't have an instruction index.  Not clear we have enough
+      // info to decide whether to do this replacement or not.  For now do it.
+      UseMO.setReg(NewReg);
+      continue;
+    }
+    SlotIndex UseIdx = LIS->getInstructionIndex(*UseMI).getRegSlot(true);
+    LiveInterval::iterator US = IntA.FindSegmentContaining(UseIdx);
+    assert(US != IntA.end() && "Use must be live");
+    if (US->valno != AValNo)
+      continue;
+    // Kill flags are no longer accurate. They are recomputed after RA.
+    UseMO.setIsKill(false);
+    if (TargetRegisterInfo::isPhysicalRegister(NewReg))
+      UseMO.substPhysReg(NewReg, *TRI);
+    else
+      UseMO.setReg(NewReg);
+    if (UseMI == CopyMI)
+      continue;
+    if (!UseMI->isCopy())
+      continue;
+    if (UseMI->getOperand(0).getReg() != IntB.reg ||
+        UseMI->getOperand(0).getSubReg())
+      continue;
+
+    // This copy will become a noop. If it's defining a new val#, merge it into
+    // BValNo.
+    SlotIndex DefIdx = UseIdx.getRegSlot();
+    VNInfo *DVNI = IntB.getVNInfoAt(DefIdx);
+    if (!DVNI)
+      continue;
+    DEBUG(dbgs() << "\t\tnoop: " << DefIdx << '\t' << *UseMI);
+    assert(DVNI->def == DefIdx);
+    BValNo = IntB.MergeValueNumberInto(DVNI, BValNo);
+    for (LiveInterval::SubRange &S : IntB.subranges()) {
+      VNInfo *SubDVNI = S.getVNInfoAt(DefIdx);
+      if (!SubDVNI)
+        continue;
+      VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
+      assert(SubBValNo->def == CopyIdx);
+      S.MergeValueNumberInto(SubDVNI, SubBValNo);
+    }
+
+    ErasedInstrs.insert(UseMI);
+    LIS->RemoveMachineInstrFromMaps(*UseMI);
+    UseMI->eraseFromParent();
+  }
+
+  // Extend BValNo by merging in IntA live segments of AValNo. Val# definition
+  // is updated.
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  if (IntB.hasSubRanges()) {
+    if (!IntA.hasSubRanges()) {
+      LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(IntA.reg);
+      IntA.createSubRangeFrom(Allocator, Mask, IntA);
+    }
+    SlotIndex AIdx = CopyIdx.getRegSlot(true);
+    for (LiveInterval::SubRange &SA : IntA.subranges()) {
+      VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
+      assert(ASubValNo != nullptr);
+
+      LaneBitmask AMask = SA.LaneMask;
+      for (LiveInterval::SubRange &SB : IntB.subranges()) {
+        LaneBitmask BMask = SB.LaneMask;
+        LaneBitmask Common = BMask & AMask;
+        if (Common == 0)
+          continue;
+
+        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
+                      << " into " << PrintLaneMask(Common) << '\n');
+        LaneBitmask BRest = BMask & ~AMask;
+        LiveInterval::SubRange *CommonRange;
+        if (BRest != 0) {
+          SB.LaneMask = BRest;
+          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
+                       << '\n');
+          // Duplicate SubRange for newly merged common stuff.
+          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
+        } else {
+          // We van reuse the L SubRange.
+          SB.LaneMask = Common;
+          CommonRange = &SB;
+        }
+        LiveRange RangeCopy(SB, Allocator);
+
+        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
+        assert(BSubValNo->def == CopyIdx);
+        BSubValNo->def = ASubValNo->def;
+        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
+        AMask &= ~BMask;
+      }
+      if (AMask != 0) {
+        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
+        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
+        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
+        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
+      }
+    }
+  }
+
+  BValNo->def = AValNo->def;
+  addSegmentsWithValNo(IntB, BValNo, IntA, AValNo);
+  DEBUG(dbgs() << "\t\textended: " << IntB << '\n');
+
+  LIS->removeVRegDefAt(IntA, AValNo->def);
+
+  DEBUG(dbgs() << "\t\ttrimmed:  " << IntA << '\n');
+  ++numCommutes;
+  return true;
+}
+
+/// Returns true if @p MI defines the full vreg @p Reg, as opposed to just
+/// defining a subregister.
+static bool definesFullReg(const MachineInstr &MI, unsigned Reg) {
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "This code cannot handle physreg aliasing");
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isDef() || Op.getReg() != Reg)
+      continue;
+    // Return true if we define the full register or don't care about the value
+    // inside other subregisters.
+    if (Op.getSubReg() == 0 || Op.isUndef())
+      return true;
+  }
+  return false;
+}
+
+bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
+                                                MachineInstr *CopyMI,
+                                                bool &IsDefCopy) {
+  IsDefCopy = false;
+  unsigned SrcReg = CP.isFlipped() ? CP.getDstReg() : CP.getSrcReg();
+  unsigned SrcIdx = CP.isFlipped() ? CP.getDstIdx() : CP.getSrcIdx();
+  unsigned DstReg = CP.isFlipped() ? CP.getSrcReg() : CP.getDstReg();
+  unsigned DstIdx = CP.isFlipped() ? CP.getSrcIdx() : CP.getDstIdx();
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg))
+    return false;
+
+  LiveInterval &SrcInt = LIS->getInterval(SrcReg);
+  SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
+  VNInfo *ValNo = SrcInt.Query(CopyIdx).valueIn();
+  assert(ValNo && "CopyMI input register not live");
+  if (ValNo->isPHIDef() || ValNo->isUnused())
+    return false;
+  MachineInstr *DefMI = LIS->getInstructionFromIndex(ValNo->def);
+  if (!DefMI)
+    return false;
+  if (DefMI->isCopyLike()) {
+    IsDefCopy = true;
+    return false;
+  }
+  if (!TII->isAsCheapAsAMove(*DefMI))
+    return false;
+  if (!TII->isTriviallyReMaterializable(*DefMI, AA))
+    return false;
+  if (!definesFullReg(*DefMI, SrcReg))
+    return false;
+  bool SawStore = false;
+  if (!DefMI->isSafeToMove(AA, SawStore))
+    return false;
+  const MCInstrDesc &MCID = DefMI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+  // Only support subregister destinations when the def is read-undef.
+  MachineOperand &DstOperand = CopyMI->getOperand(0);
+  unsigned CopyDstReg = DstOperand.getReg();
+  if (DstOperand.getSubReg() && !DstOperand.isUndef())
+    return false;
+
+  // If both SrcIdx and DstIdx are set, correct rematerialization would widen
+  // the register substantially (beyond both source and dest size). This is bad
+  // for performance since it can cascade through a function, introducing many
+  // extra spills and fills (e.g. ARM can easily end up copying QQQQPR registers
+  // around after a few subreg copies).
+  if (SrcIdx && DstIdx)
+    return false;
+
+  const TargetRegisterClass *DefRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  if (!DefMI->isImplicitDef()) {
+    if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+      unsigned NewDstReg = DstReg;
+
+      unsigned NewDstIdx = TRI->composeSubRegIndices(CP.getSrcIdx(),
+                                              DefMI->getOperand(0).getSubReg());
+      if (NewDstIdx)
+        NewDstReg = TRI->getSubReg(DstReg, NewDstIdx);
+
+      // Finally, make sure that the physical subregister that will be
+      // constructed later is permitted for the instruction.
+      if (!DefRC->contains(NewDstReg))
+        return false;
+    } else {
+      // Theoretically, some stack frame reference could exist. Just make sure
+      // it hasn't actually happened.
+      assert(TargetRegisterInfo::isVirtualRegister(DstReg) &&
+             "Only expect to deal with virtual or physical registers");
+    }
+  }
+
+  DebugLoc DL = CopyMI->getDebugLoc();
+  MachineBasicBlock *MBB = CopyMI->getParent();
+  MachineBasicBlock::iterator MII =
+    std::next(MachineBasicBlock::iterator(CopyMI));
+  TII->reMaterialize(*MBB, MII, DstReg, SrcIdx, *DefMI, *TRI);
+  MachineInstr &NewMI = *std::prev(MII);
+  NewMI.setDebugLoc(DL);
+
+  // In a situation like the following:
+  //     %vreg0:subreg = instr              ; DefMI, subreg = DstIdx
+  //     %vreg1        = copy %vreg0:subreg ; CopyMI, SrcIdx = 0
+  // instead of widening %vreg1 to the register class of %vreg0 simply do:
+  //     %vreg1 = instr
+  const TargetRegisterClass *NewRC = CP.getNewRC();
+  if (DstIdx != 0) {
+    MachineOperand &DefMO = NewMI.getOperand(0);
+    if (DefMO.getSubReg() == DstIdx) {
+      assert(SrcIdx == 0 && CP.isFlipped()
+             && "Shouldn't have SrcIdx+DstIdx at this point");
+      const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
+      const TargetRegisterClass *CommonRC =
+        TRI->getCommonSubClass(DefRC, DstRC);
+      if (CommonRC != nullptr) {
+        NewRC = CommonRC;
+        DstIdx = 0;
+        DefMO.setSubReg(0);
+      }
+    }
+  }
+
+  // CopyMI may have implicit operands, save them so that we can transfer them
+  // over to the newly materialized instruction after CopyMI is removed.
+  SmallVector<MachineOperand, 4> ImplicitOps;
+  ImplicitOps.reserve(CopyMI->getNumOperands() -
+                      CopyMI->getDesc().getNumOperands());
+  for (unsigned I = CopyMI->getDesc().getNumOperands(),
+                E = CopyMI->getNumOperands();
+       I != E; ++I) {
+    MachineOperand &MO = CopyMI->getOperand(I);
+    if (MO.isReg()) {
+      assert(MO.isImplicit() && "No explicit operands after implict operands.");
+      // Discard VReg implicit defs.
+      if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+        ImplicitOps.push_back(MO);
+    }
+  }
+
+  LIS->ReplaceMachineInstrInMaps(*CopyMI, NewMI);
+  CopyMI->eraseFromParent();
+  ErasedInstrs.insert(CopyMI);
+
+  // NewMI may have dead implicit defs (E.g. EFLAGS for MOV<bits>r0 on X86).
+  // We need to remember these so we can add intervals once we insert
+  // NewMI into SlotIndexes.
+  SmallVector<unsigned, 4> NewMIImplDefs;
+  for (unsigned i = NewMI.getDesc().getNumOperands(),
+                e = NewMI.getNumOperands();
+       i != e; ++i) {
+    MachineOperand &MO = NewMI.getOperand(i);
+    if (MO.isReg() && MO.isDef()) {
+      assert(MO.isImplicit() && MO.isDead() &&
+             TargetRegisterInfo::isPhysicalRegister(MO.getReg()));
+      NewMIImplDefs.push_back(MO.getReg());
+    }
+  }
+
+  if (TargetRegisterInfo::isVirtualRegister(DstReg)) {
+    unsigned NewIdx = NewMI.getOperand(0).getSubReg();
+
+    if (DefRC != nullptr) {
+      if (NewIdx)
+        NewRC = TRI->getMatchingSuperRegClass(NewRC, DefRC, NewIdx);
+      else
+        NewRC = TRI->getCommonSubClass(NewRC, DefRC);
+      assert(NewRC && "subreg chosen for remat incompatible with instruction");
+    }
+    // Remap subranges to new lanemask and change register class.
+    LiveInterval &DstInt = LIS->getInterval(DstReg);
+    for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+      SR.LaneMask = TRI->composeSubRegIndexLaneMask(DstIdx, SR.LaneMask);
+    }
+    MRI->setRegClass(DstReg, NewRC);
+
+    // Update machine operands and add flags.
+    updateRegDefsUses(DstReg, DstReg, DstIdx);
+    NewMI.getOperand(0).setSubReg(NewIdx);
+    // Add dead subregister definitions if we are defining the whole register
+    // but only part of it is live.
+    // This could happen if the rematerialization instruction is rematerializing
+    // more than actually is used in the register.
+    // An example would be:
+    // vreg1 = LOAD CONSTANTS 5, 8 ; Loading both 5 and 8 in different subregs
+    // ; Copying only part of the register here, but the rest is undef.
+    // vreg2:sub_16bit<def, read-undef> = COPY vreg1:sub_16bit
+    // ==>
+    // ; Materialize all the constants but only using one
+    // vreg2 = LOAD_CONSTANTS 5, 8
+    //
+    // at this point for the part that wasn't defined before we could have
+    // subranges missing the definition.
+    if (NewIdx == 0 && DstInt.hasSubRanges()) {
+      SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
+      SlotIndex DefIndex =
+          CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
+      LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(DstReg);
+      VNInfo::Allocator& Alloc = LIS->getVNInfoAllocator();
+      for (LiveInterval::SubRange &SR : DstInt.subranges()) {
+        if (!SR.liveAt(DefIndex))
+          SR.createDeadDef(DefIndex, Alloc);
+        MaxMask &= ~SR.LaneMask;
+      }
+      if (MaxMask != 0) {
+        LiveInterval::SubRange *SR = DstInt.createSubRange(Alloc, MaxMask);
+        SR->createDeadDef(DefIndex, Alloc);
+      }
+    }
+  } else if (NewMI.getOperand(0).getReg() != CopyDstReg) {
+    // The New instruction may be defining a sub-register of what's actually
+    // been asked for. If so it must implicitly define the whole thing.
+    assert(TargetRegisterInfo::isPhysicalRegister(DstReg) &&
+           "Only expect virtual or physical registers in remat");
+    NewMI.getOperand(0).setIsDead(true);
+    NewMI.addOperand(MachineOperand::CreateReg(
+        CopyDstReg, true /*IsDef*/, true /*IsImp*/, false /*IsKill*/));
+    // Record small dead def live-ranges for all the subregisters
+    // of the destination register.
+    // Otherwise, variables that live through may miss some
+    // interferences, thus creating invalid allocation.
+    // E.g., i386 code:
+    // vreg1 = somedef ; vreg1 GR8
+    // vreg2 = remat ; vreg2 GR32
+    // CL = COPY vreg2.sub_8bit
+    // = somedef vreg1 ; vreg1 GR8
+    // =>
+    // vreg1 = somedef ; vreg1 GR8
+    // ECX<def, dead> = remat ; CL<imp-def>
+    // = somedef vreg1 ; vreg1 GR8
+    // vreg1 will see the inteferences with CL but not with CH since
+    // no live-ranges would have been created for ECX.
+    // Fix that!
+    SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+    for (MCRegUnitIterator Units(NewMI.getOperand(0).getReg(), TRI);
+         Units.isValid(); ++Units)
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+  }
+
+  if (NewMI.getOperand(0).getSubReg())
+    NewMI.getOperand(0).setIsUndef();
+
+  // Transfer over implicit operands to the rematerialized instruction.
+  for (MachineOperand &MO : ImplicitOps)
+    NewMI.addOperand(MO);
+
+  SlotIndex NewMIIdx = LIS->getInstructionIndex(NewMI);
+  for (unsigned i = 0, e = NewMIImplDefs.size(); i != e; ++i) {
+    unsigned Reg = NewMIImplDefs[i];
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+      if (LiveRange *LR = LIS->getCachedRegUnit(*Units))
+        LR->createDeadDef(NewMIIdx.getRegSlot(), LIS->getVNInfoAllocator());
+  }
+
+  DEBUG(dbgs() << "Remat: " << NewMI);
+  ++NumReMats;
+
+  // The source interval can become smaller because we removed a use.
+  shrinkToUses(&SrcInt, &DeadDefs);
+  if (!DeadDefs.empty()) {
+    // If the virtual SrcReg is completely eliminated, update all DBG_VALUEs
+    // to describe DstReg instead.
+    for (MachineOperand &UseMO : MRI->use_operands(SrcReg)) {
+      MachineInstr *UseMI = UseMO.getParent();
+      if (UseMI->isDebugValue()) {
+        UseMO.setReg(DstReg);
+        DEBUG(dbgs() << "\t\tupdated: " << *UseMI);
+      }
+    }
+    eliminateDeadDefs();
+  }
+
+  return true;
+}
+
+bool RegisterCoalescer::eliminateUndefCopy(MachineInstr *CopyMI) {
+  // ProcessImpicitDefs may leave some copies of <undef> values, it only removes
+  // local variables. When we have a copy like:
+  //
+  //   %vreg1 = COPY %vreg2<undef>
+  //
+  // We delete the copy and remove the corresponding value number from %vreg1.
+  // Any uses of that value number are marked as <undef>.
+
+  // Note that we do not query CoalescerPair here but redo isMoveInstr as the
+  // CoalescerPair may have a new register class with adjusted subreg indices
+  // at this point.
+  unsigned SrcReg, DstReg, SrcSubIdx, DstSubIdx;
+  isMoveInstr(*TRI, CopyMI, SrcReg, DstReg, SrcSubIdx, DstSubIdx);
+
+  SlotIndex Idx = LIS->getInstructionIndex(*CopyMI);
+  const LiveInterval &SrcLI = LIS->getInterval(SrcReg);
+  // CopyMI is undef iff SrcReg is not live before the instruction.
+  if (SrcSubIdx != 0 && SrcLI.hasSubRanges()) {
+    LaneBitmask SrcMask = TRI->getSubRegIndexLaneMask(SrcSubIdx);
+    for (const LiveInterval::SubRange &SR : SrcLI.subranges()) {
+      if ((SR.LaneMask & SrcMask) == 0)
+        continue;
+      if (SR.liveAt(Idx))
+        return false;
+    }
+  } else if (SrcLI.liveAt(Idx))
+    return false;
+
+  DEBUG(dbgs() << "\tEliminating copy of <undef> value\n");
+
+  // Remove any DstReg segments starting at the instruction.
+  LiveInterval &DstLI = LIS->getInterval(DstReg);
+  SlotIndex RegIndex = Idx.getRegSlot();
+  // Remove value or merge with previous one in case of a subregister def.
+  if (VNInfo *PrevVNI = DstLI.getVNInfoAt(Idx)) {
+    VNInfo *VNI = DstLI.getVNInfoAt(RegIndex);
+    DstLI.MergeValueNumberInto(VNI, PrevVNI);
+
+    // The affected subregister segments can be removed.
+    LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(DstSubIdx);
+    for (LiveInterval::SubRange &SR : DstLI.subranges()) {
+      if ((SR.LaneMask & DstMask) == 0)
+        continue;
+
+      VNInfo *SVNI = SR.getVNInfoAt(RegIndex);
+      assert(SVNI != nullptr && SlotIndex::isSameInstr(SVNI->def, RegIndex));
+      SR.removeValNo(SVNI);
+    }
+    DstLI.removeEmptySubRanges();
+  } else
+    LIS->removeVRegDefAt(DstLI, RegIndex);
+
+  // Mark uses as undef.
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(DstReg)) {
+    if (MO.isDef() /*|| MO.isUndef()*/)
+      continue;
+    const MachineInstr &MI = *MO.getParent();
+    SlotIndex UseIdx = LIS->getInstructionIndex(MI);
+    LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(MO.getSubReg());
+    bool isLive;
+    if (UseMask != ~0u && DstLI.hasSubRanges()) {
+      isLive = false;
+      for (const LiveInterval::SubRange &SR : DstLI.subranges()) {
+        if ((SR.LaneMask & UseMask) == 0)
+          continue;
+        if (SR.liveAt(UseIdx)) {
+          isLive = true;
+          break;
+        }
+      }
+    } else
+      isLive = DstLI.liveAt(UseIdx);
+    if (isLive)
+      continue;
+    MO.setIsUndef(true);
+    DEBUG(dbgs() << "\tnew undef: " << UseIdx << '\t' << MI);
+  }
+  return true;
+}
+
+void RegisterCoalescer::addUndefFlag(const LiveInterval &Int, SlotIndex UseIdx,
+                                     MachineOperand &MO, unsigned SubRegIdx) {
+  LaneBitmask Mask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  if (MO.isDef())
+    Mask = ~Mask;
+  bool IsUndef = true;
+  for (const LiveInterval::SubRange &S : Int.subranges()) {
+    if ((S.LaneMask & Mask) == 0)
+      continue;
+    if (S.liveAt(UseIdx)) {
+      IsUndef = false;
+      break;
+    }
+  }
+  if (IsUndef) {
+    MO.setIsUndef(true);
+    // We found out some subregister use is actually reading an undefined
+    // value. In some cases the whole vreg has become undefined at this
+    // point so we have to potentially shrink the main range if the
+    // use was ending a live segment there.
+    LiveQueryResult Q = Int.Query(UseIdx);
+    if (Q.valueOut() == nullptr)
+      ShrinkMainRange = true;
+  }
+}
+
+void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
+                                          unsigned DstReg,
+                                          unsigned SubIdx) {
+  bool DstIsPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  LiveInterval *DstInt = DstIsPhys ? nullptr : &LIS->getInterval(DstReg);
+
+  if (DstInt && DstInt->hasSubRanges() && DstReg != SrcReg) {
+    for (MachineOperand &MO : MRI->reg_operands(DstReg)) {
+      unsigned SubReg = MO.getSubReg();
+      if (SubReg == 0 || MO.isUndef())
+        continue;
+      MachineInstr &MI = *MO.getParent();
+      if (MI.isDebugValue())
+        continue;
+      SlotIndex UseIdx = LIS->getInstructionIndex(MI).getRegSlot(true);
+      addUndefFlag(*DstInt, UseIdx, MO, SubReg);
+    }
+  }
+
+  SmallPtrSet<MachineInstr*, 8> Visited;
+  for (MachineRegisterInfo::reg_instr_iterator
+       I = MRI->reg_instr_begin(SrcReg), E = MRI->reg_instr_end();
+       I != E; ) {
+    MachineInstr *UseMI = &*(I++);
+
+    // Each instruction can only be rewritten once because sub-register
+    // composition is not always idempotent. When SrcReg != DstReg, rewriting
+    // the UseMI operands removes them from the SrcReg use-def chain, but when
+    // SrcReg is DstReg we could encounter UseMI twice if it has multiple
+    // operands mentioning the virtual register.
+    if (SrcReg == DstReg && !Visited.insert(UseMI).second)
+      continue;
+
+    SmallVector<unsigned,8> Ops;
+    bool Reads, Writes;
+    std::tie(Reads, Writes) = UseMI->readsWritesVirtualRegister(SrcReg, &Ops);
+
+    // If SrcReg wasn't read, it may still be the case that DstReg is live-in
+    // because SrcReg is a sub-register.
+    if (DstInt && !Reads && SubIdx)
+      Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
+
+    // Replace SrcReg with DstReg in all UseMI operands.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      MachineOperand &MO = UseMI->getOperand(Ops[i]);
+
+      // Adjust <undef> flags in case of sub-register joins. We don't want to
+      // turn a full def into a read-modify-write sub-register def and vice
+      // versa.
+      if (SubIdx && MO.isDef())
+        MO.setIsUndef(!Reads);
+
+      // A subreg use of a partially undef (super) register may be a complete
+      // undef use now and then has to be marked that way.
+      if (SubIdx != 0 && MO.isUse() && MRI->shouldTrackSubRegLiveness(DstReg)) {
+        if (!DstInt->hasSubRanges()) {
+          BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+          LaneBitmask Mask = MRI->getMaxLaneMaskForVReg(DstInt->reg);
+          DstInt->createSubRangeFrom(Allocator, Mask, *DstInt);
+        }
+        SlotIndex MIIdx = UseMI->isDebugValue()
+                              ? LIS->getSlotIndexes()->getIndexBefore(*UseMI)
+                              : LIS->getInstructionIndex(*UseMI);
+        SlotIndex UseIdx = MIIdx.getRegSlot(true);
+        addUndefFlag(*DstInt, UseIdx, MO, SubIdx);
+      }
+
+      if (DstIsPhys)
+        MO.substPhysReg(DstReg, *TRI);
+      else
+        MO.substVirtReg(DstReg, SubIdx, *TRI);
+    }
+
+    DEBUG({
+        dbgs() << "\t\tupdated: ";
+        if (!UseMI->isDebugValue())
+          dbgs() << LIS->getInstructionIndex(*UseMI) << "\t";
+        dbgs() << *UseMI;
+      });
+  }
+}
+
+bool RegisterCoalescer::canJoinPhys(const CoalescerPair &CP) {
+  // Always join simple intervals that are defined by a single copy from a
+  // reserved register. This doesn't increase register pressure, so it is
+  // always beneficial.
+  if (!MRI->isReserved(CP.getDstReg())) {
+    DEBUG(dbgs() << "\tCan only merge into reserved registers.\n");
+    return false;
+  }
+
+  LiveInterval &JoinVInt = LIS->getInterval(CP.getSrcReg());
+  if (JoinVInt.containsOneValue())
+    return true;
+
+  DEBUG(dbgs() << "\tCannot join complex intervals into reserved register.\n");
+  return false;
+}
+
+bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
+
+  Again = false;
+  DEBUG(dbgs() << LIS->getInstructionIndex(*CopyMI) << '\t' << *CopyMI);
+
+  CoalescerPair CP(*TRI);
+  if (!CP.setRegisters(CopyMI)) {
+    DEBUG(dbgs() << "\tNot coalescable.\n");
+    return false;
+  }
+
+  if (CP.getNewRC()) {
+    auto SrcRC = MRI->getRegClass(CP.getSrcReg());
+    auto DstRC = MRI->getRegClass(CP.getDstReg());
+    unsigned SrcIdx = CP.getSrcIdx();
+    unsigned DstIdx = CP.getDstIdx();
+    if (CP.isFlipped()) {
+      std::swap(SrcIdx, DstIdx);
+      std::swap(SrcRC, DstRC);
+    }
+    if (!TRI->shouldCoalesce(CopyMI, SrcRC, SrcIdx, DstRC, DstIdx,
+                            CP.getNewRC())) {
+      DEBUG(dbgs() << "\tSubtarget bailed on coalescing.\n");
+      return false;
+    }
+  }
+
+  // Dead code elimination. This really should be handled by MachineDCE, but
+  // sometimes dead copies slip through, and we can't generate invalid live
+  // ranges.
+  if (!CP.isPhys() && CopyMI->allDefsAreDead()) {
+    DEBUG(dbgs() << "\tCopy is dead.\n");
+    DeadDefs.push_back(CopyMI);
+    eliminateDeadDefs();
+    return true;
+  }
+
+  // Eliminate undefs.
+  if (!CP.isPhys() && eliminateUndefCopy(CopyMI)) {
+    LIS->RemoveMachineInstrFromMaps(*CopyMI);
+    CopyMI->eraseFromParent();
+    return false;  // Not coalescable.
+  }
+
+  // Coalesced copies are normally removed immediately, but transformations
+  // like removeCopyByCommutingDef() can inadvertently create identity copies.
+  // When that happens, just join the values and remove the copy.
+  if (CP.getSrcReg() == CP.getDstReg()) {
+    LiveInterval &LI = LIS->getInterval(CP.getSrcReg());
+    DEBUG(dbgs() << "\tCopy already coalesced: " << LI << '\n');
+    const SlotIndex CopyIdx = LIS->getInstructionIndex(*CopyMI);
+    LiveQueryResult LRQ = LI.Query(CopyIdx);
+    if (VNInfo *DefVNI = LRQ.valueDefined()) {
+      VNInfo *ReadVNI = LRQ.valueIn();
+      assert(ReadVNI && "No value before copy and no <undef> flag.");
+      assert(ReadVNI != DefVNI && "Cannot read and define the same value.");
+      LI.MergeValueNumberInto(DefVNI, ReadVNI);
+
+      // Process subregister liveranges.
+      for (LiveInterval::SubRange &S : LI.subranges()) {
+        LiveQueryResult SLRQ = S.Query(CopyIdx);
+        if (VNInfo *SDefVNI = SLRQ.valueDefined()) {
+          VNInfo *SReadVNI = SLRQ.valueIn();
+          S.MergeValueNumberInto(SDefVNI, SReadVNI);
+        }
+      }
+      DEBUG(dbgs() << "\tMerged values:          " << LI << '\n');
+    }
+    LIS->RemoveMachineInstrFromMaps(*CopyMI);
+    CopyMI->eraseFromParent();
+    return true;
+  }
+
+  // Enforce policies.
+  if (CP.isPhys()) {
+    DEBUG(dbgs() << "\tConsidering merging " << PrintReg(CP.getSrcReg(), TRI)
+                 << " with " << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx())
+                 << '\n');
+    if (!canJoinPhys(CP)) {
+      // Before giving up coalescing, if definition of source is defined by
+      // trivial computation, try rematerializing it.
+      bool IsDefCopy;
+      if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
+        return true;
+      if (IsDefCopy)
+        Again = true;  // May be possible to coalesce later.
+      return false;
+    }
+  } else {
+    // When possible, let DstReg be the larger interval.
+    if (!CP.isPartial() && LIS->getInterval(CP.getSrcReg()).size() >
+                           LIS->getInterval(CP.getDstReg()).size())
+      CP.flip();
+
+    DEBUG({
+      dbgs() << "\tConsidering merging to "
+             << TRI->getRegClassName(CP.getNewRC()) << " with ";
+      if (CP.getDstIdx() && CP.getSrcIdx())
+        dbgs() << PrintReg(CP.getDstReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getDstIdx()) << " and "
+               << PrintReg(CP.getSrcReg()) << " in "
+               << TRI->getSubRegIndexName(CP.getSrcIdx()) << '\n';
+      else
+        dbgs() << PrintReg(CP.getSrcReg(), TRI) << " in "
+               << PrintReg(CP.getDstReg(), TRI, CP.getSrcIdx()) << '\n';
+    });
+  }
+
+  ShrinkMask = 0;
+  ShrinkMainRange = false;
+
+  // Okay, attempt to join these two intervals.  On failure, this returns false.
+  // Otherwise, if one of the intervals being joined is a physreg, this method
+  // always canonicalizes DstInt to be it.  The output "SrcInt" will not have
+  // been modified, so we can use this information below to update aliases.
+  if (!joinIntervals(CP)) {
+    // Coalescing failed.
+
+    // If definition of source is defined by trivial computation, try
+    // rematerializing it.
+    bool IsDefCopy;
+    if (reMaterializeTrivialDef(CP, CopyMI, IsDefCopy))
+      return true;
+
+    // If we can eliminate the copy without merging the live segments, do so
+    // now.
+    if (!CP.isPartial() && !CP.isPhys()) {
+      if (adjustCopiesBackFrom(CP, CopyMI) ||
+          removeCopyByCommutingDef(CP, CopyMI)) {
+        LIS->RemoveMachineInstrFromMaps(*CopyMI);
+        CopyMI->eraseFromParent();
+        DEBUG(dbgs() << "\tTrivial!\n");
+        return true;
+      }
+    }
+
+    // Otherwise, we are unable to join the intervals.
+    DEBUG(dbgs() << "\tInterference!\n");
+    Again = true;  // May be possible to coalesce later.
+    return false;
+  }
+
+  // Coalescing to a virtual register that is of a sub-register class of the
+  // other. Make sure the resulting register is set to the right register class.
+  if (CP.isCrossClass()) {
+    ++numCrossRCs;
+    MRI->setRegClass(CP.getDstReg(), CP.getNewRC());
+  }
+
+  // Removing sub-register copies can ease the register class constraints.
+  // Make sure we attempt to inflate the register class of DstReg.
+  if (!CP.isPhys() && RegClassInfo.isProperSubClass(CP.getNewRC()))
+    InflateRegs.push_back(CP.getDstReg());
+
+  // CopyMI has been erased by joinIntervals at this point. Remove it from
+  // ErasedInstrs since copyCoalesceWorkList() won't add a successful join back
+  // to the work list. This keeps ErasedInstrs from growing needlessly.
+  ErasedInstrs.erase(CopyMI);
+
+  // Rewrite all SrcReg operands to DstReg.
+  // Also update DstReg operands to include DstIdx if it is set.
+  if (CP.getDstIdx())
+    updateRegDefsUses(CP.getDstReg(), CP.getDstReg(), CP.getDstIdx());
+  updateRegDefsUses(CP.getSrcReg(), CP.getDstReg(), CP.getSrcIdx());
+
+  // Shrink subregister ranges if necessary.
+  if (ShrinkMask != 0) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      if ((S.LaneMask & ShrinkMask) == 0)
+        continue;
+      DEBUG(dbgs() << "Shrink LaneUses (Lane " << PrintLaneMask(S.LaneMask)
+                   << ")\n");
+      LIS->shrinkToUses(S, LI.reg);
+    }
+    LI.removeEmptySubRanges();
+  }
+  if (ShrinkMainRange) {
+    LiveInterval &LI = LIS->getInterval(CP.getDstReg());
+    shrinkToUses(&LI);
+  }
+
+  // SrcReg is guaranteed to be the register whose live interval that is
+  // being merged.
+  LIS->removeInterval(CP.getSrcReg());
+
+  // Update regalloc hint.
+  TRI->updateRegAllocHint(CP.getSrcReg(), CP.getDstReg(), *MF);
+
+  DEBUG({
+    dbgs() << "\tSuccess: " << PrintReg(CP.getSrcReg(), TRI, CP.getSrcIdx())
+           << " -> " << PrintReg(CP.getDstReg(), TRI, CP.getDstIdx()) << '\n';
+    dbgs() << "\tResult = ";
+    if (CP.isPhys())
+      dbgs() << PrintReg(CP.getDstReg(), TRI);
+    else
+      dbgs() << LIS->getInterval(CP.getDstReg());
+    dbgs() << '\n';
+  });
+
+  ++numJoins;
+  return true;
+}
+
+bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
+  unsigned DstReg = CP.getDstReg();
+  assert(CP.isPhys() && "Must be a physreg copy");
+  assert(MRI->isReserved(DstReg) && "Not a reserved register");
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
+
+  assert(RHS.containsOneValue() && "Invalid join with reserved register");
+
+  // Optimization for reserved registers like ESP. We can only merge with a
+  // reserved physreg if RHS has a single value that is a copy of DstReg.
+  // The live range of the reserved register will look like a set of dead defs
+  // - we don't properly track the live range of reserved registers.
+
+  // Deny any overlapping intervals.  This depends on all the reserved
+  // register live ranges to look like dead defs.
+  for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI)
+    if (RHS.overlaps(LIS->getRegUnit(*UI))) {
+      DEBUG(dbgs() << "\t\tInterference: " << PrintRegUnit(*UI, TRI) << '\n');
+      return false;
+    }
+
+  // Skip any value computations, we are not adding new values to the
+  // reserved register.  Also skip merging the live ranges, the reserved
+  // register live range doesn't need to be accurate as long as all the
+  // defs are there.
+
+  // Delete the identity copy.
+  MachineInstr *CopyMI;
+  if (CP.isFlipped()) {
+    CopyMI = MRI->getVRegDef(RHS.reg);
+  } else {
+    if (!MRI->hasOneNonDBGUse(RHS.reg)) {
+      DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
+      return false;
+    }
+
+    MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
+    CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
+    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+    const SlotIndex DestRegIdx = LIS->getInstructionIndex(*DestMI).getRegSlot();
+
+    // We checked above that there are no interfering defs of the physical
+    // register. However, for this case, where we intent to move up the def of
+    // the physical register, we also need to check for interfering uses.
+    SlotIndexes *Indexes = LIS->getSlotIndexes();
+    for (SlotIndex SI = Indexes->getNextNonNullIndex(DestRegIdx);
+         SI != CopyRegIdx; SI = Indexes->getNextNonNullIndex(SI)) {
+      MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+      if (MI->readsRegister(DstReg, TRI)) {
+        DEBUG(dbgs() << "\t\tInterference (read): " << *MI);
+        return false;
+      }
+
+      // We must also check for clobbers caused by regmasks.
+      for (const auto &MO : MI->operands()) {
+        if (MO.isRegMask() && MO.clobbersPhysReg(DstReg)) {
+          DEBUG(dbgs() << "\t\tInterference (regmask clobber): " << *MI);
+          return false;
+        }
+      }
+    }
+
+    // We're going to remove the copy which defines a physical reserved
+    // register, so remove its valno, etc.
+    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
+          << CopyRegIdx << "\n");
+
+    LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
+    // Create a new dead def at the new def location.
+    for (MCRegUnitIterator UI(DstReg, TRI); UI.isValid(); ++UI) {
+      LiveRange &LR = LIS->getRegUnit(*UI);
+      LR.createDeadDef(DestRegIdx, LIS->getVNInfoAllocator());
+    }
+  }
+
+  LIS->RemoveMachineInstrFromMaps(*CopyMI);
+  CopyMI->eraseFromParent();
+
+  // We don't track kills for reserved registers.
+  MRI->clearKillFlags(CP.getSrcReg());
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//                 Interference checking and interval joining
+//===----------------------------------------------------------------------===//
+//
+// In the easiest case, the two live ranges being joined are disjoint, and
+// there is no interference to consider. It is quite common, though, to have
+// overlapping live ranges, and we need to check if the interference can be
+// resolved.
+//
+// The live range of a single SSA value forms a sub-tree of the dominator tree.
+// This means that two SSA values overlap if and only if the def of one value
+// is contained in the live range of the other value. As a special case, the
+// overlapping values can be defined at the same index.
+//
+// The interference from an overlapping def can be resolved in these cases:
+//
+// 1. Coalescable copies. The value is defined by a copy that would become an
+//    identity copy after joining SrcReg and DstReg. The copy instruction will
+//    be removed, and the value will be merged with the source value.
+//
+//    There can be several copies back and forth, causing many values to be
+//    merged into one. We compute a list of ultimate values in the joined live
+//    range as well as a mappings from the old value numbers.
+//
+// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
+//    predecessors have a live out value. It doesn't cause real interference,
+//    and can be merged into the value it overlaps. Like a coalescable copy, it
+//    can be erased after joining.
+//
+// 3. Copy of external value. The overlapping def may be a copy of a value that
+//    is already in the other register. This is like a coalescable copy, but
+//    the live range of the source register must be trimmed after erasing the
+//    copy instruction:
+//
+//      %src = COPY %ext
+//      %dst = COPY %ext  <-- Remove this COPY, trim the live range of %ext.
+//
+// 4. Clobbering undefined lanes. Vector registers are sometimes built by
+//    defining one lane at a time:
+//
+//      %dst:ssub0<def,read-undef> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    The live range of %src overlaps the %dst value defined by FOO, but
+//    merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
+//    which was undef anyway.
+//
+//    The value mapping is more complicated in this case. The final live range
+//    will have different value numbers for both FOO and BAR, but there is no
+//    simple mapping from old to new values. It may even be necessary to add
+//    new PHI values.
+//
+// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
+//    is live, but never read. This can happen because we don't compute
+//    individual live ranges per lane.
+//
+//      %dst<def> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    This kind of interference is only resolved locally. If the clobbered
+//    lane value escapes the block, the join is aborted.
+
+namespace {
+/// Track information about values in a single virtual register about to be
+/// joined. Objects of this class are always created in pairs - one for each
+/// side of the CoalescerPair (or one for each lane of a side of the coalescer
+/// pair)
+class JoinVals {
+  /// Live range we work on.
+  LiveRange &LR;
+  /// (Main) register we work on.
+  const unsigned Reg;
+
+  /// Reg (and therefore the values in this liverange) will end up as
+  /// subregister SubIdx in the coalesced register. Either CP.DstIdx or
+  /// CP.SrcIdx.
+  const unsigned SubIdx;
+  /// The LaneMask that this liverange will occupy the coalesced register. May
+  /// be smaller than the lanemask produced by SubIdx when merging subranges.
+  const LaneBitmask LaneMask;
+
+  /// This is true when joining sub register ranges, false when joining main
+  /// ranges.
+  const bool SubRangeJoin;
+  /// Whether the current LiveInterval tracks subregister liveness.
+  const bool TrackSubRegLiveness;
+
+  /// Values that will be present in the final live range.
+  SmallVectorImpl<VNInfo*> &NewVNInfo;
+
+  const CoalescerPair &CP;
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  const TargetRegisterInfo *TRI;
+
+  /// Value number assignments. Maps value numbers in LI to entries in
+  /// NewVNInfo. This is suitable for passing to LiveInterval::join().
+  SmallVector<int, 8> Assignments;
+
+  /// Conflict resolution for overlapping values.
+  enum ConflictResolution {
+    /// No overlap, simply keep this value.
+    CR_Keep,
+
+    /// Merge this value into OtherVNI and erase the defining instruction.
+    /// Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    /// values.
+    CR_Erase,
+
+    /// Merge this value into OtherVNI but keep the defining instruction.
+    /// This is for the special case where OtherVNI is defined by the same
+    /// instruction.
+    CR_Merge,
+
+    /// Keep this value, and have it replace OtherVNI where possible. This
+    /// complicates value mapping since OtherVNI maps to two different values
+    /// before and after this def.
+    /// Used when clobbering undefined or dead lanes.
+    CR_Replace,
+
+    /// Unresolved conflict. Visit later when all values have been mapped.
+    CR_Unresolved,
+
+    /// Unresolvable conflict. Abort the join.
+    CR_Impossible
+  };
+
+  /// Per-value info for LI. The lane bit masks are all relative to the final
+  /// joined register, so they can be compared directly between SrcReg and
+  /// DstReg.
+  struct Val {
+    ConflictResolution Resolution;
+
+    /// Lanes written by this def, 0 for unanalyzed values.
+    LaneBitmask WriteLanes;
+
+    /// Lanes with defined values in this register. Other lanes are undef and
+    /// safe to clobber.
+    LaneBitmask ValidLanes;
+
+    /// Value in LI being redefined by this def.
+    VNInfo *RedefVNI;
+
+    /// Value in the other live range that overlaps this def, if any.
+    VNInfo *OtherVNI;
+
+    /// Is this value an IMPLICIT_DEF that can be erased?
+    ///
+    /// IMPLICIT_DEF values should only exist at the end of a basic block that
+    /// is a predecessor to a phi-value. These IMPLICIT_DEF instructions can be
+    /// safely erased if they are overlapping a live value in the other live
+    /// interval.
+    ///
+    /// Weird control flow graphs and incomplete PHI handling in
+    /// ProcessImplicitDefs can very rarely create IMPLICIT_DEF values with
+    /// longer live ranges. Such IMPLICIT_DEF values should be treated like
+    /// normal values.
+    bool ErasableImplicitDef;
+
+    /// True when the live range of this value will be pruned because of an
+    /// overlapping CR_Replace value in the other live range.
+    bool Pruned;
+
+    /// True once Pruned above has been computed.
+    bool PrunedComputed;
+
+    Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
+            RedefVNI(nullptr), OtherVNI(nullptr), ErasableImplicitDef(false),
+            Pruned(false), PrunedComputed(false) {}
+
+    bool isAnalyzed() const { return WriteLanes != 0; }
+  };
+
+  /// One entry per value number in LI.
+  SmallVector<Val, 8> Vals;
+
+  /// Compute the bitmask of lanes actually written by DefMI.
+  /// Set Redef if there are any partial register definitions that depend on the
+  /// previous value of the register.
+  LaneBitmask computeWriteLanes(const MachineInstr *DefMI, bool &Redef) const;
+
+  /// Find the ultimate value that VNI was copied from.
+  std::pair<const VNInfo*,unsigned> followCopyChain(const VNInfo *VNI) const;
+
+  bool valuesIdentical(VNInfo *Val0, VNInfo *Val1, const JoinVals &Other) const;
+
+  /// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+  /// Return a conflict resolution when possible, but leave the hard cases as
+  /// CR_Unresolved.
+  /// Recursively calls computeAssignment() on this and Other, guaranteeing that
+  /// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+  /// The recursion always goes upwards in the dominator tree, making loops
+  /// impossible.
+  ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+
+  /// Compute the value assignment for ValNo in RI.
+  /// This may be called recursively by analyzeValue(), but never for a ValNo on
+  /// the stack.
+  void computeAssignment(unsigned ValNo, JoinVals &Other);
+
+  /// Assuming ValNo is going to clobber some valid lanes in Other.LR, compute
+  /// the extent of the tainted lanes in the block.
+  ///
+  /// Multiple values in Other.LR can be affected since partial redefinitions
+  /// can preserve previously tainted lanes.
+  ///
+  ///   1 %dst = VLOAD           <-- Define all lanes in %dst
+  ///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+  ///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+  ///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+  ///
+  /// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+  /// entry to TaintedVals.
+  ///
+  /// Returns false if the tainted lanes extend beyond the basic block.
+  bool taintExtent(unsigned, LaneBitmask, JoinVals&,
+                   SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> >&);
+
+  /// Return true if MI uses any of the given Lanes from Reg.
+  /// This does not include partial redefinitions of Reg.
+  bool usesLanes(const MachineInstr &MI, unsigned, unsigned, LaneBitmask) const;
+
+  /// Determine if ValNo is a copy of a value number in LR or Other.LR that will
+  /// be pruned:
+  ///
+  ///   %dst = COPY %src
+  ///   %src = COPY %dst  <-- This value to be pruned.
+  ///   %dst = COPY %src  <-- This value is a copy of a pruned value.
+  bool isPrunedValue(unsigned ValNo, JoinVals &Other);
+
+public:
+  JoinVals(LiveRange &LR, unsigned Reg, unsigned SubIdx, LaneBitmask LaneMask,
+           SmallVectorImpl<VNInfo*> &newVNInfo, const CoalescerPair &cp,
+           LiveIntervals *lis, const TargetRegisterInfo *TRI, bool SubRangeJoin,
+           bool TrackSubRegLiveness)
+    : LR(LR), Reg(Reg), SubIdx(SubIdx), LaneMask(LaneMask),
+      SubRangeJoin(SubRangeJoin), TrackSubRegLiveness(TrackSubRegLiveness),
+      NewVNInfo(newVNInfo), CP(cp), LIS(lis), Indexes(LIS->getSlotIndexes()),
+      TRI(TRI), Assignments(LR.getNumValNums(), -1), Vals(LR.getNumValNums())
+  {}
+
+  /// Analyze defs in LR and compute a value mapping in NewVNInfo.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool mapValues(JoinVals &Other);
+
+  /// Try to resolve conflicts that require all values to be mapped.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool resolveConflicts(JoinVals &Other);
+
+  /// Prune the live range of values in Other.LR where they would conflict with
+  /// CR_Replace values in LR. Collect end points for restoring the live range
+  /// after joining.
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints,
+                   bool changeInstrs);
+
+  /// Removes subranges starting at copies that get removed. This sometimes
+  /// happens when undefined subranges are copied around. These ranges contain
+  /// no useful information and can be removed.
+  void pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask);
+
+  /// Erase any machine instructions that have been coalesced away.
+  /// Add erased instructions to ErasedInstrs.
+  /// Add foreign virtual registers to ShrinkRegs if their live range ended at
+  /// the erased instrs.
+  void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+                   SmallVectorImpl<unsigned> &ShrinkRegs);
+
+  /// Remove liverange defs at places where implicit defs will be removed.
+  void removeImplicitDefs();
+
+  /// Get the value assignments suitable for passing to LiveInterval::join.
+  const int *getAssignments() const { return Assignments.data(); }
+};
+} // end anonymous namespace
+
+LaneBitmask JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef)
+  const {
+  LaneBitmask L = 0;
+  for (const MachineOperand &MO : DefMI->operands()) {
+    if (!MO.isReg() || MO.getReg() != Reg || !MO.isDef())
+      continue;
+    L |= TRI->getSubRegIndexLaneMask(
+           TRI->composeSubRegIndices(SubIdx, MO.getSubReg()));
+    if (MO.readsReg())
+      Redef = true;
+  }
+  return L;
+}
+
+std::pair<const VNInfo*, unsigned> JoinVals::followCopyChain(
+    const VNInfo *VNI) const {
+  unsigned Reg = this->Reg;
+
+  while (!VNI->isPHIDef()) {
+    SlotIndex Def = VNI->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+    assert(MI && "No defining instruction");
+    if (!MI->isFullCopy())
+      return std::make_pair(VNI, Reg);
+    unsigned SrcReg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg))
+      return std::make_pair(VNI, Reg);
+
+    const LiveInterval &LI = LIS->getInterval(SrcReg);
+    const VNInfo *ValueIn;
+    // No subrange involved.
+    if (!SubRangeJoin || !LI.hasSubRanges()) {
+      LiveQueryResult LRQ = LI.Query(Def);
+      ValueIn = LRQ.valueIn();
+    } else {
+      // Query subranges. Pick the first matching one.
+      ValueIn = nullptr;
+      for (const LiveInterval::SubRange &S : LI.subranges()) {
+        // Transform lanemask to a mask in the joined live interval.
+        LaneBitmask SMask = TRI->composeSubRegIndexLaneMask(SubIdx, S.LaneMask);
+        if ((SMask & LaneMask) == 0)
+          continue;
+        LiveQueryResult LRQ = S.Query(Def);
+        ValueIn = LRQ.valueIn();
+        break;
+      }
+    }
+    if (ValueIn == nullptr)
+      break;
+    VNI = ValueIn;
+    Reg = SrcReg;
+  }
+  return std::make_pair(VNI, Reg);
+}
+
+bool JoinVals::valuesIdentical(VNInfo *Value0, VNInfo *Value1,
+                               const JoinVals &Other) const {
+  const VNInfo *Orig0;
+  unsigned Reg0;
+  std::tie(Orig0, Reg0) = followCopyChain(Value0);
+  if (Orig0 == Value1)
+    return true;
+
+  const VNInfo *Orig1;
+  unsigned Reg1;
+  std::tie(Orig1, Reg1) = Other.followCopyChain(Value1);
+
+  // The values are equal if they are defined at the same place and use the
+  // same register. Note that we cannot compare VNInfos directly as some of
+  // them might be from a copy created in mergeSubRangeInto()  while the other
+  // is from the original LiveInterval.
+  return Orig0->def == Orig1->def && Reg0 == Reg1;
+}
+
+JoinVals::ConflictResolution
+JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  assert(!V.isAnalyzed() && "Value has already been analyzed!");
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
+  if (VNI->isUnused()) {
+    V.WriteLanes = ~0u;
+    return CR_Keep;
+  }
+
+  // Get the instruction defining this value, compute the lanes written.
+  const MachineInstr *DefMI = nullptr;
+  if (VNI->isPHIDef()) {
+    // Conservatively assume that all lanes in a PHI are valid.
+    LaneBitmask Lanes = SubRangeJoin ? 1 : TRI->getSubRegIndexLaneMask(SubIdx);
+    V.ValidLanes = V.WriteLanes = Lanes;
+  } else {
+    DefMI = Indexes->getInstructionFromIndex(VNI->def);
+    assert(DefMI != nullptr);
+    if (SubRangeJoin) {
+      // We don't care about the lanes when joining subregister ranges.
+      V.WriteLanes = V.ValidLanes = 1;
+      if (DefMI->isImplicitDef()) {
+        V.ValidLanes = 0;
+        V.ErasableImplicitDef = true;
+      }
+    } else {
+      bool Redef = false;
+      V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+      // If this is a read-modify-write instruction, there may be more valid
+      // lanes than the ones written by this instruction.
+      // This only covers partial redef operands. DefMI may have normal use
+      // operands reading the register. They don't contribute valid lanes.
+      //
+      // This adds ssub1 to the set of valid lanes in %src:
+      //
+      //   %src:ssub1<def> = FOO
+      //
+      // This leaves only ssub1 valid, making any other lanes undef:
+      //
+      //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+      //
+      // The <read-undef> flag on the def operand means that old lane values are
+      // not important.
+      if (Redef) {
+        V.RedefVNI = LR.Query(VNI->def).valueIn();
+        assert((TrackSubRegLiveness || V.RedefVNI) &&
+               "Instruction is reading nonexistent value");
+        if (V.RedefVNI != nullptr) {
+          computeAssignment(V.RedefVNI->id, Other);
+          V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+        }
+      }
+
+      // An IMPLICIT_DEF writes undef values.
+      if (DefMI->isImplicitDef()) {
+        // We normally expect IMPLICIT_DEF values to be live only until the end
+        // of their block. If the value is really live longer and gets pruned in
+        // another block, this flag is cleared again.
+        V.ErasableImplicitDef = true;
+        V.ValidLanes &= ~V.WriteLanes;
+      }
+    }
+  }
+
+  // Find the value in Other that overlaps VNI->def, if any.
+  LiveQueryResult OtherLRQ = Other.LR.Query(VNI->def);
+
+  // It is possible that both values are defined by the same instruction, or
+  // the values are PHIs defined in the same block. When that happens, the two
+  // values should be merged into one, but not into any preceding value.
+  // The first value defined or visited gets CR_Keep, the other gets CR_Merge.
+  if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
+    assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
+
+    // One value stays, the other is merged. Keep the earlier one, or the first
+    // one we see.
+    if (OtherVNI->def < VNI->def)
+      Other.computeAssignment(OtherVNI->id, *this);
+    else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
+      // This is an early-clobber def overlapping a live-in value in the other
+      // register. Not mergeable.
+      V.OtherVNI = OtherLRQ.valueIn();
+      return CR_Impossible;
+    }
+    V.OtherVNI = OtherVNI;
+    Val &OtherV = Other.Vals[OtherVNI->id];
+    // Keep this value, check for conflicts when analyzing OtherVNI.
+    if (!OtherV.isAnalyzed())
+      return CR_Keep;
+    // Both sides have been analyzed now.
+    // Allow overlapping PHI values. Any real interference would show up in a
+    // predecessor, the PHI itself can't introduce any conflicts.
+    if (VNI->isPHIDef())
+      return CR_Merge;
+    if (V.ValidLanes & OtherV.ValidLanes)
+      // Overlapping lanes can't be resolved.
+      return CR_Impossible;
+    else
+      return CR_Merge;
+  }
+
+  // No simultaneous def. Is Other live at the def?
+  V.OtherVNI = OtherLRQ.valueIn();
+  if (!V.OtherVNI)
+    // No overlap, no conflict.
+    return CR_Keep;
+
+  assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
+
+  // We have overlapping values, or possibly a kill of Other.
+  // Recursively compute assignments up the dominator tree.
+  Other.computeAssignment(V.OtherVNI->id, *this);
+  Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+  // Check if OtherV is an IMPLICIT_DEF that extends beyond its basic block.
+  // This shouldn't normally happen, but ProcessImplicitDefs can leave such
+  // IMPLICIT_DEF instructions behind, and there is nothing wrong with it
+  // technically.
+  //
+  // When it happens, treat that IMPLICIT_DEF as a normal value, and don't try
+  // to erase the IMPLICIT_DEF instruction.
+  if (OtherV.ErasableImplicitDef && DefMI &&
+      DefMI->getParent() != Indexes->getMBBFromIndex(V.OtherVNI->def)) {
+    DEBUG(dbgs() << "IMPLICIT_DEF defined at " << V.OtherVNI->def
+                 << " extends into BB#" << DefMI->getParent()->getNumber()
+                 << ", keeping it.\n");
+    OtherV.ErasableImplicitDef = false;
+  }
+
+  // Allow overlapping PHI values. Any real interference would show up in a
+  // predecessor, the PHI itself can't introduce any conflicts.
+  if (VNI->isPHIDef())
+    return CR_Replace;
+
+  // Check for simple erasable conflicts.
+  if (DefMI->isImplicitDef()) {
+    // We need the def for the subregister if there is nothing else live at the
+    // subrange at this point.
+    if (TrackSubRegLiveness
+        && (V.WriteLanes & (OtherV.ValidLanes | OtherV.WriteLanes)) == 0)
+      return CR_Replace;
+    return CR_Erase;
+  }
+
+  // Include the non-conflict where DefMI is a coalescable copy that kills
+  // OtherVNI. We still want the copy erased and value numbers merged.
+  if (CP.isCoalescable(DefMI)) {
+    // Some of the lanes copied from OtherVNI may be undef, making them undef
+    // here too.
+    V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
+    return CR_Erase;
+  }
+
+  // This may not be a real conflict if DefMI simply kills Other and defines
+  // VNI.
+  if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
+    return CR_Keep;
+
+  // Handle the case where VNI and OtherVNI can be proven to be identical:
+  //
+  //   %other = COPY %ext
+  //   %this  = COPY %ext <-- Erase this copy
+  //
+  if (DefMI->isFullCopy() && !CP.isPartial()
+      && valuesIdentical(VNI, V.OtherVNI, Other))
+    return CR_Erase;
+
+  // If the lanes written by this instruction were all undef in OtherVNI, it is
+  // still safe to join the live ranges. This can't be done with a simple value
+  // mapping, though - OtherVNI will map to multiple values:
+  //
+  //   1 %dst:ssub0 = FOO                <-- OtherVNI
+  //   2 %src = BAR                      <-- VNI
+  //   3 %dst:ssub1 = COPY %src<kill>    <-- Eliminate this copy.
+  //   4 BAZ %dst<kill>
+  //   5 QUUX %src<kill>
+  //
+  // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
+  // handles this complex value mapping.
+  if ((V.WriteLanes & OtherV.ValidLanes) == 0)
+    return CR_Replace;
+
+  // If the other live range is killed by DefMI and the live ranges are still
+  // overlapping, it must be because we're looking at an early clobber def:
+  //
+  //   %dst<def,early-clobber> = ASM %src<kill>
+  //
+  // In this case, it is illegal to merge the two live ranges since the early
+  // clobber def would clobber %src before it was read.
+  if (OtherLRQ.isKill()) {
+    // This case where the def doesn't overlap the kill is handled above.
+    assert(VNI->def.isEarlyClobber() &&
+           "Only early clobber defs can overlap a kill");
+    return CR_Impossible;
+  }
+
+  // VNI is clobbering live lanes in OtherVNI, but there is still the
+  // possibility that no instructions actually read the clobbered lanes.
+  // If we're clobbering all the lanes in OtherVNI, at least one must be read.
+  // Otherwise Other.RI wouldn't be live here.
+  if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
+    return CR_Impossible;
+
+  // We need to verify that no instructions are reading the clobbered lanes. To
+  // save compile time, we'll only check that locally. Don't allow the tainted
+  // value to escape the basic block.
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
+    return CR_Impossible;
+
+  // There are still some things that could go wrong besides clobbered lanes
+  // being read, for example OtherVNI may be only partially redefined in MBB,
+  // and some clobbered lanes could escape the block. Save this analysis for
+  // resolveConflicts() when all values have been mapped. We need to know
+  // RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
+  // that now - the recursive analyzeValue() calls must go upwards in the
+  // dominator tree.
+  return CR_Unresolved;
+}
+
+void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.isAnalyzed()) {
+    // Recursion should always move up the dominator tree, so ValNo is not
+    // supposed to reappear before it has been assigned.
+    assert(Assignments[ValNo] != -1 && "Bad recursion?");
+    return;
+  }
+  switch ((V.Resolution = analyzeValue(ValNo, Other))) {
+  case CR_Erase:
+  case CR_Merge:
+    // Merge this ValNo into OtherVNI.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
+    assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
+    Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(Reg) << ':' << ValNo << '@'
+                 << LR.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.Reg) << ':' << V.OtherVNI->id << '@'
+                 << V.OtherVNI->def << " --> @"
+                 << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    break;
+  case CR_Replace:
+  case CR_Unresolved: {
+    // The other value is going to be pruned if this join is successful.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
+    Val &OtherV = Other.Vals[V.OtherVNI->id];
+    // We cannot erase an IMPLICIT_DEF if we don't have valid values for all
+    // its lanes.
+    if ((OtherV.WriteLanes & ~V.ValidLanes) != 0 && TrackSubRegLiveness)
+      OtherV.ErasableImplicitDef = false;
+    OtherV.Pruned = true;
+  }
+    // Fall through.
+  default:
+    // This value number needs to go in the final joined live range.
+    Assignments[ValNo] = NewVNInfo.size();
+    NewVNInfo.push_back(LR.getValNumInfo(ValNo));
+    break;
+  }
+}
+
+bool JoinVals::mapValues(JoinVals &Other) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    computeAssignment(i, Other);
+    if (Vals[i].Resolution == CR_Impossible) {
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(Reg) << ':' << i
+                   << '@' << LR.getValNumInfo(i)->def << '\n');
+      return false;
+    }
+  }
+  return true;
+}
+
+bool JoinVals::
+taintExtent(unsigned ValNo, LaneBitmask TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, LaneBitmask> > &TaintExtent) {
+  VNInfo *VNI = LR.getValNumInfo(ValNo);
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
+
+  // Scan Other.LR from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LR.find(VNI->def);
+  assert(OtherI != Other.LR.end() && "No conflict?");
+  do {
+    // OtherI is pointing to a tainted value. Abort the join if the tainted
+    // lanes escape the block.
+    SlotIndex End = OtherI->end;
+    if (End >= MBBEnd) {
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.Reg) << ':'
+                   << OtherI->valno->id << '@' << OtherI->start << '\n');
+      return false;
+    }
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.Reg) << ':'
+                 << OtherI->valno->id << '@' << OtherI->start
+                 << " to " << End << '\n');
+    // A dead def is not a problem.
+    if (End.isDead())
+      break;
+    TaintExtent.push_back(std::make_pair(End, TaintedLanes));
+
+    // Check for another def in the MBB.
+    if (++OtherI == Other.LR.end() || OtherI->start >= MBBEnd)
+      break;
+
+    // Lanes written by the new def are no longer tainted.
+    const Val &OV = Other.Vals[OtherI->valno->id];
+    TaintedLanes &= ~OV.WriteLanes;
+    if (!OV.RedefVNI)
+      break;
+  } while (TaintedLanes);
+  return true;
+}
+
+bool JoinVals::usesLanes(const MachineInstr &MI, unsigned Reg, unsigned SubIdx,
+                         LaneBitmask Lanes) const {
+  if (MI.isDebugValue())
+    return false;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg() || MO.isDef() || MO.getReg() != Reg)
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (Lanes & TRI->getSubRegIndexLaneMask(
+                  TRI->composeSubRegIndices(SubIdx, MO.getSubReg())))
+      return true;
+  }
+  return false;
+}
+
+bool JoinVals::resolveConflicts(JoinVals &Other) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
+    if (V.Resolution != CR_Unresolved)
+      continue;
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(Reg) << ':' << i
+                 << '@' << LR.getValNumInfo(i)->def << '\n');
+    if (SubRangeJoin)
+      return false;
+
+    ++NumLaneConflicts;
+    assert(V.OtherVNI && "Inconsistent conflict resolution.");
+    VNInfo *VNI = LR.getValNumInfo(i);
+    const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+    // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
+    // join, those lanes will be tainted with a wrong value. Get the extent of
+    // the tainted lanes.
+    LaneBitmask TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, LaneBitmask>, 8> TaintExtent;
+    if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
+      // Tainted lanes would extend beyond the basic block.
+      return false;
+
+    assert(!TaintExtent.empty() && "There should be at least one conflict.");
+
+    // Now look at the instructions from VNI->def to TaintExtent (inclusive).
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+    MachineBasicBlock::iterator MI = MBB->begin();
+    if (!VNI->isPHIDef()) {
+      MI = Indexes->getInstructionFromIndex(VNI->def);
+      // No need to check the instruction defining VNI for reads.
+      ++MI;
+    }
+    assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
+           "Interference ends on VNI->def. Should have been handled earlier");
+    MachineInstr *LastMI =
+      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+    assert(LastMI && "Range must end at a proper instruction");
+    unsigned TaintNum = 0;
+    for(;;) {
+      assert(MI != MBB->end() && "Bad LastMI");
+      if (usesLanes(*MI, Other.Reg, Other.SubIdx, TaintedLanes)) {
+        DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
+        return false;
+      }
+      // LastMI is the last instruction to use the current value.
+      if (&*MI == LastMI) {
+        if (++TaintNum == TaintExtent.size())
+          break;
+        LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
+        assert(LastMI && "Range must end at a proper instruction");
+        TaintedLanes = TaintExtent[TaintNum].second;
+      }
+      ++MI;
+    }
+
+    // The tainted lanes are unused.
+    V.Resolution = CR_Replace;
+    ++NumLaneResolves;
+  }
+  return true;
+}
+
+bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.Pruned || V.PrunedComputed)
+    return V.Pruned;
+
+  if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
+    return V.Pruned;
+
+  // Follow copies up the dominator tree and check if any intermediate value
+  // has been pruned.
+  V.PrunedComputed = true;
+  V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
+  return V.Pruned;
+}
+
+void JoinVals::pruneValues(JoinVals &Other,
+                           SmallVectorImpl<SlotIndex> &EndPoints,
+                           bool changeInstrs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      break;
+    case CR_Replace: {
+      // This value takes precedence over the value in Other.LR.
+      LIS->pruneValue(Other.LR, Def, &EndPoints);
+      // Check if we're replacing an IMPLICIT_DEF value. The IMPLICIT_DEF
+      // instructions are only inserted to provide a live-out value for PHI
+      // predecessors, so the instruction should simply go away once its value
+      // has been replaced.
+      Val &OtherV = Other.Vals[Vals[i].OtherVNI->id];
+      bool EraseImpDef = OtherV.ErasableImplicitDef &&
+                         OtherV.Resolution == CR_Keep;
+      if (!Def.isBlock()) {
+        if (changeInstrs) {
+          // Remove <def,read-undef> flags. This def is now a partial redef.
+          // Also remove <def,dead> flags since the joined live range will
+          // continue past this instruction.
+          for (MachineOperand &MO :
+               Indexes->getInstructionFromIndex(Def)->operands()) {
+            if (MO.isReg() && MO.isDef() && MO.getReg() == Reg) {
+              MO.setIsUndef(EraseImpDef);
+              MO.setIsDead(false);
+            }
+          }
+        }
+        // This value will reach instructions below, but we need to make sure
+        // the live range also reaches the instruction at Def.
+        if (!EraseImpDef)
+          EndPoints.push_back(Def);
+      }
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.Reg) << " at " << Def
+                   << ": " << Other.LR << '\n');
+      break;
+    }
+    case CR_Erase:
+    case CR_Merge:
+      if (isPrunedValue(i, Other)) {
+        // This value is ultimately a copy of a pruned value in LR or Other.LR.
+        // We can no longer trust the value mapping computed by
+        // computeAssignment(), the value that was originally copied could have
+        // been replaced.
+        LIS->pruneValue(LR, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(Reg) << " at "
+                     << Def << ": " << LR << '\n');
+      }
+      break;
+    case CR_Unresolved:
+    case CR_Impossible:
+      llvm_unreachable("Unresolved conflicts");
+    }
+  }
+}
+
+void JoinVals::pruneSubRegValues(LiveInterval &LI, LaneBitmask &ShrinkMask)
+{
+  // Look for values being erased.
+  bool DidPrune = false;
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Erase)
+      continue;
+
+    // Check subranges at the point where the copy will be removed.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveQueryResult Q = S.Query(Def);
+
+      // If a subrange starts at the copy then an undefined value has been
+      // copied and we must remove that subrange value as well.
+      VNInfo *ValueOut = Q.valueOutOrDead();
+      if (ValueOut != nullptr && Q.valueIn() == nullptr) {
+        DEBUG(dbgs() << "\t\tPrune sublane " << PrintLaneMask(S.LaneMask)
+                     << " at " << Def << "\n");
+        LIS->pruneValue(S, Def, nullptr);
+        DidPrune = true;
+        // Mark value number as unused.
+        ValueOut->markUnused();
+        continue;
+      }
+      // If a subrange ends at the copy, then a value was copied but only
+      // partially used later. Shrink the subregister range appropriately.
+      if (Q.valueIn() != nullptr && Q.valueOut() == nullptr) {
+        DEBUG(dbgs() << "\t\tDead uses at sublane " << PrintLaneMask(S.LaneMask)
+                     << " at " << Def << "\n");
+        ShrinkMask |= S.LaneMask;
+      }
+    }
+  }
+  if (DidPrune)
+    LI.removeEmptySubRanges();
+}
+
+void JoinVals::removeImplicitDefs() {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    if (V.Resolution != CR_Keep || !V.ErasableImplicitDef || !V.Pruned)
+      continue;
+
+    VNInfo *VNI = LR.getValNumInfo(i);
+    VNI->markUnused();
+    LR.removeValNo(VNI);
+  }
+}
+
+void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
+                           SmallVectorImpl<unsigned> &ShrinkRegs) {
+  for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
+    // Get the def location before markUnused() below invalidates it.
+    SlotIndex Def = LR.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep: {
+      // If an IMPLICIT_DEF value is pruned, it doesn't serve a purpose any
+      // longer. The IMPLICIT_DEF instructions are only inserted by
+      // PHIElimination to guarantee that all PHI predecessors have a value.
+      if (!Vals[i].ErasableImplicitDef || !Vals[i].Pruned)
+        break;
+      // Remove value number i from LR.
+      VNInfo *VNI = LR.getValNumInfo(i);
+      LR.removeValNo(VNI);
+      // Note that this VNInfo is reused and still referenced in NewVNInfo,
+      // make it appear like an unused value number.
+      VNI->markUnused();
+      DEBUG(dbgs() << "\t\tremoved " << i << '@' << Def << ": " << LR << '\n');
+      // FALL THROUGH.
+    }
+
+    case CR_Erase: {
+      MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+      assert(MI && "No instruction to erase");
+      if (MI->isCopy()) {
+        unsigned Reg = MI->getOperand(1).getReg();
+        if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+            Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+          ShrinkRegs.push_back(Reg);
+      }
+      ErasedInstrs.insert(MI);
+      DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+      LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void RegisterCoalescer::joinSubRegRanges(LiveRange &LRange, LiveRange &RRange,
+                                         LaneBitmask LaneMask,
+                                         const CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  JoinVals RHSVals(RRange, CP.getSrcReg(), CP.getSrcIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+  JoinVals LHSVals(LRange, CP.getDstReg(), CP.getDstIdx(), LaneMask,
+                   NewVNInfo, CP, LIS, TRI, true, true);
+
+  // Compute NewVNInfo and resolve conflicts (see also joinVirtRegs())
+  // We should be able to resolve all conflicts here as we could successfully do
+  // it on the mainrange already. There is however a problem when multiple
+  // ranges get mapped to the "overflow" lane mask bit which creates unexpected
+  // interferences.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals)) {
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
+  }
+  if (!LHSVals.resolveConflicts(RHSVals) ||
+      !RHSVals.resolveConflicts(LHSVals)) {
+    // We already determined that it is legal to merge the intervals, so this
+    // should never fail.
+    llvm_unreachable("*** Couldn't join subrange!\n");
+  }
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, false);
+  RHSVals.pruneValues(LHSVals, EndPoints, false);
+
+  LHSVals.removeImplicitDefs();
+  RHSVals.removeImplicitDefs();
+
+  LRange.verify();
+  RRange.verify();
+
+  // Join RRange into LHS.
+  LRange.join(RRange, LHSVals.getAssignments(), RHSVals.getAssignments(),
+              NewVNInfo);
+
+  DEBUG(dbgs() << "\t\tjoined lanes: " << LRange << "\n");
+  if (EndPoints.empty())
+    return;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LRange << '\n');
+  LIS->extendToIndices(LRange, EndPoints);
+}
+
+void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
+                                          const LiveRange &ToMerge,
+                                          LaneBitmask LaneMask,
+                                          CoalescerPair &CP) {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (LiveInterval::SubRange &R : LI.subranges()) {
+    LaneBitmask RMask = R.LaneMask;
+    // LaneMask of subregisters common to subrange R and ToMerge.
+    LaneBitmask Common = RMask & LaneMask;
+    // There is nothing to do without common subregs.
+    if (Common == 0)
+      continue;
+
+    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
+                 << PrintLaneMask(Common) << '\n');
+    // LaneMask of subregisters contained in the R range but not in ToMerge,
+    // they have to split into their own subrange.
+    LaneBitmask LRest = RMask & ~LaneMask;
+    LiveInterval::SubRange *CommonRange;
+    if (LRest != 0) {
+      R.LaneMask = LRest;
+      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
+      // Duplicate SubRange for newly merged common stuff.
+      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+    } else {
+      // Reuse the existing range.
+      R.LaneMask = Common;
+      CommonRange = &R;
+    }
+    LiveRange RangeCopy(ToMerge, Allocator);
+    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
+    LaneMask &= ~RMask;
+  }
+
+  if (LaneMask != 0) {
+    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
+    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
+  }
+}
+
+bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
+  bool TrackSubRegLiveness = MRI->shouldTrackSubRegLiveness(*CP.getNewRC());
+  JoinVals RHSVals(RHS, CP.getSrcReg(), CP.getSrcIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
+  JoinVals LHSVals(LHS, CP.getDstReg(), CP.getDstIdx(), 0, NewVNInfo, CP, LIS,
+                   TRI, false, TrackSubRegLiveness);
+
+  DEBUG(dbgs() << "\t\tRHS = " << RHS
+               << "\n\t\tLHS = " << LHS
+               << '\n');
+
+  // First compute NewVNInfo and the simple value mappings.
+  // Detect impossible conflicts early.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    return false;
+
+  // Some conflicts can only be resolved after all values have been mapped.
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    return false;
+
+  // All clear, the live ranges can be merged.
+  if (RHS.hasSubRanges() || LHS.hasSubRanges()) {
+    BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+
+    // Transform lanemasks from the LHS to masks in the coalesced register and
+    // create initial subranges if necessary.
+    unsigned DstIdx = CP.getDstIdx();
+    if (!LHS.hasSubRanges()) {
+      LaneBitmask Mask = DstIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(DstIdx);
+      // LHS must support subregs or we wouldn't be in this codepath.
+      assert(Mask != 0);
+      LHS.createSubRangeFrom(Allocator, Mask, LHS);
+    } else if (DstIdx != 0) {
+      // Transform LHS lanemasks to new register class if necessary.
+      for (LiveInterval::SubRange &R : LHS.subranges()) {
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(DstIdx, R.LaneMask);
+        R.LaneMask = Mask;
+      }
+    }
+    DEBUG(dbgs() << "\t\tLHST = " << PrintReg(CP.getDstReg())
+                 << ' ' << LHS << '\n');
+
+    // Determine lanemasks of RHS in the coalesced register and merge subranges.
+    unsigned SrcIdx = CP.getSrcIdx();
+    if (!RHS.hasSubRanges()) {
+      LaneBitmask Mask = SrcIdx == 0 ? CP.getNewRC()->getLaneMask()
+                                     : TRI->getSubRegIndexLaneMask(SrcIdx);
+      mergeSubRangeInto(LHS, RHS, Mask, CP);
+    } else {
+      // Pair up subranges and merge.
+      for (LiveInterval::SubRange &R : RHS.subranges()) {
+        LaneBitmask Mask = TRI->composeSubRegIndexLaneMask(SrcIdx, R.LaneMask);
+        mergeSubRangeInto(LHS, R, Mask, CP);
+      }
+    }
+    DEBUG(dbgs() << "\tJoined SubRanges " << LHS << "\n");
+
+    LHSVals.pruneSubRegValues(LHS, ShrinkMask);
+    RHSVals.pruneSubRegValues(LHS, ShrinkMask);
+  }
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints, true);
+  RHSVals.pruneValues(LHSVals, EndPoints, true);
+
+  // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
+  // registers to require trimming.
+  SmallVector<unsigned, 8> ShrinkRegs;
+  LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  while (!ShrinkRegs.empty())
+    shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
+
+  // Join RHS into LHS.
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo);
+
+  // Kill flags are going to be wrong if the live ranges were overlapping.
+  // Eventually, we should simply clear all kill flags when computing live
+  // ranges. They are reinserted after register allocation.
+  MRI->clearKillFlags(LHS.reg);
+  MRI->clearKillFlags(RHS.reg);
+
+  if (!EndPoints.empty()) {
+    // Recompute the parts of the live range we had to remove because of
+    // CR_Replace conflicts.
+    DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+                 << " points: " << LHS << '\n');
+    LIS->extendToIndices((LiveRange&)LHS, EndPoints);
+  }
+
+  return true;
+}
+
+bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
+  return CP.isPhys() ? joinReservedPhysReg(CP) : joinVirtRegs(CP);
+}
+
+namespace {
+/// Information concerning MBB coalescing priority.
+struct MBBPriorityInfo {
+  MachineBasicBlock *MBB;
+  unsigned Depth;
+  bool IsSplit;
+
+  MBBPriorityInfo(MachineBasicBlock *mbb, unsigned depth, bool issplit)
+    : MBB(mbb), Depth(depth), IsSplit(issplit) {}
+};
+}
+
+/// C-style comparator that sorts first based on the loop depth of the basic
+/// block (the unsigned), and then on the MBB number.
+///
+/// EnableGlobalCopies assumes that the primary sort key is loop depth.
+static int compareMBBPriority(const MBBPriorityInfo *LHS,
+                              const MBBPriorityInfo *RHS) {
+  // Deeper loops first
+  if (LHS->Depth != RHS->Depth)
+    return LHS->Depth > RHS->Depth ? -1 : 1;
+
+  // Try to unsplit critical edges next.
+  if (LHS->IsSplit != RHS->IsSplit)
+    return LHS->IsSplit ? -1 : 1;
+
+  // Prefer blocks that are more connected in the CFG. This takes care of
+  // the most difficult copies first while intervals are short.
+  unsigned cl = LHS->MBB->pred_size() + LHS->MBB->succ_size();
+  unsigned cr = RHS->MBB->pred_size() + RHS->MBB->succ_size();
+  if (cl != cr)
+    return cl > cr ? -1 : 1;
+
+  // As a last resort, sort by block number.
+  return LHS->MBB->getNumber() < RHS->MBB->getNumber() ? -1 : 1;
+}
+
+/// \returns true if the given copy uses or defines a local live range.
+static bool isLocalCopy(MachineInstr *Copy, const LiveIntervals *LIS) {
+  if (!Copy->isCopy())
+    return false;
+
+  if (Copy->getOperand(1).isUndef())
+    return false;
+
+  unsigned SrcReg = Copy->getOperand(1).getReg();
+  unsigned DstReg = Copy->getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(SrcReg)
+      || TargetRegisterInfo::isPhysicalRegister(DstReg))
+    return false;
+
+  return LIS->intervalIsInOneMBB(LIS->getInterval(SrcReg))
+    || LIS->intervalIsInOneMBB(LIS->getInterval(DstReg));
+}
+
+bool RegisterCoalescer::
+copyCoalesceWorkList(MutableArrayRef<MachineInstr*> CurrList) {
+  bool Progress = false;
+  for (unsigned i = 0, e = CurrList.size(); i != e; ++i) {
+    if (!CurrList[i])
+      continue;
+    // Skip instruction pointers that have already been erased, for example by
+    // dead code elimination.
+    if (ErasedInstrs.erase(CurrList[i])) {
+      CurrList[i] = nullptr;
+      continue;
+    }
+    bool Again = false;
+    bool Success = joinCopy(CurrList[i], Again);
+    Progress |= Success;
+    if (Success || !Again)
+      CurrList[i] = nullptr;
+  }
+  return Progress;
+}
+
+/// Check if DstReg is a terminal node.
+/// I.e., it does not have any affinity other than \p Copy.
+static bool isTerminalReg(unsigned DstReg, const MachineInstr &Copy,
+                          const MachineRegisterInfo *MRI) {
+  assert(Copy.isCopyLike());
+  // Check if the destination of this copy as any other affinity.
+  for (const MachineInstr &MI : MRI->reg_nodbg_instructions(DstReg))
+    if (&MI != &Copy && MI.isCopyLike())
+      return false;
+  return true;
+}
+
+bool RegisterCoalescer::applyTerminalRule(const MachineInstr &Copy) const {
+  assert(Copy.isCopyLike());
+  if (!UseTerminalRule)
+    return false;
+  unsigned DstReg, DstSubReg, SrcReg, SrcSubReg;
+  isMoveInstr(*TRI, &Copy, SrcReg, DstReg, SrcSubReg, DstSubReg);
+  // Check if the destination of this copy has any other affinity.
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      // If SrcReg is a physical register, the copy won't be coalesced.
+      // Ignoring it may have other side effect (like missing
+      // rematerialization). So keep it.
+      TargetRegisterInfo::isPhysicalRegister(SrcReg) ||
+      !isTerminalReg(DstReg, Copy, MRI))
+    return false;
+
+  // DstReg is a terminal node. Check if it interferes with any other
+  // copy involving SrcReg.
+  const MachineBasicBlock *OrigBB = Copy.getParent();
+  const LiveInterval &DstLI = LIS->getInterval(DstReg);
+  for (const MachineInstr &MI : MRI->reg_nodbg_instructions(SrcReg)) {
+    // Technically we should check if the weight of the new copy is
+    // interesting compared to the other one and update the weight
+    // of the copies accordingly. However, this would only work if
+    // we would gather all the copies first then coalesce, whereas
+    // right now we interleave both actions.
+    // For now, just consider the copies that are in the same block.
+    if (&MI == &Copy || !MI.isCopyLike() || MI.getParent() != OrigBB)
+      continue;
+    unsigned OtherReg, OtherSubReg, OtherSrcReg, OtherSrcSubReg;
+    isMoveInstr(*TRI, &Copy, OtherSrcReg, OtherReg, OtherSrcSubReg,
+                OtherSubReg);
+    if (OtherReg == SrcReg)
+      OtherReg = OtherSrcReg;
+    // Check if OtherReg is a non-terminal.
+    if (TargetRegisterInfo::isPhysicalRegister(OtherReg) ||
+        isTerminalReg(OtherReg, MI, MRI))
+      continue;
+    // Check that OtherReg interfere with DstReg.
+    if (LIS->getInterval(OtherReg).overlaps(DstLI)) {
+      DEBUG(dbgs() << "Apply terminal rule for: " << PrintReg(DstReg) << '\n');
+      return true;
+    }
+  }
+  return false;
+}
+
+void
+RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << MBB->getName() << ":\n");
+
+  // Collect all copy-like instructions in MBB. Don't start coalescing anything
+  // yet, it might invalidate the iterator.
+  const unsigned PrevSize = WorkList.size();
+  if (JoinGlobalCopies) {
+    SmallVector<MachineInstr*, 2> LocalTerminals;
+    SmallVector<MachineInstr*, 2> GlobalTerminals;
+    // Coalesce copies bottom-up to coalesce local defs before local uses. They
+    // are not inherently easier to resolve, but slightly preferable until we
+    // have local live range splitting. In particular this is required by
+    // cmp+jmp macro fusion.
+    for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end();
+         MII != E; ++MII) {
+      if (!MII->isCopyLike())
+        continue;
+      bool ApplyTerminalRule = applyTerminalRule(*MII);
+      if (isLocalCopy(&(*MII), LIS)) {
+        if (ApplyTerminalRule)
+          LocalTerminals.push_back(&(*MII));
+        else
+          LocalWorkList.push_back(&(*MII));
+      } else {
+        if (ApplyTerminalRule)
+          GlobalTerminals.push_back(&(*MII));
+        else
+          WorkList.push_back(&(*MII));
+      }
+    }
+    // Append the copies evicted by the terminal rule at the end of the list.
+    LocalWorkList.append(LocalTerminals.begin(), LocalTerminals.end());
+    WorkList.append(GlobalTerminals.begin(), GlobalTerminals.end());
+  }
+  else {
+    SmallVector<MachineInstr*, 2> Terminals;
+    for (MachineInstr &MII : *MBB)
+      if (MII.isCopyLike()) {
+        if (applyTerminalRule(MII))
+          Terminals.push_back(&MII);
+        else
+          WorkList.push_back(&MII);
+      }
+    // Append the copies evicted by the terminal rule at the end of the list.
+    WorkList.append(Terminals.begin(), Terminals.end());
+  }
+  // Try coalescing the collected copies immediately, and remove the nulls.
+  // This prevents the WorkList from getting too large since most copies are
+  // joinable on the first attempt.
+  MutableArrayRef<MachineInstr*>
+    CurrList(WorkList.begin() + PrevSize, WorkList.end());
+  if (copyCoalesceWorkList(CurrList))
+    WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(),
+                               (MachineInstr*)nullptr), WorkList.end());
+}
+
+void RegisterCoalescer::coalesceLocals() {
+  copyCoalesceWorkList(LocalWorkList);
+  for (unsigned j = 0, je = LocalWorkList.size(); j != je; ++j) {
+    if (LocalWorkList[j])
+      WorkList.push_back(LocalWorkList[j]);
+  }
+  LocalWorkList.clear();
+}
+
+void RegisterCoalescer::joinAllIntervals() {
+  DEBUG(dbgs() << "********** JOINING INTERVALS ***********\n");
+  assert(WorkList.empty() && LocalWorkList.empty() && "Old data still around.");
+
+  std::vector<MBBPriorityInfo> MBBs;
+  MBBs.reserve(MF->size());
+  for (MachineFunction::iterator I = MF->begin(), E = MF->end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    MBBs.push_back(MBBPriorityInfo(MBB, Loops->getLoopDepth(MBB),
+                                   JoinSplitEdges && isSplitEdge(MBB)));
+  }
+  array_pod_sort(MBBs.begin(), MBBs.end(), compareMBBPriority);
+
+  // Coalesce intervals in MBB priority order.
+  unsigned CurrDepth = UINT_MAX;
+  for (unsigned i = 0, e = MBBs.size(); i != e; ++i) {
+    // Try coalescing the collected local copies for deeper loops.
+    if (JoinGlobalCopies && MBBs[i].Depth < CurrDepth) {
+      coalesceLocals();
+      CurrDepth = MBBs[i].Depth;
+    }
+    copyCoalesceInMBB(MBBs[i].MBB);
+  }
+  coalesceLocals();
+
+  // Joining intervals can allow other intervals to be joined.  Iteratively join
+  // until we make no progress.
+  while (copyCoalesceWorkList(WorkList))
+    /* empty */ ;
+}
+
+void RegisterCoalescer::releaseMemory() {
+  ErasedInstrs.clear();
+  WorkList.clear();
+  DeadDefs.clear();
+  InflateRegs.clear();
+}
+
+bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  MRI = &fn.getRegInfo();
+  TM = &fn.getTarget();
+  const TargetSubtargetInfo &STI = fn.getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TII = STI.getInstrInfo();
+  LIS = &getAnalysis<LiveIntervals>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  if (EnableGlobalCopies == cl::BOU_UNSET)
+    JoinGlobalCopies = STI.enableJoinGlobalCopies();
+  else
+    JoinGlobalCopies = (EnableGlobalCopies == cl::BOU_TRUE);
+
+  // The MachineScheduler does not currently require JoinSplitEdges. This will
+  // either be enabled unconditionally or replaced by a more general live range
+  // splitting optimization.
+  JoinSplitEdges = EnableJoinSplits;
+
+  DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
+               << "********** Function: " << MF->getName() << '\n');
+
+  if (VerifyCoalescing)
+    MF->verify(this, "Before register coalescing");
+
+  RegClassInfo.runOnMachineFunction(fn);
+
+  // Join (coalesce) intervals if requested.
+  if (EnableJoining)
+    joinAllIntervals();
+
+  // After deleting a lot of copies, register classes may be less constrained.
+  // Removing sub-register operands may allow GR32_ABCD -> GR32 and DPR_VFP2 ->
+  // DPR inflation.
+  array_pod_sort(InflateRegs.begin(), InflateRegs.end());
+  InflateRegs.erase(std::unique(InflateRegs.begin(), InflateRegs.end()),
+                    InflateRegs.end());
+  DEBUG(dbgs() << "Trying to inflate " << InflateRegs.size() << " regs.\n");
+  for (unsigned i = 0, e = InflateRegs.size(); i != e; ++i) {
+    unsigned Reg = InflateRegs[i];
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    if (MRI->recomputeRegClass(Reg)) {
+      DEBUG(dbgs() << PrintReg(Reg) << " inflated to "
+                   << TRI->getRegClassName(MRI->getRegClass(Reg)) << '\n');
+      ++NumInflated;
+
+      LiveInterval &LI = LIS->getInterval(Reg);
+      if (LI.hasSubRanges()) {
+        // If the inflated register class does not support subregisters anymore
+        // remove the subranges.
+        if (!MRI->shouldTrackSubRegLiveness(Reg)) {
+          LI.clearSubRanges();
+        } else {
+#ifndef NDEBUG
+          LaneBitmask MaxMask = MRI->getMaxLaneMaskForVReg(Reg);
+          // If subranges are still supported, then the same subregs
+          // should still be supported.
+          for (LiveInterval::SubRange &S : LI.subranges()) {
+            assert((S.LaneMask & ~MaxMask) == 0);
+          }
+#endif
+        }
+      }
+    }
+  }
+
+  DEBUG(dump());
+  if (VerifyCoalescing)
+    MF->verify(this, "After register coalescing");
+  return true;
+}
+
+void RegisterCoalescer::print(raw_ostream &O, const Module* m) const {
+   LIS->print(O, m);
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterCoalescer.h b/contrib/llvm/lib/CodeGen/RegisterCoalescer.h
new file mode 100644
index 000000000000..04067a1427af
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterCoalescer.h
@@ -0,0 +1,116 @@
+//===-- RegisterCoalescer.h - Register Coalescing Interface -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the abstract interface for register coalescers,
+// allowing them to interact with and query register allocators.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+#define LLVM_LIB_CODEGEN_REGISTERCOALESCER_H
+
+namespace llvm {
+
+  class MachineInstr;
+  class TargetRegisterInfo;
+  class TargetRegisterClass;
+  class TargetInstrInfo;
+
+  /// A helper class for register coalescers. When deciding if
+  /// two registers can be coalesced, CoalescerPair can determine if a copy
+  /// instruction would become an identity copy after coalescing.
+  class CoalescerPair {
+    const TargetRegisterInfo &TRI;
+
+    /// The register that will be left after coalescing. It can be a
+    /// virtual or physical register.
+    unsigned DstReg;
+
+    /// The virtual register that will be coalesced into dstReg.
+    unsigned SrcReg;
+
+    /// The sub-register index of the old DstReg in the new coalesced register.
+    unsigned DstIdx;
+
+    /// The sub-register index of the old SrcReg in the new coalesced register.
+    unsigned SrcIdx;
+
+    /// True when the original copy was a partial subregister copy.
+    bool Partial;
+
+    /// True when both regs are virtual and newRC is constrained.
+    bool CrossClass;
+
+    /// True when DstReg and SrcReg are reversed from the original
+    /// copy instruction.
+    bool Flipped;
+
+    /// The register class of the coalesced register, or NULL if DstReg
+    /// is a physreg. This register class may be a super-register of both
+    /// SrcReg and DstReg.
+    const TargetRegisterClass *NewRC;
+
+  public:
+    CoalescerPair(const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(0), SrcReg(0), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
+
+    /// Create a CoalescerPair representing a virtreg-to-physreg copy.
+    /// No need to call setRegisters().
+    CoalescerPair(unsigned VirtReg, unsigned PhysReg,
+                  const TargetRegisterInfo &tri)
+      : TRI(tri), DstReg(PhysReg), SrcReg(VirtReg), DstIdx(0), SrcIdx(0),
+        Partial(false), CrossClass(false), Flipped(false), NewRC(nullptr) {}
+
+    /// Set registers to match the copy instruction MI. Return
+    /// false if MI is not a coalescable copy instruction.
+    bool setRegisters(const MachineInstr*);
+
+    /// Swap SrcReg and DstReg. Return false if swapping is impossible
+    /// because DstReg is a physical register, or SubIdx is set.
+    bool flip();
+
+    /// Return true if MI is a copy instruction that will become
+    /// an identity copy after coalescing.
+    bool isCoalescable(const MachineInstr*) const;
+
+    /// Return true if DstReg is a physical register.
+    bool isPhys() const { return !NewRC; }
+
+    /// Return true if the original copy instruction did not copy
+    /// the full register, but was a subreg operation.
+    bool isPartial() const { return Partial; }
+
+    /// Return true if DstReg is virtual and NewRC is a smaller
+    /// register class than DstReg's.
+    bool isCrossClass() const { return CrossClass; }
+
+    /// Return true when getSrcReg is the register being defined by
+    /// the original copy instruction.
+    bool isFlipped() const { return Flipped; }
+
+    /// Return the register (virtual or physical) that will remain
+    /// after coalescing.
+    unsigned getDstReg() const { return DstReg; }
+
+    /// Return the virtual register that will be coalesced away.
+    unsigned getSrcReg() const { return SrcReg; }
+
+    /// Return the subregister index that DstReg will be coalesced into, or 0.
+    unsigned getDstIdx() const { return DstIdx; }
+
+    /// Return the subregister index that SrcReg will be coalesced into, or 0.
+    unsigned getSrcIdx() const { return SrcIdx; }
+
+    /// Return the register class of the coalesced register.
+    const TargetRegisterClass *getNewRC() const { return NewRC; }
+  };
+} // End llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/RegisterPressure.cpp b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
new file mode 100644
index 000000000000..a21d6c1d4d63
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -0,0 +1,1350 @@
+//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the RegisterPressure class which can be used to track
+// MachineInstr level register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+/// Increase pressure for each pressure set provided by TargetRegisterInfo.
+static void increaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  assert((PrevMask & ~NewMask) == 0 && "Must not remove bits");
+  if (PrevMask != 0 || NewMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI)
+    CurrSetPressure[*PSetI] += Weight;
+}
+
+/// Decrease pressure for each pressure set provided by TargetRegisterInfo.
+static void decreaseSetPressure(std::vector<unsigned> &CurrSetPressure,
+                                const MachineRegisterInfo &MRI, unsigned Reg,
+                                LaneBitmask PrevMask, LaneBitmask NewMask) {
+  assert((NewMask & !PrevMask) == 0 && "Must not add bits");
+  if (NewMask != 0 || PrevMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI.getPressureSets(Reg);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    assert(CurrSetPressure[*PSetI] >= Weight && "register pressure underflow");
+    CurrSetPressure[*PSetI] -= Weight;
+  }
+}
+
+LLVM_DUMP_METHOD
+void llvm::dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
+                              const TargetRegisterInfo *TRI) {
+  bool Empty = true;
+  for (unsigned i = 0, e = SetPressure.size(); i < e; ++i) {
+    if (SetPressure[i] != 0) {
+      dbgs() << TRI->getRegPressureSetName(i) << "=" << SetPressure[i] << '\n';
+      Empty = false;
+    }
+  }
+  if (Empty)
+    dbgs() << "\n";
+}
+
+LLVM_DUMP_METHOD
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
+  dbgs() << "Max Pressure: ";
+  dumpRegSetPressure(MaxSetPressure, TRI);
+  dbgs() << "Live In: ";
+  for (const RegisterMaskPair &P : LiveInRegs) {
+    dbgs() << PrintVRegOrUnit(P.RegUnit, TRI);
+    if (P.LaneMask != ~0u)
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
+  dbgs() << '\n';
+  dbgs() << "Live Out: ";
+  for (const RegisterMaskPair &P : LiveOutRegs) {
+    dbgs() << PrintVRegOrUnit(P.RegUnit, TRI);
+    if (P.LaneMask != ~0u)
+      dbgs() << ':' << PrintLaneMask(P.LaneMask);
+    dbgs() << ' ';
+  }
+  dbgs() << '\n';
+}
+
+LLVM_DUMP_METHOD
+void RegPressureTracker::dump() const {
+  if (!isTopClosed() || !isBottomClosed()) {
+    dbgs() << "Curr Pressure: ";
+    dumpRegSetPressure(CurrSetPressure, TRI);
+  }
+  P.dump(TRI);
+}
+
+void PressureDiff::dump(const TargetRegisterInfo &TRI) const {
+  const char *sep = "";
+  for (const PressureChange &Change : *this) {
+    if (!Change.isValid())
+      break;
+    dbgs() << sep << TRI.getRegPressureSetName(Change.getPSet())
+           << " " << Change.getUnitInc();
+    sep = "    ";
+  }
+  dbgs() << '\n';
+}
+
+void RegPressureTracker::increaseRegPressure(unsigned RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  if (PreviousMask != 0 || NewMask == 0)
+    return;
+
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  unsigned Weight = PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    CurrSetPressure[*PSetI] += Weight;
+    P.MaxSetPressure[*PSetI] =
+        std::max(P.MaxSetPressure[*PSetI], CurrSetPressure[*PSetI]);
+  }
+}
+
+void RegPressureTracker::decreaseRegPressure(unsigned RegUnit,
+                                             LaneBitmask PreviousMask,
+                                             LaneBitmask NewMask) {
+  decreaseSetPressure(CurrSetPressure, *MRI, RegUnit, PreviousMask, NewMask);
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void IntervalPressure::reset() {
+  TopIdx = BottomIdx = SlotIndex();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// Clear the result so it can be used for another round of pressure tracking.
+void RegionPressure::reset() {
+  TopPos = BottomPos = MachineBasicBlock::const_iterator();
+  MaxSetPressure.clear();
+  LiveInRegs.clear();
+  LiveOutRegs.clear();
+}
+
+/// If the current top is not less than or equal to the next index, open it.
+/// We happen to need the SlotIndex for the next top for pressure update.
+void IntervalPressure::openTop(SlotIndex NextTop) {
+  if (TopIdx <= NextTop)
+    return;
+  TopIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current top is the previous instruction (before receding), open it.
+void RegionPressure::openTop(MachineBasicBlock::const_iterator PrevTop) {
+  if (TopPos != PrevTop)
+    return;
+  TopPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is not greater than the previous index, open it.
+void IntervalPressure::openBottom(SlotIndex PrevBottom) {
+  if (BottomIdx > PrevBottom)
+    return;
+  BottomIdx = SlotIndex();
+  LiveInRegs.clear();
+}
+
+/// If the current bottom is the previous instr (before advancing), open it.
+void RegionPressure::openBottom(MachineBasicBlock::const_iterator PrevBottom) {
+  if (BottomPos != PrevBottom)
+    return;
+  BottomPos = MachineBasicBlock::const_iterator();
+  LiveInRegs.clear();
+}
+
+void LiveRegSet::init(const MachineRegisterInfo &MRI) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  unsigned NumRegUnits = TRI.getNumRegs();
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  Regs.setUniverse(NumRegUnits + NumVirtRegs);
+  this->NumRegUnits = NumRegUnits;
+}
+
+void LiveRegSet::clear() {
+  Regs.clear();
+}
+
+static const LiveRange *getLiveRange(const LiveIntervals &LIS, unsigned Reg) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return &LIS.getInterval(Reg);
+  return LIS.getCachedRegUnit(Reg);
+}
+
+void RegPressureTracker::reset() {
+  MBB = nullptr;
+  LIS = nullptr;
+
+  CurrSetPressure.clear();
+  LiveThruPressure.clear();
+  P.MaxSetPressure.clear();
+
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).reset();
+  else
+    static_cast<RegionPressure&>(P).reset();
+
+  LiveRegs.clear();
+  UntiedDefs.clear();
+}
+
+/// Setup the RegPressureTracker.
+///
+/// TODO: Add support for pressure without LiveIntervals.
+void RegPressureTracker::init(const MachineFunction *mf,
+                              const RegisterClassInfo *rci,
+                              const LiveIntervals *lis,
+                              const MachineBasicBlock *mbb,
+                              MachineBasicBlock::const_iterator pos,
+                              bool TrackLaneMasks, bool TrackUntiedDefs) {
+  reset();
+
+  MF = mf;
+  TRI = MF->getSubtarget().getRegisterInfo();
+  RCI = rci;
+  MRI = &MF->getRegInfo();
+  MBB = mbb;
+  this->TrackUntiedDefs = TrackUntiedDefs;
+  this->TrackLaneMasks = TrackLaneMasks;
+
+  if (RequireIntervals) {
+    assert(lis && "IntervalPressure requires LiveIntervals");
+    LIS = lis;
+  }
+
+  CurrPos = pos;
+  CurrSetPressure.assign(TRI->getNumRegPressureSets(), 0);
+
+  P.MaxSetPressure = CurrSetPressure;
+
+  LiveRegs.init(*MRI);
+  if (TrackUntiedDefs)
+    UntiedDefs.setUniverse(MRI->getNumVirtRegs());
+}
+
+/// Does this pressure result have a valid top position and live ins.
+bool RegPressureTracker::isTopClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).TopIdx.isValid();
+  return (static_cast<RegionPressure&>(P).TopPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+/// Does this pressure result have a valid bottom position and live outs.
+bool RegPressureTracker::isBottomClosed() const {
+  if (RequireIntervals)
+    return static_cast<IntervalPressure&>(P).BottomIdx.isValid();
+  return (static_cast<RegionPressure&>(P).BottomPos ==
+          MachineBasicBlock::const_iterator());
+}
+
+
+SlotIndex RegPressureTracker::getCurrSlot() const {
+  MachineBasicBlock::const_iterator IdxPos = CurrPos;
+  while (IdxPos != MBB->end() && IdxPos->isDebugValue())
+    ++IdxPos;
+  if (IdxPos == MBB->end())
+    return LIS->getMBBEndIdx(MBB);
+  return LIS->getInstructionIndex(*IdxPos).getRegSlot();
+}
+
+/// Set the boundary for the top of the region and summarize live ins.
+void RegPressureTracker::closeTop() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).TopIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).TopPos = CurrPos;
+
+  assert(P.LiveInRegs.empty() && "inconsistent max pressure result");
+  P.LiveInRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveInRegs);
+}
+
+/// Set the boundary for the bottom of the region and summarize live outs.
+void RegPressureTracker::closeBottom() {
+  if (RequireIntervals)
+    static_cast<IntervalPressure&>(P).BottomIdx = getCurrSlot();
+  else
+    static_cast<RegionPressure&>(P).BottomPos = CurrPos;
+
+  assert(P.LiveOutRegs.empty() && "inconsistent max pressure result");
+  P.LiveOutRegs.reserve(LiveRegs.size());
+  LiveRegs.appendTo(P.LiveOutRegs);
+}
+
+/// Finalize the region boundaries and record live ins and live outs.
+void RegPressureTracker::closeRegion() {
+  if (!isTopClosed() && !isBottomClosed()) {
+    assert(LiveRegs.size() == 0 && "no region boundary");
+    return;
+  }
+  if (!isBottomClosed())
+    closeBottom();
+  else if (!isTopClosed())
+    closeTop();
+  // If both top and bottom are closed, do nothing.
+}
+
+/// The register tracker is unaware of global liveness so ignores normal
+/// live-thru ranges. However, two-address or coalesced chains can also lead
+/// to live ranges with no holes. Count these to inform heuristics that we
+/// can never drop below this pressure.
+void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
+  LiveThruPressure.assign(TRI->getNumRegPressureSets(), 0);
+  assert(isBottomClosed() && "need bottom-up tracking to intialize.");
+  for (const RegisterMaskPair &Pair : P.LiveOutRegs) {
+    unsigned RegUnit = Pair.RegUnit;
+    if (TargetRegisterInfo::isVirtualRegister(RegUnit)
+        && !RPTracker.hasUntiedDef(RegUnit))
+      increaseSetPressure(LiveThruPressure, *MRI, RegUnit, 0, Pair.LaneMask);
+  }
+}
+
+static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
+                               unsigned RegUnit) {
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                        return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end())
+    return 0;
+  return I->LaneMask;
+}
+
+static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                        RegisterMaskPair Pair) {
+  unsigned RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask != 0);
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(Pair);
+  } else {
+    I->LaneMask |= Pair.LaneMask;
+  }
+}
+
+static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                       unsigned RegUnit) {
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I == RegUnits.end()) {
+    RegUnits.push_back(RegisterMaskPair(RegUnit, 0));
+  } else {
+    I->LaneMask = 0;
+  }
+}
+
+static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
+                           RegisterMaskPair Pair) {
+  unsigned RegUnit = Pair.RegUnit;
+  assert(Pair.LaneMask != 0);
+  auto I = std::find_if(RegUnits.begin(), RegUnits.end(),
+                        [RegUnit](const RegisterMaskPair Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  if (I != RegUnits.end()) {
+    I->LaneMask &= ~Pair.LaneMask;
+    if (I->LaneMask == 0)
+      RegUnits.erase(I);
+  }
+}
+
+static LaneBitmask getLanesWithProperty(const LiveIntervals &LIS,
+    const MachineRegisterInfo &MRI, bool TrackLaneMasks, unsigned RegUnit,
+    SlotIndex Pos, LaneBitmask SafeDefault,
+    bool(*Property)(const LiveRange &LR, SlotIndex Pos)) {
+  if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
+    const LiveInterval &LI = LIS.getInterval(RegUnit);
+    LaneBitmask Result = 0;
+    if (TrackLaneMasks && LI.hasSubRanges()) {
+        for (const LiveInterval::SubRange &SR : LI.subranges()) {
+          if (Property(SR, Pos))
+            Result |= SR.LaneMask;
+        }
+    } else if (Property(LI, Pos)) {
+      Result = TrackLaneMasks ? MRI.getMaxLaneMaskForVReg(RegUnit) : ~0u;
+    }
+
+    return Result;
+  } else {
+    const LiveRange *LR = LIS.getCachedRegUnit(RegUnit);
+    // Be prepared for missing liveranges: We usually do not compute liveranges
+    // for physical registers on targets with many registers (GPUs).
+    if (LR == nullptr)
+      return SafeDefault;
+    return Property(*LR, Pos) ? ~0u : 0;
+  }
+}
+
+static LaneBitmask getLiveLanesAt(const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI,
+                                  bool TrackLaneMasks, unsigned RegUnit,
+                                  SlotIndex Pos) {
+  return getLanesWithProperty(LIS, MRI, TrackLaneMasks, RegUnit, Pos, ~0u,
+                              [](const LiveRange &LR, SlotIndex Pos) {
+                                return LR.liveAt(Pos);
+                              });
+}
+
+
+namespace {
+
+/// Collect this instruction's unique uses and defs into SmallVectors for
+/// processing defs and uses in order.
+///
+/// FIXME: always ignore tied opers
+class RegisterOperandsCollector {
+  RegisterOperands &RegOpers;
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+  bool IgnoreDead;
+
+  RegisterOperandsCollector(RegisterOperands &RegOpers,
+                            const TargetRegisterInfo &TRI,
+                            const MachineRegisterInfo &MRI, bool IgnoreDead)
+    : RegOpers(RegOpers), TRI(TRI), MRI(MRI), IgnoreDead(IgnoreDead) {}
+
+  void collectInstr(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+      collectOperand(*OperI);
+
+    // Remove redundant physreg dead defs.
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
+  }
+
+  void collectInstrLanes(const MachineInstr &MI) const {
+    for (ConstMIBundleOperands OperI(MI); OperI.isValid(); ++OperI)
+      collectOperandLanes(*OperI);
+
+    // Remove redundant physreg dead defs.
+    for (const RegisterMaskPair &P : RegOpers.Defs)
+      removeRegLanes(RegOpers.DeadDefs, P);
+  }
+
+  /// Push this operand's register onto the correct vectors.
+  void collectOperand(const MachineOperand &MO) const {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    unsigned Reg = MO.getReg();
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushReg(Reg, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Subregister definitions may imply a register read.
+      if (MO.readsReg())
+        pushReg(Reg, RegOpers.Uses);
+
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushReg(Reg, RegOpers.DeadDefs);
+      } else
+        pushReg(Reg, RegOpers.Defs);
+    }
+  }
+
+  void pushReg(unsigned Reg,
+               SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, ~0u));
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        addRegLanes(RegUnits, RegisterMaskPair(*Units, ~0u));
+    }
+  }
+
+  void collectOperandLanes(const MachineOperand &MO) const {
+    if (!MO.isReg() || !MO.getReg())
+      return;
+    unsigned Reg = MO.getReg();
+    unsigned SubRegIdx = MO.getSubReg();
+    if (MO.isUse()) {
+      if (!MO.isUndef() && !MO.isInternalRead())
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Uses);
+    } else {
+      assert(MO.isDef());
+      // Treat read-undef subreg defs as definitions of the whole register.
+      if (MO.isUndef())
+        SubRegIdx = 0;
+
+      if (MO.isDead()) {
+        if (!IgnoreDead)
+          pushRegLanes(Reg, SubRegIdx, RegOpers.DeadDefs);
+      } else
+        pushRegLanes(Reg, SubRegIdx, RegOpers.Defs);
+    }
+  }
+
+  void pushRegLanes(unsigned Reg, unsigned SubRegIdx,
+                    SmallVectorImpl<RegisterMaskPair> &RegUnits) const {
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      LaneBitmask LaneMask = SubRegIdx != 0
+                             ? TRI.getSubRegIndexLaneMask(SubRegIdx)
+                             : MRI.getMaxLaneMaskForVReg(Reg);
+      addRegLanes(RegUnits, RegisterMaskPair(Reg, LaneMask));
+    } else if (MRI.isAllocatable(Reg)) {
+      for (MCRegUnitIterator Units(Reg, &TRI); Units.isValid(); ++Units)
+        addRegLanes(RegUnits, RegisterMaskPair(*Units, ~0u));
+    }
+  }
+
+  friend class llvm::RegisterOperands;
+};
+
+} // namespace
+
+void RegisterOperands::collect(const MachineInstr &MI,
+                               const TargetRegisterInfo &TRI,
+                               const MachineRegisterInfo &MRI,
+                               bool TrackLaneMasks, bool IgnoreDead) {
+  RegisterOperandsCollector Collector(*this, TRI, MRI, IgnoreDead);
+  if (TrackLaneMasks)
+    Collector.collectInstrLanes(MI);
+  else
+    Collector.collectInstr(MI);
+}
+
+void RegisterOperands::detectDeadDefs(const MachineInstr &MI,
+                                      const LiveIntervals &LIS) {
+  SlotIndex SlotIdx = LIS.getInstructionIndex(MI);
+  for (auto RI = Defs.begin(); RI != Defs.end(); /*empty*/) {
+    unsigned Reg = RI->RegUnit;
+    const LiveRange *LR = getLiveRange(LIS, Reg);
+    if (LR != nullptr) {
+      LiveQueryResult LRQ = LR->Query(SlotIdx);
+      if (LRQ.isDeadDef()) {
+        // LiveIntervals knows this is a dead even though it's MachineOperand is
+        // not flagged as such.
+        DeadDefs.push_back(*RI);
+        RI = Defs.erase(RI);
+        continue;
+      }
+    }
+    ++RI;
+  }
+}
+
+void RegisterOperands::adjustLaneLiveness(const LiveIntervals &LIS,
+                                          const MachineRegisterInfo &MRI,
+                                          SlotIndex Pos,
+                                          MachineInstr *AddFlagsMI) {
+  for (auto I = Defs.begin(); I != Defs.end(); ) {
+    LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                           Pos.getDeadSlot());
+    // If the the def is all that is live after the instruction, then in case
+    // of a subregister def we need a read-undef flag.
+    unsigned RegUnit = I->RegUnit;
+    if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+        AddFlagsMI != nullptr && (LiveAfter & ~I->LaneMask) == 0)
+      AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+
+    LaneBitmask ActualDef = I->LaneMask & LiveAfter;
+    if (ActualDef == 0) {
+      I = Defs.erase(I);
+    } else {
+      I->LaneMask = ActualDef;
+      ++I;
+    }
+  }
+  for (auto I = Uses.begin(); I != Uses.end(); ) {
+    LaneBitmask LiveBefore = getLiveLanesAt(LIS, MRI, true, I->RegUnit,
+                                            Pos.getBaseIndex());
+    LaneBitmask LaneMask = I->LaneMask & LiveBefore;
+    if (LaneMask == 0) {
+      I = Uses.erase(I);
+    } else {
+      I->LaneMask = LaneMask;
+      ++I;
+    }
+  }
+  if (AddFlagsMI != nullptr) {
+    for (const RegisterMaskPair &P : DeadDefs) {
+      unsigned RegUnit = P.RegUnit;
+      if (!TargetRegisterInfo::isVirtualRegister(RegUnit))
+        continue;
+      LaneBitmask LiveAfter = getLiveLanesAt(LIS, MRI, true, RegUnit,
+                                             Pos.getDeadSlot());
+      if (LiveAfter == 0)
+        AddFlagsMI->setRegisterDefReadUndef(RegUnit);
+    }
+  }
+}
+
+/// Initialize an array of N PressureDiffs.
+void PressureDiffs::init(unsigned N) {
+  Size = N;
+  if (N <= Max) {
+    memset(PDiffArray, 0, N * sizeof(PressureDiff));
+    return;
+  }
+  Max = Size;
+  free(PDiffArray);
+  PDiffArray = reinterpret_cast<PressureDiff*>(calloc(N, sizeof(PressureDiff)));
+}
+
+void PressureDiffs::addInstruction(unsigned Idx,
+                                   const RegisterOperands &RegOpers,
+                                   const MachineRegisterInfo &MRI) {
+  PressureDiff &PDiff = (*this)[Idx];
+  assert(!PDiff.begin()->isValid() && "stale PDiff");
+  for (const RegisterMaskPair &P : RegOpers.Defs)
+    PDiff.addPressureChange(P.RegUnit, true, &MRI);
+
+  for (const RegisterMaskPair &P : RegOpers.Uses)
+    PDiff.addPressureChange(P.RegUnit, false, &MRI);
+}
+
+/// Add a change in pressure to the pressure diff of a given instruction.
+void PressureDiff::addPressureChange(unsigned RegUnit, bool IsDec,
+                                     const MachineRegisterInfo *MRI) {
+  PSetIterator PSetI = MRI->getPressureSets(RegUnit);
+  int Weight = IsDec ? -PSetI.getWeight() : PSetI.getWeight();
+  for (; PSetI.isValid(); ++PSetI) {
+    // Find an existing entry in the pressure diff for this PSet.
+    PressureDiff::iterator I = nonconst_begin(), E = nonconst_end();
+    for (; I != E && I->isValid(); ++I) {
+      if (I->getPSet() >= *PSetI)
+        break;
+    }
+    // If all pressure sets are more constrained, skip the remaining PSets.
+    if (I == E)
+      break;
+    // Insert this PressureChange.
+    if (!I->isValid() || I->getPSet() != *PSetI) {
+      PressureChange PTmp = PressureChange(*PSetI);
+      for (PressureDiff::iterator J = I; J != E && PTmp.isValid(); ++J)
+        std::swap(*J, PTmp);
+    }
+    // Update the units for this pressure set.
+    unsigned NewUnitInc = I->getUnitInc() + Weight;
+    if (NewUnitInc != 0) {
+      I->setUnitInc(NewUnitInc);
+    } else {
+      // Remove entry
+      PressureDiff::iterator J;
+      for (J = std::next(I); J != E && J->isValid(); ++J, ++I)
+        *I = *J;
+      if (J != E)
+        *I = *J;
+    }
+  }
+}
+
+/// Force liveness of registers.
+void RegPressureTracker::addLiveRegs(ArrayRef<RegisterMaskPair> Regs) {
+  for (const RegisterMaskPair &P : Regs) {
+    LaneBitmask PrevMask = LiveRegs.insert(P);
+    LaneBitmask NewMask = PrevMask | P.LaneMask;
+    increaseRegPressure(P.RegUnit, PrevMask, NewMask);
+  }
+}
+
+void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
+    SmallVectorImpl<RegisterMaskPair> &LiveInOrOut) {
+  assert(Pair.LaneMask != 0);
+
+  unsigned RegUnit = Pair.RegUnit;
+  auto I = std::find_if(LiveInOrOut.begin(), LiveInOrOut.end(),
+                        [RegUnit](const RegisterMaskPair &Other) {
+                          return Other.RegUnit == RegUnit;
+                        });
+  LaneBitmask PrevMask;
+  LaneBitmask NewMask;
+  if (I == LiveInOrOut.end()) {
+    PrevMask = 0;
+    NewMask = Pair.LaneMask;
+    LiveInOrOut.push_back(Pair);
+  } else {
+    PrevMask = I->LaneMask;
+    NewMask = PrevMask | Pair.LaneMask;
+    I->LaneMask = NewMask;
+  }
+  increaseSetPressure(P.MaxSetPressure, *MRI, RegUnit, PrevMask, NewMask);
+}
+
+void RegPressureTracker::discoverLiveIn(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveInRegs);
+}
+
+void RegPressureTracker::discoverLiveOut(RegisterMaskPair Pair) {
+  discoverLiveInOrOut(Pair, P.LiveOutRegs);
+}
+
+void RegPressureTracker::bumpDeadDefs(ArrayRef<RegisterMaskPair> DeadDefs) {
+  for (const RegisterMaskPair &P : DeadDefs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    increaseRegPressure(Reg, LiveMask, BumpedMask);
+  }
+  for (const RegisterMaskPair &P : DeadDefs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask BumpedMask = LiveMask | P.LaneMask;
+    decreaseRegPressure(Reg, BumpedMask, LiveMask);
+  }
+}
+
+/// Recede across the previous instruction. If LiveUses is provided, record any
+/// RegUnits that are made live by the current instruction's uses. This includes
+/// registers that are both defined and used by the instruction.  If a pressure
+/// difference pointer is provided record the changes is pressure caused by this
+/// instruction independent of liveness.
+void RegPressureTracker::recede(const RegisterOperands &RegOpers,
+                                SmallVectorImpl<RegisterMaskPair> *LiveUses) {
+  assert(!CurrPos->isDebugValue());
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  // TODO: consider earlyclobbers?
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    unsigned Reg = Def.RegUnit;
+
+    LaneBitmask PreviousMask = LiveRegs.erase(Def);
+    LaneBitmask NewMask = PreviousMask & ~Def.LaneMask;
+
+    LaneBitmask LiveOut = Def.LaneMask & ~PreviousMask;
+    if (LiveOut != 0) {
+      discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      // Retroactively model effects on pressure of the live out lanes.
+      increaseSetPressure(CurrSetPressure, *MRI, Reg, 0, LiveOut);
+      PreviousMask = LiveOut;
+    }
+
+    if (NewMask == 0) {
+      // Add a 0 entry to LiveUses as a marker that the complete vreg has become
+      // dead.
+      if (TrackLaneMasks && LiveUses != nullptr)
+        setRegZero(*LiveUses, Reg);
+    }
+
+    decreaseRegPressure(Reg, PreviousMask, NewMask);
+  }
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+
+  // Generate liveness for uses.
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    unsigned Reg = Use.RegUnit;
+    assert(Use.LaneMask != 0);
+    LaneBitmask PreviousMask = LiveRegs.insert(Use);
+    LaneBitmask NewMask = PreviousMask | Use.LaneMask;
+    if (NewMask == PreviousMask)
+      continue;
+
+    // Did the register just become live?
+    if (PreviousMask == 0) {
+      if (LiveUses != nullptr) {
+        if (!TrackLaneMasks) {
+          addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+        } else {
+          auto I = std::find_if(LiveUses->begin(), LiveUses->end(),
+                                [Reg](const RegisterMaskPair Other) {
+                                return Other.RegUnit == Reg;
+                                });
+          bool IsRedef = I != LiveUses->end();
+          if (IsRedef) {
+            // ignore re-defs here...
+            assert(I->LaneMask == 0);
+            removeRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          } else {
+            addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
+          }
+        }
+      }
+
+      // Discover live outs if this may be the first occurance of this register.
+      if (RequireIntervals) {
+        LaneBitmask LiveOut = getLiveThroughAt(Reg, SlotIdx);
+        if (LiveOut != 0)
+          discoverLiveOut(RegisterMaskPair(Reg, LiveOut));
+      }
+    }
+
+    increaseRegPressure(Reg, PreviousMask, NewMask);
+  }
+  if (TrackUntiedDefs) {
+    for (const RegisterMaskPair &Def : RegOpers.Defs) {
+      unsigned RegUnit = Def.RegUnit;
+      if (TargetRegisterInfo::isVirtualRegister(RegUnit) &&
+          (LiveRegs.contains(RegUnit) & Def.LaneMask) == 0)
+        UntiedDefs.insert(RegUnit);
+    }
+  }
+}
+
+void RegPressureTracker::recedeSkipDebugValues() {
+  assert(CurrPos != MBB->begin());
+  if (!isBottomClosed())
+    closeBottom();
+
+  // Open the top of the region using block iterators.
+  if (!RequireIntervals && isTopClosed())
+    static_cast<RegionPressure&>(P).openTop(CurrPos);
+
+  // Find the previous instruction.
+  do
+    --CurrPos;
+  while (CurrPos != MBB->begin() && CurrPos->isDebugValue());
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+
+  // Open the top of the region using slot indexes.
+  if (RequireIntervals && isTopClosed())
+    static_cast<IntervalPressure&>(P).openTop(SlotIdx);
+}
+
+void RegPressureTracker::recede(SmallVectorImpl<RegisterMaskPair> *LiveUses) {
+  recedeSkipDebugValues();
+
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = LIS->getInstructionIndex(*CurrPos).getRegSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  } else if (RequireIntervals) {
+    RegOpers.detectDeadDefs(MI, *LIS);
+  }
+
+  recede(RegOpers, LiveUses);
+}
+
+/// Advance across the current instruction.
+void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
+  assert(!TrackUntiedDefs && "unsupported mode");
+  assert(CurrPos != MBB->end());
+  if (!isTopClosed())
+    closeTop();
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = getCurrSlot();
+
+  // Open the bottom of the region using slot indexes.
+  if (isBottomClosed()) {
+    if (RequireIntervals)
+      static_cast<IntervalPressure&>(P).openBottom(SlotIdx);
+    else
+      static_cast<RegionPressure&>(P).openBottom(CurrPos);
+  }
+
+  for (const RegisterMaskPair &Use : RegOpers.Uses) {
+    unsigned Reg = Use.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask LiveIn = Use.LaneMask & ~LiveMask;
+    if (LiveIn != 0) {
+      discoverLiveIn(RegisterMaskPair(Reg, LiveIn));
+      increaseRegPressure(Reg, LiveMask, LiveMask | LiveIn);
+      LiveRegs.insert(RegisterMaskPair(Reg, LiveIn));
+    }
+    // Kill liveness at last uses.
+    if (RequireIntervals) {
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask != 0) {
+        LiveRegs.erase(RegisterMaskPair(Reg, LastUseMask));
+        decreaseRegPressure(Reg, LiveMask, LiveMask & ~LastUseMask);
+      }
+    }
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    LaneBitmask PreviousMask = LiveRegs.insert(Def);
+    LaneBitmask NewMask = PreviousMask | Def.LaneMask;
+    increaseRegPressure(Def.RegUnit, PreviousMask, NewMask);
+  }
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Find the next instruction.
+  do
+    ++CurrPos;
+  while (CurrPos != MBB->end() && CurrPos->isDebugValue());
+}
+
+void RegPressureTracker::advance() {
+  const MachineInstr &MI = *CurrPos;
+  RegisterOperands RegOpers;
+  RegOpers.collect(MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks) {
+    SlotIndex SlotIdx = getCurrSlot();
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  }
+  advance(RegOpers);
+}
+
+/// Find the max change in excess pressure across all sets.
+static void computeExcessPressureDelta(ArrayRef<unsigned> OldPressureVec,
+                                       ArrayRef<unsigned> NewPressureVec,
+                                       RegPressureDelta &Delta,
+                                       const RegisterClassInfo *RCI,
+                                       ArrayRef<unsigned> LiveThruPressureVec) {
+  Delta.Excess = PressureChange();
+  for (unsigned i = 0, e = OldPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldPressureVec[i];
+    unsigned PNew = NewPressureVec[i];
+    int PDiff = (int)PNew - (int)POld;
+    if (!PDiff) // No change in this set in the common case.
+      continue;
+    // Only consider change beyond the limit.
+    unsigned Limit = RCI->getRegPressureSetLimit(i);
+    if (!LiveThruPressureVec.empty())
+      Limit += LiveThruPressureVec[i];
+
+    if (Limit > POld) {
+      if (Limit > PNew)
+        PDiff = 0;            // Under the limit
+      else
+        PDiff = PNew - Limit; // Just exceeded limit.
+    } else if (Limit > PNew)
+      PDiff = Limit - POld;   // Just obeyed limit.
+
+    if (PDiff) {
+      Delta.Excess = PressureChange(i);
+      Delta.Excess.setUnitInc(PDiff);
+      break;
+    }
+  }
+}
+
+/// Find the max change in max pressure that either surpasses a critical PSet
+/// limit or exceeds the current MaxPressureLimit.
+///
+/// FIXME: comparing each element of the old and new MaxPressure vectors here is
+/// silly. It's done now to demonstrate the concept but will go away with a
+/// RegPressureTracker API change to work with pressure differences.
+static void computeMaxPressureDelta(ArrayRef<unsigned> OldMaxPressureVec,
+                                    ArrayRef<unsigned> NewMaxPressureVec,
+                                    ArrayRef<PressureChange> CriticalPSets,
+                                    ArrayRef<unsigned> MaxPressureLimit,
+                                    RegPressureDelta &Delta) {
+  Delta.CriticalMax = PressureChange();
+  Delta.CurrentMax = PressureChange();
+
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (unsigned i = 0, e = OldMaxPressureVec.size(); i < e; ++i) {
+    unsigned POld = OldMaxPressureVec[i];
+    unsigned PNew = NewMaxPressureVec[i];
+    if (PNew == POld) // No change in this set in the common case.
+      continue;
+
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < i)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == i) {
+        int PDiff = (int)PNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (PDiff > 0) {
+          Delta.CriticalMax = PressureChange(i);
+          Delta.CriticalMax.setUnitInc(PDiff);
+        }
+      }
+    }
+    // Find the first increase above MaxPressureLimit.
+    // (Ignores negative MDiff).
+    if (!Delta.CurrentMax.isValid() && PNew > MaxPressureLimit[i]) {
+      Delta.CurrentMax = PressureChange(i);
+      Delta.CurrentMax.setUnitInc(PNew - POld);
+      if (CritIdx == CritEnd || Delta.CriticalMax.isValid())
+        break;
+    }
+  }
+}
+
+/// Record the upward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, /*IgnoreDead=*/true);
+  assert(RegOpers.DeadDefs.size() == 0);
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+  else if (RequireIntervals)
+    RegOpers.detectDeadDefs(*MI, *LIS);
+
+  // Boost max pressure for all dead defs together.
+  // Since CurrSetPressure and MaxSetPressure
+  bumpDeadDefs(RegOpers.DeadDefs);
+
+  // Kill liveness at live defs.
+  for (const RegisterMaskPair &P : RegOpers.Defs) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask UseLanes = getRegLanes(RegOpers.Uses, Reg);
+    LaneBitmask DefLanes = P.LaneMask;
+    LaneBitmask LiveAfter = (LiveLanes & ~DefLanes) | UseLanes;
+    decreaseRegPressure(Reg, LiveLanes, LiveAfter);
+  }
+  // Generate liveness for uses.
+  for (const RegisterMaskPair &P : RegOpers.Uses) {
+    unsigned Reg = P.RegUnit;
+    LaneBitmask LiveLanes = LiveRegs.contains(Reg);
+    LaneBitmask LiveAfter = LiveLanes | P.LaneMask;
+    increaseRegPressure(Reg, LiveLanes, LiveAfter);
+  }
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// bottom-up. Find the pressure set with the most change beyond its pressure
+/// limit based on the tracker's current pressure, and return the change in
+/// number of register units of that pressure set introduced by this
+/// instruction.
+///
+/// This assumes that the current LiveOut set is sufficient.
+///
+/// This is expensive for an on-the-fly query because it calls
+/// bumpUpwardPressure to recompute the pressure sets based on current
+/// liveness. This mainly exists to verify correctness, e.g. with
+/// -verify-misched. getUpwardPressureDelta is the fast version of this query
+/// that uses the per-SUnit cache of the PressureDiff.
+void RegPressureTracker::
+getMaxUpwardPressureDelta(const MachineInstr *MI, PressureDiff *PDiff,
+                          RegPressureDelta &Delta,
+                          ArrayRef<PressureChange> CriticalPSets,
+                          ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  // FIXME: The snapshot heap space should persist. But I'm planning to
+  // summarize the pressure effect so we don't need to snapshot at all.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+
+#ifndef NDEBUG
+  if (!PDiff)
+    return;
+
+  // Check if the alternate algorithm yields the same result.
+  RegPressureDelta Delta2;
+  getUpwardPressureDelta(MI, *PDiff, Delta2, CriticalPSets, MaxPressureLimit);
+  if (Delta != Delta2) {
+    dbgs() << "PDiff: ";
+    PDiff->dump(*TRI);
+    dbgs() << "DELTA: " << *MI;
+    if (Delta.Excess.isValid())
+      dbgs() << "Excess1 " << TRI->getRegPressureSetName(Delta.Excess.getPSet())
+             << " " << Delta.Excess.getUnitInc() << "\n";
+    if (Delta.CriticalMax.isValid())
+      dbgs() << "Critic1 " << TRI->getRegPressureSetName(Delta.CriticalMax.getPSet())
+             << " " << Delta.CriticalMax.getUnitInc() << "\n";
+    if (Delta.CurrentMax.isValid())
+      dbgs() << "CurrMx1 " << TRI->getRegPressureSetName(Delta.CurrentMax.getPSet())
+             << " " << Delta.CurrentMax.getUnitInc() << "\n";
+    if (Delta2.Excess.isValid())
+      dbgs() << "Excess2 " << TRI->getRegPressureSetName(Delta2.Excess.getPSet())
+             << " " << Delta2.Excess.getUnitInc() << "\n";
+    if (Delta2.CriticalMax.isValid())
+      dbgs() << "Critic2 " << TRI->getRegPressureSetName(Delta2.CriticalMax.getPSet())
+             << " " << Delta2.CriticalMax.getUnitInc() << "\n";
+    if (Delta2.CurrentMax.isValid())
+      dbgs() << "CurrMx2 " << TRI->getRegPressureSetName(Delta2.CurrentMax.getPSet())
+             << " " << Delta2.CurrentMax.getUnitInc() << "\n";
+    llvm_unreachable("RegP Delta Mismatch");
+  }
+#endif
+}
+
+/// This is the fast version of querying register pressure that does not
+/// directly depend on current liveness.
+///
+/// @param Delta captures information needed for heuristics.
+///
+/// @param CriticalPSets Are the pressure sets that are known to exceed some
+/// limit within the region, not necessarily at the current position.
+///
+/// @param MaxPressureLimit Is the max pressure within the region, not
+/// necessarily at the current position.
+void RegPressureTracker::
+getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
+                       RegPressureDelta &Delta,
+                       ArrayRef<PressureChange> CriticalPSets,
+                       ArrayRef<unsigned> MaxPressureLimit) const {
+  unsigned CritIdx = 0, CritEnd = CriticalPSets.size();
+  for (PressureDiff::const_iterator
+         PDiffI = PDiff.begin(), PDiffE = PDiff.end();
+       PDiffI != PDiffE && PDiffI->isValid(); ++PDiffI) {
+
+    unsigned PSetID = PDiffI->getPSet();
+    unsigned Limit = RCI->getRegPressureSetLimit(PSetID);
+    if (!LiveThruPressure.empty())
+      Limit += LiveThruPressure[PSetID];
+
+    unsigned POld = CurrSetPressure[PSetID];
+    unsigned MOld = P.MaxSetPressure[PSetID];
+    unsigned MNew = MOld;
+    // Ignore DeadDefs here because they aren't captured by PressureChange.
+    unsigned PNew = POld + PDiffI->getUnitInc();
+    assert((PDiffI->getUnitInc() >= 0) == (PNew >= POld)
+           && "PSet overflow/underflow");
+    if (PNew > MOld)
+      MNew = PNew;
+    // Check if current pressure has exceeded the limit.
+    if (!Delta.Excess.isValid()) {
+      unsigned ExcessInc = 0;
+      if (PNew > Limit)
+        ExcessInc = POld > Limit ? PNew - POld : PNew - Limit;
+      else if (POld > Limit)
+        ExcessInc = Limit - POld;
+      if (ExcessInc) {
+        Delta.Excess = PressureChange(PSetID);
+        Delta.Excess.setUnitInc(ExcessInc);
+      }
+    }
+    // Check if max pressure has exceeded a critical pressure set max.
+    if (MNew == MOld)
+      continue;
+    if (!Delta.CriticalMax.isValid()) {
+      while (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() < PSetID)
+        ++CritIdx;
+
+      if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
+        int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
+        if (CritInc > 0 && CritInc <= INT16_MAX) {
+          Delta.CriticalMax = PressureChange(PSetID);
+          Delta.CriticalMax.setUnitInc(CritInc);
+        }
+      }
+    }
+    // Check if max pressure has exceeded the current max.
+    if (!Delta.CurrentMax.isValid() && MNew > MaxPressureLimit[PSetID]) {
+      Delta.CurrentMax = PressureChange(PSetID);
+      Delta.CurrentMax.setUnitInc(MNew - MOld);
+    }
+  }
+}
+
+/// Helper to find a vreg use between two indices [PriorUseIdx, NextUseIdx).
+/// The query starts with a lane bitmask which gets lanes/bits removed for every
+/// use we find.
+static LaneBitmask findUseBetween(unsigned Reg, LaneBitmask LastUseMask,
+                                  SlotIndex PriorUseIdx, SlotIndex NextUseIdx,
+                                  const MachineRegisterInfo &MRI,
+                                  const LiveIntervals *LIS) {
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  for (const MachineOperand &MO : MRI.use_nodbg_operands(Reg)) {
+    if (MO.isUndef())
+      continue;
+    const MachineInstr *MI = MO.getParent();
+    SlotIndex InstSlot = LIS->getInstructionIndex(*MI).getRegSlot();
+    if (InstSlot >= PriorUseIdx && InstSlot < NextUseIdx) {
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask UseMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+      LastUseMask &= ~UseMask;
+      if (LastUseMask == 0)
+        return 0;
+    }
+  }
+  return LastUseMask;
+}
+
+LaneBitmask RegPressureTracker::getLiveLanesAt(unsigned RegUnit,
+                                               SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos, ~0u,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        return LR.liveAt(Pos);
+      });
+}
+
+LaneBitmask RegPressureTracker::getLastUsedLanes(unsigned RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit,
+                              Pos.getBaseIndex(), 0,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->end == Pos.getRegSlot();
+      });
+}
+
+LaneBitmask RegPressureTracker::getLiveThroughAt(unsigned RegUnit,
+                                                 SlotIndex Pos) const {
+  assert(RequireIntervals);
+  return getLanesWithProperty(*LIS, *MRI, TrackLaneMasks, RegUnit, Pos, 0u,
+      [](const LiveRange &LR, SlotIndex Pos) {
+        const LiveRange::Segment *S = LR.getSegmentContaining(Pos);
+        return S != nullptr && S->start < Pos.getRegSlot(true) &&
+               S->end != Pos.getDeadSlot();
+      });
+}
+
+/// Record the downward impact of a single instruction on current register
+/// pressure. Unlike the advance/recede pressure tracking interface, this does
+/// not discover live in/outs.
+///
+/// This is intended for speculative queries. It leaves pressure inconsistent
+/// with the current position, so must be restored by the caller.
+void RegPressureTracker::bumpDownwardPressure(const MachineInstr *MI) {
+  assert(!MI->isDebugValue() && "Expect a nondebug instruction.");
+
+  SlotIndex SlotIdx;
+  if (RequireIntervals)
+    SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+
+  // Account for register pressure similar to RegPressureTracker::recede().
+  RegisterOperands RegOpers;
+  RegOpers.collect(*MI, *TRI, *MRI, TrackLaneMasks, false);
+  if (TrackLaneMasks)
+    RegOpers.adjustLaneLiveness(*LIS, *MRI, SlotIdx);
+
+  if (RequireIntervals) {
+    for (const RegisterMaskPair &Use : RegOpers.Uses) {
+      unsigned Reg = Use.RegUnit;
+      LaneBitmask LastUseMask = getLastUsedLanes(Reg, SlotIdx);
+      if (LastUseMask == 0)
+        continue;
+      // The LastUseMask is queried from the liveness information of instruction
+      // which may be further down the schedule. Some lanes may actually not be
+      // last uses for the current position.
+      // FIXME: allow the caller to pass in the list of vreg uses that remain
+      // to be bottom-scheduled to avoid searching uses at each query.
+      SlotIndex CurrIdx = getCurrSlot();
+      LastUseMask
+        = findUseBetween(Reg, LastUseMask, CurrIdx, SlotIdx, *MRI, LIS);
+      if (LastUseMask == 0)
+        continue;
+
+      LaneBitmask LiveMask = LiveRegs.contains(Reg);
+      LaneBitmask NewMask = LiveMask & ~LastUseMask;
+      decreaseRegPressure(Reg, LiveMask, NewMask);
+    }
+  }
+
+  // Generate liveness for defs.
+  for (const RegisterMaskPair &Def : RegOpers.Defs) {
+    unsigned Reg = Def.RegUnit;
+    LaneBitmask LiveMask = LiveRegs.contains(Reg);
+    LaneBitmask NewMask = LiveMask | Def.LaneMask;
+    increaseRegPressure(Reg, LiveMask, NewMask);
+  }
+
+  // Boost pressure for all dead defs together.
+  bumpDeadDefs(RegOpers.DeadDefs);
+}
+
+/// Consider the pressure increase caused by traversing this instruction
+/// top-down. Find the register class with the most change in its pressure limit
+/// based on the tracker's current pressure, and return the number of excess
+/// register units of that pressure set introduced by this instruction.
+///
+/// This assumes that the current LiveIn set is sufficient.
+///
+/// This is expensive for an on-the-fly query because it calls
+/// bumpDownwardPressure to recompute the pressure sets based on current
+/// liveness. We don't yet have a fast version of downward pressure tracking
+/// analogous to getUpwardPressureDelta.
+void RegPressureTracker::
+getMaxDownwardPressureDelta(const MachineInstr *MI, RegPressureDelta &Delta,
+                            ArrayRef<PressureChange> CriticalPSets,
+                            ArrayRef<unsigned> MaxPressureLimit) {
+  // Snapshot Pressure.
+  std::vector<unsigned> SavedPressure = CurrSetPressure;
+  std::vector<unsigned> SavedMaxPressure = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  computeExcessPressureDelta(SavedPressure, CurrSetPressure, Delta, RCI,
+                             LiveThruPressure);
+  computeMaxPressureDelta(SavedMaxPressure, P.MaxSetPressure, CriticalPSets,
+                          MaxPressureLimit, Delta);
+  assert(Delta.CriticalMax.getUnitInc() >= 0 &&
+         Delta.CurrentMax.getUnitInc() >= 0 && "cannot decrease max pressure");
+
+  // Restore the tracker's state.
+  P.MaxSetPressure.swap(SavedMaxPressure);
+  CurrSetPressure.swap(SavedPressure);
+}
+
+/// Get the pressure of each PSet after traversing this instruction bottom-up.
+void RegPressureTracker::
+getUpwardPressure(const MachineInstr *MI,
+                  std::vector<unsigned> &PressureResult,
+                  std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpUpwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
+
+/// Get the pressure of each PSet after traversing this instruction top-down.
+void RegPressureTracker::
+getDownwardPressure(const MachineInstr *MI,
+                    std::vector<unsigned> &PressureResult,
+                    std::vector<unsigned> &MaxPressureResult) {
+  // Snapshot pressure.
+  PressureResult = CurrSetPressure;
+  MaxPressureResult = P.MaxSetPressure;
+
+  bumpDownwardPressure(MI);
+
+  // Current pressure becomes the result. Restore current pressure.
+  P.MaxSetPressure.swap(MaxPressureResult);
+  CurrSetPressure.swap(PressureResult);
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
new file mode 100644
index 000000000000..6b80179190db
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -0,0 +1,458 @@
+//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This file implements the machine register scavenger. It can provide
+/// information, such as unused registers, at any point in a machine basic
+/// block. It also provides a mechanism to make registers available by evicting
+/// them to spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "reg-scavenging"
+
+void RegScavenger::setRegUsed(unsigned Reg, LaneBitmask LaneMask) {
+  for (MCRegUnitMaskIterator RUI(Reg, TRI); RUI.isValid(); ++RUI) {
+    LaneBitmask UnitMask = (*RUI).second;
+    if (UnitMask == 0 || (LaneMask & UnitMask) != 0)
+      RegUnitsAvailable.reset((*RUI).first);
+  }
+}
+
+void RegScavenger::initRegState() {
+  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
+         IE = Scavenged.end(); I != IE; ++I) {
+    I->Reg = 0;
+    I->Restore = nullptr;
+  }
+
+  // All register units start out unused.
+  RegUnitsAvailable.set();
+
+  // Live-in registers are in use.
+  for (const auto &LI : MBB->liveins())
+    setRegUsed(LI.PhysReg, LI.LaneMask);
+
+  // Pristine CSRs are also unavailable.
+  const MachineFunction &MF = *MBB->getParent();
+  BitVector PR = MF.getFrameInfo()->getPristineRegs(MF);
+  for (int I = PR.find_first(); I>0; I = PR.find_next(I))
+    setRegUsed(I);
+}
+
+void RegScavenger::enterBasicBlock(MachineBasicBlock &MBB) {
+  MachineFunction &MF = *MBB.getParent();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+
+  assert((NumRegUnits == 0 || NumRegUnits == TRI->getNumRegUnits()) &&
+         "Target changed?");
+
+  // It is not possible to use the register scavenger after late optimization
+  // passes that don't preserve accurate liveness information.
+  assert(MRI->tracksLiveness() &&
+         "Cannot use register scavenger with inaccurate liveness");
+
+  // Self-initialize.
+  if (!this->MBB) {
+    NumRegUnits = TRI->getNumRegUnits();
+    RegUnitsAvailable.resize(NumRegUnits);
+    KillRegUnits.resize(NumRegUnits);
+    DefRegUnits.resize(NumRegUnits);
+    TmpRegUnits.resize(NumRegUnits);
+  }
+  this->MBB = &MBB;
+
+  initRegState();
+
+  Tracking = false;
+}
+
+void RegScavenger::addRegUnits(BitVector &BV, unsigned Reg) {
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    BV.set(*RUI);
+}
+
+void RegScavenger::determineKillsAndDefs() {
+  assert(Tracking && "Must be tracking to determine kills and defs");
+
+  MachineInstr &MI = *MBBI;
+  assert(!MI.isDebugValue() && "Debug values have no kills or defs");
+
+  // Find out which registers are early clobbered, killed, defined, and marked
+  // def-dead in this instruction.
+  KillRegUnits.reset();
+  DefRegUnits.reset();
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      TmpRegUnits.clear();
+      for (unsigned RU = 0, RUEnd = TRI->getNumRegUnits(); RU != RUEnd; ++RU) {
+        for (MCRegUnitRootIterator RURI(RU, TRI); RURI.isValid(); ++RURI) {
+          if (MO.clobbersPhysReg(*RURI)) {
+            TmpRegUnits.set(RU);
+            break;
+          }
+        }
+      }
+
+      // Apply the mask.
+      KillRegUnits |= TmpRegUnits;
+    }
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+      continue;
+
+    if (MO.isUse()) {
+      // Ignore undef uses.
+      if (MO.isUndef())
+        continue;
+      if (MO.isKill())
+        addRegUnits(KillRegUnits, Reg);
+    } else {
+      assert(MO.isDef());
+      if (MO.isDead())
+        addRegUnits(KillRegUnits, Reg);
+      else
+        addRegUnits(DefRegUnits, Reg);
+    }
+  }
+}
+
+void RegScavenger::unprocess() {
+  assert(Tracking && "Cannot unprocess because we're not tracking");
+
+  MachineInstr &MI = *MBBI;
+  if (!MI.isDebugValue()) {
+    determineKillsAndDefs();
+
+    // Commit the changes.
+    setUsed(KillRegUnits);
+    setUnused(DefRegUnits);
+  }
+
+  if (MBBI == MBB->begin()) {
+    MBBI = MachineBasicBlock::iterator(nullptr);
+    Tracking = false;
+  } else
+    --MBBI;
+}
+
+void RegScavenger::forward() {
+  // Move ptr forward.
+  if (!Tracking) {
+    MBBI = MBB->begin();
+    Tracking = true;
+  } else {
+    assert(MBBI != MBB->end() && "Already past the end of the basic block!");
+    MBBI = std::next(MBBI);
+  }
+  assert(MBBI != MBB->end() && "Already at the end of the basic block!");
+
+  MachineInstr &MI = *MBBI;
+
+  for (SmallVectorImpl<ScavengedInfo>::iterator I = Scavenged.begin(),
+         IE = Scavenged.end(); I != IE; ++I) {
+    if (I->Restore != &MI)
+      continue;
+
+    I->Reg = 0;
+    I->Restore = nullptr;
+  }
+
+  if (MI.isDebugValue())
+    return;
+
+  determineKillsAndDefs();
+
+  // Verify uses and defs.
+#ifndef NDEBUG
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg) || isReserved(Reg))
+      continue;
+    if (MO.isUse()) {
+      if (MO.isUndef())
+        continue;
+      if (!isRegUsed(Reg)) {
+        // Check if it's partial live: e.g.
+        // D0 = insert_subreg D0<undef>, S0
+        // ... D0
+        // The problem is the insert_subreg could be eliminated. The use of
+        // D0 is using a partially undef value. This is not *incorrect* since
+        // S1 is can be freely clobbered.
+        // Ideally we would like a way to model this, but leaving the
+        // insert_subreg around causes both correctness and performance issues.
+        bool SubUsed = false;
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+          if (isRegUsed(*SubRegs)) {
+            SubUsed = true;
+            break;
+          }
+        bool SuperUsed = false;
+        for (MCSuperRegIterator SR(Reg, TRI); SR.isValid(); ++SR) {
+          if (isRegUsed(*SR)) {
+            SuperUsed = true;
+            break;
+          }
+        }
+        if (!SubUsed && !SuperUsed) {
+          MBB->getParent()->verify(nullptr, "In Register Scavenger");
+          llvm_unreachable("Using an undefined register!");
+        }
+        (void)SubUsed;
+        (void)SuperUsed;
+      }
+    } else {
+      assert(MO.isDef());
+#if 0
+      // FIXME: Enable this once we've figured out how to correctly transfer
+      // implicit kills during codegen passes like the coalescer.
+      assert((KillRegs.test(Reg) || isUnused(Reg) ||
+              isLiveInButUnusedBefore(Reg, MI, MBB, TRI, MRI)) &&
+             "Re-defining a live register!");
+#endif
+    }
+  }
+#endif // NDEBUG
+
+  // Commit the changes.
+  setUnused(KillRegUnits);
+  setUsed(DefRegUnits);
+}
+
+bool RegScavenger::isRegUsed(unsigned Reg, bool includeReserved) const {
+  if (includeReserved && isReserved(Reg))
+    return true;
+  for (MCRegUnitIterator RUI(Reg, TRI); RUI.isValid(); ++RUI)
+    if (!RegUnitsAvailable.test(*RUI))
+      return true;
+  return false;
+}
+
+unsigned RegScavenger::FindUnusedReg(const TargetRegisterClass *RC) const {
+  for (unsigned Reg : *RC) {
+    if (!isRegUsed(Reg)) {
+      DEBUG(dbgs() << "Scavenger found unused reg: " << TRI->getName(Reg) <<
+            "\n");
+      return Reg;
+    }
+  }
+  return 0;
+}
+
+BitVector RegScavenger::getRegsAvailable(const TargetRegisterClass *RC) {
+  BitVector Mask(TRI->getNumRegs());
+  for (unsigned Reg : *RC)
+    if (!isRegUsed(Reg))
+      Mask.set(Reg);
+  return Mask;
+}
+
+unsigned RegScavenger::findSurvivorReg(MachineBasicBlock::iterator StartMI,
+                                       BitVector &Candidates,
+                                       unsigned InstrLimit,
+                                       MachineBasicBlock::iterator &UseMI) {
+  int Survivor = Candidates.find_first();
+  assert(Survivor > 0 && "No candidates for scavenging");
+
+  MachineBasicBlock::iterator ME = MBB->getFirstTerminator();
+  assert(StartMI != ME && "MI already at terminator");
+  MachineBasicBlock::iterator RestorePointMI = StartMI;
+  MachineBasicBlock::iterator MI = StartMI;
+
+  bool inVirtLiveRange = false;
+  for (++MI; InstrLimit > 0 && MI != ME; ++MI, --InstrLimit) {
+    if (MI->isDebugValue()) {
+      ++InstrLimit; // Don't count debug instructions
+      continue;
+    }
+    bool isVirtKillInsn = false;
+    bool isVirtDefInsn = false;
+    // Remove any candidates touched by instruction.
+    for (const MachineOperand &MO : MI->operands()) {
+      if (MO.isRegMask())
+        Candidates.clearBitsNotInMask(MO.getRegMask());
+      if (!MO.isReg() || MO.isUndef() || !MO.getReg())
+        continue;
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        if (MO.isDef())
+          isVirtDefInsn = true;
+        else if (MO.isKill())
+          isVirtKillInsn = true;
+        continue;
+      }
+      for (MCRegAliasIterator AI(MO.getReg(), TRI, true); AI.isValid(); ++AI)
+        Candidates.reset(*AI);
+    }
+    // If we're not in a virtual reg's live range, this is a valid
+    // restore point.
+    if (!inVirtLiveRange) RestorePointMI = MI;
+
+    // Update whether we're in the live range of a virtual register
+    if (isVirtKillInsn) inVirtLiveRange = false;
+    if (isVirtDefInsn) inVirtLiveRange = true;
+
+    // Was our survivor untouched by this instruction?
+    if (Candidates.test(Survivor))
+      continue;
+
+    // All candidates gone?
+    if (Candidates.none())
+      break;
+
+    Survivor = Candidates.find_first();
+  }
+  // If we ran off the end, that's where we want to restore.
+  if (MI == ME) RestorePointMI = ME;
+  assert(RestorePointMI != StartMI &&
+         "No available scavenger restore location!");
+
+  // We ran out of candidates, so stop the search.
+  UseMI = RestorePointMI;
+  return Survivor;
+}
+
+static unsigned getFrameIndexOperandNum(MachineInstr &MI) {
+  unsigned i = 0;
+  while (!MI.getOperand(i).isFI()) {
+    ++i;
+    assert(i < MI.getNumOperands() && "Instr doesn't have FrameIndex operand!");
+  }
+  return i;
+}
+
+unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
+                                        MachineBasicBlock::iterator I,
+                                        int SPAdj) {
+  MachineInstr &MI = *I;
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  // Consider all allocatable registers in the register class initially
+  BitVector Candidates = TRI->getAllocatableSet(MF, RC);
+
+  // Exclude all the registers being used by the instruction.
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.getReg() != 0 && !(MO.isUse() && MO.isUndef()) &&
+        !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      Candidates.reset(MO.getReg());
+  }
+
+  // Try to find a register that's unused if there is one, as then we won't
+  // have to spill.
+  BitVector Available = getRegsAvailable(RC);
+  Available &= Candidates;
+  if (Available.any())
+    Candidates = Available;
+
+  // Find the register whose use is furthest away.
+  MachineBasicBlock::iterator UseMI;
+  unsigned SReg = findSurvivorReg(I, Candidates, 25, UseMI);
+
+  // If we found an unused register there is no reason to spill it.
+  if (!isRegUsed(SReg)) {
+    DEBUG(dbgs() << "Scavenged register: " << TRI->getName(SReg) << "\n");
+    return SReg;
+  }
+
+  // Find an available scavenging slot with size and alignment matching
+  // the requirements of the class RC.
+  const MachineFrameInfo &MFI = *MF.getFrameInfo();
+  unsigned NeedSize = RC->getSize();
+  unsigned NeedAlign = RC->getAlignment();
+
+  unsigned SI = Scavenged.size(), Diff = UINT_MAX;
+  int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
+  for (unsigned I = 0; I < Scavenged.size(); ++I) {
+    if (Scavenged[I].Reg != 0)
+      continue;
+    // Verify that this slot is valid for this register.
+    int FI = Scavenged[I].FrameIndex;
+    if (FI < FIB || FI >= FIE)
+      continue;
+    unsigned S = MFI.getObjectSize(FI);
+    unsigned A = MFI.getObjectAlignment(FI);
+    if (NeedSize > S || NeedAlign > A)
+      continue;
+    // Avoid wasting slots with large size and/or large alignment. Pick one
+    // that is the best fit for this register class (in street metric).
+    // Picking a larger slot than necessary could happen if a slot for a
+    // larger register is reserved before a slot for a smaller one. When
+    // trying to spill a smaller register, the large slot would be found
+    // first, thus making it impossible to spill the larger register later.
+    unsigned D = (S-NeedSize) + (A-NeedAlign);
+    if (D < Diff) {
+      SI = I;
+      Diff = D;
+    }
+  }
+
+  if (SI == Scavenged.size()) {
+    // We need to scavenge a register but have no spill slot, the target
+    // must know how to do it (if not, we'll assert below).
+    Scavenged.push_back(ScavengedInfo(FIE));
+  }
+
+  // Avoid infinite regress
+  Scavenged[SI].Reg = SReg;
+
+  // If the target knows how to save/restore the register, let it do so;
+  // otherwise, use the emergency stack spill slot.
+  if (!TRI->saveScavengerRegister(*MBB, I, UseMI, RC, SReg)) {
+    // Spill the scavenged register before I.
+    int FI = Scavenged[SI].FrameIndex;
+    if (FI < FIB || FI >= FIE) {
+      std::string Msg = std::string("Error while trying to spill ") +
+          TRI->getName(SReg) + " from class " + TRI->getRegClassName(RC) +
+          ": Cannot scavenge register without an emergency spill slot!";
+      report_fatal_error(Msg.c_str());
+    }
+    TII->storeRegToStackSlot(*MBB, I, SReg, true, Scavenged[SI].FrameIndex,
+                             RC, TRI);
+    MachineBasicBlock::iterator II = std::prev(I);
+
+    unsigned FIOperandNum = getFrameIndexOperandNum(*II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+
+    // Restore the scavenged register before its use (or first terminator).
+    TII->loadRegFromStackSlot(*MBB, UseMI, SReg, Scavenged[SI].FrameIndex,
+                              RC, TRI);
+    II = std::prev(UseMI);
+
+    FIOperandNum = getFrameIndexOperandNum(*II);
+    TRI->eliminateFrameIndex(II, SPAdj, FIOperandNum, this);
+  }
+
+  Scavenged[SI].Restore = &*std::prev(UseMI);
+
+  // Doing this here leads to infinite regress.
+  // Scavenged[SI].Reg = SReg;
+
+  DEBUG(dbgs() << "Scavenged register (with spill): " << TRI->getName(SReg) <<
+        "\n");
+
+  return SReg;
+}
diff --git a/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
new file mode 100644
index 000000000000..5cf3e57eb3d3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -0,0 +1,93 @@
+//===- RegisterUsageInfo.cpp - Register Usage Informartion Storage --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This pass is required to take advantage of the interprocedural register
+/// allocation infrastructure.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ip-regalloc"
+
+cl::opt<bool> DumpRegUsage(
+    "print-regusage", cl::init(false), cl::Hidden,
+    cl::desc("print register usage details collected for analysis."));
+
+INITIALIZE_PASS(PhysicalRegisterUsageInfo, "reg-usage-info",
+                "Register Usage Informartion Stroage", false, true)
+
+char PhysicalRegisterUsageInfo::ID = 0;
+
+void PhysicalRegisterUsageInfo::anchor() {}
+
+bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
+  RegMasks.grow(M.size());
+  return false;
+}
+
+bool PhysicalRegisterUsageInfo::doFinalization(Module &M) {
+  if (DumpRegUsage)
+    print(errs());
+
+  RegMasks.shrink_and_clear();
+  return false;
+}
+
+void PhysicalRegisterUsageInfo::storeUpdateRegUsageInfo(
+    const Function *FP, std::vector<uint32_t> RegMask) {
+  assert(FP != nullptr && "Function * can't be nullptr.");
+  RegMasks[FP] = std::move(RegMask);
+}
+
+const std::vector<uint32_t> *
+PhysicalRegisterUsageInfo::getRegUsageInfo(const Function *FP) {
+  auto It = RegMasks.find(FP);
+  if (It != RegMasks.end())
+    return &(It->second);
+  return nullptr;
+}
+
+void PhysicalRegisterUsageInfo::print(raw_ostream &OS, const Module *M) const {
+  const TargetRegisterInfo *TRI;
+
+  typedef std::pair<const Function *, std::vector<uint32_t>> FuncPtrRegMaskPair;
+
+  SmallVector<const FuncPtrRegMaskPair *, 64> FPRMPairVector;
+
+  // Create a vector of pointer to RegMasks entries
+  for (const auto &RegMask : RegMasks)
+    FPRMPairVector.push_back(&RegMask);
+
+  // sort the vector to print analysis in alphabatic order of function name.
+  std::sort(
+      FPRMPairVector.begin(), FPRMPairVector.end(),
+      [](const FuncPtrRegMaskPair *A, const FuncPtrRegMaskPair *B) -> bool {
+        return A->first->getName() < B->first->getName();
+      });
+
+  for (const FuncPtrRegMaskPair *FPRMPair : FPRMPairVector) {
+    OS << FPRMPair->first->getName() << " "
+       << "Clobbered Registers: ";
+    TRI = TM->getSubtarget<TargetSubtargetInfo>(*(FPRMPair->first))
+              .getRegisterInfo();
+
+    for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+      if (MachineOperand::clobbersPhysReg(&(FPRMPair->second[0]), PReg))
+        OS << TRI->getName(PReg) << " ";
+    }
+    OS << "\n";
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
new file mode 100644
index 000000000000..ea952d9088fc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/RenameIndependentSubregs.cpp
@@ -0,0 +1,388 @@
+//===-- RenameIndependentSubregs.cpp - Live Interval Analysis -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Rename independent subregisters looks for virtual registers with
+/// independently used subregisters and renames them to new virtual registers.
+/// Example: In the following:
+///   %vreg0:sub0<read-undef> = ...
+///   %vreg0:sub1 = ...
+///   use %vreg0:sub0
+///   %vreg0:sub0 = ...
+///   use %vreg0:sub0
+///   use %vreg0:sub1
+/// sub0 and sub1 are never used together, and we have two independent sub0
+/// definitions. This pass will rename to:
+///   %vreg0:sub0<read-undef> = ...
+///   %vreg1:sub1<read-undef> = ...
+///   use %vreg1:sub1
+///   %vreg2:sub1<read-undef> = ...
+///   use %vreg2:sub1
+///   use %vreg0:sub0
+//
+//===----------------------------------------------------------------------===//
+
+#include "LiveRangeUtils.h"
+#include "PHIEliminationUtils.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "rename-independent-subregs"
+
+namespace {
+
+class RenameIndependentSubregs : public MachineFunctionPass {
+public:
+  static char ID;
+  RenameIndependentSubregs() : MachineFunctionPass(ID) {}
+
+  const char *getPassName() const override {
+    return "Rename Disconnected Subregister Components";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LiveIntervals>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addRequired<SlotIndexes>();
+    AU.addPreserved<SlotIndexes>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  struct SubRangeInfo {
+    ConnectedVNInfoEqClasses ConEQ;
+    LiveInterval::SubRange *SR;
+    unsigned Index;
+
+    SubRangeInfo(LiveIntervals &LIS, LiveInterval::SubRange &SR,
+                 unsigned Index)
+      : ConEQ(LIS), SR(&SR), Index(Index) {}
+  };
+
+  /// Split unrelated subregister components and rename them to new vregs.
+  bool renameComponents(LiveInterval &LI) const;
+
+  /// \brief Build a vector of SubRange infos and a union find set of
+  /// equivalence classes.
+  /// Returns true if more than 1 equivalence class was found.
+  bool findComponents(IntEqClasses &Classes,
+                      SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                      LiveInterval &LI) const;
+
+  /// \brief Distribute the LiveInterval segments into the new LiveIntervals
+  /// belonging to their class.
+  void distribute(const IntEqClasses &Classes,
+                  const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                  const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// \brief Constructs main liverange and add missing undef+dead flags.
+  void computeMainRangesFixFlags(const IntEqClasses &Classes,
+      const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+      const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+  /// Rewrite Machine Operands to use the new vreg belonging to their class.
+  void rewriteOperands(const IntEqClasses &Classes,
+                       const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+                       const SmallVectorImpl<LiveInterval*> &Intervals) const;
+
+
+  LiveIntervals *LIS;
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+};
+
+} // end anonymous namespace
+
+char RenameIndependentSubregs::ID;
+
+char &llvm::RenameIndependentSubregsID = RenameIndependentSubregs::ID;
+
+INITIALIZE_PASS_BEGIN(RenameIndependentSubregs, "rename-independent-subregs",
+                      "Rename Independent Subregisters", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_END(RenameIndependentSubregs, "rename-independent-subregs",
+                    "Rename Independent Subregisters", false, false)
+
+bool RenameIndependentSubregs::renameComponents(LiveInterval &LI) const {
+  // Shortcut: We cannot have split components with a single definition.
+  if (LI.valnos.size() < 2)
+    return false;
+
+  SmallVector<SubRangeInfo, 4> SubRangeInfos;
+  IntEqClasses Classes;
+  if (!findComponents(Classes, SubRangeInfos, LI))
+    return false;
+
+  // Create a new VReg for each class.
+  unsigned Reg = LI.reg;
+  const TargetRegisterClass *RegClass = MRI->getRegClass(Reg);
+  SmallVector<LiveInterval*, 4> Intervals;
+  Intervals.push_back(&LI);
+  DEBUG(dbgs() << PrintReg(Reg) << ": Found " << Classes.getNumClasses()
+        << " equivalence classes.\n");
+  DEBUG(dbgs() << PrintReg(Reg) << ": Splitting into newly created:");
+  for (unsigned I = 1, NumClasses = Classes.getNumClasses(); I < NumClasses;
+       ++I) {
+    unsigned NewVReg = MRI->createVirtualRegister(RegClass);
+    LiveInterval &NewLI = LIS->createEmptyInterval(NewVReg);
+    Intervals.push_back(&NewLI);
+    DEBUG(dbgs() << ' ' << PrintReg(NewVReg));
+  }
+  DEBUG(dbgs() << '\n');
+
+  rewriteOperands(Classes, SubRangeInfos, Intervals);
+  distribute(Classes, SubRangeInfos, Intervals);
+  computeMainRangesFixFlags(Classes, SubRangeInfos, Intervals);
+  return true;
+}
+
+bool RenameIndependentSubregs::findComponents(IntEqClasses &Classes,
+    SmallVectorImpl<RenameIndependentSubregs::SubRangeInfo> &SubRangeInfos,
+    LiveInterval &LI) const {
+  // First step: Create connected components for the VNInfos inside the
+  // subranges and count the global number of such components.
+  unsigned NumComponents = 0;
+  for (LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRangeInfos.push_back(SubRangeInfo(*LIS, SR, NumComponents));
+    ConnectedVNInfoEqClasses &ConEQ = SubRangeInfos.back().ConEQ;
+
+    unsigned NumSubComponents = ConEQ.Classify(SR);
+    NumComponents += NumSubComponents;
+  }
+  // Shortcut: With only 1 subrange, the normal separate component tests are
+  // enough and we do not need to perform the union-find on the subregister
+  // segments.
+  if (SubRangeInfos.size() < 2)
+    return false;
+
+  // Next step: Build union-find structure over all subranges and merge classes
+  // across subranges when they are affected by the same MachineOperand.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  Classes.grow(NumComponents);
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+    unsigned MergedID = ~0u;
+    for (RenameIndependentSubregs::SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask) == 0)
+        continue;
+      SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+      Pos = MO.isDef() ? Pos.getRegSlot(MO.isEarlyClobber())
+                       : Pos.getBaseIndex();
+      const VNInfo *VNI = SR.getVNInfoAt(Pos);
+      if (VNI == nullptr)
+        continue;
+
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(VNI);
+      // Global ID
+      unsigned ID = LocalID + SRInfo.Index;
+      // Merge other sets
+      MergedID = MergedID == ~0u ? ID : Classes.join(MergedID, ID);
+    }
+  }
+
+  // Early exit if we ended up with a single equivalence class.
+  Classes.compress();
+  unsigned NumClasses = Classes.getNumClasses();
+  return NumClasses > 1;
+}
+
+void RenameIndependentSubregs::rewriteOperands(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = Intervals[0]->reg;;
+  for (MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(Reg),
+       E = MRI->reg_nodbg_end(); I != E; ) {
+    MachineOperand &MO = *I++;
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    MachineInstr &MI = *MO.getParent();
+
+    SlotIndex Pos = LIS->getInstructionIndex(MI);
+    unsigned SubRegIdx = MO.getSubReg();
+    LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubRegIdx);
+
+    unsigned ID = ~0u;
+    for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+      const LiveInterval::SubRange &SR = *SRInfo.SR;
+      if ((SR.LaneMask & LaneMask) == 0)
+        continue;
+      LiveRange::const_iterator I = SR.find(Pos);
+      if (I == SR.end())
+        continue;
+
+      const VNInfo &VNI = *I->valno;
+      // Map to local representant ID.
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(&VNI);
+      // Global ID
+      ID = Classes[LocalID + SRInfo.Index];
+      break;
+    }
+
+    unsigned VReg = Intervals[ID]->reg;
+    MO.setReg(VReg);
+  }
+  // TODO: We could attempt to recompute new register classes while visiting
+  // the operands: Some of the split register may be fine with less constraint
+  // classes than the original vreg.
+}
+
+void RenameIndependentSubregs::distribute(const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  unsigned NumClasses = Classes.getNumClasses();
+  SmallVector<unsigned, 8> VNIMapping;
+  SmallVector<LiveInterval::SubRange*, 8> SubRanges;
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  for (const SubRangeInfo &SRInfo : SubRangeInfos) {
+    LiveInterval::SubRange &SR = *SRInfo.SR;
+    unsigned NumValNos = SR.valnos.size();
+    VNIMapping.clear();
+    VNIMapping.reserve(NumValNos);
+    SubRanges.clear();
+    SubRanges.resize(NumClasses-1, nullptr);
+    for (unsigned I = 0; I < NumValNos; ++I) {
+      const VNInfo &VNI = *SR.valnos[I];
+      unsigned LocalID = SRInfo.ConEQ.getEqClass(&VNI);
+      unsigned ID = Classes[LocalID + SRInfo.Index];
+      VNIMapping.push_back(ID);
+      if (ID > 0 && SubRanges[ID-1] == nullptr)
+        SubRanges[ID-1] = Intervals[ID]->createSubRange(Allocator, SR.LaneMask);
+    }
+    DistributeRange(SR, SubRanges.data(), VNIMapping);
+  }
+}
+
+static bool subRangeLiveAt(const LiveInterval &LI, SlotIndex Pos) {
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if (SR.liveAt(Pos))
+      return true;
+  }
+  return false;
+}
+
+void RenameIndependentSubregs::computeMainRangesFixFlags(
+    const IntEqClasses &Classes,
+    const SmallVectorImpl<SubRangeInfo> &SubRangeInfos,
+    const SmallVectorImpl<LiveInterval*> &Intervals) const {
+  BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
+  const SlotIndexes &Indexes = *LIS->getSlotIndexes();
+  for (size_t I = 0, E = Intervals.size(); I < E; ++I) {
+    LiveInterval &LI = *Intervals[I];
+    unsigned Reg = LI.reg;
+
+    LI.removeEmptySubRanges();
+
+    // There must be a def (or live-in) before every use. Splitting vregs may
+    // violate this principle as the splitted vreg may not have a definition on
+    // every path. Fix this by creating IMPLICIT_DEF instruction as necessary.
+    for (const LiveInterval::SubRange &SR : LI.subranges()) {
+      // Search for "PHI" value numbers in the subranges. We must find a live
+      // value in each predecessor block, add an IMPLICIT_DEF where it is
+      // missing.
+      for (unsigned I = 0; I < SR.valnos.size(); ++I) {
+        const VNInfo &VNI = *SR.valnos[I];
+        if (VNI.isUnused() || !VNI.isPHIDef())
+          continue;
+
+        SlotIndex Def = VNI.def;
+        MachineBasicBlock &MBB = *Indexes.getMBBFromIndex(Def);
+        for (MachineBasicBlock *PredMBB : MBB.predecessors()) {
+          SlotIndex PredEnd = Indexes.getMBBEndIdx(PredMBB);
+          if (subRangeLiveAt(LI, PredEnd.getPrevSlot()))
+            continue;
+
+          MachineBasicBlock::iterator InsertPos =
+            llvm::findPHICopyInsertPoint(PredMBB, &MBB, Reg);
+          const MCInstrDesc &MCDesc = TII->get(TargetOpcode::IMPLICIT_DEF);
+          MachineInstrBuilder ImpDef = BuildMI(*PredMBB, InsertPos,
+                                               DebugLoc(), MCDesc, Reg);
+          SlotIndex DefIdx = LIS->InsertMachineInstrInMaps(*ImpDef);
+          SlotIndex RegDefIdx = DefIdx.getRegSlot();
+          for (LiveInterval::SubRange &SR : LI.subranges()) {
+            VNInfo *SRVNI = SR.getNextValue(RegDefIdx, Allocator);
+            SR.addSegment(LiveRange::Segment(RegDefIdx, PredEnd, SRVNI));
+          }
+        }
+      }
+    }
+
+    for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+      if (!MO.isDef())
+        continue;
+      unsigned SubRegIdx = MO.getSubReg();
+      if (SubRegIdx == 0)
+        continue;
+      // After assigning the new vreg we may not have any other sublanes living
+      // in and out of the instruction anymore. We need to add new dead and
+      // undef flags in these cases.
+      if (!MO.isUndef()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent());
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsUndef();
+      }
+      if (!MO.isDead()) {
+        SlotIndex Pos = LIS->getInstructionIndex(*MO.getParent()).getDeadSlot();
+        if (!subRangeLiveAt(LI, Pos))
+          MO.setIsDead();
+      }
+    }
+
+    if (I == 0)
+      LI.clear();
+    LIS->constructMainRangeFromSubranges(LI);
+  }
+}
+
+bool RenameIndependentSubregs::runOnMachineFunction(MachineFunction &MF) {
+  // Skip renaming if liveness of subregister is not tracked.
+  if (!MF.getSubtarget().enableSubRegLiveness())
+    return false;
+
+  DEBUG(dbgs() << "Renaming independent subregister live ranges in "
+        << MF.getName() << '\n');
+
+  LIS = &getAnalysis<LiveIntervals>();
+  MRI = &MF.getRegInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+
+  // Iterate over all vregs. Note that we query getNumVirtRegs() the newly
+  // created vregs end up with higher numbers but do not need to be visited as
+  // there can't be any further splitting.
+  bool Changed = false;
+  for (size_t I = 0, E = MRI->getNumVirtRegs(); I < E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (!LIS->hasInterval(Reg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(Reg);
+    if (!LI.hasSubRanges())
+      continue;
+
+    Changed |= renameComponents(LI);
+  }
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/SafeStack.cpp b/contrib/llvm/lib/CodeGen/SafeStack.cpp
new file mode 100644
index 000000000000..19cd59b9dba7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SafeStack.cpp
@@ -0,0 +1,851 @@
+//===-- SafeStack.cpp - Safe Stack Insertion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass splits the stack into the safe stack (kept as-is for LLVM backend)
+// and the unsafe stack (explicitly allocated and managed through the runtime
+// support library).
+//
+// http://clang.llvm.org/docs/SafeStack.html
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackColoring.h"
+#include "SafeStackLayout.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestack"
+
+enum UnsafeStackPtrStorageVal { ThreadLocalUSP, SingleThreadUSP };
+
+static cl::opt<UnsafeStackPtrStorageVal> USPStorage("safe-stack-usp-storage",
+    cl::Hidden, cl::init(ThreadLocalUSP),
+    cl::desc("Type of storage for the unsafe stack pointer"),
+    cl::values(clEnumValN(ThreadLocalUSP, "thread-local",
+                          "Thread-local storage"),
+               clEnumValN(SingleThreadUSP, "single-thread",
+                          "Non-thread-local storage"),
+               clEnumValEnd));
+
+namespace llvm {
+
+STATISTIC(NumFunctions, "Total number of functions");
+STATISTIC(NumUnsafeStackFunctions, "Number of functions with unsafe stack");
+STATISTIC(NumUnsafeStackRestorePointsFunctions,
+          "Number of functions that use setjmp or exceptions");
+
+STATISTIC(NumAllocas, "Total number of allocas");
+STATISTIC(NumUnsafeStaticAllocas, "Number of unsafe static allocas");
+STATISTIC(NumUnsafeDynamicAllocas, "Number of unsafe dynamic allocas");
+STATISTIC(NumUnsafeByValArguments, "Number of unsafe byval arguments");
+STATISTIC(NumUnsafeStackRestorePoints, "Number of setjmps and landingpads");
+
+} // namespace llvm
+
+namespace {
+
+/// Rewrite an SCEV expression for a memory access address to an expression that
+/// represents offset from the given alloca.
+///
+/// The implementation simply replaces all mentions of the alloca with zero.
+class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
+  const Value *AllocaPtr;
+
+public:
+  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
+      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
+
+  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
+    if (Expr->getValue() == AllocaPtr)
+      return SE.getZero(Expr->getType());
+    return Expr;
+  }
+};
+
+/// The SafeStack pass splits the stack of each function into the safe
+/// stack, which is only accessed through memory safe dereferences (as
+/// determined statically), and the unsafe stack, which contains all
+/// local variables that are accessed in ways that we can't prove to
+/// be safe.
+class SafeStack : public FunctionPass {
+  const TargetMachine *TM;
+  const TargetLoweringBase *TL;
+  const DataLayout *DL;
+  ScalarEvolution *SE;
+
+  Type *StackPtrTy;
+  Type *IntPtrTy;
+  Type *Int32Ty;
+  Type *Int8Ty;
+
+  Value *UnsafeStackPtr = nullptr;
+
+  /// Unsafe stack alignment. Each stack frame must ensure that the stack is
+  /// aligned to this value. We need to re-align the unsafe stack if the
+  /// alignment of any object on the stack exceeds this value.
+  ///
+  /// 16 seems like a reasonable upper bound on the alignment of objects that we
+  /// might expect to appear on the stack on most common targets.
+  enum { StackAlignment = 16 };
+
+  /// \brief Build a value representing a pointer to the unsafe stack pointer.
+  Value *getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F);
+
+  /// \brief Return the value of the stack canary.
+  Value *getStackGuard(IRBuilder<> &IRB, Function &F);
+
+  /// \brief Load stack guard from the frame and check if it has changed.
+  void checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+                       AllocaInst *StackGuardSlot, Value *StackGuard);
+
+  /// \brief Find all static allocas, dynamic allocas, return instructions and
+  /// stack restore points (exception unwind blocks and setjmp calls) in the
+  /// given function and append them to the respective vectors.
+  void findInsts(Function &F, SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                 SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                 SmallVectorImpl<Argument *> &ByValArguments,
+                 SmallVectorImpl<ReturnInst *> &Returns,
+                 SmallVectorImpl<Instruction *> &StackRestorePoints);
+
+  /// \brief Calculate the allocation size of a given alloca. Returns 0 if the
+  /// size can not be statically determined.
+  uint64_t getStaticAllocaAllocationSize(const AllocaInst* AI);
+
+  /// \brief Allocate space for all static allocas in \p StaticAllocas,
+  /// replace allocas with pointers into the unsafe stack and generate code to
+  /// restore the stack pointer before all return instructions in \p Returns.
+  ///
+  /// \returns A pointer to the top of the unsafe stack after all unsafe static
+  /// allocas are allocated.
+  Value *moveStaticAllocasToUnsafeStack(IRBuilder<> &IRB, Function &F,
+                                        ArrayRef<AllocaInst *> StaticAllocas,
+                                        ArrayRef<Argument *> ByValArguments,
+                                        ArrayRef<ReturnInst *> Returns,
+                                        Instruction *BasePointer,
+                                        AllocaInst *StackGuardSlot);
+
+  /// \brief Generate code to restore the stack after all stack restore points
+  /// in \p StackRestorePoints.
+  ///
+  /// \returns A local variable in which to maintain the dynamic top of the
+  /// unsafe stack if needed.
+  AllocaInst *
+  createStackRestorePoints(IRBuilder<> &IRB, Function &F,
+                           ArrayRef<Instruction *> StackRestorePoints,
+                           Value *StaticTop, bool NeedDynamicTop);
+
+  /// \brief Replace all allocas in \p DynamicAllocas with code to allocate
+  /// space dynamically on the unsafe stack and store the dynamic unsafe stack
+  /// top to \p DynamicTop if non-null.
+  void moveDynamicAllocasToUnsafeStack(Function &F, Value *UnsafeStackPtr,
+                                       AllocaInst *DynamicTop,
+                                       ArrayRef<AllocaInst *> DynamicAllocas);
+
+  bool IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize);
+
+  bool IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                          const Value *AllocaPtr, uint64_t AllocaSize);
+  bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
+                    uint64_t AllocaSize);
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  SafeStack(const TargetMachine *TM)
+      : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) {
+    initializeSafeStackPass(*PassRegistry::getPassRegistry());
+  }
+  SafeStack() : SafeStack(nullptr) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+  }
+
+  bool doInitialization(Module &M) override {
+    DL = &M.getDataLayout();
+
+    StackPtrTy = Type::getInt8PtrTy(M.getContext());
+    IntPtrTy = DL->getIntPtrType(M.getContext());
+    Int32Ty = Type::getInt32Ty(M.getContext());
+    Int8Ty = Type::getInt8Ty(M.getContext());
+
+    return false;
+  }
+
+  bool runOnFunction(Function &F) override;
+}; // class SafeStack
+
+uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) {
+  uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType());
+  if (AI->isArrayAllocation()) {
+    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+    if (!C)
+      return 0;
+    Size *= C->getZExtValue();
+  }
+  return Size;
+}
+
+bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize,
+                             const Value *AllocaPtr, uint64_t AllocaSize) {
+  AllocaOffsetRewriter Rewriter(*SE, AllocaPtr);
+  const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr));
+
+  uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType());
+  ConstantRange AccessStartRange = SE->getUnsignedRange(Expr);
+  ConstantRange SizeRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize));
+  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
+  ConstantRange AllocaRange =
+      ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AllocaSize));
+  bool Safe = AllocaRange.contains(AccessRange);
+
+  DEBUG(dbgs() << "[SafeStack] "
+               << (isa<AllocaInst>(AllocaPtr) ? "Alloca " : "ByValArgument ")
+               << *AllocaPtr << "\n"
+               << "            Access " << *Addr << "\n"
+               << "            SCEV " << *Expr
+               << " U: " << SE->getUnsignedRange(Expr)
+               << ", S: " << SE->getSignedRange(Expr) << "\n"
+               << "            Range " << AccessRange << "\n"
+               << "            AllocaRange " << AllocaRange << "\n"
+               << "            " << (Safe ? "safe" : "unsafe") << "\n");
+
+  return Safe;
+}
+
+bool SafeStack::IsMemIntrinsicSafe(const MemIntrinsic *MI, const Use &U,
+                                   const Value *AllocaPtr,
+                                   uint64_t AllocaSize) {
+  // All MemIntrinsics have destination address in Arg0 and size in Arg2.
+  if (MI->getRawDest() != U) return true;
+  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
+  // Non-constant size => unsafe. FIXME: try SCEV getRange.
+  if (!Len) return false;
+  return IsAccessSafe(U, Len->getZExtValue(), AllocaPtr, AllocaSize);
+}
+
+/// Check whether a given allocation must be put on the safe
+/// stack or not. The function analyzes all uses of AI and checks whether it is
+/// only accessed in a memory safe way (as decided statically).
+bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
+  // Go through all uses of this alloca and check whether all accesses to the
+  // allocated object are statically known to be memory safe and, hence, the
+  // object can be placed on the safe stack.
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 8> WorkList;
+  WorkList.push_back(AllocaPtr);
+
+  // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
+  while (!WorkList.empty()) {
+    const Value *V = WorkList.pop_back_val();
+    for (const Use &UI : V->uses()) {
+      auto I = cast<const Instruction>(UI.getUser());
+      assert(V == UI.get());
+
+      switch (I->getOpcode()) {
+      case Instruction::Load: {
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr,
+                          AllocaSize))
+          return false;
+        break;
+      }
+      case Instruction::VAArg:
+        // "va-arg" from a pointer is safe.
+        break;
+      case Instruction::Store: {
+        if (V == I->getOperand(0)) {
+          // Stored the pointer - conservatively assume it may be unsafe.
+          DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                       << "\n            store of address: " << *I << "\n");
+          return false;
+        }
+
+        if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()),
+                          AllocaPtr, AllocaSize))
+          return false;
+        break;
+      }
+      case Instruction::Ret: {
+        // Information leak.
+        return false;
+      }
+
+      case Instruction::Call:
+      case Instruction::Invoke: {
+        ImmutableCallSite CS(I);
+
+        if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+          if (II->getIntrinsicID() == Intrinsic::lifetime_start ||
+              II->getIntrinsicID() == Intrinsic::lifetime_end)
+            continue;
+        }
+
+        if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
+          if (!IsMemIntrinsicSafe(MI, UI, AllocaPtr, AllocaSize)) {
+            DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                         << "\n            unsafe memintrinsic: " << *I
+                         << "\n");
+            return false;
+          }
+          continue;
+        }
+
+        // LLVM 'nocapture' attribute is only set for arguments whose address
+        // is not stored, passed around, or used in any other non-trivial way.
+        // We assume that passing a pointer to an object as a 'nocapture
+        // readnone' argument is safe.
+        // FIXME: a more precise solution would require an interprocedural
+        // analysis here, which would look at all uses of an argument inside
+        // the function being called.
+        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
+          if (A->get() == V)
+            if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
+                                               CS.doesNotAccessMemory()))) {
+              DEBUG(dbgs() << "[SafeStack] Unsafe alloca: " << *AllocaPtr
+                           << "\n            unsafe call: " << *I << "\n");
+              return false;
+            }
+        continue;
+      }
+
+      default:
+        if (Visited.insert(I).second)
+          WorkList.push_back(cast<const Instruction>(I));
+      }
+    }
+  }
+
+  // All uses of the alloca are safe, we can place it on the safe stack.
+  return true;
+}
+
+Value *SafeStack::getOrCreateUnsafeStackPtr(IRBuilder<> &IRB, Function &F) {
+  // Check if there is a target-specific location for the unsafe stack pointer.
+  if (TL)
+    if (Value *V = TL->getSafeStackPointerLocation(IRB))
+      return V;
+
+  // Otherwise, assume the target links with compiler-rt, which provides a
+  // thread-local variable with a magic name.
+  Module &M = *F.getParent();
+  const char *UnsafeStackPtrVar = "__safestack_unsafe_stack_ptr";
+  auto UnsafeStackPtr =
+      dyn_cast_or_null<GlobalVariable>(M.getNamedValue(UnsafeStackPtrVar));
+
+  bool UseTLS = USPStorage == ThreadLocalUSP;
+
+  if (!UnsafeStackPtr) {
+    auto TLSModel = UseTLS ?
+        GlobalValue::InitialExecTLSModel :
+        GlobalValue::NotThreadLocal;
+    // The global variable is not defined yet, define it ourselves.
+    // We use the initial-exec TLS model because we do not support the
+    // variable living anywhere other than in the main executable.
+    UnsafeStackPtr = new GlobalVariable(
+        M, StackPtrTy, false, GlobalValue::ExternalLinkage, nullptr,
+        UnsafeStackPtrVar, nullptr, TLSModel);
+  } else {
+    // The variable exists, check its type and attributes.
+    if (UnsafeStackPtr->getValueType() != StackPtrTy)
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must have void* type");
+    if (UseTLS != UnsafeStackPtr->isThreadLocal())
+      report_fatal_error(Twine(UnsafeStackPtrVar) + " must " +
+                         (UseTLS ? "" : "not ") + "be thread-local");
+  }
+  return UnsafeStackPtr;
+}
+
+Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
+  Value *StackGuardVar = nullptr;
+  if (TL)
+    StackGuardVar = TL->getIRStackGuard(IRB);
+  if (!StackGuardVar)
+    StackGuardVar =
+        F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy);
+  return IRB.CreateLoad(StackGuardVar, "StackGuard");
+}
+
+void SafeStack::findInsts(Function &F,
+                          SmallVectorImpl<AllocaInst *> &StaticAllocas,
+                          SmallVectorImpl<AllocaInst *> &DynamicAllocas,
+                          SmallVectorImpl<Argument *> &ByValArguments,
+                          SmallVectorImpl<ReturnInst *> &Returns,
+                          SmallVectorImpl<Instruction *> &StackRestorePoints) {
+  for (Instruction &I : instructions(&F)) {
+    if (auto AI = dyn_cast<AllocaInst>(&I)) {
+      ++NumAllocas;
+
+      uint64_t Size = getStaticAllocaAllocationSize(AI);
+      if (IsSafeStackAlloca(AI, Size))
+        continue;
+
+      if (AI->isStaticAlloca()) {
+        ++NumUnsafeStaticAllocas;
+        StaticAllocas.push_back(AI);
+      } else {
+        ++NumUnsafeDynamicAllocas;
+        DynamicAllocas.push_back(AI);
+      }
+    } else if (auto RI = dyn_cast<ReturnInst>(&I)) {
+      Returns.push_back(RI);
+    } else if (auto CI = dyn_cast<CallInst>(&I)) {
+      // setjmps require stack restore.
+      if (CI->getCalledFunction() && CI->canReturnTwice())
+        StackRestorePoints.push_back(CI);
+    } else if (auto LP = dyn_cast<LandingPadInst>(&I)) {
+      // Exception landing pads require stack restore.
+      StackRestorePoints.push_back(LP);
+    } else if (auto II = dyn_cast<IntrinsicInst>(&I)) {
+      if (II->getIntrinsicID() == Intrinsic::gcroot)
+        llvm::report_fatal_error(
+            "gcroot intrinsic not compatible with safestack attribute");
+    }
+  }
+  for (Argument &Arg : F.args()) {
+    if (!Arg.hasByValAttr())
+      continue;
+    uint64_t Size =
+        DL->getTypeStoreSize(Arg.getType()->getPointerElementType());
+    if (IsSafeStackAlloca(&Arg, Size))
+      continue;
+
+    ++NumUnsafeByValArguments;
+    ByValArguments.push_back(&Arg);
+  }
+}
+
+AllocaInst *
+SafeStack::createStackRestorePoints(IRBuilder<> &IRB, Function &F,
+                                    ArrayRef<Instruction *> StackRestorePoints,
+                                    Value *StaticTop, bool NeedDynamicTop) {
+  assert(StaticTop && "The stack top isn't set.");
+
+  if (StackRestorePoints.empty())
+    return nullptr;
+
+  // We need the current value of the shadow stack pointer to restore
+  // after longjmp or exception catching.
+
+  // FIXME: On some platforms this could be handled by the longjmp/exception
+  // runtime itself.
+
+  AllocaInst *DynamicTop = nullptr;
+  if (NeedDynamicTop) {
+    // If we also have dynamic alloca's, the stack pointer value changes
+    // throughout the function. For now we store it in an alloca.
+    DynamicTop = IRB.CreateAlloca(StackPtrTy, /*ArraySize=*/nullptr,
+                                  "unsafe_stack_dynamic_ptr");
+    IRB.CreateStore(StaticTop, DynamicTop);
+  }
+
+  // Restore current stack pointer after longjmp/exception catch.
+  for (Instruction *I : StackRestorePoints) {
+    ++NumUnsafeStackRestorePoints;
+
+    IRB.SetInsertPoint(I->getNextNode());
+    Value *CurrentTop = DynamicTop ? IRB.CreateLoad(DynamicTop) : StaticTop;
+    IRB.CreateStore(CurrentTop, UnsafeStackPtr);
+  }
+
+  return DynamicTop;
+}
+
+void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
+                                AllocaInst *StackGuardSlot, Value *StackGuard) {
+  Value *V = IRB.CreateLoad(StackGuardSlot);
+  Value *Cmp = IRB.CreateICmpNE(StackGuard, V);
+
+  auto SuccessProb = BranchProbabilityInfo::getBranchProbStackProtector(true);
+  auto FailureProb = BranchProbabilityInfo::getBranchProbStackProtector(false);
+  MDNode *Weights = MDBuilder(F.getContext())
+                        .createBranchWeights(SuccessProb.getNumerator(),
+                                             FailureProb.getNumerator());
+  Instruction *CheckTerm =
+      SplitBlockAndInsertIfThen(Cmp, &RI,
+                                /* Unreachable */ true, Weights);
+  IRBuilder<> IRBFail(CheckTerm);
+  // FIXME: respect -fsanitize-trap / -ftrap-function here?
+  Constant *StackChkFail = F.getParent()->getOrInsertFunction(
+      "__stack_chk_fail", IRB.getVoidTy(), nullptr);
+  IRBFail.CreateCall(StackChkFail, {});
+}
+
+/// We explicitly compute and set the unsafe stack layout for all unsafe
+/// static alloca instructions. We save the unsafe "base pointer" in the
+/// prologue into a local variable and restore it in the epilogue.
+Value *SafeStack::moveStaticAllocasToUnsafeStack(
+    IRBuilder<> &IRB, Function &F, ArrayRef<AllocaInst *> StaticAllocas,
+    ArrayRef<Argument *> ByValArguments, ArrayRef<ReturnInst *> Returns,
+    Instruction *BasePointer, AllocaInst *StackGuardSlot) {
+  if (StaticAllocas.empty() && ByValArguments.empty())
+    return BasePointer;
+
+  DIBuilder DIB(*F.getParent());
+
+  StackColoring SSC(F, StaticAllocas);
+  SSC.run();
+  SSC.removeAllMarkers();
+
+  // Unsafe stack always grows down.
+  StackLayout SSL(StackAlignment);
+  if (StackGuardSlot) {
+    Type *Ty = StackGuardSlot->getAllocatedType();
+    unsigned Align =
+        std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment());
+    SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot),
+                  Align, SSC.getLiveRange(StackGuardSlot));
+  }
+
+  for (Argument *Arg : ByValArguments) {
+    Type *Ty = Arg->getType()->getPointerElementType();
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty),
+                              Arg->getParamAlignment());
+    SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange());
+  }
+
+  for (AllocaInst *AI : StaticAllocas) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    // Ensure the object is properly aligned.
+    unsigned Align =
+        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment());
+
+    SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
+  }
+
+  SSL.computeLayout();
+  unsigned FrameAlignment = SSL.getFrameAlignment();
+
+  // FIXME: tell SSL that we start at a less-then-MaxAlignment aligned location
+  // (AlignmentSkew).
+  if (FrameAlignment > StackAlignment) {
+    // Re-align the base pointer according to the max requested alignment.
+    assert(isPowerOf2_32(FrameAlignment));
+    IRB.SetInsertPoint(BasePointer->getNextNode());
+    BasePointer = cast<Instruction>(IRB.CreateIntToPtr(
+        IRB.CreateAnd(IRB.CreatePtrToInt(BasePointer, IntPtrTy),
+                      ConstantInt::get(IntPtrTy, ~uint64_t(FrameAlignment - 1))),
+        StackPtrTy));
+  }
+
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  if (StackGuardSlot) {
+    unsigned Offset = SSL.getObjectOffset(StackGuardSlot);
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -Offset));
+    Value *NewAI =
+        IRB.CreateBitCast(Off, StackGuardSlot->getType(), "StackGuardSlot");
+
+    // Replace alloc with the new location.
+    StackGuardSlot->replaceAllUsesWith(NewAI);
+    StackGuardSlot->eraseFromParent();
+  }
+
+  for (Argument *Arg : ByValArguments) {
+    unsigned Offset = SSL.getObjectOffset(Arg);
+    Type *Ty = Arg->getType()->getPointerElementType();
+
+    uint64_t Size = DL->getTypeStoreSize(Ty);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    Value *Off = IRB.CreateGEP(BasePointer, // BasePointer is i8*
+                               ConstantInt::get(Int32Ty, -Offset));
+    Value *NewArg = IRB.CreateBitCast(Off, Arg->getType(),
+                                     Arg->getName() + ".unsafe-byval");
+
+    // Replace alloc with the new location.
+    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
+                      /*Deref=*/true, -Offset);
+    Arg->replaceAllUsesWith(NewArg);
+    IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
+    IRB.CreateMemCpy(Off, Arg, Size, Arg->getParamAlignment());
+  }
+
+  // Allocate space for every unsafe static AllocaInst on the unsafe stack.
+  for (AllocaInst *AI : StaticAllocas) {
+    IRB.SetInsertPoint(AI);
+    unsigned Offset = SSL.getObjectOffset(AI);
+
+    uint64_t Size = getStaticAllocaAllocationSize(AI);
+    if (Size == 0)
+      Size = 1; // Don't create zero-sized stack objects.
+
+    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, /*Deref=*/true, -Offset);
+    replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset);
+
+    // Replace uses of the alloca with the new location.
+    // Insert address calculation close to each use to work around PR27844.
+    std::string Name = std::string(AI->getName()) + ".unsafe";
+    while (!AI->use_empty()) {
+      Use &U = *AI->use_begin();
+      Instruction *User = cast<Instruction>(U.getUser());
+
+      Instruction *InsertBefore;
+      if (auto *PHI = dyn_cast<PHINode>(User))
+        InsertBefore = PHI->getIncomingBlock(U)->getTerminator();
+      else
+        InsertBefore = User;
+
+      IRBuilder<> IRBUser(InsertBefore);
+      Value *Off = IRBUser.CreateGEP(BasePointer, // BasePointer is i8*
+                                     ConstantInt::get(Int32Ty, -Offset));
+      Value *Replacement = IRBUser.CreateBitCast(Off, AI->getType(), Name);
+
+      if (auto *PHI = dyn_cast<PHINode>(User)) {
+        // PHI nodes may have multiple incoming edges from the same BB (why??),
+        // all must be updated at once with the same incoming value.
+        auto *BB = PHI->getIncomingBlock(U);
+        for (unsigned I = 0; I < PHI->getNumIncomingValues(); ++I)
+          if (PHI->getIncomingBlock(I) == BB)
+            PHI->setIncomingValue(I, Replacement);
+      } else {
+        U.set(Replacement);
+      }
+    }
+
+    AI->eraseFromParent();
+  }
+
+  // Re-align BasePointer so that our callees would see it aligned as
+  // expected.
+  // FIXME: no need to update BasePointer in leaf functions.
+  unsigned FrameSize = alignTo(SSL.getFrameSize(), StackAlignment);
+
+  // Update shadow stack pointer in the function epilogue.
+  IRB.SetInsertPoint(BasePointer->getNextNode());
+
+  Value *StaticTop =
+      IRB.CreateGEP(BasePointer, ConstantInt::get(Int32Ty, -FrameSize),
+                    "unsafe_stack_static_top");
+  IRB.CreateStore(StaticTop, UnsafeStackPtr);
+  return StaticTop;
+}
+
+void SafeStack::moveDynamicAllocasToUnsafeStack(
+    Function &F, Value *UnsafeStackPtr, AllocaInst *DynamicTop,
+    ArrayRef<AllocaInst *> DynamicAllocas) {
+  DIBuilder DIB(*F.getParent());
+
+  for (AllocaInst *AI : DynamicAllocas) {
+    IRBuilder<> IRB(AI);
+
+    // Compute the new SP value (after AI).
+    Value *ArraySize = AI->getArraySize();
+    if (ArraySize->getType() != IntPtrTy)
+      ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false);
+
+    Type *Ty = AI->getAllocatedType();
+    uint64_t TySize = DL->getTypeAllocSize(Ty);
+    Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize));
+
+    Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy);
+    SP = IRB.CreateSub(SP, Size);
+
+    // Align the SP value to satisfy the AllocaInst, type and stack alignments.
+    unsigned Align = std::max(
+        std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()),
+        (unsigned)StackAlignment);
+
+    assert(isPowerOf2_32(Align));
+    Value *NewTop = IRB.CreateIntToPtr(
+        IRB.CreateAnd(SP, ConstantInt::get(IntPtrTy, ~uint64_t(Align - 1))),
+        StackPtrTy);
+
+    // Save the stack pointer.
+    IRB.CreateStore(NewTop, UnsafeStackPtr);
+    if (DynamicTop)
+      IRB.CreateStore(NewTop, DynamicTop);
+
+    Value *NewAI = IRB.CreatePointerCast(NewTop, AI->getType());
+    if (AI->hasName() && isa<Instruction>(NewAI))
+      NewAI->takeName(AI);
+
+    replaceDbgDeclareForAlloca(AI, NewAI, DIB, /*Deref=*/true);
+    AI->replaceAllUsesWith(NewAI);
+    AI->eraseFromParent();
+  }
+
+  if (!DynamicAllocas.empty()) {
+    // Now go through the instructions again, replacing stacksave/stackrestore.
+    for (inst_iterator It = inst_begin(&F), Ie = inst_end(&F); It != Ie;) {
+      Instruction *I = &*(It++);
+      auto II = dyn_cast<IntrinsicInst>(I);
+      if (!II)
+        continue;
+
+      if (II->getIntrinsicID() == Intrinsic::stacksave) {
+        IRBuilder<> IRB(II);
+        Instruction *LI = IRB.CreateLoad(UnsafeStackPtr);
+        LI->takeName(II);
+        II->replaceAllUsesWith(LI);
+        II->eraseFromParent();
+      } else if (II->getIntrinsicID() == Intrinsic::stackrestore) {
+        IRBuilder<> IRB(II);
+        Instruction *SI = IRB.CreateStore(II->getArgOperand(0), UnsafeStackPtr);
+        SI->takeName(II);
+        assert(II->use_empty());
+        II->eraseFromParent();
+      }
+    }
+  }
+}
+
+bool SafeStack::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n");
+
+  if (!F.hasFnAttribute(Attribute::SafeStack)) {
+    DEBUG(dbgs() << "[SafeStack]     safestack is not requested"
+                    " for this function\n");
+    return false;
+  }
+
+  if (F.isDeclaration()) {
+    DEBUG(dbgs() << "[SafeStack]     function definition"
+                    " is not available\n");
+    return false;
+  }
+
+  TL = TM ? TM->getSubtargetImpl(F)->getTargetLowering() : nullptr;
+  SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  ++NumFunctions;
+
+  SmallVector<AllocaInst *, 16> StaticAllocas;
+  SmallVector<AllocaInst *, 4> DynamicAllocas;
+  SmallVector<Argument *, 4> ByValArguments;
+  SmallVector<ReturnInst *, 4> Returns;
+
+  // Collect all points where stack gets unwound and needs to be restored
+  // This is only necessary because the runtime (setjmp and unwind code) is
+  // not aware of the unsafe stack and won't unwind/restore it prorerly.
+  // To work around this problem without changing the runtime, we insert
+  // instrumentation to restore the unsafe stack pointer when necessary.
+  SmallVector<Instruction *, 4> StackRestorePoints;
+
+  // Find all static and dynamic alloca instructions that must be moved to the
+  // unsafe stack, all return instructions and stack restore points.
+  findInsts(F, StaticAllocas, DynamicAllocas, ByValArguments, Returns,
+            StackRestorePoints);
+
+  if (StaticAllocas.empty() && DynamicAllocas.empty() &&
+      ByValArguments.empty() && StackRestorePoints.empty())
+    return false; // Nothing to do in this function.
+
+  if (!StaticAllocas.empty() || !DynamicAllocas.empty() ||
+      !ByValArguments.empty())
+    ++NumUnsafeStackFunctions; // This function has the unsafe stack.
+
+  if (!StackRestorePoints.empty())
+    ++NumUnsafeStackRestorePointsFunctions;
+
+  IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt());
+  UnsafeStackPtr = getOrCreateUnsafeStackPtr(IRB, F);
+
+  // Load the current stack pointer (we'll also use it as a base pointer).
+  // FIXME: use a dedicated register for it ?
+  Instruction *BasePointer =
+      IRB.CreateLoad(UnsafeStackPtr, false, "unsafe_stack_ptr");
+  assert(BasePointer->getType() == StackPtrTy);
+
+  AllocaInst *StackGuardSlot = nullptr;
+  // FIXME: implement weaker forms of stack protector.
+  if (F.hasFnAttribute(Attribute::StackProtect) ||
+      F.hasFnAttribute(Attribute::StackProtectStrong) ||
+      F.hasFnAttribute(Attribute::StackProtectReq)) {
+    Value *StackGuard = getStackGuard(IRB, F);
+    StackGuardSlot = IRB.CreateAlloca(StackPtrTy, nullptr);
+    IRB.CreateStore(StackGuard, StackGuardSlot);
+
+    for (ReturnInst *RI : Returns) {
+      IRBuilder<> IRBRet(RI);
+      checkStackGuard(IRBRet, F, *RI, StackGuardSlot, StackGuard);
+    }
+  }
+
+  // The top of the unsafe stack after all unsafe static allocas are
+  // allocated.
+  Value *StaticTop =
+      moveStaticAllocasToUnsafeStack(IRB, F, StaticAllocas, ByValArguments,
+                                     Returns, BasePointer, StackGuardSlot);
+
+  // Safe stack object that stores the current unsafe stack top. It is updated
+  // as unsafe dynamic (non-constant-sized) allocas are allocated and freed.
+  // This is only needed if we need to restore stack pointer after longjmp
+  // or exceptions, and we have dynamic allocations.
+  // FIXME: a better alternative might be to store the unsafe stack pointer
+  // before setjmp / invoke instructions.
+  AllocaInst *DynamicTop = createStackRestorePoints(
+      IRB, F, StackRestorePoints, StaticTop, !DynamicAllocas.empty());
+
+  // Handle dynamic allocas.
+  moveDynamicAllocasToUnsafeStack(F, UnsafeStackPtr, DynamicTop,
+                                  DynamicAllocas);
+
+  // Restore the unsafe stack pointer before each return.
+  for (ReturnInst *RI : Returns) {
+    IRB.SetInsertPoint(RI);
+    IRB.CreateStore(BasePointer, UnsafeStackPtr);
+  }
+
+  DEBUG(dbgs() << "[SafeStack]     safestack applied\n");
+  return true;
+}
+
+} // anonymous namespace
+
+char SafeStack::ID = 0;
+INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack",
+                         "Safe Stack instrumentation pass", false, false)
+INITIALIZE_TM_PASS_END(SafeStack, "safe-stack",
+                       "Safe Stack instrumentation pass", false, false)
+
+FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) {
+  return new SafeStack(TM);
+}
diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
new file mode 100644
index 000000000000..709614f57e7d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.cpp
@@ -0,0 +1,289 @@
+//===-- SafeStackColoring.cpp - SafeStack frame coloring -------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackColoring.h"
+
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestackcoloring"
+
+static cl::opt<bool> ClColoring("safe-stack-coloring",
+                                cl::desc("enable safe stack coloring"),
+                                cl::Hidden, cl::init(true));
+
+const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
+  return LiveRanges[AllocaNumbering[AI]];
+}
+
+bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
+  auto *II = dyn_cast<IntrinsicInst>(I);
+  if (!II || (II->getIntrinsicID() != Intrinsic::lifetime_start &&
+              II->getIntrinsicID() != Intrinsic::lifetime_end))
+    return false;
+
+  *IsStart = II->getIntrinsicID() == Intrinsic::lifetime_start;
+  return true;
+}
+
+void StackColoring::removeAllMarkers() {
+  for (auto *I : Markers) {
+    auto *Op = dyn_cast<Instruction>(I->getOperand(1));
+    I->eraseFromParent();
+    // Remove the operand bitcast, too, if it has no more uses left.
+    if (Op && Op->use_empty())
+      Op->eraseFromParent();
+  }
+}
+
+void StackColoring::collectMarkers() {
+  InterestingAllocas.resize(NumAllocas);
+  DenseMap<BasicBlock *, SmallDenseMap<Instruction *, Marker>> BBMarkerSet;
+
+  // Compute the set of start/end markers per basic block.
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+    AllocaInst *AI = Allocas[AllocaNo];
+    SmallVector<Instruction *, 8> WorkList;
+    WorkList.push_back(AI);
+    while (!WorkList.empty()) {
+      Instruction *I = WorkList.pop_back_val();
+      for (User *U : I->users()) {
+        if (auto *BI = dyn_cast<BitCastInst>(U)) {
+          WorkList.push_back(BI);
+          continue;
+        }
+        auto *UI = dyn_cast<Instruction>(U);
+        if (!UI)
+          continue;
+        bool IsStart;
+        if (!readMarker(UI, &IsStart))
+          continue;
+        if (IsStart)
+          InterestingAllocas.set(AllocaNo);
+        BBMarkerSet[UI->getParent()][UI] = {AllocaNo, IsStart};
+        Markers.push_back(UI);
+      }
+    }
+  }
+
+  // Compute instruction numbering. Only the following instructions are
+  // considered:
+  // * Basic block entries
+  // * Lifetime markers
+  // For each basic block, compute
+  // * the list of markers in the instruction order
+  // * the sets of allocas whose lifetime starts or ends in this BB
+  DEBUG(dbgs() << "Instructions:\n");
+  unsigned InstNo = 0;
+  for (BasicBlock *BB : depth_first(&F)) {
+    DEBUG(dbgs() << "  " << InstNo << ": BB " << BB->getName() << "\n");
+    unsigned BBStart = InstNo++;
+
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+    BlockInfo.Begin.resize(NumAllocas);
+    BlockInfo.End.resize(NumAllocas);
+    BlockInfo.LiveIn.resize(NumAllocas);
+    BlockInfo.LiveOut.resize(NumAllocas);
+
+    auto &BlockMarkerSet = BBMarkerSet[BB];
+    if (BlockMarkerSet.empty()) {
+      unsigned BBEnd = InstNo;
+      BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+      continue;
+    }
+
+    auto ProcessMarker = [&](Instruction *I, const Marker &M) {
+      DEBUG(dbgs() << "  " << InstNo << ":  "
+                   << (M.IsStart ? "start " : "end   ") << M.AllocaNo << ", "
+                   << *I << "\n");
+
+      BBMarkers[BB].push_back({InstNo, M});
+
+      InstructionNumbering[I] = InstNo++;
+
+      if (M.IsStart) {
+        if (BlockInfo.End.test(M.AllocaNo))
+          BlockInfo.End.reset(M.AllocaNo);
+        BlockInfo.Begin.set(M.AllocaNo);
+      } else {
+        if (BlockInfo.Begin.test(M.AllocaNo))
+          BlockInfo.Begin.reset(M.AllocaNo);
+        BlockInfo.End.set(M.AllocaNo);
+      }
+    };
+
+    if (BlockMarkerSet.size() == 1) {
+      ProcessMarker(BlockMarkerSet.begin()->getFirst(),
+                    BlockMarkerSet.begin()->getSecond());
+    } else {
+      // Scan the BB to determine the marker order.
+      for (Instruction &I : *BB) {
+        auto It = BlockMarkerSet.find(&I);
+        if (It == BlockMarkerSet.end())
+          continue;
+        ProcessMarker(&I, It->getSecond());
+      }
+    }
+
+    unsigned BBEnd = InstNo;
+    BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+  }
+  NumInst = InstNo;
+}
+
+void StackColoring::calculateLocalLiveness() {
+  bool changed = true;
+  while (changed) {
+    changed = false;
+
+    for (BasicBlock *BB : depth_first(&F)) {
+      BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+
+      // Compute LiveIn by unioning together the LiveOut sets of all preds.
+      BitVector LocalLiveIn;
+      for (auto *PredBB : predecessors(BB)) {
+        LivenessMap::const_iterator I = BlockLiveness.find(PredBB);
+        assert(I != BlockLiveness.end() && "Predecessor not found");
+        LocalLiveIn |= I->second.LiveOut;
+      }
+
+      // Compute LiveOut by subtracting out lifetimes that end in this
+      // block, then adding in lifetimes that begin in this block.  If
+      // we have both BEGIN and END markers in the same basic block
+      // then we know that the BEGIN marker comes after the END,
+      // because we already handle the case where the BEGIN comes
+      // before the END when collecting the markers (and building the
+      // BEGIN/END vectors).
+      BitVector LocalLiveOut = LocalLiveIn;
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveOut |= BlockInfo.Begin;
+
+      // Update block LiveIn set, noting whether it has changed.
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+        changed = true;
+        BlockInfo.LiveIn |= LocalLiveIn;
+      }
+
+      // Update block LiveOut set, noting whether it has changed.
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+        changed = true;
+        BlockInfo.LiveOut |= LocalLiveOut;
+      }
+    }
+  } // while changed.
+}
+
+void StackColoring::calculateLiveIntervals() {
+  for (auto IT : BlockLiveness) {
+    BasicBlock *BB = IT.getFirst();
+    BlockLifetimeInfo &BlockInfo = IT.getSecond();
+    unsigned BBStart, BBEnd;
+    std::tie(BBStart, BBEnd) = BlockInstRange[BB];
+
+    BitVector Started, Ended;
+    Started.resize(NumAllocas);
+    Ended.resize(NumAllocas);
+    SmallVector<unsigned, 8> Start;
+    Start.resize(NumAllocas);
+
+    // LiveIn ranges start at the first instruction.
+    for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+      if (BlockInfo.LiveIn.test(AllocaNo)) {
+        Started.set(AllocaNo);
+        Start[AllocaNo] = BBStart;
+      }
+    }
+
+    for (auto &It : BBMarkers[BB]) {
+      unsigned InstNo = It.first;
+      bool IsStart = It.second.IsStart;
+      unsigned AllocaNo = It.second.AllocaNo;
+
+      if (IsStart) {
+        assert(!Started.test(AllocaNo));
+        Started.set(AllocaNo);
+        Ended.reset(AllocaNo);
+        Start[AllocaNo] = InstNo;
+      } else {
+        assert(!Ended.test(AllocaNo));
+        if (Started.test(AllocaNo)) {
+          LiveRanges[AllocaNo].AddRange(Start[AllocaNo], InstNo);
+          Started.reset(AllocaNo);
+        }
+        Ended.set(AllocaNo);
+      }
+    }
+
+    for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
+      if (Started.test(AllocaNo))
+        LiveRanges[AllocaNo].AddRange(Start[AllocaNo], BBEnd);
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {
+  dbgs() << "Allocas:\n";
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
+    dbgs() << "  " << AllocaNo << ": " << *Allocas[AllocaNo] << "\n";
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpBlockLiveness() {
+  dbgs() << "Block liveness:\n";
+  for (auto IT : BlockLiveness) {
+    BasicBlock *BB = IT.getFirst();
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+    auto BlockRange = BlockInstRange[BB];
+    dbgs() << "  BB [" << BlockRange.first << ", " << BlockRange.second
+           << "): begin " << BlockInfo.Begin << ", end " << BlockInfo.End
+           << ", livein " << BlockInfo.LiveIn << ", liveout "
+           << BlockInfo.LiveOut << "\n";
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
+  dbgs() << "Alloca liveness:\n";
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
+    LiveRange &Range = LiveRanges[AllocaNo];
+    dbgs() << "  " << AllocaNo << ": " << Range << "\n";
+  }
+}
+
+void StackColoring::run() {
+  DEBUG(dumpAllocas());
+
+  for (unsigned I = 0; I < NumAllocas; ++I)
+    AllocaNumbering[Allocas[I]] = I;
+  LiveRanges.resize(NumAllocas);
+
+  collectMarkers();
+
+  if (!ClColoring) {
+    for (auto &R : LiveRanges) {
+      R.SetMaximum(1);
+      R.AddRange(0, 1);
+    }
+    return;
+  }
+
+  for (auto &R : LiveRanges)
+    R.SetMaximum(NumInst);
+  for (unsigned I = 0; I < NumAllocas; ++I)
+    if (!InterestingAllocas.test(I))
+      LiveRanges[I] = getFullLiveRange();
+
+  calculateLocalLiveness();
+  DEBUG(dumpBlockLiveness());
+  calculateLiveIntervals();
+  DEBUG(dumpLiveRanges());
+}
diff --git a/contrib/llvm/lib/CodeGen/SafeStackColoring.h b/contrib/llvm/lib/CodeGen/SafeStackColoring.h
new file mode 100644
index 000000000000..08b179ccb7f1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SafeStackColoring.h
@@ -0,0 +1,149 @@
+//===-- SafeStackColoring.h - SafeStack frame coloring ---------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
+#define LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/raw_os_ostream.h"
+
+namespace llvm {
+class AllocaInst;
+
+namespace safestack {
+/// Compute live ranges of allocas.
+/// Live ranges are represented as sets of "interesting" instructions, which are
+/// defined as instructions that may start or end an alloca's lifetime. These
+/// are:
+/// * lifetime.start and lifetime.end intrinsics
+/// * first instruction of any basic block
+/// Interesting instructions are numbered in the depth-first walk of the CFG,
+/// and in the program order inside each basic block.
+class StackColoring {
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+public:
+  /// This class represents a set of interesting instructions where an alloca is
+  /// live.
+  struct LiveRange {
+    BitVector bv;
+    void SetMaximum(int size) { bv.resize(size); }
+    void AddRange(unsigned start, unsigned end) { bv.set(start, end); }
+    bool Overlaps(const LiveRange &Other) const {
+      return bv.anyCommon(Other.bv);
+    }
+    void Join(const LiveRange &Other) { bv |= Other.bv; }
+  };
+
+private:
+  Function &F;
+
+  /// Maps active slots (per bit) for each basic block.
+  typedef DenseMap<BasicBlock *, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
+
+  /// Number of interesting instructions.
+  int NumInst;
+  /// Numeric ids for interesting instructions.
+  DenseMap<Instruction *, unsigned> InstructionNumbering;
+  /// A range [Start, End) of instruction ids for each basic block.
+  /// Instructions inside each BB have monotonic and consecutive ids.
+  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
+
+  ArrayRef<AllocaInst *> Allocas;
+  unsigned NumAllocas;
+  DenseMap<AllocaInst *, unsigned> AllocaNumbering;
+  /// LiveRange for allocas.
+  SmallVector<LiveRange, 8> LiveRanges;
+
+  /// The set of allocas that have at least one lifetime.start. All other
+  /// allocas get LiveRange that corresponds to the entire function.
+  BitVector InterestingAllocas;
+  SmallVector<Instruction *, 8> Markers;
+
+  struct Marker {
+    unsigned AllocaNo;
+    bool IsStart;
+  };
+
+  /// List of {InstNo, {AllocaNo, IsStart}} for each BB, ordered by InstNo.
+  DenseMap<BasicBlock *, SmallVector<std::pair<unsigned, Marker>, 4>> BBMarkers;
+
+  void dumpAllocas();
+  void dumpBlockLiveness();
+  void dumpLiveRanges();
+
+  bool readMarker(Instruction *I, bool *IsStart);
+  void collectMarkers();
+  void calculateLocalLiveness();
+  void calculateLiveIntervals();
+
+public:
+  StackColoring(Function &F, ArrayRef<AllocaInst *> Allocas)
+      : F(F), NumInst(-1), Allocas(Allocas), NumAllocas(Allocas.size()) {}
+
+  void run();
+  void removeAllMarkers();
+
+  /// Returns a set of "interesting" instructions where the given alloca is
+  /// live. Not all instructions in a function are interesting: we pick a set
+  /// that is large enough for LiveRange::Overlaps to be correct.
+  const LiveRange &getLiveRange(AllocaInst *AI);
+
+  /// Returns a live range that represents an alloca that is live throughout the
+  /// entire function.
+  LiveRange getFullLiveRange() {
+    assert(NumInst >= 0);
+    LiveRange R;
+    R.SetMaximum(NumInst);
+    R.AddRange(0, NumInst);
+    return R;
+  }
+};
+
+static inline raw_ostream &operator<<(raw_ostream &OS, const BitVector &V) {
+  OS << "{";
+  int idx = V.find_first();
+  bool first = true;
+  while (idx >= 0) {
+    if (!first) {
+      OS << ", ";
+    }
+    first = false;
+    OS << idx;
+    idx = V.find_next(idx);
+  }
+  OS << "}";
+  return OS;
+}
+
+static inline raw_ostream &operator<<(raw_ostream &OS,
+                                      const StackColoring::LiveRange &R) {
+  return OS << R.bv;
+}
+
+} // namespace safestack
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
new file mode 100644
index 000000000000..b8190e0f2153
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -0,0 +1,138 @@
+//===-- SafeStackLayout.cpp - SafeStack frame layout -----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SafeStackLayout.h"
+
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::safestack;
+
+#define DEBUG_TYPE "safestacklayout"
+
+static cl::opt<bool> ClLayout("safe-stack-layout",
+                              cl::desc("enable safe stack layout"), cl::Hidden,
+                              cl::init(true));
+
+LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
+  OS << "Stack regions:\n";
+  for (unsigned i = 0; i < Regions.size(); ++i) {
+    OS << "  " << i << ": [" << Regions[i].Start << ", " << Regions[i].End
+       << "), range " << Regions[i].Range << "\n";
+  }
+  OS << "Stack objects:\n";
+  for (auto &IT : ObjectOffsets) {
+    OS << "  at " << IT.getSecond() << ": " << *IT.getFirst() << "\n";
+  }
+}
+
+void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
+                            const StackColoring::LiveRange &Range) {
+  StackObjects.push_back({V, Size, Alignment, Range});
+  MaxAlignment = std::max(MaxAlignment, Alignment);
+}
+
+static unsigned AdjustStackOffset(unsigned Offset, unsigned Size,
+                                  unsigned Alignment) {
+  return alignTo(Offset + Size, Alignment) - Size;
+}
+
+void StackLayout::layoutObject(StackObject &Obj) {
+  if (!ClLayout) {
+    // If layout is disabled, just grab the next aligned address.
+    // This effectively disables stack coloring as well.
+    unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+    unsigned Start = AdjustStackOffset(LastRegionEnd, Obj.Size, Obj.Alignment);
+    unsigned End = Start + Obj.Size;
+    Regions.emplace_back(Start, End, Obj.Range);
+    ObjectOffsets[Obj.Handle] = End;
+    return;
+  }
+
+  DEBUG(dbgs() << "Layout: size " << Obj.Size << ", align " << Obj.Alignment
+               << ", range " << Obj.Range << "\n");
+  assert(Obj.Alignment <= MaxAlignment);
+  unsigned Start = AdjustStackOffset(0, Obj.Size, Obj.Alignment);
+  unsigned End = Start + Obj.Size;
+  DEBUG(dbgs() << "  First candidate: " << Start << " .. " << End << "\n");
+  for (const StackRegion &R : Regions) {
+    DEBUG(dbgs() << "  Examining region: " << R.Start << " .. " << R.End
+                 << ", range " << R.Range << "\n");
+    assert(End >= R.Start);
+    if (Start >= R.End) {
+      DEBUG(dbgs() << "  Does not intersect, skip.\n");
+      continue;
+    }
+    if (Obj.Range.Overlaps(R.Range)) {
+      // Find the next appropriate location.
+      Start = AdjustStackOffset(R.End, Obj.Size, Obj.Alignment);
+      End = Start + Obj.Size;
+      DEBUG(dbgs() << "  Overlaps. Next candidate: " << Start << " .. " << End
+                   << "\n");
+      continue;
+    }
+    if (End <= R.End) {
+      DEBUG(dbgs() << "  Reusing region(s).\n");
+      break;
+    }
+  }
+
+  unsigned LastRegionEnd = Regions.empty() ? 0 : Regions.back().End;
+  if (End > LastRegionEnd) {
+    // Insert a new region at the end. Maybe two.
+    if (Start > LastRegionEnd) {
+      DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
+                   << Start << "\n");
+      Regions.emplace_back(LastRegionEnd, Start, StackColoring::LiveRange());
+      LastRegionEnd = Start;
+    }
+    DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. " << End
+                 << ", range " << Obj.Range << "\n");
+    Regions.emplace_back(LastRegionEnd, End, Obj.Range);
+    LastRegionEnd = End;
+  }
+
+  // Split starting and ending regions if necessary.
+  for (StackRegion &R : Regions) {
+    if (Start > R.Start && Start < R.End) {
+      StackRegion R0 = R;
+      R.Start = R0.End = Start;
+      Regions.insert(&R, R0);
+      continue;
+    }
+    if (End > R.Start && End < R.End) {
+      StackRegion R0 = R;
+      R0.End = R.Start = End;
+      Regions.insert(&R, R0);
+      break;
+    }
+  }
+
+  // Update live ranges for all affected regions.
+  for (StackRegion &R : Regions) {
+    if (Start < R.End && End > R.Start)
+      R.Range.Join(Obj.Range);
+    if (End <= R.End)
+      break;
+  }
+
+  ObjectOffsets[Obj.Handle] = End;
+}
+
+void StackLayout::computeLayout() {
+  // Simple greedy algorithm.
+  // If this is replaced with something smarter, it must preserve the property
+  // that the first object is always at the offset 0 in the stack frame (for
+  // StackProtectorSlot), or handle stack protector in some other way.
+  for (auto &Obj : StackObjects)
+    layoutObject(Obj);
+
+  DEBUG(print(dbgs()));
+}
diff --git a/contrib/llvm/lib/CodeGen/SafeStackLayout.h b/contrib/llvm/lib/CodeGen/SafeStackLayout.h
new file mode 100644
index 000000000000..313ed21c8869
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SafeStackLayout.h
@@ -0,0 +1,68 @@
+//===-- SafeStackLayout.h - SafeStack frame layout -------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+#define LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
+
+#include "SafeStackColoring.h"
+
+namespace llvm {
+namespace safestack {
+
+/// Compute the layout of an unsafe stack frame.
+class StackLayout {
+  unsigned MaxAlignment;
+
+  struct StackRegion {
+    unsigned Start;
+    unsigned End;
+    StackColoring::LiveRange Range;
+    StackRegion(unsigned Start, unsigned End,
+                const StackColoring::LiveRange &Range)
+        : Start(Start), End(End), Range(Range) {}
+  };
+  /// The list of current stack regions, sorted by StackRegion::Start.
+  SmallVector<StackRegion, 16> Regions;
+
+  struct StackObject {
+    const Value *Handle;
+    unsigned Size, Alignment;
+    StackColoring::LiveRange Range;
+  };
+  SmallVector<StackObject, 8> StackObjects;
+
+  DenseMap<const Value *, unsigned> ObjectOffsets;
+
+  void layoutObject(StackObject &Obj);
+
+public:
+  StackLayout(unsigned StackAlignment) : MaxAlignment(StackAlignment) {}
+  /// Add an object to the stack frame. Value pointer is opaque and used as a
+  /// handle to retrieve the object's offset in the frame later.
+  void addObject(const Value *V, unsigned Size, unsigned Alignment,
+                 const StackColoring::LiveRange &Range);
+
+  /// Run the layout computation for all previously added objects.
+  void computeLayout();
+
+  /// Returns the offset to the object start in the stack frame.
+  unsigned getObjectOffset(const Value *V) { return ObjectOffsets[V]; }
+
+  /// Returns the size of the entire frame.
+  unsigned getFrameSize() { return Regions.empty() ? 0 : Regions.back().End; }
+
+  /// Returns the alignment of the frame.
+  unsigned getFrameAlignment() { return MaxAlignment; }
+  void print(raw_ostream &OS);
+};
+
+} // namespace safestack
+} // namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
new file mode 100644
index 000000000000..efde61ece639
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -0,0 +1,641 @@
+//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+#ifndef NDEBUG
+static cl::opt<bool> StressSchedOpt(
+  "stress-sched", cl::Hidden, cl::init(false),
+  cl::desc("Stress test instruction scheduling"));
+#endif
+
+void SchedulingPriorityQueue::anchor() { }
+
+ScheduleDAG::ScheduleDAG(MachineFunction &mf)
+    : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
+      TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
+      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
+#ifndef NDEBUG
+  StressSched = StressSchedOpt;
+#endif
+}
+
+ScheduleDAG::~ScheduleDAG() {}
+
+/// Clear the DAG state (e.g. between scheduling regions).
+void ScheduleDAG::clearDAG() {
+  SUnits.clear();
+  EntrySU = SUnit();
+  ExitSU = SUnit();
+}
+
+/// getInstrDesc helper to handle SDNodes.
+const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
+  if (!Node || !Node->isMachineOpcode()) return nullptr;
+  return &TII->get(Node->getMachineOpcode());
+}
+
+/// addPred - This adds the specified edge as a pred of the current node if
+/// not already.  It also adds the current node as a successor of the
+/// specified node.
+bool SUnit::addPred(const SDep &D, bool Required) {
+  // If this node already has this dependence, don't add a redundant one.
+  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
+    // Zero-latency weak edges may be added purely for heuristic ordering. Don't
+    // add them if another kind of edge already exists.
+    if (!Required && I->getSUnit() == D.getSUnit())
+      return false;
+    if (I->overlaps(D)) {
+      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
+      if (I->getLatency() < D.getLatency()) {
+        SUnit *PredSU = I->getSUnit();
+        // Find the corresponding successor in N.
+        SDep ForwardD = *I;
+        ForwardD.setSUnit(this);
+        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(),
+               EE = PredSU->Succs.end(); II != EE; ++II) {
+          if (*II == ForwardD) {
+            II->setLatency(D.getLatency());
+            break;
+          }
+        }
+        I->setLatency(D.getLatency());
+      }
+      return false;
+    }
+  }
+  // Now add a corresponding succ to N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  // Update the bookkeeping.
+  if (D.getKind() == SDep::Data) {
+    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
+    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    ++NumPreds;
+    ++N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak()) {
+      ++WeakPredsLeft;
+    }
+    else {
+      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+      ++NumPredsLeft;
+    }
+  }
+  if (!isScheduled) {
+    if (D.isWeak()) {
+      ++N->WeakSuccsLeft;
+    }
+    else {
+      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+      ++N->NumSuccsLeft;
+    }
+  }
+  Preds.push_back(D);
+  N->Succs.push_back(P);
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
+  return true;
+}
+
+/// removePred - This removes the specified edge as a pred of the current
+/// node if it exists.  It also removes the current node as a successor of
+/// the specified node.
+void SUnit::removePred(const SDep &D) {
+  // Find the matching predecessor.
+  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I)
+    if (*I == D) {
+      // Find the corresponding successor in N.
+      SDep P = D;
+      P.setSUnit(this);
+      SUnit *N = D.getSUnit();
+      SmallVectorImpl<SDep>::iterator Succ = std::find(N->Succs.begin(),
+                                                       N->Succs.end(), P);
+      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+      N->Succs.erase(Succ);
+      Preds.erase(I);
+      // Update the bookkeeping.
+      if (P.getKind() == SDep::Data) {
+        assert(NumPreds > 0 && "NumPreds will underflow!");
+        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+        --NumPreds;
+        --N->NumSuccs;
+      }
+      if (!N->isScheduled) {
+        if (D.isWeak())
+          --WeakPredsLeft;
+        else {
+          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+          --NumPredsLeft;
+        }
+      }
+      if (!isScheduled) {
+        if (D.isWeak())
+          --N->WeakSuccsLeft;
+        else {
+          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+          --N->NumSuccsLeft;
+        }
+      }
+      if (P.getLatency() != 0) {
+        this->setDepthDirty();
+        N->setHeightDirty();
+      }
+      return;
+    }
+}
+
+void SUnit::setDepthDirty() {
+  if (!isDepthCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isDepthCurrent = false;
+    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
+         E = SU->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isDepthCurrent)
+        WorkList.push_back(SuccSU);
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::setHeightDirty() {
+  if (!isHeightCurrent) return;
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *SU = WorkList.pop_back_val();
+    SU->isHeightCurrent = false;
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
+         E = SU->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isHeightCurrent)
+        WorkList.push_back(PredSU);
+    }
+  } while (!WorkList.empty());
+}
+
+/// setDepthToAtLeast - Update this node's successors to reflect the
+/// fact that this node's depth just increased.
+///
+void SUnit::setDepthToAtLeast(unsigned NewDepth) {
+  if (NewDepth <= getDepth())
+    return;
+  setDepthDirty();
+  Depth = NewDepth;
+  isDepthCurrent = true;
+}
+
+/// setHeightToAtLeast - Update this node's predecessors to reflect the
+/// fact that this node's height just increased.
+///
+void SUnit::setHeightToAtLeast(unsigned NewHeight) {
+  if (NewHeight <= getHeight())
+    return;
+  setHeightDirty();
+  Height = NewHeight;
+  isHeightCurrent = true;
+}
+
+/// ComputeDepth - Calculate the maximal path from the node to the exit.
+///
+void SUnit::ComputeDepth() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxPredDepth = 0;
+    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
+         E = Cur->Preds.end(); I != E; ++I) {
+      SUnit *PredSU = I->getSUnit();
+      if (PredSU->isDepthCurrent)
+        MaxPredDepth = std::max(MaxPredDepth,
+                                PredSU->Depth + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(PredSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxPredDepth != Cur->Depth) {
+        Cur->setDepthDirty();
+        Cur->Depth = MaxPredDepth;
+      }
+      Cur->isDepthCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+/// ComputeHeight - Calculate the maximal path from the node to the entry.
+///
+void SUnit::ComputeHeight() {
+  SmallVector<SUnit*, 8> WorkList;
+  WorkList.push_back(this);
+  do {
+    SUnit *Cur = WorkList.back();
+
+    bool Done = true;
+    unsigned MaxSuccHeight = 0;
+    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
+         E = Cur->Succs.end(); I != E; ++I) {
+      SUnit *SuccSU = I->getSUnit();
+      if (SuccSU->isHeightCurrent)
+        MaxSuccHeight = std::max(MaxSuccHeight,
+                                 SuccSU->Height + I->getLatency());
+      else {
+        Done = false;
+        WorkList.push_back(SuccSU);
+      }
+    }
+
+    if (Done) {
+      WorkList.pop_back();
+      if (MaxSuccHeight != Cur->Height) {
+        Cur->setHeightDirty();
+        Cur->Height = MaxSuccHeight;
+      }
+      Cur->isHeightCurrent = true;
+    }
+  } while (!WorkList.empty());
+}
+
+void SUnit::biasCriticalPath() {
+  if (NumPreds < 2)
+    return;
+
+  SUnit::pred_iterator BestI = Preds.begin();
+  unsigned MaxDepth = BestI->getSUnit()->getDepth();
+  for (SUnit::pred_iterator I = std::next(BestI), E = Preds.end(); I != E;
+       ++I) {
+    if (I->getKind() == SDep::Data && I->getSUnit()->getDepth() > MaxDepth)
+      BestI = I;
+  }
+  if (BestI != Preds.begin())
+    std::swap(*Preds.begin(), *BestI);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
+/// a group of nodes flagged together.
+void SUnit::dump(const ScheduleDAG *G) const {
+  dbgs() << "SU(" << NodeNum << "): ";
+  G->dumpNode(this);
+}
+
+void SUnit::dumpAll(const ScheduleDAG *G) const {
+  dump(G);
+
+  dbgs() << "  # preds left       : " << NumPredsLeft << "\n";
+  dbgs() << "  # succs left       : " << NumSuccsLeft << "\n";
+  if (WeakPredsLeft)
+    dbgs() << "  # weak preds left  : " << WeakPredsLeft << "\n";
+  if (WeakSuccsLeft)
+    dbgs() << "  # weak succs left  : " << WeakSuccsLeft << "\n";
+  dbgs() << "  # rdefs left       : " << NumRegDefsLeft << "\n";
+  dbgs() << "  Latency            : " << Latency << "\n";
+  dbgs() << "  Depth              : " << getDepth() << "\n";
+  dbgs() << "  Height             : " << getHeight() << "\n";
+
+  if (Preds.size() != 0) {
+    dbgs() << "  Predecessors:\n";
+    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      if (I->isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << "\n";
+    }
+  }
+  if (Succs.size() != 0) {
+    dbgs() << "  Successors:\n";
+    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
+         I != E; ++I) {
+      dbgs() << "   ";
+      switch (I->getKind()) {
+      case SDep::Data:        dbgs() << "val "; break;
+      case SDep::Anti:        dbgs() << "anti"; break;
+      case SDep::Output:      dbgs() << "out "; break;
+      case SDep::Order:       dbgs() << "ch  "; break;
+      }
+      dbgs() << "SU(" << I->getSUnit()->NodeNum << ")";
+      if (I->isArtificial())
+        dbgs() << " *";
+      dbgs() << ": Latency=" << I->getLatency();
+      if (I->isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << "\n";
+    }
+  }
+}
+#endif
+
+#ifndef NDEBUG
+/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
+/// their state is consistent. Return the number of scheduled nodes.
+///
+unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
+  bool AnyNotSched = false;
+  unsigned DeadNodes = 0;
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    if (!SUnits[i].isScheduled) {
+      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+        ++DeadNodes;
+        continue;
+      }
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has not been scheduled!\n";
+      AnyNotSched = true;
+    }
+    if (SUnits[i].isScheduled &&
+        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
+          unsigned(INT_MAX)) {
+      if (!AnyNotSched)
+        dbgs() << "*** Scheduling failed! ***\n";
+      SUnits[i].dump(this);
+      dbgs() << "has an unexpected "
+           << (isBottomUp ? "Height" : "Depth") << " value!\n";
+      AnyNotSched = true;
+    }
+    if (isBottomUp) {
+      if (SUnits[i].NumSuccsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has successors left!\n";
+        AnyNotSched = true;
+      }
+    } else {
+      if (SUnits[i].NumPredsLeft != 0) {
+        if (!AnyNotSched)
+          dbgs() << "*** Scheduling failed! ***\n";
+        SUnits[i].dump(this);
+        dbgs() << "has predecessors left!\n";
+        AnyNotSched = true;
+      }
+    }
+  }
+  assert(!AnyNotSched);
+  return SUnits.size() - DeadNodes;
+}
+#endif
+
+/// InitDAGTopologicalSorting - create the initial topological
+/// ordering from the DAG to be scheduled.
+///
+/// The idea of the algorithm is taken from
+/// "Online algorithms for managing the topological order of
+/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+/// This is the MNR algorithm, which was first introduced by
+/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+/// "Maintaining a topological order under edge insertions".
+///
+/// Short description of the algorithm:
+///
+/// Topological ordering, ord, of a DAG maps each node to a topological
+/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+///
+/// This means that if there is a path from the node X to the node Z,
+/// then ord(X) < ord(Z).
+///
+/// This property can be used to check for reachability of nodes:
+/// if Z is reachable from X, then an insertion of the edge Z->X would
+/// create a cycle.
+///
+/// The algorithm first computes a topological ordering for the DAG by
+/// initializing the Index2Node and Node2Index arrays and then tries to keep
+/// the ordering up-to-date after edge insertions by reordering the DAG.
+///
+/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
+/// the nodes reachable from Y, and then shifts them using Shift to lie
+/// immediately after X in Index2Node.
+void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  unsigned DAGSize = SUnits.size();
+  std::vector<SUnit*> WorkList;
+  WorkList.reserve(DAGSize);
+
+  Index2Node.resize(DAGSize);
+  Node2Index.resize(DAGSize);
+
+  // Initialize the data structures.
+  if (ExitSU)
+    WorkList.push_back(ExitSU);
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    int NodeNum = SU->NodeNum;
+    unsigned Degree = SU->Succs.size();
+    // Temporarily use the Node2Index array as scratch space for degree counts.
+    Node2Index[NodeNum] = Degree;
+
+    // Is it a node without dependencies?
+    if (Degree == 0) {
+      assert(SU->Succs.empty() && "SUnit should have no successors");
+      // Collect leaf nodes.
+      WorkList.push_back(SU);
+    }
+  }
+
+  int Id = DAGSize;
+  while (!WorkList.empty()) {
+    SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    if (SU->NodeNum < DAGSize)
+      Allocate(SU->NodeNum, --Id);
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      SUnit *SU = I->getSUnit();
+      if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
+        // If all dependencies of the node are processed already,
+        // then the node can be computed now.
+        WorkList.push_back(SU);
+    }
+  }
+
+  Visited.resize(DAGSize);
+
+#ifndef NDEBUG
+  // Check correctness of the ordering
+  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
+    SUnit *SU = &SUnits[i];
+    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+      "Wrong topological sorting");
+    }
+  }
+#endif
+}
+
+/// AddPred - Updates the topological ordering to accommodate an edge
+/// to be added from SUnit X to SUnit Y.
+void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[Y->NodeNum];
+  UpperBound = Node2Index[X->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(X) < Ord(Y) ?
+  if (LowerBound < UpperBound) {
+    // Update the topological order.
+    Visited.reset();
+    DFS(Y, UpperBound, HasLoop);
+    assert(!HasLoop && "Inserted edge creates a loop!");
+    // Recompute topological indexes.
+    Shift(Visited, LowerBound, UpperBound);
+  }
+}
+
+/// RemovePred - Updates the topological ordering to accommodate an
+/// an edge to be removed from the specified node N from the predecessors
+/// of the current node M.
+void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
+  // InitDAGTopologicalSorting();
+}
+
+/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
+/// all nodes affected by the edge insertion. These nodes will later get new
+/// topological indexes by means of the Shift method.
+void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
+                                     bool &HasLoop) {
+  std::vector<const SUnit*> WorkList;
+  WorkList.reserve(SUnits.size());
+
+  WorkList.push_back(SU);
+  do {
+    SU = WorkList.back();
+    WorkList.pop_back();
+    Visited.set(SU->NodeNum);
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      unsigned s = SU->Succs[I].getSUnit()->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (s >= Node2Index.size())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        HasLoop = true;
+        return;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        WorkList.push_back(SU->Succs[I].getSUnit());
+      }
+    }
+  } while (!WorkList.empty());
+}
+
+/// Shift - Renumber the nodes so that the topological ordering is
+/// preserved.
+void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
+                                       int UpperBound) {
+  std::vector<int> L;
+  int shift = 0;
+  int i;
+
+  for (i = LowerBound; i <= UpperBound; ++i) {
+    // w is node at topological index i.
+    int w = Index2Node[i];
+    if (Visited.test(w)) {
+      // Unmark.
+      Visited.reset(w);
+      L.push_back(w);
+      shift = shift + 1;
+    } else {
+      Allocate(w, i - shift);
+    }
+  }
+
+  for (unsigned j = 0; j < L.size(); ++j) {
+    Allocate(L[j], i - shift);
+    i = i + 1;
+  }
+}
+
+
+/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will
+/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).
+bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
+  // Is SU reachable from TargetSU via successor edges?
+  if (IsReachable(SU, TargetSU))
+    return true;
+  for (SUnit::pred_iterator
+         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I)
+    if (I->isAssignedRegDep() &&
+        IsReachable(SU, I->getSUnit()))
+      return true;
+  return false;
+}
+
+/// IsReachable - Checks if SU is reachable from TargetSU.
+bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
+                                             const SUnit *TargetSU) {
+  // If insertion of the edge SU->TargetSU would create a cycle
+  // then there is a path from TargetSU to SU.
+  int UpperBound, LowerBound;
+  LowerBound = Node2Index[TargetSU->NodeNum];
+  UpperBound = Node2Index[SU->NodeNum];
+  bool HasLoop = false;
+  // Is Ord(TargetSU) < Ord(SU) ?
+  if (LowerBound < UpperBound) {
+    Visited.reset();
+    // There may be a path from TargetSU to SU. Check for it.
+    DFS(TargetSU, UpperBound, HasLoop);
+  }
+  return HasLoop;
+}
+
+/// Allocate - assign the topological index to the node n.
+void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
+  Node2Index[n] = index;
+  Index2Node[index] = n;
+}
+
+ScheduleDAGTopologicalSort::
+ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
+  : SUnits(sunits), ExitSU(exitsu) {}
+
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
new file mode 100644
index 000000000000..22bfd4d1964c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -0,0 +1,1711 @@
+//===---- ScheduleDAGInstrs.cpp - MachineInstr Rescheduling ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAGInstrs class, which implements re-scheduling
+// of MachineInstrs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/ScheduleDFS.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+static cl::opt<bool> EnableAASchedMI("enable-aa-sched-mi", cl::Hidden,
+    cl::ZeroOrMore, cl::init(false),
+    cl::desc("Enable use of AA during MI DAG construction"));
+
+static cl::opt<bool> UseTBAA("use-tbaa-in-sched-mi", cl::Hidden,
+    cl::init(true), cl::desc("Enable use of TBAA during MI DAG construction"));
+
+// Note: the two options below might be used in tuning compile time vs
+// output quality. Setting HugeRegion so large that it will never be
+// reached means best-effort, but may be slow.
+
+// When Stores and Loads maps (or NonAliasStores and NonAliasLoads)
+// together hold this many SUs, a reduction of maps will be done.
+static cl::opt<unsigned> HugeRegion("dag-maps-huge-region", cl::Hidden,
+    cl::init(1000), cl::desc("The limit to use while constructing the DAG "
+                             "prior to scheduling, at which point a trade-off "
+                             "is made to avoid excessive compile time."));
+
+static cl::opt<unsigned> ReductionSize(
+    "dag-maps-reduction-size", cl::Hidden,
+    cl::desc("A huge scheduling region will have maps reduced by this many "
+             "nodes at a time. Defaults to HugeRegion / 2."));
+
+static unsigned getReductionSize() {
+  // Always reduce a huge region with half of the elements, except
+  // when user sets this number explicitly.
+  if (ReductionSize.getNumOccurrences() == 0)
+    return HugeRegion / 2;
+  return ReductionSize;
+}
+
+static void dumpSUList(ScheduleDAGInstrs::SUList &L) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  dbgs() << "{ ";
+  for (auto *su : L) {
+    dbgs() << "SU(" << su->NodeNum << ")";
+    if (su != L.back())
+      dbgs() << ", ";
+  }
+  dbgs() << "}\n";
+#endif
+}
+
+ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
+                                     const MachineLoopInfo *mli,
+                                     bool RemoveKillFlags)
+    : ScheduleDAG(mf), MLI(mli), MFI(mf.getFrameInfo()),
+      RemoveKillFlags(RemoveKillFlags), CanHandleTerminators(false),
+      TrackLaneMasks(false), AAForDep(nullptr), BarrierChain(nullptr),
+      UnknownValue(UndefValue::get(
+                     Type::getVoidTy(mf.getFunction()->getContext()))),
+      FirstDbgValue(nullptr) {
+  DbgValues.clear();
+
+  const TargetSubtargetInfo &ST = mf.getSubtarget();
+  SchedModel.init(ST.getSchedModel(), &ST, TII);
+}
+
+/// getUnderlyingObjectFromInt - This is the function that does the work of
+/// looking through basic ptrtoint+arithmetic+inttoptr sequences.
+static const Value *getUnderlyingObjectFromInt(const Value *V) {
+  do {
+    if (const Operator *U = dyn_cast<Operator>(V)) {
+      // If we find a ptrtoint, we can transfer control back to the
+      // regular getUnderlyingObjectFromInt.
+      if (U->getOpcode() == Instruction::PtrToInt)
+        return U->getOperand(0);
+      // If we find an add of a constant, a multiplied value, or a phi, it's
+      // likely that the other operand will lead us to the base
+      // object. We don't have to worry about the case where the
+      // object address is somehow being computed by the multiply,
+      // because our callers only care when the result is an
+      // identifiable object.
+      if (U->getOpcode() != Instruction::Add ||
+          (!isa<ConstantInt>(U->getOperand(1)) &&
+           Operator::getOpcode(U->getOperand(1)) != Instruction::Mul &&
+           !isa<PHINode>(U->getOperand(1))))
+        return V;
+      V = U->getOperand(0);
+    } else {
+      return V;
+    }
+    assert(V->getType()->isIntegerTy() && "Unexpected operand type!");
+  } while (1);
+}
+
+/// getUnderlyingObjects - This is a wrapper around GetUnderlyingObjects
+/// and adds support for basic ptrtoint+arithmetic+inttoptr sequences.
+static void getUnderlyingObjects(const Value *V,
+                                 SmallVectorImpl<Value *> &Objects,
+                                 const DataLayout &DL) {
+  SmallPtrSet<const Value *, 16> Visited;
+  SmallVector<const Value *, 4> Working(1, V);
+  do {
+    V = Working.pop_back_val();
+
+    SmallVector<Value *, 4> Objs;
+    GetUnderlyingObjects(const_cast<Value *>(V), Objs, DL);
+
+    for (SmallVectorImpl<Value *>::iterator I = Objs.begin(), IE = Objs.end();
+         I != IE; ++I) {
+      V = *I;
+      if (!Visited.insert(V).second)
+        continue;
+      if (Operator::getOpcode(V) == Instruction::IntToPtr) {
+        const Value *O =
+          getUnderlyingObjectFromInt(cast<User>(V)->getOperand(0));
+        if (O->getType()->isPointerTy()) {
+          Working.push_back(O);
+          continue;
+        }
+      }
+      Objects.push_back(const_cast<Value *>(V));
+    }
+  } while (!Working.empty());
+}
+
+/// getUnderlyingObjectsForInstr - If this machine instr has memory reference
+/// information and it can be tracked to a normal reference to a known
+/// object, return the Value for that object.
+static void getUnderlyingObjectsForInstr(const MachineInstr *MI,
+                                         const MachineFrameInfo *MFI,
+                                         UnderlyingObjectsVector &Objects,
+                                         const DataLayout &DL) {
+  auto allMMOsOkay = [&]() {
+    for (const MachineMemOperand *MMO : MI->memoperands()) {
+      if (MMO->isVolatile())
+        return false;
+
+      if (const PseudoSourceValue *PSV = MMO->getPseudoValue()) {
+        // Function that contain tail calls don't have unique PseudoSourceValue
+        // objects. Two PseudoSourceValues might refer to the same or
+        // overlapping locations. The client code calling this function assumes
+        // this is not the case. So return a conservative answer of no known
+        // object.
+        if (MFI->hasTailCall())
+          return false;
+
+        // For now, ignore PseudoSourceValues which may alias LLVM IR values
+        // because the code that uses this function has no way to cope with
+        // such aliases.
+        if (PSV->isAliased(MFI))
+          return false;
+
+        bool MayAlias = PSV->mayAlias(MFI);
+        Objects.push_back(UnderlyingObjectsVector::value_type(PSV, MayAlias));
+      } else if (const Value *V = MMO->getValue()) {
+        SmallVector<Value *, 4> Objs;
+        getUnderlyingObjects(V, Objs, DL);
+
+        for (Value *V : Objs) {
+          if (!isIdentifiedObject(V))
+            return false;
+
+          Objects.push_back(UnderlyingObjectsVector::value_type(V, true));
+        }
+      } else
+        return false;
+    }
+    return true;
+  };
+
+  if (!allMMOsOkay())
+    Objects.clear();
+}
+
+void ScheduleDAGInstrs::startBlock(MachineBasicBlock *bb) {
+  BB = bb;
+}
+
+void ScheduleDAGInstrs::finishBlock() {
+  // Subclasses should no longer refer to the old block.
+  BB = nullptr;
+}
+
+/// Initialize the DAG and common scheduler state for the current scheduling
+/// region. This does not actually create the DAG, only clears it. The
+/// scheduling driver may call BuildSchedGraph multiple times per scheduling
+/// region.
+void ScheduleDAGInstrs::enterRegion(MachineBasicBlock *bb,
+                                    MachineBasicBlock::iterator begin,
+                                    MachineBasicBlock::iterator end,
+                                    unsigned regioninstrs) {
+  assert(bb == BB && "startBlock should set BB");
+  RegionBegin = begin;
+  RegionEnd = end;
+  NumRegionInstrs = regioninstrs;
+}
+
+/// Close the current scheduling region. Don't clear any state in case the
+/// driver wants to refer to the previous scheduling region.
+void ScheduleDAGInstrs::exitRegion() {
+  // Nothing to do.
+}
+
+/// addSchedBarrierDeps - Add dependencies from instructions in the current
+/// list of instructions being scheduled to scheduling barrier by adding
+/// the exit SU to the register defs and use list. This is because we want to
+/// make sure instructions which define registers that are either used by
+/// the terminator or are live-out are properly scheduled. This is
+/// especially important when the definition latency of the return value(s)
+/// are too high to be hidden by the branch or when the liveout registers
+/// used by instructions in the fallthrough block.
+void ScheduleDAGInstrs::addSchedBarrierDeps() {
+  MachineInstr *ExitMI = RegionEnd != BB->end() ? &*RegionEnd : nullptr;
+  ExitSU.setInstr(ExitMI);
+  bool AllDepKnown = ExitMI &&
+    (ExitMI->isCall() || ExitMI->isBarrier());
+  if (ExitMI && AllDepKnown) {
+    // If it's a call or a barrier, add dependencies on the defs and uses of
+    // instruction.
+    for (unsigned i = 0, e = ExitMI->getNumOperands(); i != e; ++i) {
+      const MachineOperand &MO = ExitMI->getOperand(i);
+      if (!MO.isReg() || MO.isDef()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        Uses.insert(PhysRegSUOper(&ExitSU, -1, Reg));
+      else if (MO.readsReg()) // ignore undef operands
+        addVRegUseDeps(&ExitSU, i);
+    }
+  } else {
+    // For others, e.g. fallthrough, conditional branch, assume the exit
+    // uses all the registers that are livein to the successor blocks.
+    assert(Uses.empty() && "Uses in set before adding deps?");
+    for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+           SE = BB->succ_end(); SI != SE; ++SI)
+      for (const auto &LI : (*SI)->liveins()) {
+        if (!Uses.contains(LI.PhysReg))
+          Uses.insert(PhysRegSUOper(&ExitSU, -1, LI.PhysReg));
+      }
+  }
+}
+
+/// MO is an operand of SU's instruction that defines a physical register. Add
+/// data dependencies from SU to any uses of the physical register.
+void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineOperand &MO = SU->getInstr()->getOperand(OperIdx);
+  assert(MO.isDef() && "expect physreg def");
+
+  // Ask the target if address-backscheduling is desirable, and if so how much.
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
+    if (!Uses.contains(*Alias))
+      continue;
+    for (Reg2SUnitsMap::iterator I = Uses.find(*Alias); I != Uses.end(); ++I) {
+      SUnit *UseSU = I->SU;
+      if (UseSU == SU)
+        continue;
+
+      // Adjust the dependence latency using operand def/use information,
+      // then allow the target to perform its own adjustments.
+      int UseOp = I->OpIdx;
+      MachineInstr *RegUse = nullptr;
+      SDep Dep;
+      if (UseOp < 0)
+        Dep = SDep(SU, SDep::Artificial);
+      else {
+        // Set the hasPhysRegDefs only for physreg defs that have a use within
+        // the scheduling region.
+        SU->hasPhysRegDefs = true;
+        Dep = SDep(SU, SDep::Data, *Alias);
+        RegUse = UseSU->getInstr();
+      }
+      Dep.setLatency(
+        SchedModel.computeOperandLatency(SU->getInstr(), OperIdx, RegUse,
+                                         UseOp));
+
+      ST.adjustSchedDependency(SU, UseSU, Dep);
+      UseSU->addPred(Dep);
+    }
+  }
+}
+
+/// addPhysRegDeps - Add register dependencies (data, anti, and output) from
+/// this SUnit to following instructions in the same scheduling region that
+/// depend the physical register referenced at OperIdx.
+void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+
+  // Optionally add output and anti dependencies. For anti
+  // dependencies we use a latency of 0 because for a multi-issue
+  // target we want to allow the defining instruction to issue
+  // in the same cycle as the using instruction.
+  // TODO: Using a latency of 1 here for output dependencies assumes
+  //       there's no cost for reusing registers.
+  SDep::Kind Kind = MO.isUse() ? SDep::Anti : SDep::Output;
+  for (MCRegAliasIterator Alias(MO.getReg(), TRI, true);
+       Alias.isValid(); ++Alias) {
+    if (!Defs.contains(*Alias))
+      continue;
+    for (Reg2SUnitsMap::iterator I = Defs.find(*Alias); I != Defs.end(); ++I) {
+      SUnit *DefSU = I->SU;
+      if (DefSU == &ExitSU)
+        continue;
+      if (DefSU != SU &&
+          (Kind != SDep::Output || !MO.isDead() ||
+           !DefSU->getInstr()->registerDefIsDead(*Alias))) {
+        if (Kind == SDep::Anti)
+          DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
+        else {
+          SDep Dep(SU, Kind, /*Reg=*/*Alias);
+          Dep.setLatency(
+            SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+          DefSU->addPred(Dep);
+        }
+      }
+    }
+  }
+
+  if (!MO.isDef()) {
+    SU->hasPhysRegUses = true;
+    // Either insert a new Reg2SUnits entry with an empty SUnits list, or
+    // retrieve the existing SUnits list for this register's uses.
+    // Push this SUnit on the use list.
+    Uses.insert(PhysRegSUOper(SU, OperIdx, MO.getReg()));
+    if (RemoveKillFlags)
+      MO.setIsKill(false);
+  }
+  else {
+    addPhysRegDataDeps(SU, OperIdx);
+    unsigned Reg = MO.getReg();
+
+    // clear this register's use list
+    if (Uses.contains(Reg))
+      Uses.eraseAll(Reg);
+
+    if (!MO.isDead()) {
+      Defs.eraseAll(Reg);
+    } else if (SU->isCall) {
+      // Calls will not be reordered because of chain dependencies (see
+      // below). Since call operands are dead, calls may continue to be added
+      // to the DefList making dependence checking quadratic in the size of
+      // the block. Instead, we leave only one call at the back of the
+      // DefList.
+      Reg2SUnitsMap::RangePair P = Defs.equal_range(Reg);
+      Reg2SUnitsMap::iterator B = P.first;
+      Reg2SUnitsMap::iterator I = P.second;
+      for (bool isBegin = I == B; !isBegin; /* empty */) {
+        isBegin = (--I) == B;
+        if (!I->SU->isCall)
+          break;
+        I = Defs.erase(I);
+      }
+    }
+
+    // Defs are pushed in the order they are visited and never reordered.
+    Defs.insert(PhysRegSUOper(SU, OperIdx, Reg));
+  }
+}
+
+LaneBitmask ScheduleDAGInstrs::getLaneMaskForMO(const MachineOperand &MO) const
+{
+  unsigned Reg = MO.getReg();
+  // No point in tracking lanemasks if we don't have interesting subregisters.
+  const TargetRegisterClass &RC = *MRI.getRegClass(Reg);
+  if (!RC.HasDisjunctSubRegs)
+    return ~0u;
+
+  unsigned SubReg = MO.getSubReg();
+  if (SubReg == 0)
+    return RC.getLaneMask();
+  return TRI->getSubRegIndexLaneMask(SubReg);
+}
+
+/// addVRegDefDeps - Add register output and data dependencies from this SUnit
+/// to instructions that occur later in the same scheduling region if they read
+/// from or write to the virtual register defined at OperIdx.
+///
+/// TODO: Hoist loop induction variable increments. This has to be
+/// reevaluated. Generally, IV scheduling should be done before coalescing.
+void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
+  MachineInstr *MI = SU->getInstr();
+  MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  LaneBitmask DefLaneMask;
+  LaneBitmask KillLaneMask;
+  if (TrackLaneMasks) {
+    bool IsKill = MO.getSubReg() == 0 || MO.isUndef();
+    DefLaneMask = getLaneMaskForMO(MO);
+    // If we have a <read-undef> flag, none of the lane values comes from an
+    // earlier instruction.
+    KillLaneMask = IsKill ? ~0u : DefLaneMask;
+
+    // Clear undef flag, we'll re-add it later once we know which subregister
+    // Def is first.
+    MO.setIsUndef(false);
+  } else {
+    DefLaneMask = ~0u;
+    KillLaneMask = ~0u;
+  }
+
+  if (MO.isDead()) {
+    assert(CurrentVRegUses.find(Reg) == CurrentVRegUses.end() &&
+           "Dead defs should have no uses");
+  } else {
+    // Add data dependence to all uses we found so far.
+    const TargetSubtargetInfo &ST = MF.getSubtarget();
+    for (VReg2SUnitOperIdxMultiMap::iterator I = CurrentVRegUses.find(Reg),
+         E = CurrentVRegUses.end(); I != E; /*empty*/) {
+      LaneBitmask LaneMask = I->LaneMask;
+      // Ignore uses of other lanes.
+      if ((LaneMask & KillLaneMask) == 0) {
+        ++I;
+        continue;
+      }
+
+      if ((LaneMask & DefLaneMask) != 0) {
+        SUnit *UseSU = I->SU;
+        MachineInstr *Use = UseSU->getInstr();
+        SDep Dep(SU, SDep::Data, Reg);
+        Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
+                                                        I->OperandIndex));
+        ST.adjustSchedDependency(SU, UseSU, Dep);
+        UseSU->addPred(Dep);
+      }
+
+      LaneMask &= ~KillLaneMask;
+      // If we found a Def for all lanes of this use, remove it from the list.
+      if (LaneMask != 0) {
+        I->LaneMask = LaneMask;
+        ++I;
+      } else
+        I = CurrentVRegUses.erase(I);
+    }
+  }
+
+  // Shortcut: Singly defined vregs do not have output/anti dependencies.
+  if (MRI.hasOneDef(Reg))
+    return;
+
+  // Add output dependence to the next nearest defs of this vreg.
+  //
+  // Unless this definition is dead, the output dependence should be
+  // transitively redundant with antidependencies from this definition's
+  // uses. We're conservative for now until we have a way to guarantee the uses
+  // are not eliminated sometime during scheduling. The output dependence edge
+  // is also useful if output latency exceeds def-use latency.
+  LaneBitmask LaneMask = DefLaneMask;
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for other lanes.
+    if ((V2SU.LaneMask & LaneMask) == 0)
+      continue;
+    // Add an output dependence.
+    SUnit *DefSU = V2SU.SU;
+    // Ignore additional defs of the same lanes in one instruction. This can
+    // happen because lanemasks are shared for targets with too many
+    // subregisters. We also use some representration tricks/hacks where we
+    // add super-register defs/uses, to imply that although we only access parts
+    // of the reg we care about the full one.
+    if (DefSU == SU)
+      continue;
+    SDep Dep(SU, SDep::Output, Reg);
+    Dep.setLatency(
+      SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
+    DefSU->addPred(Dep);
+
+    // Update current definition. This can get tricky if the def was about a
+    // bigger lanemask before. We then have to shrink it and create a new
+    // VReg2SUnit for the non-overlapping part.
+    LaneBitmask OverlapMask = V2SU.LaneMask & LaneMask;
+    LaneBitmask NonOverlapMask = V2SU.LaneMask & ~LaneMask;
+    V2SU.SU = SU;
+    V2SU.LaneMask = OverlapMask;
+    if (NonOverlapMask != 0)
+      CurrentVRegDefs.insert(VReg2SUnit(Reg, NonOverlapMask, DefSU));
+  }
+  // If there was no CurrentVRegDefs entry for some lanes yet, create one.
+  if (LaneMask != 0)
+    CurrentVRegDefs.insert(VReg2SUnit(Reg, LaneMask, SU));
+}
+
+/// addVRegUseDeps - Add a register data dependency if the instruction that
+/// defines the virtual register used at OperIdx is mapped to an SUnit. Add a
+/// register antidependency from this SUnit to instructions that occur later in
+/// the same scheduling region if they write the virtual register.
+///
+/// TODO: Handle ExitSU "uses" properly.
+void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
+  const MachineInstr *MI = SU->getInstr();
+  const MachineOperand &MO = MI->getOperand(OperIdx);
+  unsigned Reg = MO.getReg();
+
+  // Remember the use. Data dependencies will be added when we find the def.
+  LaneBitmask LaneMask = TrackLaneMasks ? getLaneMaskForMO(MO) : ~0u;
+  CurrentVRegUses.insert(VReg2SUnitOperIdx(Reg, LaneMask, OperIdx, SU));
+
+  // Add antidependences to the following defs of the vreg.
+  for (VReg2SUnit &V2SU : make_range(CurrentVRegDefs.find(Reg),
+                                     CurrentVRegDefs.end())) {
+    // Ignore defs for unrelated lanes.
+    LaneBitmask PrevDefLaneMask = V2SU.LaneMask;
+    if ((PrevDefLaneMask & LaneMask) == 0)
+      continue;
+    if (V2SU.SU == SU)
+      continue;
+
+    V2SU.SU->addPred(SDep(SU, SDep::Anti, Reg));
+  }
+}
+
+/// Return true if MI is an instruction we are unable to reason about
+/// (like a call or something with unmodeled side effects).
+static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
+  return MI->isCall() || MI->hasUnmodeledSideEffects() ||
+         (MI->hasOrderedMemoryRef() && !MI->isInvariantLoad(AA));
+}
+
+/// This returns true if the two MIs need a chain edge between them.
+/// This is called on normal stores and loads.
+static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
+                             const DataLayout &DL, MachineInstr *MIa,
+                             MachineInstr *MIb) {
+  const MachineFunction *MF = MIa->getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  assert ((MIa->mayStore() || MIb->mayStore()) &&
+          "Dependency checked between two loads");
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA))
+    return false;
+
+  // To this point analysis is generic. From here on we do need AA.
+  if (!AA)
+    return true;
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
+    return true;
+
+  MachineMemOperand *MMOa = *MIa->memoperands_begin();
+  MachineMemOperand *MMOb = *MIb->memoperands_begin();
+
+  if (!MMOa->getValue() || !MMOb->getValue())
+    return true;
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
+                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                MemoryLocation(MMOb->getValue(), Overlapb,
+                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+
+  return (AAResult != NoAlias);
+}
+
+/// Check whether two objects need a chain edge and add it if needed.
+void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
+                                            unsigned Latency) {
+  if (MIsNeedChainEdge(AAForDep, MFI, MF.getDataLayout(), SUa->getInstr(),
+		       SUb->getInstr())) {
+    SDep Dep(SUa, SDep::MayAliasMem);
+    Dep.setLatency(Latency);
+    SUb->addPred(Dep);
+  }
+}
+
+/// Create an SUnit for each real instruction, numbered in top-down topological
+/// order. The instruction order A < B, implies that no edge exists from B to A.
+///
+/// Map each real instruction to its SUnit.
+///
+/// After initSUnits, the SUnits vector cannot be resized and the scheduler may
+/// hang onto SUnit pointers. We may relax this in the future by using SUnit IDs
+/// instead of pointers.
+///
+/// MachineScheduler relies on initSUnits numbering the nodes by their order in
+/// the original instruction list.
+void ScheduleDAGInstrs::initSUnits() {
+  // We'll be allocating one SUnit for each real instruction in the region,
+  // which is contained within a basic block.
+  SUnits.reserve(NumRegionInstrs);
+
+  for (MachineInstr &MI : llvm::make_range(RegionBegin, RegionEnd)) {
+    if (MI.isDebugValue())
+      continue;
+
+    SUnit *SU = newSUnit(&MI);
+    MISUnitMap[&MI] = SU;
+
+    SU->isCall = MI.isCall();
+    SU->isCommutable = MI.isCommutable();
+
+    // Assign the Latency field of SU using target-provided information.
+    SU->Latency = SchedModel.computeInstrLatency(SU->getInstr());
+
+    // If this SUnit uses a reserved or unbuffered resource, mark it as such.
+    //
+    // Reserved resources block an instruction from issuing and stall the
+    // entire pipeline. These are identified by BufferSize=0.
+    //
+    // Unbuffered resources prevent execution of subsequent instructions that
+    // require the same resources. This is used for in-order execution pipelines
+    // within an out-of-order core. These are identified by BufferSize=1.
+    if (SchedModel.hasInstrSchedModel()) {
+      const MCSchedClassDesc *SC = getSchedClass(SU);
+      for (TargetSchedModel::ProcResIter
+             PI = SchedModel.getWriteProcResBegin(SC),
+             PE = SchedModel.getWriteProcResEnd(SC); PI != PE; ++PI) {
+        switch (SchedModel.getProcResource(PI->ProcResourceIdx)->BufferSize) {
+        case 0:
+          SU->hasReservedResource = true;
+          break;
+        case 1:
+          SU->isUnbuffered = true;
+          break;
+        default:
+          break;
+        }
+      }
+    }
+  }
+}
+
+void ScheduleDAGInstrs::collectVRegUses(SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    if (!MO.readsReg())
+      continue;
+    if (TrackLaneMasks && !MO.isUse())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      continue;
+
+    // Ignore re-defs.
+    if (TrackLaneMasks) {
+      bool FoundDef = false;
+      for (const MachineOperand &MO2 : MI->operands()) {
+        if (MO2.isReg() && MO2.isDef() && MO2.getReg() == Reg && !MO2.isDead()) {
+          FoundDef = true;
+          break;
+        }
+      }
+      if (FoundDef)
+        continue;
+    }
+
+    // Record this local VReg use.
+    VReg2SUnitMultiMap::iterator UI = VRegUses.find(Reg);
+    for (; UI != VRegUses.end(); ++UI) {
+      if (UI->SU == SU)
+        break;
+    }
+    if (UI == VRegUses.end())
+      VRegUses.insert(VReg2SUnit(Reg, 0, SU));
+  }
+}
+
+class ScheduleDAGInstrs::Value2SUsMap : public MapVector<ValueType, SUList> {
+
+  /// Current total number of SUs in map.
+  unsigned NumNodes;
+
+  /// 1 for loads, 0 for stores. (see comment in SUList)
+  unsigned TrueMemOrderLatency;
+public:
+
+  Value2SUsMap(unsigned lat = 0) : NumNodes(0), TrueMemOrderLatency(lat) {}
+
+  /// To keep NumNodes up to date, insert() is used instead of
+  /// this operator w/ push_back().
+  ValueType &operator[](const SUList &Key) {
+    llvm_unreachable("Don't use. Use insert() instead."); };
+
+  /// Add SU to the SUList of V. If Map grows huge, reduce its size
+  /// by calling reduce().
+  void inline insert(SUnit *SU, ValueType V) {
+    MapVector::operator[](V).push_back(SU);
+    NumNodes++;
+  }
+
+  /// Clears the list of SUs mapped to V.
+  void inline clearList(ValueType V) {
+    iterator Itr = find(V);
+    if (Itr != end()) {
+      assert (NumNodes >= Itr->second.size());
+      NumNodes -= Itr->second.size();
+
+      Itr->second.clear();
+    }
+  }
+
+  /// Clears map from all contents.
+  void clear() {
+    MapVector<ValueType, SUList>::clear();
+    NumNodes = 0;
+  }
+
+  unsigned inline size() const { return NumNodes; }
+
+  /// Count the number of SUs in this map after a reduction.
+  void reComputeSize(void) {
+    NumNodes = 0;
+    for (auto &I : *this)
+      NumNodes += I.second.size();
+  }
+
+  unsigned inline getTrueMemOrderLatency() const {
+    return TrueMemOrderLatency;
+  }
+
+  void dump();
+};
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap) {
+  for (auto &I : Val2SUsMap)
+    addChainDependencies(SU, I.second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addChainDependencies(SUnit *SU,
+                                             Value2SUsMap &Val2SUsMap,
+                                             ValueType V) {
+  Value2SUsMap::iterator Itr = Val2SUsMap.find(V);
+  if (Itr != Val2SUsMap.end())
+    addChainDependencies(SU, Itr->second,
+                         Val2SUsMap.getTrueMemOrderLatency());
+}
+
+void ScheduleDAGInstrs::addBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  for (auto &I : map) {
+    SUList &sus = I.second;
+    for (auto *SU : sus)
+      SU->addPredBarrier(BarrierChain);
+  }
+  map.clear();
+}
+
+void ScheduleDAGInstrs::insertBarrierChain(Value2SUsMap &map) {
+  assert (BarrierChain != nullptr);
+
+  // Go through all lists of SUs.
+  for (Value2SUsMap::iterator I = map.begin(), EE = map.end(); I != EE;) {
+    Value2SUsMap::iterator CurrItr = I++;
+    SUList &sus = CurrItr->second;
+    SUList::iterator SUItr = sus.begin(), SUEE = sus.end();
+    for (; SUItr != SUEE; ++SUItr) {
+      // Stop on BarrierChain or any instruction above it.
+      if ((*SUItr)->NodeNum <= BarrierChain->NodeNum)
+        break;
+
+      (*SUItr)->addPredBarrier(BarrierChain);
+    }
+
+    // Remove also the BarrierChain from list if present.
+    if (SUItr != SUEE && *SUItr == BarrierChain)
+      SUItr++;
+
+    // Remove all SUs that are now successors of BarrierChain.
+    if (SUItr != sus.begin())
+      sus.erase(sus.begin(), SUItr);
+  }
+
+  // Remove all entries with empty su lists.
+  map.remove_if([&](std::pair<ValueType, SUList> &mapEntry) {
+      return (mapEntry.second.empty()); });
+
+  // Recompute the size of the map (NumNodes).
+  map.reComputeSize();
+}
+
+/// If RegPressure is non-null, compute register pressure as a side effect. The
+/// DAG builder is an efficient place to do it because it already visits
+/// operands.
+void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
+                                        RegPressureTracker *RPTracker,
+                                        PressureDiffs *PDiffs,
+                                        LiveIntervals *LIS,
+                                        bool TrackLaneMasks) {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+  bool UseAA = EnableAASchedMI.getNumOccurrences() > 0 ? EnableAASchedMI
+                                                       : ST.useAA();
+  AAForDep = UseAA ? AA : nullptr;
+
+  BarrierChain = nullptr;
+
+  this->TrackLaneMasks = TrackLaneMasks;
+  MISUnitMap.clear();
+  ScheduleDAG::clearDAG();
+
+  // Create an SUnit for each real instruction.
+  initSUnits();
+
+  if (PDiffs)
+    PDiffs->init(SUnits.size());
+
+  // We build scheduling units by walking a block's instruction list
+  // from bottom to top.
+
+  // Each MIs' memory operand(s) is analyzed to a list of underlying
+  // objects. The SU is then inserted in the SUList(s) mapped from the
+  // Value(s). Each Value thus gets mapped to lists of SUs depending
+  // on it, stores and loads kept separately. Two SUs are trivially
+  // non-aliasing if they both depend on only identified Values and do
+  // not share any common Value.
+  Value2SUsMap Stores, Loads(1 /*TrueMemOrderLatency*/);
+
+  // Certain memory accesses are known to not alias any SU in Stores
+  // or Loads, and have therefore their own 'NonAlias'
+  // domain. E.g. spill / reload instructions never alias LLVM I/R
+  // Values. It would be nice to assume that this type of memory
+  // accesses always have a proper memory operand modelling, and are
+  // therefore never unanalyzable, but this is conservatively not
+  // done.
+  Value2SUsMap NonAliasStores, NonAliasLoads(1 /*TrueMemOrderLatency*/);
+
+  // Remove any stale debug info; sometimes BuildSchedGraph is called again
+  // without emitting the info from the previous call.
+  DbgValues.clear();
+  FirstDbgValue = nullptr;
+
+  assert(Defs.empty() && Uses.empty() &&
+         "Only BuildGraph should update Defs/Uses");
+  Defs.setUniverse(TRI->getNumRegs());
+  Uses.setUniverse(TRI->getNumRegs());
+
+  assert(CurrentVRegDefs.empty() && "nobody else should use CurrentVRegDefs");
+  assert(CurrentVRegUses.empty() && "nobody else should use CurrentVRegUses");
+  unsigned NumVirtRegs = MRI.getNumVirtRegs();
+  CurrentVRegDefs.setUniverse(NumVirtRegs);
+  CurrentVRegUses.setUniverse(NumVirtRegs);
+
+  VRegUses.clear();
+  VRegUses.setUniverse(NumVirtRegs);
+
+  // Model data dependencies between instructions being scheduled and the
+  // ExitSU.
+  addSchedBarrierDeps();
+
+  // Walk the list of instructions, from bottom moving up.
+  MachineInstr *DbgMI = nullptr;
+  for (MachineBasicBlock::iterator MII = RegionEnd, MIE = RegionBegin;
+       MII != MIE; --MII) {
+    MachineInstr &MI = *std::prev(MII);
+    if (DbgMI) {
+      DbgValues.push_back(std::make_pair(DbgMI, &MI));
+      DbgMI = nullptr;
+    }
+
+    if (MI.isDebugValue()) {
+      DbgMI = &MI;
+      continue;
+    }
+    SUnit *SU = MISUnitMap[&MI];
+    assert(SU && "No SUnit mapped to this MI");
+
+    if (RPTracker) {
+      collectVRegUses(SU);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(MI, *TRI, MRI, TrackLaneMasks, false);
+      if (TrackLaneMasks) {
+        SlotIndex SlotIdx = LIS->getInstructionIndex(MI);
+        RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx);
+      }
+      if (PDiffs != nullptr)
+        PDiffs->addInstruction(SU->NodeNum, RegOpers, MRI);
+
+      RPTracker->recedeSkipDebugValues();
+      assert(&*RPTracker->getPos() == &MI && "RPTracker in sync");
+      RPTracker->recede(RegOpers);
+    }
+
+    assert(
+        (CanHandleTerminators || (!MI.isTerminator() && !MI.isPosition())) &&
+        "Cannot schedule terminators or labels!");
+
+    // Add register-based dependencies (data, anti, and output).
+    // For some instructions (calls, returns, inline-asm, etc.) there can
+    // be explicit uses and implicit defs, in which case the use will appear
+    // on the operand list before the def. Do two passes over the operand
+    // list to make sure that defs are processed before any uses.
+    bool HasVRegDef = false;
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      if (!MO.isReg() || !MO.isDef())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        addPhysRegDeps(SU, j);
+      else {
+        HasVRegDef = true;
+        addVRegDefDeps(SU, j);
+      }
+    }
+    // Now process all uses.
+    for (unsigned j = 0, n = MI.getNumOperands(); j != n; ++j) {
+      const MachineOperand &MO = MI.getOperand(j);
+      // Only look at use operands.
+      // We do not need to check for MO.readsReg() here because subsequent
+      // subregister defs will get output dependence edges and need no
+      // additional use dependencies.
+      if (!MO.isReg() || !MO.isUse())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0)
+        continue;
+
+      if (TRI->isPhysicalRegister(Reg))
+        addPhysRegDeps(SU, j);
+      else if (MO.readsReg()) // ignore undef operands
+        addVRegUseDeps(SU, j);
+    }
+
+    // If we haven't seen any uses in this scheduling region, create a
+    // dependence edge to ExitSU to model the live-out latency. This is required
+    // for vreg defs with no in-region use, and prefetches with no vreg def.
+    //
+    // FIXME: NumDataSuccs would be more precise than NumSuccs here. This
+    // check currently relies on being called before adding chain deps.
+    if (SU->NumSuccs == 0 && SU->Latency > 1 && (HasVRegDef || MI.mayLoad())) {
+      SDep Dep(SU, SDep::Artificial);
+      Dep.setLatency(SU->Latency - 1);
+      ExitSU.addPred(Dep);
+    }
+
+    // Add memory dependencies (Note: isStoreToStackSlot and
+    // isLoadFromStackSLot are not usable after stack slots are lowered to
+    // actual addresses).
+
+    // This is a barrier event that acts as a pivotal node in the DAG.
+    if (isGlobalMemoryObject(AA, &MI)) {
+
+      // Become the barrier chain.
+      if (BarrierChain)
+        BarrierChain->addPredBarrier(SU);
+      BarrierChain = SU;
+
+      DEBUG(dbgs() << "Global memory object and new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+
+      // Add dependencies against everything below it and clear maps.
+      addBarrierChain(Stores);
+      addBarrierChain(Loads);
+      addBarrierChain(NonAliasStores);
+      addBarrierChain(NonAliasLoads);
+
+      continue;
+    }
+
+    // If it's not a store or a variant load, we're done.
+    if (!MI.mayStore() && !(MI.mayLoad() && !MI.isInvariantLoad(AA)))
+      continue;
+
+    // Always add dependecy edge to BarrierChain if present.
+    if (BarrierChain)
+      BarrierChain->addPredBarrier(SU);
+
+    // Find the underlying objects for MI. The Objs vector is either
+    // empty, or filled with the Values of memory locations which this
+    // SU depends on. An empty vector means the memory location is
+    // unknown, and may alias anything.
+    UnderlyingObjectsVector Objs;
+    getUnderlyingObjectsForInstr(&MI, MFI, Objs, MF.getDataLayout());
+
+    if (MI.mayStore()) {
+      if (Objs.empty()) {
+        // An unknown store depends on all stores and loads.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+        addChainDependencies(SU, Loads);
+        addChainDependencies(SU, NonAliasLoads);
+
+        // Map this store to 'UnknownValue'.
+        Stores.insert(SU, UnknownValue);
+      } else {
+        // Add precise dependencies against all previously seen memory
+        // accesses mapped to the same Value(s).
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add dependencies to previous stores and loads mapped to V.
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+          addChainDependencies(SU, (ThisMayAlias ? Loads : NonAliasLoads), V);
+        }
+        // Update the store map after all chains have been added to avoid adding
+        // self-loop edge if multiple underlying objects are present.
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Map this store to V.
+          (ThisMayAlias ? Stores : NonAliasStores).insert(SU, V);
+        }
+        // The store may have dependencies to unanalyzable loads and
+        // stores.
+        addChainDependencies(SU, Loads, UnknownValue);
+        addChainDependencies(SU, Stores, UnknownValue);
+      }
+    } else { // SU is a load.
+      if (Objs.empty()) {
+        // An unknown load depends on all stores.
+        addChainDependencies(SU, Stores);
+        addChainDependencies(SU, NonAliasStores);
+
+        Loads.insert(SU, UnknownValue);
+      } else {
+        for (const UnderlyingObject &UnderlObj : Objs) {
+          ValueType V = UnderlObj.getValue();
+          bool ThisMayAlias = UnderlObj.mayAlias();
+
+          // Add precise dependencies against all previously seen stores
+          // mapping to the same Value(s).
+          addChainDependencies(SU, (ThisMayAlias ? Stores : NonAliasStores), V);
+
+          // Map this load to V.
+          (ThisMayAlias ? Loads : NonAliasLoads).insert(SU, V);
+        }
+        // The load may have dependencies to unanalyzable stores.
+        addChainDependencies(SU, Stores, UnknownValue);
+      }
+    }
+
+    // Reduce maps if they grow huge.
+    if (Stores.size() + Loads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing Stores and Loads maps.\n";);
+      reduceHugeMemNodeMaps(Stores, Loads, getReductionSize());
+    }
+    if (NonAliasStores.size() + NonAliasLoads.size() >= HugeRegion) {
+      DEBUG(dbgs() << "Reducing NonAliasStores and NonAliasLoads maps.\n";);
+      reduceHugeMemNodeMaps(NonAliasStores, NonAliasLoads, getReductionSize());
+    }
+  }
+
+  if (DbgMI)
+    FirstDbgValue = DbgMI;
+
+  Defs.clear();
+  Uses.clear();
+  CurrentVRegDefs.clear();
+  CurrentVRegUses.clear();
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const PseudoSourceValue* PSV) {
+  PSV->printCustom(OS);
+  return OS;
+}
+
+void ScheduleDAGInstrs::Value2SUsMap::dump() {
+  for (auto &Itr : *this) {
+    if (Itr.first.is<const Value*>()) {
+      const Value *V = Itr.first.get<const Value*>();
+      if (isa<UndefValue>(V))
+        dbgs() << "Unknown";
+      else
+        V->printAsOperand(dbgs());
+    }
+    else if (Itr.first.is<const PseudoSourceValue*>())
+      dbgs() <<  Itr.first.get<const PseudoSourceValue*>();
+    else
+      llvm_unreachable("Unknown Value type.");
+
+    dbgs() << " : ";
+    dumpSUList(Itr.second);
+  }
+}
+
+/// Reduce maps in FIFO order, by N SUs. This is better than turning
+/// every Nth memory SU into BarrierChain in buildSchedGraph(), since
+/// it avoids unnecessary edges between seen SUs above the new
+/// BarrierChain, and those below it.
+void ScheduleDAGInstrs::reduceHugeMemNodeMaps(Value2SUsMap &stores,
+                                              Value2SUsMap &loads, unsigned N) {
+  DEBUG(dbgs() << "Before reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
+
+  // Insert all SU's NodeNums into a vector and sort it.
+  std::vector<unsigned> NodeNums;
+  NodeNums.reserve(stores.size() + loads.size());
+  for (auto &I : stores)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  for (auto &I : loads)
+    for (auto *SU : I.second)
+      NodeNums.push_back(SU->NodeNum);
+  std::sort(NodeNums.begin(), NodeNums.end());
+
+  // The N last elements in NodeNums will be removed, and the SU with
+  // the lowest NodeNum of them will become the new BarrierChain to
+  // let the not yet seen SUs have a dependency to the removed SUs.
+  assert (N <= NodeNums.size());
+  SUnit *newBarrierChain = &SUnits[*(NodeNums.end() - N)];
+  if (BarrierChain) {
+    // The aliasing and non-aliasing maps reduce independently of each
+    // other, but share a common BarrierChain. Check if the
+    // newBarrierChain is above the former one. If it is not, it may
+    // introduce a loop to use newBarrierChain, so keep the old one.
+    if (newBarrierChain->NodeNum < BarrierChain->NodeNum) {
+      BarrierChain->addPredBarrier(newBarrierChain);
+      BarrierChain = newBarrierChain;
+      DEBUG(dbgs() << "Inserting new barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+    }
+    else
+      DEBUG(dbgs() << "Keeping old barrier chain: SU("
+            << BarrierChain->NodeNum << ").\n";);
+  }
+  else
+    BarrierChain = newBarrierChain;
+
+  insertBarrierChain(stores);
+  insertBarrierChain(loads);
+
+  DEBUG(dbgs() << "After reduction:\nStoring SUnits:\n";
+        stores.dump();
+        dbgs() << "Loading SUnits:\n";
+        loads.dump());
+}
+
+/// \brief Initialize register live-range state for updating kills.
+void ScheduleDAGInstrs::startBlockForKills(MachineBasicBlock *BB) {
+  // Start with no live registers.
+  LiveRegs.reset();
+
+  // Examine the live-in regs of all successors.
+  for (MachineBasicBlock::succ_iterator SI = BB->succ_begin(),
+       SE = BB->succ_end(); SI != SE; ++SI) {
+    for (const auto &LI : (*SI)->liveins()) {
+      // Repeat, for reg and all subregs.
+      for (MCSubRegIterator SubRegs(LI.PhysReg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
+    }
+  }
+}
+
+/// \brief If we change a kill flag on the bundle instruction implicit register
+/// operands, then we also need to propagate that to any instructions inside
+/// the bundle which had the same kill state.
+static void toggleBundleKillFlag(MachineInstr *MI, unsigned Reg,
+                                 bool NewKillState,
+                                 const TargetRegisterInfo *TRI) {
+  if (MI->getOpcode() != TargetOpcode::BUNDLE)
+    return;
+
+  // Walk backwards from the last instruction in the bundle to the first.
+  // Once we set a kill flag on an instruction, we bail out, as otherwise we
+  // might set it on too many operands.  We will clear as many flags as we
+  // can though.
+  MachineBasicBlock::instr_iterator Begin = MI->getIterator();
+  MachineBasicBlock::instr_iterator End = getBundleEnd(*MI);
+  while (Begin != End) {
+    if (NewKillState) {
+      if ((--End)->addRegisterKilled(Reg, TRI, /* addIfNotFound= */ false))
+         return;
+    } else
+        (--End)->clearRegisterKills(Reg, TRI);
+  }
+}
+
+bool ScheduleDAGInstrs::toggleKillFlag(MachineInstr *MI, MachineOperand &MO) {
+  // Setting kill flag...
+  if (!MO.isKill()) {
+    MO.setIsKill(true);
+    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
+    return false;
+  }
+
+  // If MO itself is live, clear the kill flag...
+  if (LiveRegs.test(MO.getReg())) {
+    MO.setIsKill(false);
+    toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
+    return false;
+  }
+
+  // If any subreg of MO is live, then create an imp-def for that
+  // subreg and keep MO marked as killed.
+  MO.setIsKill(false);
+  toggleBundleKillFlag(MI, MO.getReg(), false, TRI);
+  bool AllDead = true;
+  const unsigned SuperReg = MO.getReg();
+  MachineInstrBuilder MIB(MF, MI);
+  for (MCSubRegIterator SubRegs(SuperReg, TRI); SubRegs.isValid(); ++SubRegs) {
+    if (LiveRegs.test(*SubRegs)) {
+      MIB.addReg(*SubRegs, RegState::ImplicitDefine);
+      AllDead = false;
+    }
+  }
+
+  if(AllDead) {
+    MO.setIsKill(true);
+    toggleBundleKillFlag(MI, MO.getReg(), true, TRI);
+  }
+  return false;
+}
+
+// FIXME: Reuse the LivePhysRegs utility for this.
+void ScheduleDAGInstrs::fixupKills(MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Fixup kills for BB#" << MBB->getNumber() << '\n');
+
+  LiveRegs.resize(TRI->getNumRegs());
+  BitVector killedRegs(TRI->getNumRegs());
+
+  startBlockForKills(MBB);
+
+  // Examine block from end to start...
+  unsigned Count = MBB->size();
+  for (MachineBasicBlock::iterator I = MBB->end(), E = MBB->begin();
+       I != E; --Count) {
+    MachineInstr &MI = *--I;
+    if (MI.isDebugValue())
+      continue;
+
+    // Update liveness.  Registers that are defed but not used in this
+    // instruction are now dead. Mark register and all subregs as they
+    // are completely defined.
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isRegMask())
+        LiveRegs.clearBitsNotInMask(MO.getRegMask());
+      if (!MO.isReg()) continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == 0) continue;
+      if (!MO.isDef()) continue;
+      // Ignore two-addr defs.
+      if (MI.isRegTiedToUseOperand(i)) continue;
+
+      // Repeat for reg and all subregs.
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        LiveRegs.reset(*SubRegs);
+    }
+
+    // Examine all used registers and set/clear kill flag. When a
+    // register is used multiple times we only set the kill flag on
+    // the first use. Don't set kill flags on undef operands.
+    killedRegs.reset();
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
+
+      bool kill = false;
+      if (!killedRegs.test(Reg)) {
+        kill = true;
+        // A register is not killed if any subregs are live...
+        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
+          if (LiveRegs.test(*SubRegs)) {
+            kill = false;
+            break;
+          }
+        }
+
+        // If subreg is not live, then register is killed if it became
+        // live in this instruction
+        if (kill)
+          kill = !LiveRegs.test(Reg);
+      }
+
+      if (MO.isKill() != kill) {
+        DEBUG(dbgs() << "Fixing " << MO << " in ");
+        // Warning: toggleKillFlag may invalidate MO.
+        toggleKillFlag(&MI, MO);
+        DEBUG(MI.dump());
+        DEBUG({
+          if (MI.getOpcode() == TargetOpcode::BUNDLE) {
+            MachineBasicBlock::instr_iterator Begin = MI.getIterator();
+            MachineBasicBlock::instr_iterator End = getBundleEnd(MI);
+            while (++Begin != End)
+              DEBUG(Begin->dump());
+          }
+        });
+      }
+
+      killedRegs.set(Reg);
+    }
+
+    // Mark any used register (that is not using undef) and subregs as
+    // now live...
+    for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (!MO.isReg() || !MO.isUse() || MO.isUndef()) continue;
+      unsigned Reg = MO.getReg();
+      if ((Reg == 0) || MRI.isReserved(Reg)) continue;
+
+      for (MCSubRegIterator SubRegs(Reg, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        LiveRegs.set(*SubRegs);
+    }
+  }
+}
+
+void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  SU->getInstr()->dump();
+#endif
+}
+
+std::string ScheduleDAGInstrs::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream oss(s);
+  if (SU == &EntrySU)
+    oss << "<entry>";
+  else if (SU == &ExitSU)
+    oss << "<exit>";
+  else
+    SU->getInstr()->print(oss, /*SkipOpers=*/true);
+  return oss.str();
+}
+
+/// Return the basic block label. It is not necessarilly unique because a block
+/// contains multiple scheduling regions. But it is fine for visualization.
+std::string ScheduleDAGInstrs::getDAGName() const {
+  return "dag." + BB->getFullName();
+}
+
+//===----------------------------------------------------------------------===//
+// SchedDFSResult Implementation
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+/// \brief Internal state used to compute SchedDFSResult.
+class SchedDFSImpl {
+  SchedDFSResult &R;
+
+  /// Join DAG nodes into equivalence classes by their subtree.
+  IntEqClasses SubtreeClasses;
+  /// List PredSU, SuccSU pairs that represent data edges between subtrees.
+  std::vector<std::pair<const SUnit*, const SUnit*> > ConnectionPairs;
+
+  struct RootData {
+    unsigned NodeID;
+    unsigned ParentNodeID;  // Parent node (member of the parent subtree).
+    unsigned SubInstrCount; // Instr count in this tree only, not children.
+
+    RootData(unsigned id): NodeID(id),
+                           ParentNodeID(SchedDFSResult::InvalidSubtreeID),
+                           SubInstrCount(0) {}
+
+    unsigned getSparseSetIndex() const { return NodeID; }
+  };
+
+  SparseSet<RootData> RootSet;
+
+public:
+  SchedDFSImpl(SchedDFSResult &r): R(r), SubtreeClasses(R.DFSNodeData.size()) {
+    RootSet.setUniverse(R.DFSNodeData.size());
+  }
+
+  /// Return true if this node been visited by the DFS traversal.
+  ///
+  /// During visitPostorderNode the Node's SubtreeID is assigned to the Node
+  /// ID. Later, SubtreeID is updated but remains valid.
+  bool isVisited(const SUnit *SU) const {
+    return R.DFSNodeData[SU->NodeNum].SubtreeID
+      != SchedDFSResult::InvalidSubtreeID;
+  }
+
+  /// Initialize this node's instruction count. We don't need to flag the node
+  /// visited until visitPostorder because the DAG cannot have cycles.
+  void visitPreorder(const SUnit *SU) {
+    R.DFSNodeData[SU->NodeNum].InstrCount =
+      SU->getInstr()->isTransient() ? 0 : 1;
+  }
+
+  /// Called once for each node after all predecessors are visited. Revisit this
+  /// node's predecessors and potentially join them now that we know the ILP of
+  /// the other predecessors.
+  void visitPostorderNode(const SUnit *SU) {
+    // Mark this node as the root of a subtree. It may be joined with its
+    // successors later.
+    R.DFSNodeData[SU->NodeNum].SubtreeID = SU->NodeNum;
+    RootData RData(SU->NodeNum);
+    RData.SubInstrCount = SU->getInstr()->isTransient() ? 0 : 1;
+
+    // If any predecessors are still in their own subtree, they either cannot be
+    // joined or are large enough to remain separate. If this parent node's
+    // total instruction count is not greater than a child subtree by at least
+    // the subtree limit, then try to join it now since splitting subtrees is
+    // only useful if multiple high-pressure paths are possible.
+    unsigned InstrCount = R.DFSNodeData[SU->NodeNum].InstrCount;
+    for (SUnit::const_pred_iterator
+           PI = SU->Preds.begin(), PE = SU->Preds.end(); PI != PE; ++PI) {
+      if (PI->getKind() != SDep::Data)
+        continue;
+      unsigned PredNum = PI->getSUnit()->NodeNum;
+      if ((InstrCount - R.DFSNodeData[PredNum].InstrCount) < R.SubtreeLimit)
+        joinPredSubtree(*PI, SU, /*CheckLimit=*/false);
+
+      // Either link or merge the TreeData entry from the child to the parent.
+      if (R.DFSNodeData[PredNum].SubtreeID == PredNum) {
+        // If the predecessor's parent is invalid, this is a tree edge and the
+        // current node is the parent.
+        if (RootSet[PredNum].ParentNodeID == SchedDFSResult::InvalidSubtreeID)
+          RootSet[PredNum].ParentNodeID = SU->NodeNum;
+      }
+      else if (RootSet.count(PredNum)) {
+        // The predecessor is not a root, but is still in the root set. This
+        // must be the new parent that it was just joined to. Note that
+        // RootSet[PredNum].ParentNodeID may either be invalid or may still be
+        // set to the original parent.
+        RData.SubInstrCount += RootSet[PredNum].SubInstrCount;
+        RootSet.erase(PredNum);
+      }
+    }
+    RootSet[SU->NodeNum] = RData;
+  }
+
+  /// Called once for each tree edge after calling visitPostOrderNode on the
+  /// predecessor. Increment the parent node's instruction count and
+  /// preemptively join this subtree to its parent's if it is small enough.
+  void visitPostorderEdge(const SDep &PredDep, const SUnit *Succ) {
+    R.DFSNodeData[Succ->NodeNum].InstrCount
+      += R.DFSNodeData[PredDep.getSUnit()->NodeNum].InstrCount;
+    joinPredSubtree(PredDep, Succ);
+  }
+
+  /// Add a connection for cross edges.
+  void visitCrossEdge(const SDep &PredDep, const SUnit *Succ) {
+    ConnectionPairs.push_back(std::make_pair(PredDep.getSUnit(), Succ));
+  }
+
+  /// Set each node's subtree ID to the representative ID and record connections
+  /// between trees.
+  void finalize() {
+    SubtreeClasses.compress();
+    R.DFSTreeData.resize(SubtreeClasses.getNumClasses());
+    assert(SubtreeClasses.getNumClasses() == RootSet.size()
+           && "number of roots should match trees");
+    for (SparseSet<RootData>::const_iterator
+           RI = RootSet.begin(), RE = RootSet.end(); RI != RE; ++RI) {
+      unsigned TreeID = SubtreeClasses[RI->NodeID];
+      if (RI->ParentNodeID != SchedDFSResult::InvalidSubtreeID)
+        R.DFSTreeData[TreeID].ParentTreeID = SubtreeClasses[RI->ParentNodeID];
+      R.DFSTreeData[TreeID].SubInstrCount = RI->SubInstrCount;
+      // Note that SubInstrCount may be greater than InstrCount if we joined
+      // subtrees across a cross edge. InstrCount will be attributed to the
+      // original parent, while SubInstrCount will be attributed to the joined
+      // parent.
+    }
+    R.SubtreeConnections.resize(SubtreeClasses.getNumClasses());
+    R.SubtreeConnectLevels.resize(SubtreeClasses.getNumClasses());
+    DEBUG(dbgs() << R.getNumSubtrees() << " subtrees:\n");
+    for (unsigned Idx = 0, End = R.DFSNodeData.size(); Idx != End; ++Idx) {
+      R.DFSNodeData[Idx].SubtreeID = SubtreeClasses[Idx];
+      DEBUG(dbgs() << "  SU(" << Idx << ") in tree "
+            << R.DFSNodeData[Idx].SubtreeID << '\n');
+    }
+    for (std::vector<std::pair<const SUnit*, const SUnit*> >::const_iterator
+           I = ConnectionPairs.begin(), E = ConnectionPairs.end();
+         I != E; ++I) {
+      unsigned PredTree = SubtreeClasses[I->first->NodeNum];
+      unsigned SuccTree = SubtreeClasses[I->second->NodeNum];
+      if (PredTree == SuccTree)
+        continue;
+      unsigned Depth = I->first->getDepth();
+      addConnection(PredTree, SuccTree, Depth);
+      addConnection(SuccTree, PredTree, Depth);
+    }
+  }
+
+protected:
+  /// Join the predecessor subtree with the successor that is its DFS
+  /// parent. Apply some heuristics before joining.
+  bool joinPredSubtree(const SDep &PredDep, const SUnit *Succ,
+                       bool CheckLimit = true) {
+    assert(PredDep.getKind() == SDep::Data && "Subtrees are for data edges");
+
+    // Check if the predecessor is already joined.
+    const SUnit *PredSU = PredDep.getSUnit();
+    unsigned PredNum = PredSU->NodeNum;
+    if (R.DFSNodeData[PredNum].SubtreeID != PredNum)
+      return false;
+
+    // Four is the magic number of successors before a node is considered a
+    // pinch point.
+    unsigned NumDataSucs = 0;
+    for (SUnit::const_succ_iterator SI = PredSU->Succs.begin(),
+           SE = PredSU->Succs.end(); SI != SE; ++SI) {
+      if (SI->getKind() == SDep::Data) {
+        if (++NumDataSucs >= 4)
+          return false;
+      }
+    }
+    if (CheckLimit && R.DFSNodeData[PredNum].InstrCount > R.SubtreeLimit)
+      return false;
+    R.DFSNodeData[PredNum].SubtreeID = Succ->NodeNum;
+    SubtreeClasses.join(Succ->NodeNum, PredNum);
+    return true;
+  }
+
+  /// Called by finalize() to record a connection between trees.
+  void addConnection(unsigned FromTree, unsigned ToTree, unsigned Depth) {
+    if (!Depth)
+      return;
+
+    do {
+      SmallVectorImpl<SchedDFSResult::Connection> &Connections =
+        R.SubtreeConnections[FromTree];
+      for (SmallVectorImpl<SchedDFSResult::Connection>::iterator
+             I = Connections.begin(), E = Connections.end(); I != E; ++I) {
+        if (I->TreeID == ToTree) {
+          I->Level = std::max(I->Level, Depth);
+          return;
+        }
+      }
+      Connections.push_back(SchedDFSResult::Connection(ToTree, Depth));
+      FromTree = R.DFSTreeData[FromTree].ParentTreeID;
+    } while (FromTree != SchedDFSResult::InvalidSubtreeID);
+  }
+};
+} // namespace llvm
+
+namespace {
+/// \brief Manage the stack used by a reverse depth-first search over the DAG.
+class SchedDAGReverseDFS {
+  std::vector<std::pair<const SUnit*, SUnit::const_pred_iterator> > DFSStack;
+public:
+  bool isComplete() const { return DFSStack.empty(); }
+
+  void follow(const SUnit *SU) {
+    DFSStack.push_back(std::make_pair(SU, SU->Preds.begin()));
+  }
+  void advance() { ++DFSStack.back().second; }
+
+  const SDep *backtrack() {
+    DFSStack.pop_back();
+    return DFSStack.empty() ? nullptr : std::prev(DFSStack.back().second);
+  }
+
+  const SUnit *getCurr() const { return DFSStack.back().first; }
+
+  SUnit::const_pred_iterator getPred() const { return DFSStack.back().second; }
+
+  SUnit::const_pred_iterator getPredEnd() const {
+    return getCurr()->Preds.end();
+  }
+};
+} // anonymous
+
+static bool hasDataSucc(const SUnit *SU) {
+  for (SUnit::const_succ_iterator
+         SI = SU->Succs.begin(), SE = SU->Succs.end(); SI != SE; ++SI) {
+    if (SI->getKind() == SDep::Data && !SI->getSUnit()->isBoundaryNode())
+      return true;
+  }
+  return false;
+}
+
+/// Compute an ILP metric for all nodes in the subDAG reachable via depth-first
+/// search from this root.
+void SchedDFSResult::compute(ArrayRef<SUnit> SUnits) {
+  if (!IsBottomUp)
+    llvm_unreachable("Top-down ILP metric is unimplemnted");
+
+  SchedDFSImpl Impl(*this);
+  for (ArrayRef<SUnit>::const_iterator
+         SI = SUnits.begin(), SE = SUnits.end(); SI != SE; ++SI) {
+    const SUnit *SU = &*SI;
+    if (Impl.isVisited(SU) || hasDataSucc(SU))
+      continue;
+
+    SchedDAGReverseDFS DFS;
+    Impl.visitPreorder(SU);
+    DFS.follow(SU);
+    for (;;) {
+      // Traverse the leftmost path as far as possible.
+      while (DFS.getPred() != DFS.getPredEnd()) {
+        const SDep &PredDep = *DFS.getPred();
+        DFS.advance();
+        // Ignore non-data edges.
+        if (PredDep.getKind() != SDep::Data
+            || PredDep.getSUnit()->isBoundaryNode()) {
+          continue;
+        }
+        // An already visited edge is a cross edge, assuming an acyclic DAG.
+        if (Impl.isVisited(PredDep.getSUnit())) {
+          Impl.visitCrossEdge(PredDep, DFS.getCurr());
+          continue;
+        }
+        Impl.visitPreorder(PredDep.getSUnit());
+        DFS.follow(PredDep.getSUnit());
+      }
+      // Visit the top of the stack in postorder and backtrack.
+      const SUnit *Child = DFS.getCurr();
+      const SDep *PredDep = DFS.backtrack();
+      Impl.visitPostorderNode(Child);
+      if (PredDep)
+        Impl.visitPostorderEdge(*PredDep, DFS.getCurr());
+      if (DFS.isComplete())
+        break;
+    }
+  }
+  Impl.finalize();
+}
+
+/// The root of the given SubtreeID was just scheduled. For all subtrees
+/// connected to this tree, record the depth of the connection so that the
+/// nearest connected subtrees can be prioritized.
+void SchedDFSResult::scheduleTree(unsigned SubtreeID) {
+  for (SmallVectorImpl<Connection>::const_iterator
+         I = SubtreeConnections[SubtreeID].begin(),
+         E = SubtreeConnections[SubtreeID].end(); I != E; ++I) {
+    SubtreeConnectLevels[I->TreeID] =
+      std::max(SubtreeConnectLevels[I->TreeID], I->Level);
+    DEBUG(dbgs() << "  Tree: " << I->TreeID
+          << " @" << SubtreeConnectLevels[I->TreeID] << '\n');
+  }
+}
+
+LLVM_DUMP_METHOD
+void ILPValue::print(raw_ostream &OS) const {
+  OS << InstrCount << " / " << Length << " = ";
+  if (!Length)
+    OS << "BADILP";
+  else
+    OS << format("%g", ((double)InstrCount / Length));
+}
+
+LLVM_DUMP_METHOD
+void ILPValue::dump() const {
+  dbgs() << *this << '\n';
+}
+
+namespace llvm {
+
+LLVM_DUMP_METHOD
+raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val) {
+  Val.print(OS);
+  return OS;
+}
+
+} // namespace llvm
diff --git a/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
new file mode 100644
index 000000000000..ca2881cb91e0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -0,0 +1,100 @@
+//===-- ScheduleDAGPrinter.cpp - Implement ScheduleDAG::viewGraph() -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <fstream>
+using namespace llvm;
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<ScheduleDAG*> : public DefaultDOTGraphTraits {
+
+  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+
+    static std::string getGraphName(const ScheduleDAG *G) {
+      return G->MF.getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static bool isNodeHidden(const SUnit *Node) {
+      return (Node->NumPreds > 10 || Node->NumSuccs > 10);
+    }
+
+    static std::string getNodeIdentifierLabel(const SUnit *Node,
+                                              const ScheduleDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+      OS << static_cast<const void *>(Node);
+      return R;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    static std::string getEdgeAttributes(const SUnit *Node,
+                                         SUnitIterator EI,
+                                         const ScheduleDAG *Graph) {
+      if (EI.isArtificialDep())
+        return "color=cyan,style=dashed";
+      if (EI.isCtrlDep())
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    std::string getNodeLabel(const SUnit *Node, const ScheduleDAG *Graph);
+    static std::string getNodeAttributes(const SUnit *N,
+                                         const ScheduleDAG *Graph) {
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(ScheduleDAG *G,
+                                       GraphWriter<ScheduleDAG*> &GW) {
+      return G->addCustomGraphFeatures(GW);
+    }
+  };
+}
+
+std::string DOTGraphTraits<ScheduleDAG*>::getNodeLabel(const SUnit *SU,
+                                                       const ScheduleDAG *G) {
+  return G->getGraphNodeLabel(SU);
+}
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void ScheduleDAG::viewGraph(const Twine &Name, const Twine &Title) {
+  // This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, Name, false, Title);
+#else
+  errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+/// Out-of-line implementation with no arguments is handy for gdb.
+void ScheduleDAG::viewGraph() {
+  viewGraph(getDAGName(), "Scheduling-Units Graph for " + getDAGName());
+}
diff --git a/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
new file mode 100644
index 000000000000..69c487033015
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -0,0 +1,240 @@
+//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScoreboardHazardRecognizer class, which
+// encapsultes hazard-avoidance heuristics for scheduling, based on the
+// scheduling itineraries specified for the target.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE DebugType
+
+ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
+    const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
+    const char *ParentDebugType)
+    : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
+      DAG(SchedDAG), IssueWidth(0), IssueCount(0) {
+
+  // Determine the maximum depth of any itinerary. This determines the depth of
+  // the scoreboard. We always make the scoreboard at least 1 cycle deep to
+  // avoid dealing with the boundary condition.
+  unsigned ScoreboardDepth = 1;
+  if (ItinData && !ItinData->isEmpty()) {
+    for (unsigned idx = 0; ; ++idx) {
+      if (ItinData->isEndMarker(idx))
+        break;
+
+      const InstrStage *IS = ItinData->beginStage(idx);
+      const InstrStage *E = ItinData->endStage(idx);
+      unsigned CurCycle = 0;
+      unsigned ItinDepth = 0;
+      for (; IS != E; ++IS) {
+        unsigned StageDepth = CurCycle + IS->getCycles();
+        if (ItinDepth < StageDepth) ItinDepth = StageDepth;
+        CurCycle += IS->getNextCycles();
+      }
+
+      // Find the next power-of-2 >= ItinDepth
+      while (ItinDepth > ScoreboardDepth) {
+        ScoreboardDepth *= 2;
+        // Don't set MaxLookAhead until we find at least one nonzero stage.
+        // This way, an itinerary with no stages has MaxLookAhead==0, which
+        // completely bypasses the scoreboard hazard logic.
+        MaxLookAhead = ScoreboardDepth;
+      }
+    }
+  }
+
+  ReservedScoreboard.reset(ScoreboardDepth);
+  RequiredScoreboard.reset(ScoreboardDepth);
+
+  // If MaxLookAhead is not set above, then we are not enabled.
+  if (!isEnabled())
+    DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
+  else {
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel.IssueWidth;
+    DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
+          << ScoreboardDepth << '\n');
+  }
+}
+
+void ScoreboardHazardRecognizer::Reset() {
+  IssueCount = 0;
+  RequiredScoreboard.reset();
+  ReservedScoreboard.reset();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void ScoreboardHazardRecognizer::Scoreboard::dump() const {
+  dbgs() << "Scoreboard:\n";
+
+  unsigned last = Depth - 1;
+  while ((last > 0) && ((*this)[last] == 0))
+    last--;
+
+  for (unsigned i = 0; i <= last; i++) {
+    unsigned FUs = (*this)[i];
+    dbgs() << "\t";
+    for (int j = 31; j >= 0; j--)
+      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    dbgs() << '\n';
+  }
+}
+#endif
+
+bool ScoreboardHazardRecognizer::atIssueLimit() const {
+  if (IssueWidth == 0)
+    return false;
+
+  return IssueCount == IssueWidth;
+}
+
+ScheduleHazardRecognizer::HazardType
+ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
+  if (!ItinData || ItinData->isEmpty())
+    return NoHazard;
+
+  // Note that stalls will be negative for bottom-up scheduling.
+  int cycle = Stalls;
+
+  // Use the itinerary for the underlying instruction to check for
+  // free FU's in the scoreboard at the appropriate future cycles.
+
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  if (!MCID) {
+    // Don't check hazards for non-machineinstr Nodes.
+    return NoHazard;
+  }
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must find one of the stage's units free for every cycle the
+    // stage is occupied. FIXME it would be more accurate to find the
+    // same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      int StageCycle = cycle + (int)i;
+      if (StageCycle < 0)
+        continue;
+
+      if (StageCycle >= (int)RequiredScoreboard.getDepth()) {
+        assert((StageCycle - Stalls) < (int)RequiredScoreboard.getDepth() &&
+               "Scoreboard depth exceeded!");
+        // This stage was stalled beyond pipeline depth, so cannot conflict.
+        break;
+      }
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[StageCycle];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[StageCycle];
+        break;
+      }
+
+      if (!freeUnits) {
+        DEBUG(dbgs() << "*** Hazard in cycle +" << StageCycle << ", ");
+        DEBUG(dbgs() << "SU(" << SU->NodeNum << "): ");
+        DEBUG(DAG->dumpNode(SU));
+        return Hazard;
+      }
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  return NoHazard;
+}
+
+void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
+  if (!ItinData || ItinData->isEmpty())
+    return;
+
+  // Use the itinerary for the underlying instruction to reserve FU's
+  // in the scoreboard at the appropriate future cycles.
+  const MCInstrDesc *MCID = DAG->getInstrDesc(SU);
+  assert(MCID && "The scheduler must filter non-machineinstrs");
+  if (DAG->TII->isZeroCost(MCID->Opcode))
+    return;
+
+  ++IssueCount;
+
+  unsigned cycle = 0;
+
+  unsigned idx = MCID->getSchedClass();
+  for (const InstrStage *IS = ItinData->beginStage(idx),
+         *E = ItinData->endStage(idx); IS != E; ++IS) {
+    // We must reserve one of the stage's units for every cycle the
+    // stage is occupied. FIXME it would be more accurate to reserve
+    // the same unit free in all the cycles.
+    for (unsigned int i = 0; i < IS->getCycles(); ++i) {
+      assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
+             "Scoreboard depth exceeded!");
+
+      unsigned freeUnits = IS->getUnits();
+      switch (IS->getReservationKind()) {
+      case InstrStage::Required:
+        // Required FUs conflict with both reserved and required ones
+        freeUnits &= ~ReservedScoreboard[cycle + i];
+        // FALLTHROUGH
+      case InstrStage::Reserved:
+        // Reserved FUs can conflict only with required ones.
+        freeUnits &= ~RequiredScoreboard[cycle + i];
+        break;
+      }
+
+      // reduce to a single unit
+      unsigned freeUnit = 0;
+      do {
+        freeUnit = freeUnits;
+        freeUnits = freeUnit & (freeUnit - 1);
+      } while (freeUnits);
+
+      if (IS->getReservationKind() == InstrStage::Required)
+        RequiredScoreboard[cycle + i] |= freeUnit;
+      else
+        ReservedScoreboard[cycle + i] |= freeUnit;
+    }
+
+    // Advance the cycle to the next stage.
+    cycle += IS->getNextCycles();
+  }
+
+  DEBUG(ReservedScoreboard.dump());
+  DEBUG(RequiredScoreboard.dump());
+}
+
+void ScoreboardHazardRecognizer::AdvanceCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[0] = 0; ReservedScoreboard.advance();
+  RequiredScoreboard[0] = 0; RequiredScoreboard.advance();
+}
+
+void ScoreboardHazardRecognizer::RecedeCycle() {
+  IssueCount = 0;
+  ReservedScoreboard[ReservedScoreboard.getDepth()-1] = 0;
+  ReservedScoreboard.recede();
+  RequiredScoreboard[RequiredScoreboard.getDepth()-1] = 0;
+  RequiredScoreboard.recede();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
new file mode 100644
index 000000000000..d888676583f3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -0,0 +1,15009 @@
+//===-- DAGCombiner.cpp - Implement a DAG node combiner -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass combines dag nodes to form fewer, simpler DAG nodes.  It can be run
+// both before and after the DAG is legalized.
+//
+// This pass is not a substitute for the LLVM IR instcombine pass. This pass is
+// primarily intended to handle simplification opportunities that are implicit
+// in the LLVM IR and exposed by the various codegen lowering phases.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "dagcombine"
+
+STATISTIC(NodesCombined   , "Number of dag nodes combined");
+STATISTIC(PreIndexedNodes , "Number of pre-indexed nodes created");
+STATISTIC(PostIndexedNodes, "Number of post-indexed nodes created");
+STATISTIC(OpsNarrowed     , "Number of load/op/store narrowed");
+STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
+STATISTIC(SlicedLoads, "Number of load sliced");
+
+namespace {
+  static cl::opt<bool>
+    CombinerAA("combiner-alias-analysis", cl::Hidden,
+               cl::desc("Enable DAG combiner alias-analysis heuristics"));
+
+  static cl::opt<bool>
+    CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
+               cl::desc("Enable DAG combiner's use of IR alias analysis"));
+
+  static cl::opt<bool>
+    UseTBAA("combiner-use-tbaa", cl::Hidden, cl::init(true),
+               cl::desc("Enable DAG combiner's use of TBAA"));
+
+#ifndef NDEBUG
+  static cl::opt<std::string>
+    CombinerAAOnlyFunc("combiner-aa-only-func", cl::Hidden,
+               cl::desc("Only use DAG-combiner alias analysis in this"
+                        " function"));
+#endif
+
+  /// Hidden option to stress test load slicing, i.e., when this option
+  /// is enabled, load slicing bypasses most of its profitability guards.
+  static cl::opt<bool>
+  StressLoadSlicing("combiner-stress-load-slicing", cl::Hidden,
+                    cl::desc("Bypass the profitability model of load "
+                             "slicing"),
+                    cl::init(false));
+
+  static cl::opt<bool>
+    MaySplitLoadIndex("combiner-split-load-index", cl::Hidden, cl::init(true),
+                      cl::desc("DAG combiner may split indexing from loads"));
+
+//------------------------------ DAGCombiner ---------------------------------//
+
+  class DAGCombiner {
+    SelectionDAG &DAG;
+    const TargetLowering &TLI;
+    CombineLevel Level;
+    CodeGenOpt::Level OptLevel;
+    bool LegalOperations;
+    bool LegalTypes;
+    bool ForCodeSize;
+
+    /// \brief Worklist of all of the nodes that need to be simplified.
+    ///
+    /// This must behave as a stack -- new nodes to process are pushed onto the
+    /// back and when processing we pop off of the back.
+    ///
+    /// The worklist will not contain duplicates but may contain null entries
+    /// due to nodes being deleted from the underlying DAG.
+    SmallVector<SDNode *, 64> Worklist;
+
+    /// \brief Mapping from an SDNode to its position on the worklist.
+    ///
+    /// This is used to find and remove nodes from the worklist (by nulling
+    /// them) when they are deleted from the underlying DAG. It relies on
+    /// stable indices of nodes within the worklist.
+    DenseMap<SDNode *, unsigned> WorklistMap;
+
+    /// \brief Set of nodes which have been combined (at least once).
+    ///
+    /// This is used to allow us to reliably add any operands of a DAG node
+    /// which have not yet been combined to the worklist.
+    SmallPtrSet<SDNode *, 32> CombinedNodes;
+
+    // AA - Used for DAG load/store alias analysis.
+    AliasAnalysis &AA;
+
+    /// When an instruction is simplified, add all users of the instruction to
+    /// the work lists because they might get more simplified now.
+    void AddUsersToWorklist(SDNode *N) {
+      for (SDNode *Node : N->uses())
+        AddToWorklist(Node);
+    }
+
+    /// Call the node-specific routine that folds each particular type of node.
+    SDValue visit(SDNode *N);
+
+  public:
+    /// Add to the worklist making sure its instance is at the back (next to be
+    /// processed.)
+    void AddToWorklist(SDNode *N) {
+      // Skip handle nodes as they can't usefully be combined and confuse the
+      // zero-use deletion strategy.
+      if (N->getOpcode() == ISD::HANDLENODE)
+        return;
+
+      if (WorklistMap.insert(std::make_pair(N, Worklist.size())).second)
+        Worklist.push_back(N);
+    }
+
+    /// Remove all instances of N from the worklist.
+    void removeFromWorklist(SDNode *N) {
+      CombinedNodes.erase(N);
+
+      auto It = WorklistMap.find(N);
+      if (It == WorklistMap.end())
+        return; // Not in the worklist.
+
+      // Null out the entry rather than erasing it to avoid a linear operation.
+      Worklist[It->second] = nullptr;
+      WorklistMap.erase(It);
+    }
+
+    void deleteAndRecombine(SDNode *N);
+    bool recursivelyDeleteUnusedNodes(SDNode *N);
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                      bool AddTo = true);
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, SDValue Res, bool AddTo = true) {
+      return CombineTo(N, &Res, 1, AddTo);
+    }
+
+    /// Replaces all uses of the results of one DAG node with new values.
+    SDValue CombineTo(SDNode *N, SDValue Res0, SDValue Res1,
+                      bool AddTo = true) {
+      SDValue To[] = { Res0, Res1 };
+      return CombineTo(N, To, 2, AddTo);
+    }
+
+    void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
+
+  private:
+
+    /// Check the specified integer node value to see if it can be simplified or
+    /// if things it uses can be simplified by bit propagation.
+    /// If so, return true.
+    bool SimplifyDemandedBits(SDValue Op) {
+      unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+      APInt Demanded = APInt::getAllOnesValue(BitWidth);
+      return SimplifyDemandedBits(Op, Demanded);
+    }
+
+    bool SimplifyDemandedBits(SDValue Op, const APInt &Demanded);
+
+    bool CombineToPreIndexedLoadStore(SDNode *N);
+    bool CombineToPostIndexedLoadStore(SDNode *N);
+    SDValue SplitIndexingFromLoad(LoadSDNode *LD);
+    bool SliceUpLoad(SDNode *N);
+
+    /// \brief Replace an ISD::EXTRACT_VECTOR_ELT of a load with a narrowed
+    ///   load.
+    ///
+    /// \param EVE ISD::EXTRACT_VECTOR_ELT to be replaced.
+    /// \param InVecVT type of the input vector to EVE with bitcasts resolved.
+    /// \param EltNo index of the vector element to load.
+    /// \param OriginalLoad load that EVE came from to be replaced.
+    /// \returns EVE on success SDValue() on failure.
+    SDValue ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+        SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad);
+    void ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad);
+    SDValue PromoteOperand(SDValue Op, EVT PVT, bool &Replace);
+    SDValue SExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue ZExtPromoteOperand(SDValue Op, EVT PVT);
+    SDValue PromoteIntBinOp(SDValue Op);
+    SDValue PromoteIntShiftOp(SDValue Op);
+    SDValue PromoteExtend(SDValue Op);
+    bool PromoteLoad(SDValue Op);
+
+    void ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs, SDValue Trunc,
+                         SDValue ExtLoad, const SDLoc &DL,
+                         ISD::NodeType ExtType);
+
+    /// Call the node-specific routine that knows how to fold each
+    /// particular type of node. If that doesn't do anything, try the
+    /// target-specific DAG combines.
+    SDValue combine(SDNode *N);
+
+    // Visitation implementation - Implement dag node combining for different
+    // node types.  The semantics are as follows:
+    // Return Value:
+    //   SDValue.getNode() == 0 - No change was made
+    //   SDValue.getNode() == N - N was replaced, is dead and has been handled.
+    //   otherwise              - N should be replaced by the returned Operand.
+    //
+    SDValue visitTokenFactor(SDNode *N);
+    SDValue visitMERGE_VALUES(SDNode *N);
+    SDValue visitADD(SDNode *N);
+    SDValue visitSUB(SDNode *N);
+    SDValue visitADDC(SDNode *N);
+    SDValue visitSUBC(SDNode *N);
+    SDValue visitADDE(SDNode *N);
+    SDValue visitSUBE(SDNode *N);
+    SDValue visitMUL(SDNode *N);
+    SDValue useDivRem(SDNode *N);
+    SDValue visitSDIV(SDNode *N);
+    SDValue visitUDIV(SDNode *N);
+    SDValue visitREM(SDNode *N);
+    SDValue visitMULHU(SDNode *N);
+    SDValue visitMULHS(SDNode *N);
+    SDValue visitSMUL_LOHI(SDNode *N);
+    SDValue visitUMUL_LOHI(SDNode *N);
+    SDValue visitSMULO(SDNode *N);
+    SDValue visitUMULO(SDNode *N);
+    SDValue visitIMINMAX(SDNode *N);
+    SDValue visitAND(SDNode *N);
+    SDValue visitANDLike(SDValue N0, SDValue N1, SDNode *LocReference);
+    SDValue visitOR(SDNode *N);
+    SDValue visitORLike(SDValue N0, SDValue N1, SDNode *LocReference);
+    SDValue visitXOR(SDNode *N);
+    SDValue SimplifyVBinOp(SDNode *N);
+    SDValue visitSHL(SDNode *N);
+    SDValue visitSRA(SDNode *N);
+    SDValue visitSRL(SDNode *N);
+    SDValue visitRotate(SDNode *N);
+    SDValue visitBSWAP(SDNode *N);
+    SDValue visitBITREVERSE(SDNode *N);
+    SDValue visitCTLZ(SDNode *N);
+    SDValue visitCTLZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTTZ(SDNode *N);
+    SDValue visitCTTZ_ZERO_UNDEF(SDNode *N);
+    SDValue visitCTPOP(SDNode *N);
+    SDValue visitSELECT(SDNode *N);
+    SDValue visitVSELECT(SDNode *N);
+    SDValue visitSELECT_CC(SDNode *N);
+    SDValue visitSETCC(SDNode *N);
+    SDValue visitSETCCE(SDNode *N);
+    SDValue visitSIGN_EXTEND(SDNode *N);
+    SDValue visitZERO_EXTEND(SDNode *N);
+    SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitSIGN_EXTEND_INREG(SDNode *N);
+    SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
+    SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
+    SDValue visitTRUNCATE(SDNode *N);
+    SDValue visitBITCAST(SDNode *N);
+    SDValue visitBUILD_PAIR(SDNode *N);
+    SDValue visitFADD(SDNode *N);
+    SDValue visitFSUB(SDNode *N);
+    SDValue visitFMUL(SDNode *N);
+    SDValue visitFMA(SDNode *N);
+    SDValue visitFDIV(SDNode *N);
+    SDValue visitFREM(SDNode *N);
+    SDValue visitFSQRT(SDNode *N);
+    SDValue visitFCOPYSIGN(SDNode *N);
+    SDValue visitSINT_TO_FP(SDNode *N);
+    SDValue visitUINT_TO_FP(SDNode *N);
+    SDValue visitFP_TO_SINT(SDNode *N);
+    SDValue visitFP_TO_UINT(SDNode *N);
+    SDValue visitFP_ROUND(SDNode *N);
+    SDValue visitFP_ROUND_INREG(SDNode *N);
+    SDValue visitFP_EXTEND(SDNode *N);
+    SDValue visitFNEG(SDNode *N);
+    SDValue visitFABS(SDNode *N);
+    SDValue visitFCEIL(SDNode *N);
+    SDValue visitFTRUNC(SDNode *N);
+    SDValue visitFFLOOR(SDNode *N);
+    SDValue visitFMINNUM(SDNode *N);
+    SDValue visitFMAXNUM(SDNode *N);
+    SDValue visitBRCOND(SDNode *N);
+    SDValue visitBR_CC(SDNode *N);
+    SDValue visitLOAD(SDNode *N);
+
+    SDValue replaceStoreChain(StoreSDNode *ST, SDValue BetterChain);
+    SDValue replaceStoreOfFPConstant(StoreSDNode *ST);
+
+    SDValue visitSTORE(SDNode *N);
+    SDValue visitINSERT_VECTOR_ELT(SDNode *N);
+    SDValue visitEXTRACT_VECTOR_ELT(SDNode *N);
+    SDValue visitBUILD_VECTOR(SDNode *N);
+    SDValue visitCONCAT_VECTORS(SDNode *N);
+    SDValue visitEXTRACT_SUBVECTOR(SDNode *N);
+    SDValue visitVECTOR_SHUFFLE(SDNode *N);
+    SDValue visitSCALAR_TO_VECTOR(SDNode *N);
+    SDValue visitINSERT_SUBVECTOR(SDNode *N);
+    SDValue visitMLOAD(SDNode *N);
+    SDValue visitMSTORE(SDNode *N);
+    SDValue visitMGATHER(SDNode *N);
+    SDValue visitMSCATTER(SDNode *N);
+    SDValue visitFP_TO_FP16(SDNode *N);
+    SDValue visitFP16_TO_FP(SDNode *N);
+
+    SDValue visitFADDForFMACombine(SDNode *N);
+    SDValue visitFSUBForFMACombine(SDNode *N);
+    SDValue visitFMULForFMACombine(SDNode *N);
+
+    SDValue XformToShuffleWithZero(SDNode *N);
+    SDValue ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue LHS,
+                           SDValue RHS);
+
+    SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
+
+    bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
+    SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
+    SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
+    SDValue SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                             SDValue N2, SDValue N3, ISD::CondCode CC,
+                             bool NotExtCompare = false);
+    SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
+                          const SDLoc &DL, bool foldBooleans = true);
+
+    bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                           SDValue &CC) const;
+    bool isOneUseSetCC(SDValue N) const;
+
+    SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                         unsigned HiOp);
+    SDValue CombineConsecutiveLoads(SDNode *N, EVT VT);
+    SDValue CombineExtLoad(SDNode *N);
+    SDValue combineRepeatedFPDivisors(SDNode *N);
+    SDValue ConstantFoldBITCASTofBUILD_VECTOR(SDNode *, EVT);
+    SDValue BuildSDIV(SDNode *N);
+    SDValue BuildSDIVPow2(SDNode *N);
+    SDValue BuildUDIV(SDNode *N);
+    SDValue BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags);
+    SDValue buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags, bool Recip);
+    SDValue buildSqrtNROneConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                SDNodeFlags *Flags, bool Reciprocal);
+    SDValue buildSqrtNRTwoConst(SDValue Op, SDValue Est, unsigned Iterations,
+                                SDNodeFlags *Flags, bool Reciprocal);
+    SDValue MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                               bool DemandHighBits = true);
+    SDValue MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1);
+    SDNode *MatchRotatePosNeg(SDValue Shifted, SDValue Pos, SDValue Neg,
+                              SDValue InnerPos, SDValue InnerNeg,
+                              unsigned PosOpcode, unsigned NegOpcode,
+                              const SDLoc &DL);
+    SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue ReduceLoadWidth(SDNode *N);
+    SDValue ReduceLoadOpStoreWidth(SDNode *N);
+    SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
+
+    SDValue GetDemandedBits(SDValue V, const APInt &Mask);
+
+    /// Walk up chain skipping non-aliasing memory nodes,
+    /// looking for aliasing nodes and adding them to the Aliases vector.
+    void GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                          SmallVectorImpl<SDValue> &Aliases);
+
+    /// Return true if there is any possibility that the two addresses overlap.
+    bool isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const;
+
+    /// Walk up chain skipping non-aliasing memory nodes, looking for a better
+    /// chain (aliasing node.)
+    SDValue FindBetterChain(SDNode *N, SDValue Chain);
+
+    /// Try to replace a store and any possibly adjacent stores on
+    /// consecutive chains with better chains. Return true only if St is
+    /// replaced.
+    ///
+    /// Notice that other chains may still be replaced even if the function
+    /// returns false.
+    bool findBetterNeighborChains(StoreSDNode *St);
+
+    /// Match "(X shl/srl V1) & V2" where V2 may not be present.
+    bool MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask);
+
+    /// Holds a pointer to an LSBaseSDNode as well as information on where it
+    /// is located in a sequence of memory operations connected by a chain.
+    struct MemOpLink {
+      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
+      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      // Ptr to the mem node.
+      LSBaseSDNode *MemNode;
+      // Offset from the base ptr.
+      int64_t OffsetFromBase;
+      // What is the sequence number of this mem node.
+      // Lowest mem operand in the DAG starts at zero.
+      unsigned SequenceNum;
+    };
+
+    /// This is a helper function for visitMUL to check the profitability
+    /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+    /// MulNode is the original multiply, AddNode is (add x, c1),
+    /// and ConstNode is c2.
+    bool isMulAddWithConstProfitable(SDNode *MulNode,
+                                     SDValue &AddNode,
+                                     SDValue &ConstNode);
+
+    /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
+    /// constant build_vector of the stored constant values in Stores.
+    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
+                                         ArrayRef<MemOpLink> Stores,
+                                         SmallVectorImpl<SDValue> &Chains,
+                                         EVT Ty) const;
+
+    /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
+    /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
+    /// the type of the loaded value to be extended.  LoadedVT returns the type
+    /// of the original loaded value.  NarrowLoad returns whether the load would
+    /// need to be narrowed in order to match.
+    bool isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                          EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
+                          bool &NarrowLoad);
+
+    /// This is a helper function for MergeConsecutiveStores. When the source
+    /// elements of the consecutive stores are all constants or all extracted
+    /// vector elements, try to merge them into one larger store.
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumStores,
+                                         bool IsConstantSrc, bool UseVector);
+
+    /// This is a helper function for MergeConsecutiveStores.
+    /// Stores that may be merged are placed in StoreNodes.
+    /// Loads that may alias with those stores are placed in AliasLoadNodes.
+    void getStoreMergeAndAliasCandidates(
+        StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
+        SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
+
+    /// Helper function for MergeConsecutiveStores. Checks if
+    /// Candidate stores have indirect dependency through their
+    /// operands. \return True if safe to merge
+    bool checkMergeStoreCandidatesForDependencies(
+        SmallVectorImpl<MemOpLink> &StoreNodes);
+
+    /// Merge consecutive store operations into a wide store.
+    /// This optimization uses wide integers or vectors when possible.
+    /// \return True if some memory operations were changed.
+    bool MergeConsecutiveStores(StoreSDNode *N);
+
+    /// \brief Try to transform a truncation where C is a constant:
+    ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
+    ///
+    /// \p N needs to be a truncation and its first operand an AND. Other
+    /// requirements are checked by the function (e.g. that trunc is
+    /// single-use) and if missed an empty SDValue is returned.
+    SDValue distributeTruncateThroughAnd(SDNode *N);
+
+  public:
+    DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
+        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
+          OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
+      ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
+    }
+
+    /// Runs the dag combiner on all nodes in the work list
+    void Run(CombineLevel AtLevel);
+
+    SelectionDAG &getDAG() const { return DAG; }
+
+    /// Returns a type large enough to hold any valid shift amount - before type
+    /// legalization these can be huge.
+    EVT getShiftAmountTy(EVT LHSTy) {
+      assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+      if (LHSTy.isVector())
+        return LHSTy;
+      auto &DL = DAG.getDataLayout();
+      return LegalTypes ? TLI.getScalarShiftAmountTy(DL, LHSTy)
+                        : TLI.getPointerTy(DL);
+    }
+
+    /// This method returns true if we are running before type legalization or
+    /// if the specified VT is legal.
+    bool isTypeLegal(const EVT &VT) {
+      if (!LegalTypes) return true;
+      return TLI.isTypeLegal(VT);
+    }
+
+    /// Convenience wrapper around TargetLowering::getSetCCResultType
+    EVT getSetCCResultType(EVT VT) const {
+      return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+    }
+  };
+}
+
+
+namespace {
+/// This class is a DAGUpdateListener that removes any deleted
+/// nodes from the worklist.
+class WorklistRemover : public SelectionDAG::DAGUpdateListener {
+  DAGCombiner &DC;
+public:
+  explicit WorklistRemover(DAGCombiner &dc)
+    : SelectionDAG::DAGUpdateListener(dc.getDAG()), DC(dc) {}
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    DC.removeFromWorklist(N);
+  }
+};
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetLowering::DAGCombinerInfo implementation
+//===----------------------------------------------------------------------===//
+
+void TargetLowering::DAGCombinerInfo::AddToWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->AddToWorklist(N);
+}
+
+void TargetLowering::DAGCombinerInfo::RemoveFromWorklist(SDNode *N) {
+  ((DAGCombiner*)DC)->removeFromWorklist(N);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, ArrayRef<SDValue> To, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, &To[0], To.size(), AddTo);
+}
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res, AddTo);
+}
+
+
+SDValue TargetLowering::DAGCombinerInfo::
+CombineTo(SDNode *N, SDValue Res0, SDValue Res1, bool AddTo) {
+  return ((DAGCombiner*)DC)->CombineTo(N, Res0, Res1, AddTo);
+}
+
+void TargetLowering::DAGCombinerInfo::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  return ((DAGCombiner*)DC)->CommitTargetLoweringOpt(TLO);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::deleteAndRecombine(SDNode *N) {
+  removeFromWorklist(N);
+
+  // If the operands of this node are only used by the node, they will now be
+  // dead. Make sure to re-visit them and recursively delete dead nodes.
+  for (const SDValue &Op : N->ops())
+    // For an operand generating multiple values, one of the values may
+    // become dead allowing further simplification (e.g. split index
+    // arithmetic from an indexed load).
+    if (Op->hasOneUse() || Op->getNumValues() > 1)
+      AddToWorklist(Op.getNode());
+
+  DAG.DeleteNode(N);
+}
+
+/// Return 1 if we can compute the negated form of the specified expression for
+/// the same cost as the expression itself, or 2 if we can compute the negated
+/// form more cheaply than the expression itself.
+static char isNegatibleForFree(SDValue Op, bool LegalOperations,
+                               const TargetLowering &TLI,
+                               const TargetOptions *Options,
+                               unsigned Depth = 0) {
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return 2;
+
+  // Don't allow anything with multiple uses.
+  if (!Op.hasOneUse()) return 0;
+
+  // Don't recurse exponentially.
+  if (Depth > 6) return 0;
+
+  switch (Op.getOpcode()) {
+  default: return false;
+  case ISD::ConstantFP:
+    // Don't invert constant FP values after legalize.  The negated constant
+    // isn't necessarily legal.
+    return LegalOperations ? 0 : 1;
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    if (!Options->UnsafeFPMath) return 0;
+
+    // After operation legalization, it might not be legal to create new FSUBs.
+    if (LegalOperations &&
+        !TLI.isOperationLegalOrCustom(ISD::FSUB,  Op.getValueType()))
+      return 0;
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
+      return V;
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
+                              Depth + 1);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    if (!Options->UnsafeFPMath) return 0;
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return 1;
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    if (Options->HonorSignDependentRoundingFPMath()) return 0;
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
+    if (char V = isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI,
+                                    Options, Depth + 1))
+      return V;
+
+    return isNegatibleForFree(Op.getOperand(1), LegalOperations, TLI, Options,
+                              Depth + 1);
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FSIN:
+    return isNegatibleForFree(Op.getOperand(0), LegalOperations, TLI, Options,
+                              Depth + 1);
+  }
+}
+
+/// If isNegatibleForFree returns true, return the newly negated expression.
+static SDValue GetNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                    bool LegalOperations, unsigned Depth = 0) {
+  const TargetOptions &Options = DAG.getTarget().Options;
+  // fneg is removable even if it has multiple uses.
+  if (Op.getOpcode() == ISD::FNEG) return Op.getOperand(0);
+
+  // Don't allow anything with multiple uses.
+  assert(Op.hasOneUse() && "Unknown reuse!");
+
+  assert(Depth <= 6 && "GetNegatedExpression doesn't match isNegatibleForFree");
+
+  const SDNodeFlags *Flags = Op.getNode()->getFlags();
+
+  switch (Op.getOpcode()) {
+  default: llvm_unreachable("Unknown code");
+  case ISD::ConstantFP: {
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
+  }
+  case ISD::FADD:
+    // FIXME: determine better conditions for this xform.
+    assert(Options.UnsafeFPMath);
+
+    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
+      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1), Flags);
+    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1),
+                       Op.getOperand(0), Flags);
+  case ISD::FSUB:
+    // We can't turn -(A-B) into B-A when we honor signed zeros.
+    assert(Options.UnsafeFPMath);
+
+    // fold (fneg (fsub 0, B)) -> B
+    if (ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(Op.getOperand(0)))
+      if (N0CFP->isZero())
+        return Op.getOperand(1);
+
+    // fold (fneg (fsub A, B)) -> (fsub B, A)
+    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(0), Flags);
+
+  case ISD::FMUL:
+  case ISD::FDIV:
+    assert(!Options.HonorSignDependentRoundingFPMath());
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    if (isNegatibleForFree(Op.getOperand(0), LegalOperations,
+                           DAG.getTargetLoweringInfo(), &Options, Depth+1))
+      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1), Flags);
+
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(0),
+                       GetNegatedExpression(Op.getOperand(1), DAG,
+                                            LegalOperations, Depth+1), Flags);
+
+  case ISD::FP_EXTEND:
+  case ISD::FSIN:
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
+                       GetNegatedExpression(Op.getOperand(0), DAG,
+                                            LegalOperations, Depth+1));
+  case ISD::FP_ROUND:
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
+                         GetNegatedExpression(Op.getOperand(0), DAG,
+                                              LegalOperations, Depth+1),
+                         Op.getOperand(1));
+  }
+}
+
+// Return true if this node is a setcc, or is a select_cc
+// that selects between the target values used for true and false, making it
+// equivalent to a setcc. Also, set the incoming LHS, RHS, and CC references to
+// the appropriate nodes based on the type of node we are checking. This
+// simplifies life a bit for the callers.
+bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
+                                    SDValue &CC) const {
+  if (N.getOpcode() == ISD::SETCC) {
+    LHS = N.getOperand(0);
+    RHS = N.getOperand(1);
+    CC  = N.getOperand(2);
+    return true;
+  }
+
+  if (N.getOpcode() != ISD::SELECT_CC ||
+      !TLI.isConstTrueVal(N.getOperand(2).getNode()) ||
+      !TLI.isConstFalseVal(N.getOperand(3).getNode()))
+    return false;
+
+  if (TLI.getBooleanContents(N.getValueType()) ==
+      TargetLowering::UndefinedBooleanContent)
+    return false;
+
+  LHS = N.getOperand(0);
+  RHS = N.getOperand(1);
+  CC  = N.getOperand(4);
+  return true;
+}
+
+/// Return true if this is a SetCC-equivalent operation with only one use.
+/// If this is true, it allows the users to invert the operation for free when
+/// it is profitable to do so.
+bool DAGCombiner::isOneUseSetCC(SDValue N) const {
+  SDValue N0, N1, N2;
+  if (isSetCCEquivalent(N, N0, N1, N2) && N.getNode()->hasOneUse())
+    return true;
+  return false;
+}
+
+// \brief Returns the SDNode if it is a constant float BuildVector
+// or constant float.
+static SDNode *isConstantFPBuildVectorOrConstantFP(SDValue N) {
+  if (isa<ConstantFPSDNode>(N))
+    return N.getNode();
+  if (ISD::isBuildVectorOfConstantFPSDNodes(N.getNode()))
+    return N.getNode();
+  return nullptr;
+}
+
+// \brief Returns the SDNode if it is a constant splat BuildVector or constant
+// int.
+static ConstantSDNode *isConstOrConstSplat(SDValue N) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N))
+    return CN;
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantSDNode *CN = BV->getConstantSplatNode(&UndefElements);
+
+    // BuildVectors can truncate their operands. Ignore that case here.
+    // FIXME: We blindly ignore splats which include undef which is overly
+    // pessimistic.
+    if (CN && UndefElements.none() &&
+        CN->getValueType(0) == N.getValueType().getScalarType())
+      return CN;
+  }
+
+  return nullptr;
+}
+
+// \brief Returns the SDNode if it is a constant splat BuildVector or constant
+// float.
+static ConstantFPSDNode *isConstOrConstSplatFP(SDValue N) {
+  if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(N))
+    return CN;
+
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N)) {
+    BitVector UndefElements;
+    ConstantFPSDNode *CN = BV->getConstantFPSplatNode(&UndefElements);
+
+    if (CN && UndefElements.none())
+      return CN;
+  }
+
+  return nullptr;
+}
+
+SDValue DAGCombiner::ReassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
+                                    SDValue N1) {
+  EVT VT = N0.getValueType();
+  if (N0.getOpcode() == Opc) {
+    if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
+      if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+        // reassoc. (op (op x, c1), c2) -> (op x, (op c1, c2))
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, L, R))
+          return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
+        return SDValue();
+      }
+      if (N0.hasOneUse()) {
+        // reassoc. (op (op x, c1), y) -> (op (op x, y), c1) iff x+c1 has one
+        // use
+        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0.getOperand(0), N1);
+        if (!OpNode.getNode())
+          return SDValue();
+        AddToWorklist(OpNode.getNode());
+        return DAG.getNode(Opc, DL, VT, OpNode, N0.getOperand(1));
+      }
+    }
+  }
+
+  if (N1.getOpcode() == Opc) {
+    if (SDNode *R = DAG.isConstantIntBuildVectorOrConstantInt(N1.getOperand(1))) {
+      if (SDNode *L = DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
+        // reassoc. (op c2, (op x, c1)) -> (op x, (op c1, c2))
+        if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, R, L))
+          return DAG.getNode(Opc, DL, VT, N1.getOperand(0), OpNode);
+        return SDValue();
+      }
+      if (N1.hasOneUse()) {
+        // reassoc. (op x, (op y, c1)) -> (op (op x, y), c1) iff x+c1 has one
+        // use
+        SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N0, N1.getOperand(0));
+        if (!OpNode.getNode())
+          return SDValue();
+        AddToWorklist(OpNode.getNode());
+        return DAG.getNode(Opc, DL, VT, OpNode, N1.getOperand(1));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
+                               bool AddTo) {
+  assert(N->getNumValues() == NumTo && "Broken CombineTo call!");
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.1 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        To[0].getNode()->dump(&DAG);
+        dbgs() << " and " << NumTo-1 << " other values\n");
+  for (unsigned i = 0, e = NumTo; i != e; ++i)
+    assert((!To[i].getNode() ||
+            N->getValueType(i) == To[i].getValueType()) &&
+           "Cannot combine value to value of different type!");
+
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesWith(N, To);
+  if (AddTo) {
+    // Push the new nodes and any users onto the worklist
+    for (unsigned i = 0, e = NumTo; i != e; ++i) {
+      if (To[i].getNode()) {
+        AddToWorklist(To[i].getNode());
+        AddUsersToWorklist(To[i].getNode());
+      }
+    }
+  }
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (N->use_empty())
+    deleteAndRecombine(N);
+  return SDValue(N, 0);
+}
+
+void DAGCombiner::
+CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace all uses.  If any nodes become isomorphic to other nodes and
+  // are deleted, make sure to remove them from our worklist.
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(TLO.Old, TLO.New);
+
+  // Push the new node and any (possibly new) users onto the worklist.
+  AddToWorklist(TLO.New.getNode());
+  AddUsersToWorklist(TLO.New.getNode());
+
+  // Finally, if the node is now dead, remove it from the graph.  The node
+  // may not be dead if the replacement process recursively simplified to
+  // something else needing this node.
+  if (TLO.Old.getNode()->use_empty())
+    deleteAndRecombine(TLO.Old.getNode());
+}
+
+/// Check the specified integer node value to see if it can be simplified or if
+/// things it uses can be simplified by bit propagation. If so, return true.
+bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &Demanded) {
+  TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+  APInt KnownZero, KnownOne;
+  if (!TLI.SimplifyDemandedBits(Op, Demanded, KnownZero, KnownOne, TLO))
+    return false;
+
+  // Revisit the node.
+  AddToWorklist(Op.getNode());
+
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.2 ";
+        TLO.Old.getNode()->dump(&DAG);
+        dbgs() << "\nWith: ";
+        TLO.New.getNode()->dump(&DAG);
+        dbgs() << '\n');
+
+  CommitTargetLoweringOpt(TLO);
+  return true;
+}
+
+void DAGCombiner::ReplaceLoadWithPromotedLoad(SDNode *Load, SDNode *ExtLoad) {
+  SDLoc dl(Load);
+  EVT VT = Load->getValueType(0);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, VT, SDValue(ExtLoad, 0));
+
+  DEBUG(dbgs() << "\nReplacing.9 ";
+        Load->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Trunc.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 0), Trunc);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Load, 1), SDValue(ExtLoad, 1));
+  deleteAndRecombine(Load);
+  AddToWorklist(Trunc.getNode());
+}
+
+SDValue DAGCombiner::PromoteOperand(SDValue Op, EVT PVT, bool &Replace) {
+  Replace = false;
+  SDLoc dl(Op);
+  if (ISD::isUNINDEXEDLoad(Op.getNode())) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    Replace = true;
+    return DAG.getExtLoad(ExtType, dl, PVT,
+                          LD->getChain(), LD->getBasePtr(),
+                          MemVT, LD->getMemOperand());
+  }
+
+  unsigned Opc = Op.getOpcode();
+  switch (Opc) {
+  default: break;
+  case ISD::AssertSext:
+    return DAG.getNode(ISD::AssertSext, dl, PVT,
+                       SExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::AssertZext:
+    return DAG.getNode(ISD::AssertZext, dl, PVT,
+                       ZExtPromoteOperand(Op.getOperand(0), PVT),
+                       Op.getOperand(1));
+  case ISD::Constant: {
+    unsigned ExtOpc =
+      Op.getValueType().isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    return DAG.getNode(ExtOpc, dl, PVT, Op);
+  }
+  }
+
+  if (!TLI.isOperationLegal(ISD::ANY_EXTEND, PVT))
+    return SDValue();
+  return DAG.getNode(ISD::ANY_EXTEND, dl, PVT, Op);
+}
+
+SDValue DAGCombiner::SExtPromoteOperand(SDValue Op, EVT PVT) {
+  if (!TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, PVT))
+    return SDValue();
+  EVT OldVT = Op.getValueType();
+  SDLoc dl(Op);
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (!NewOp.getNode())
+    return SDValue();
+  AddToWorklist(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NewOp.getValueType(), NewOp,
+                     DAG.getValueType(OldVT));
+}
+
+SDValue DAGCombiner::ZExtPromoteOperand(SDValue Op, EVT PVT) {
+  EVT OldVT = Op.getValueType();
+  SDLoc dl(Op);
+  bool Replace = false;
+  SDValue NewOp = PromoteOperand(Op, PVT, Replace);
+  if (!NewOp.getNode())
+    return SDValue();
+  AddToWorklist(NewOp.getNode());
+
+  if (Replace)
+    ReplaceLoadWithPromotedLoad(Op.getNode(), NewOp.getNode());
+  return DAG.getZeroExtendInReg(NewOp, dl, OldVT);
+}
+
+/// Promote the specified integer binary operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace0 = false;
+    SDValue N0 = Op.getOperand(0);
+    SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
+    if (!NN0.getNode())
+      return SDValue();
+
+    bool Replace1 = false;
+    SDValue N1 = Op.getOperand(1);
+    SDValue NN1;
+    if (N0 == N1)
+      NN1 = NN0;
+    else {
+      NN1 = PromoteOperand(N1, PVT, Replace1);
+      if (!NN1.getNode())
+        return SDValue();
+    }
+
+    AddToWorklist(NN0.getNode());
+    if (NN1.getNode())
+      AddToWorklist(NN1.getNode());
+
+    if (Replace0)
+      ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
+    if (Replace1)
+      ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    SDLoc dl(Op);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, NN0, NN1));
+  }
+  return SDValue();
+}
+
+/// Promote the specified integer shift operation if the target indicates it is
+/// beneficial. e.g. On x86, it's usually better to promote i16 operations to
+/// i32 since i16 instructions are longer.
+SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    bool Replace = false;
+    SDValue N0 = Op.getOperand(0);
+    if (Opc == ISD::SRA)
+      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+    else if (Opc == ISD::SRL)
+      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+    else
+      N0 = PromoteOperand(N0, PVT, Replace);
+    if (!N0.getNode())
+      return SDValue();
+
+    AddToWorklist(N0.getNode());
+    if (Replace)
+      ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
+
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    SDLoc dl(Op);
+    return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                       DAG.getNode(Opc, dl, PVT, N0, Op.getOperand(1)));
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::PromoteExtend(SDValue Op) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return SDValue();
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+    // fold (aext (aext x)) -> (aext x)
+    // fold (aext (zext x)) -> (zext x)
+    // fold (aext (sext x)) -> (sext x)
+    DEBUG(dbgs() << "\nPromoting ";
+          Op.getNode()->dump(&DAG));
+    return DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, Op.getOperand(0));
+  }
+  return SDValue();
+}
+
+bool DAGCombiner::PromoteLoad(SDValue Op) {
+  if (!LegalOperations)
+    return false;
+
+  if (!ISD::isUNINDEXEDLoad(Op.getNode()))
+    return false;
+
+  EVT VT = Op.getValueType();
+  if (VT.isVector() || !VT.isInteger())
+    return false;
+
+  // If operation type is 'undesirable', e.g. i16 on x86, consider
+  // promoting it.
+  unsigned Opc = Op.getOpcode();
+  if (TLI.isTypeDesirableForOp(Opc, VT))
+    return false;
+
+  EVT PVT = VT;
+  // Consult target whether it is a good idea to promote this operation and
+  // what's the right type to promote it to.
+  if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
+    assert(PVT != VT && "Don't know what type to promote to!");
+
+    SDLoc dl(Op);
+    SDNode *N = Op.getNode();
+    LoadSDNode *LD = cast<LoadSDNode>(N);
+    EVT MemVT = LD->getMemoryVT();
+    ISD::LoadExtType ExtType = ISD::isNON_EXTLoad(LD)
+      ? (TLI.isLoadExtLegal(ISD::ZEXTLOAD, PVT, MemVT) ? ISD::ZEXTLOAD
+                                                       : ISD::EXTLOAD)
+      : LD->getExtensionType();
+    SDValue NewLD = DAG.getExtLoad(ExtType, dl, PVT,
+                                   LD->getChain(), LD->getBasePtr(),
+                                   MemVT, LD->getMemOperand());
+    SDValue Result = DAG.getNode(ISD::TRUNCATE, dl, VT, NewLD);
+
+    DEBUG(dbgs() << "\nPromoting ";
+          N->dump(&DAG);
+          dbgs() << "\nTo: ";
+          Result.getNode()->dump(&DAG);
+          dbgs() << '\n');
+    WorklistRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), NewLD.getValue(1));
+    deleteAndRecombine(N);
+    AddToWorklist(Result.getNode());
+    return true;
+  }
+  return false;
+}
+
+/// \brief Recursively delete a node which has no uses and any operands for
+/// which it is the only use.
+///
+/// Note that this both deletes the nodes and removes them from the worklist.
+/// It also adds any nodes who have had a user deleted to the worklist as they
+/// may now have only one use and subject to other combines.
+bool DAGCombiner::recursivelyDeleteUnusedNodes(SDNode *N) {
+  if (!N->use_empty())
+    return false;
+
+  SmallSetVector<SDNode *, 16> Nodes;
+  Nodes.insert(N);
+  do {
+    N = Nodes.pop_back_val();
+    if (!N)
+      continue;
+
+    if (N->use_empty()) {
+      for (const SDValue &ChildN : N->op_values())
+        Nodes.insert(ChildN.getNode());
+
+      removeFromWorklist(N);
+      DAG.DeleteNode(N);
+    } else {
+      AddToWorklist(N);
+    }
+  } while (!Nodes.empty());
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Main DAG Combiner implementation
+//===----------------------------------------------------------------------===//
+
+void DAGCombiner::Run(CombineLevel AtLevel) {
+  // set the instance variables, so that the various visit routines may use it.
+  Level = AtLevel;
+  LegalOperations = Level >= AfterLegalizeVectorOps;
+  LegalTypes = Level >= AfterLegalizeTypes;
+
+  // Add all the dag nodes to the worklist.
+  for (SDNode &Node : DAG.allnodes())
+    AddToWorklist(&Node);
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+
+  // While the worklist isn't empty, find a node and try to combine it.
+  while (!WorklistMap.empty()) {
+    SDNode *N;
+    // The Worklist holds the SDNodes in order, but it may contain null entries.
+    do {
+      N = Worklist.pop_back_val();
+    } while (!N);
+
+    bool GoodWorklistEntry = WorklistMap.erase(N);
+    (void)GoodWorklistEntry;
+    assert(GoodWorklistEntry &&
+           "Found a worklist entry without a corresponding map entry!");
+
+    // If N has no uses, it is dead.  Make sure to revisit all N's operands once
+    // N is deleted from the DAG, since they too may now be dead or may have a
+    // reduced number of uses, allowing other xforms.
+    if (recursivelyDeleteUnusedNodes(N))
+      continue;
+
+    WorklistRemover DeadNodes(*this);
+
+    // If this combine is running after legalizing the DAG, re-legalize any
+    // nodes pulled off the worklist.
+    if (Level == AfterLegalizeDAG) {
+      SmallSetVector<SDNode *, 16> UpdatedNodes;
+      bool NIsValid = DAG.LegalizeOp(N, UpdatedNodes);
+
+      for (SDNode *LN : UpdatedNodes) {
+        AddToWorklist(LN);
+        AddUsersToWorklist(LN);
+      }
+      if (!NIsValid)
+        continue;
+    }
+
+    DEBUG(dbgs() << "\nCombining: "; N->dump(&DAG));
+
+    // Add any operands of the new node which have not yet been combined to the
+    // worklist as well. Because the worklist uniques things already, this
+    // won't repeatedly process the same operand.
+    CombinedNodes.insert(N);
+    for (const SDValue &ChildN : N->op_values())
+      if (!CombinedNodes.count(ChildN.getNode()))
+        AddToWorklist(ChildN.getNode());
+
+    SDValue RV = combine(N);
+
+    if (!RV.getNode())
+      continue;
+
+    ++NodesCombined;
+
+    // If we get back the same node we passed in, rather than a new node or
+    // zero, we know that the node must have defined multiple values and
+    // CombineTo was used.  Since CombineTo takes care of the worklist
+    // mechanics for us, we have no work to do in this case.
+    if (RV.getNode() == N)
+      continue;
+
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           RV.getNode()->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned new node!");
+
+    DEBUG(dbgs() << " ... into: ";
+          RV.getNode()->dump(&DAG));
+
+    if (N->getNumValues() == RV.getNode()->getNumValues())
+      DAG.ReplaceAllUsesWith(N, RV.getNode());
+    else {
+      assert(N->getValueType(0) == RV.getValueType() &&
+             N->getNumValues() == 1 && "Type mismatch");
+      SDValue OpV = RV;
+      DAG.ReplaceAllUsesWith(N, &OpV);
+    }
+
+    // Push the new node and any users onto the worklist
+    AddToWorklist(RV.getNode());
+    AddUsersToWorklist(RV.getNode());
+
+    // Finally, if the node is now dead, remove it from the graph.  The node
+    // may not be dead if the replacement process recursively simplified to
+    // something else needing this node. This will also take care of adding any
+    // operands which have lost a user to the worklist.
+    recursivelyDeleteUnusedNodes(N);
+  }
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  DAG.setRoot(Dummy.getValue());
+  DAG.RemoveDeadNodes();
+}
+
+SDValue DAGCombiner::visit(SDNode *N) {
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::TokenFactor:        return visitTokenFactor(N);
+  case ISD::MERGE_VALUES:       return visitMERGE_VALUES(N);
+  case ISD::ADD:                return visitADD(N);
+  case ISD::SUB:                return visitSUB(N);
+  case ISD::ADDC:               return visitADDC(N);
+  case ISD::SUBC:               return visitSUBC(N);
+  case ISD::ADDE:               return visitADDE(N);
+  case ISD::SUBE:               return visitSUBE(N);
+  case ISD::MUL:                return visitMUL(N);
+  case ISD::SDIV:               return visitSDIV(N);
+  case ISD::UDIV:               return visitUDIV(N);
+  case ISD::SREM:
+  case ISD::UREM:               return visitREM(N);
+  case ISD::MULHU:              return visitMULHU(N);
+  case ISD::MULHS:              return visitMULHS(N);
+  case ISD::SMUL_LOHI:          return visitSMUL_LOHI(N);
+  case ISD::UMUL_LOHI:          return visitUMUL_LOHI(N);
+  case ISD::SMULO:              return visitSMULO(N);
+  case ISD::UMULO:              return visitUMULO(N);
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:               return visitIMINMAX(N);
+  case ISD::AND:                return visitAND(N);
+  case ISD::OR:                 return visitOR(N);
+  case ISD::XOR:                return visitXOR(N);
+  case ISD::SHL:                return visitSHL(N);
+  case ISD::SRA:                return visitSRA(N);
+  case ISD::SRL:                return visitSRL(N);
+  case ISD::ROTR:
+  case ISD::ROTL:               return visitRotate(N);
+  case ISD::BSWAP:              return visitBSWAP(N);
+  case ISD::BITREVERSE:         return visitBITREVERSE(N);
+  case ISD::CTLZ:               return visitCTLZ(N);
+  case ISD::CTLZ_ZERO_UNDEF:    return visitCTLZ_ZERO_UNDEF(N);
+  case ISD::CTTZ:               return visitCTTZ(N);
+  case ISD::CTTZ_ZERO_UNDEF:    return visitCTTZ_ZERO_UNDEF(N);
+  case ISD::CTPOP:              return visitCTPOP(N);
+  case ISD::SELECT:             return visitSELECT(N);
+  case ISD::VSELECT:            return visitVSELECT(N);
+  case ISD::SELECT_CC:          return visitSELECT_CC(N);
+  case ISD::SETCC:              return visitSETCC(N);
+  case ISD::SETCCE:             return visitSETCCE(N);
+  case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
+  case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
+  case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
+  case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
+  case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
+  case ISD::TRUNCATE:           return visitTRUNCATE(N);
+  case ISD::BITCAST:            return visitBITCAST(N);
+  case ISD::BUILD_PAIR:         return visitBUILD_PAIR(N);
+  case ISD::FADD:               return visitFADD(N);
+  case ISD::FSUB:               return visitFSUB(N);
+  case ISD::FMUL:               return visitFMUL(N);
+  case ISD::FMA:                return visitFMA(N);
+  case ISD::FDIV:               return visitFDIV(N);
+  case ISD::FREM:               return visitFREM(N);
+  case ISD::FSQRT:              return visitFSQRT(N);
+  case ISD::FCOPYSIGN:          return visitFCOPYSIGN(N);
+  case ISD::SINT_TO_FP:         return visitSINT_TO_FP(N);
+  case ISD::UINT_TO_FP:         return visitUINT_TO_FP(N);
+  case ISD::FP_TO_SINT:         return visitFP_TO_SINT(N);
+  case ISD::FP_TO_UINT:         return visitFP_TO_UINT(N);
+  case ISD::FP_ROUND:           return visitFP_ROUND(N);
+  case ISD::FP_ROUND_INREG:     return visitFP_ROUND_INREG(N);
+  case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
+  case ISD::FNEG:               return visitFNEG(N);
+  case ISD::FABS:               return visitFABS(N);
+  case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FMINNUM:            return visitFMINNUM(N);
+  case ISD::FMAXNUM:            return visitFMAXNUM(N);
+  case ISD::FCEIL:              return visitFCEIL(N);
+  case ISD::FTRUNC:             return visitFTRUNC(N);
+  case ISD::BRCOND:             return visitBRCOND(N);
+  case ISD::BR_CC:              return visitBR_CC(N);
+  case ISD::LOAD:               return visitLOAD(N);
+  case ISD::STORE:              return visitSTORE(N);
+  case ISD::INSERT_VECTOR_ELT:  return visitINSERT_VECTOR_ELT(N);
+  case ISD::EXTRACT_VECTOR_ELT: return visitEXTRACT_VECTOR_ELT(N);
+  case ISD::BUILD_VECTOR:       return visitBUILD_VECTOR(N);
+  case ISD::CONCAT_VECTORS:     return visitCONCAT_VECTORS(N);
+  case ISD::EXTRACT_SUBVECTOR:  return visitEXTRACT_SUBVECTOR(N);
+  case ISD::VECTOR_SHUFFLE:     return visitVECTOR_SHUFFLE(N);
+  case ISD::SCALAR_TO_VECTOR:   return visitSCALAR_TO_VECTOR(N);
+  case ISD::INSERT_SUBVECTOR:   return visitINSERT_SUBVECTOR(N);
+  case ISD::MGATHER:            return visitMGATHER(N);
+  case ISD::MLOAD:              return visitMLOAD(N);
+  case ISD::MSCATTER:           return visitMSCATTER(N);
+  case ISD::MSTORE:             return visitMSTORE(N);
+  case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
+  case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::combine(SDNode *N) {
+  SDValue RV = visit(N);
+
+  // If nothing happened, try a target-specific DAG combine.
+  if (!RV.getNode()) {
+    assert(N->getOpcode() != ISD::DELETED_NODE &&
+           "Node was deleted but visit returned NULL!");
+
+    if (N->getOpcode() >= ISD::BUILTIN_OP_END ||
+        TLI.hasTargetDAGCombine((ISD::NodeType)N->getOpcode())) {
+
+      // Expose the DAG combiner to the target combiner impls.
+      TargetLowering::DAGCombinerInfo
+        DagCombineInfo(DAG, Level, false, this);
+
+      RV = TLI.PerformDAGCombine(N, DagCombineInfo);
+    }
+  }
+
+  // If nothing happened still, try promoting the operation.
+  if (!RV.getNode()) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      RV = PromoteIntBinOp(SDValue(N, 0));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+      RV = PromoteIntShiftOp(SDValue(N, 0));
+      break;
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+      RV = PromoteExtend(SDValue(N, 0));
+      break;
+    case ISD::LOAD:
+      if (PromoteLoad(SDValue(N, 0)))
+        RV = SDValue(N, 0);
+      break;
+    }
+  }
+
+  // If N is a commutative binary node, try commuting it to enable more
+  // sdisel CSE.
+  if (!RV.getNode() && SelectionDAG::isCommutativeBinOp(N->getOpcode()) &&
+      N->getNumValues() == 1) {
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+
+    // Constant operands are canonicalized to RHS.
+    if (isa<ConstantSDNode>(N0) || !isa<ConstantSDNode>(N1)) {
+      SDValue Ops[] = {N1, N0};
+      SDNode *CSENode = DAG.getNodeIfExists(N->getOpcode(), N->getVTList(), Ops,
+                                            N->getFlags());
+      if (CSENode)
+        return SDValue(CSENode, 0);
+    }
+  }
+
+  return RV;
+}
+
+/// Given a node, return its input chain if it has one, otherwise return a null
+/// sd operand.
+static SDValue getInputChainForNode(SDNode *N) {
+  if (unsigned NumOps = N->getNumOperands()) {
+    if (N->getOperand(0).getValueType() == MVT::Other)
+      return N->getOperand(0);
+    if (N->getOperand(NumOps-1).getValueType() == MVT::Other)
+      return N->getOperand(NumOps-1);
+    for (unsigned i = 1; i < NumOps-1; ++i)
+      if (N->getOperand(i).getValueType() == MVT::Other)
+        return N->getOperand(i);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
+  // If N has two operands, where one has an input chain equal to the other,
+  // the 'other' chain is redundant.
+  if (N->getNumOperands() == 2) {
+    if (getInputChainForNode(N->getOperand(0).getNode()) == N->getOperand(1))
+      return N->getOperand(0);
+    if (getInputChainForNode(N->getOperand(1).getNode()) == N->getOperand(0))
+      return N->getOperand(1);
+  }
+
+  SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
+  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallPtrSet<SDNode*, 16> SeenOps;
+  bool Changed = false;             // If we should replace this token factor.
+
+  // Start out with this token factor.
+  TFs.push_back(N);
+
+  // Iterate through token factors.  The TFs grows when new token factors are
+  // encountered.
+  for (unsigned i = 0; i < TFs.size(); ++i) {
+    SDNode *TF = TFs[i];
+
+    // Check each of the operands.
+    for (const SDValue &Op : TF->op_values()) {
+
+      switch (Op.getOpcode()) {
+      case ISD::EntryToken:
+        // Entry tokens don't need to be added to the list. They are
+        // redundant.
+        Changed = true;
+        break;
+
+      case ISD::TokenFactor:
+        if (Op.hasOneUse() &&
+            std::find(TFs.begin(), TFs.end(), Op.getNode()) == TFs.end()) {
+          // Queue up for processing.
+          TFs.push_back(Op.getNode());
+          // Clean up in case the token factor is removed.
+          AddToWorklist(Op.getNode());
+          Changed = true;
+          break;
+        }
+        // Fall thru
+
+      default:
+        // Only add if it isn't already in the list.
+        if (SeenOps.insert(Op.getNode()).second)
+          Ops.push_back(Op);
+        else
+          Changed = true;
+        break;
+      }
+    }
+  }
+
+  SDValue Result;
+
+  // If we've changed things around then replace token factor.
+  if (Changed) {
+    if (Ops.empty()) {
+      // The entry token is the only possible outcome.
+      Result = DAG.getEntryNode();
+    } else {
+      // New and improved token factor.
+      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+    }
+
+    // Add users to worklist if AA is enabled, since it may introduce
+    // a lot of new chained token factors while removing memory deps.
+    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+      : DAG.getSubtarget().useAA();
+    return CombineTo(N, Result, UseAA /*add to worklist*/);
+  }
+
+  return Result;
+}
+
+/// MERGE_VALUES can always be eliminated.
+SDValue DAGCombiner::visitMERGE_VALUES(SDNode *N) {
+  WorklistRemover DeadNodes(*this);
+  // Replacing results may cause a different MERGE_VALUES to suddenly
+  // be CSE'd with N, and carry its uses with it. Iterate until no
+  // uses remain, to ensure that the node can be safely deleted.
+  // First add the users of this node to the work list so that they
+  // can be tried again once they have new operands.
+  AddUsersToWorklist(N);
+  do {
+    for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i)
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, i), N->getOperand(i));
+  } while (!N->use_empty());
+  deleteAndRecombine(N);
+  return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+}
+
+/// If \p N is a ConstantSDNode with isOpaque() == false return it casted to a
+/// ConstantSDNode pointer else nullptr.
+static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N);
+  return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
+}
+
+SDValue DAGCombiner::visitADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    // fold (add x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+  }
+
+  // fold (add x, undef) -> undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0)) {
+    // canonicalize constant to RHS
+    if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT, N1, N0);
+    // fold (add c1, c2) -> c1+c2
+    return DAG.FoldConstantArithmetic(ISD::ADD, SDLoc(N), VT,
+                                      N0.getNode(), N1.getNode());
+  }
+  // fold (add x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+  // fold ((c1-A)+c2) -> (c1+c2)-A
+  if (ConstantSDNode *N1C = getAsNonOpaqueConstant(N1)) {
+    if (N0.getOpcode() == ISD::SUB)
+      if (ConstantSDNode *N0C = getAsNonOpaqueConstant(N0.getOperand(0))) {
+        SDLoc DL(N);
+        return DAG.getNode(ISD::SUB, DL, VT,
+                           DAG.getConstant(N1C->getAPIntValue()+
+                                           N0C->getAPIntValue(), DL, VT),
+                           N0.getOperand(1));
+      }
+  }
+  // reassociate add
+  if (SDValue RADD = ReassociateOps(ISD::ADD, SDLoc(N), N0, N1))
+    return RADD;
+  // fold ((0-A) + B) -> B-A
+  if (N0.getOpcode() == ISD::SUB && isNullConstant(N0.getOperand(0)))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1, N0.getOperand(1));
+  // fold (A + (0-B)) -> A-B
+  if (N1.getOpcode() == ISD::SUB && isNullConstant(N1.getOperand(0)))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0, N1.getOperand(1));
+  // fold (A+(B-A)) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(1))
+    return N1.getOperand(0);
+  // fold ((B-A)+A) -> B
+  if (N0.getOpcode() == ISD::SUB && N1 == N0.getOperand(1))
+    return N0.getOperand(0);
+  // fold (A+(B-(A+C))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(0))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(1));
+  // fold (A+(B-(C+A))) to (B-C)
+  if (N1.getOpcode() == ISD::SUB && N1.getOperand(1).getOpcode() == ISD::ADD &&
+      N0 == N1.getOperand(1).getOperand(1))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1.getOperand(0),
+                       N1.getOperand(1).getOperand(0));
+  // fold (A+((B-A)+or-C)) to (B+or-C)
+  if ((N1.getOpcode() == ISD::SUB || N1.getOpcode() == ISD::ADD) &&
+      N1.getOperand(0).getOpcode() == ISD::SUB &&
+      N0 == N1.getOperand(0).getOperand(1))
+    return DAG.getNode(N1.getOpcode(), SDLoc(N), VT,
+                       N1.getOperand(0).getOperand(0), N1.getOperand(1));
+
+  // fold (A-B)+(C-D) to (A+C)-(B+D) when A or C is constant
+  if (N0.getOpcode() == ISD::SUB && N1.getOpcode() == ISD::SUB) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    SDValue N10 = N1.getOperand(0);
+    SDValue N11 = N1.getOperand(1);
+
+    if (isa<ConstantSDNode>(N00) || isa<ConstantSDNode>(N10))
+      return DAG.getNode(ISD::SUB, SDLoc(N), VT,
+                         DAG.getNode(ISD::ADD, SDLoc(N0), VT, N00, N10),
+                         DAG.getNode(ISD::ADD, SDLoc(N1), VT, N01, N11));
+  }
+
+  if (!VT.isVector() && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (a+b) -> (a|b) iff a and b share no bits.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::OR, VT)) &&
+      VT.isInteger() && !VT.isVector() && DAG.haveNoCommonBitsSet(N0, N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1);
+
+  // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
+  if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
+      isNullConstant(N1.getOperand(0).getOperand(0)))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N0,
+                       DAG.getNode(ISD::SHL, SDLoc(N), VT,
+                                   N1.getOperand(0).getOperand(1),
+                                   N1.getOperand(1)));
+  if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB &&
+      isNullConstant(N0.getOperand(0).getOperand(0)))
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT, N1,
+                       DAG.getNode(ISD::SHL, SDLoc(N), VT,
+                                   N0.getOperand(0).getOperand(1),
+                                   N0.getOperand(1)));
+
+  if (N1.getOpcode() == ISD::AND) {
+    SDValue AndOp0 = N1.getOperand(0);
+    unsigned NumSignBits = DAG.ComputeNumSignBits(AndOp0);
+    unsigned DestBits = VT.getScalarType().getSizeInBits();
+
+    // (add z, (and (sbbl x, x), 1)) -> (sub z, (sbbl x, x))
+    // and similar xforms where the inner op is either ~0 or 0.
+    if (NumSignBits == DestBits && isOneConstant(N1->getOperand(1))) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
+    }
+  }
+
+  // add (sext i1), X -> sub X, (zext i1)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND &&
+      N0.getOperand(0).getValueType() == MVT::i1 &&
+      !TLI.isOperationLegal(ISD::SIGN_EXTEND, MVT::i1)) {
+    SDLoc DL(N);
+    SDValue ZExt = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, N1, ZExt);
+  }
+
+  // add X, (sextinreg Y i1) -> sub X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, DL, VT));
+      return DAG.getNode(ISD::SUB, DL, VT, N0, ZExt);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE,
+                                 SDLoc(N), MVT::Glue));
+
+  // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
+
+  // fold (addc x, 0) -> x + no carry out
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
+                                        SDLoc(N), MVT::Glue));
+
+  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
+  APInt LHSZero, LHSOne;
+  APInt RHSZero, RHSOne;
+  DAG.computeKnownBits(N0, LHSZero, LHSOne);
+
+  if (LHSZero.getBoolValue()) {
+    DAG.computeKnownBits(N1, RHSZero, RHSOne);
+
+    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
+    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
+    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
+      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1),
+                       DAG.getNode(ISD::CARRY_FALSE,
+                                   SDLoc(N), MVT::Glue));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // canonicalize constant to RHS
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::ADDE, SDLoc(N), N->getVTList(),
+                       N1, N0, CarryIn);
+
+  // fold (adde x, y, false) -> (addc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+// Since it may not be valid to emit a fold to zero for vector initializers
+// check if we can before folding.
+static SDValue tryFoldToZero(const SDLoc &DL, const TargetLowering &TLI, EVT VT,
+                             SelectionDAG &DAG, bool LegalOperations,
+                             bool LegalTypes) {
+  if (!VT.isVector())
+    return DAG.getConstant(0, DL, VT);
+  if (!LegalOperations || TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return DAG.getConstant(0, DL, VT);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    // fold (sub x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (sub x, x) -> 0
+  // FIXME: Refactor this and xor and other similar operations together.
+  if (N0 == N1)
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+    // fold (sub c1, c2) -> c1-c2
+    return DAG.FoldConstantArithmetic(ISD::SUB, SDLoc(N), VT,
+                                      N0.getNode(), N1.getNode());
+  }
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+  // fold (sub x, c) -> (add x, -c)
+  if (N1C) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::ADD, DL, VT, N0,
+                       DAG.getConstant(-N1C->getAPIntValue(), DL, VT));
+  }
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1)
+  if (isAllOnesConstant(N0))
+    return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
+  // fold A-(A-B) -> B
+  if (N1.getOpcode() == ISD::SUB && N0 == N1.getOperand(0))
+    return N1.getOperand(1);
+  // fold (A+B)-A -> B
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N1)
+    return N0.getOperand(1);
+  // fold (A+B)-B -> A
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1)
+    return N0.getOperand(0);
+  // fold C2-(A+C1) -> (C2-C1)-A
+  ConstantSDNode *N1C1 = N1.getOpcode() != ISD::ADD ? nullptr :
+    dyn_cast<ConstantSDNode>(N1.getOperand(1).getNode());
+  if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
+    SDLoc DL(N);
+    SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
+                                   DL, VT);
+    return DAG.getNode(ISD::SUB, DL, VT, NewC,
+                       N1.getOperand(0));
+  }
+  // fold ((A+(B+or-C))-B) -> A+or-C
+  if (N0.getOpcode() == ISD::ADD &&
+      (N0.getOperand(1).getOpcode() == ISD::SUB ||
+       N0.getOperand(1).getOpcode() == ISD::ADD) &&
+      N0.getOperand(1).getOperand(0) == N1)
+    return DAG.getNode(N0.getOperand(1).getOpcode(), SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(1));
+  // fold ((A+(C+B))-B) -> A+C
+  if (N0.getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOpcode() == ISD::ADD &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+  // fold ((A-(B-C))-C) -> A-B
+  if (N0.getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOpcode() == ISD::SUB &&
+      N0.getOperand(1).getOperand(1) == N1)
+    return DAG.getNode(ISD::SUB, SDLoc(N), VT,
+                       N0.getOperand(0), N0.getOperand(1).getOperand(0));
+
+  // If either operand of a sub is undef, the result is undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+
+  // If the relocation model supports it, consider symbol offsets.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N0))
+    if (!LegalOperations && TLI.isOffsetFoldingLegal(GA)) {
+      // fold (sub Sym, c) -> Sym-c
+      if (N1C && GA->getOpcode() == ISD::GlobalAddress)
+        return DAG.getGlobalAddress(GA->getGlobal(), SDLoc(N1C), VT,
+                                    GA->getOffset() -
+                                      (uint64_t)N1C->getSExtValue());
+      // fold (sub Sym+c1, Sym+c2) -> c1-c2
+      if (GlobalAddressSDNode *GB = dyn_cast<GlobalAddressSDNode>(N1))
+        if (GA->getGlobal() == GB->getGlobal())
+          return DAG.getConstant((uint64_t)GA->getOffset() - GB->getOffset(),
+                                 SDLoc(N), VT);
+    }
+
+  // sub X, (sextinreg Y i1) -> add X, (and Y 1)
+  if (N1.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+    VTSDNode *TN = cast<VTSDNode>(N1.getOperand(1));
+    if (TN->getVT() == MVT::i1) {
+      SDLoc DL(N);
+      SDValue ZExt = DAG.getNode(ISD::AND, DL, VT, N1.getOperand(0),
+                                 DAG.getConstant(1, DL, VT));
+      return DAG.getNode(ISD::ADD, DL, VT, N0, ZExt);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // fold (subc x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // fold (subc x, 0) -> x + no borrow
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  // Canonicalize (sub -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (isAllOnesConstant(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSUBE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue CarryIn = N->getOperand(2);
+
+  // fold (sube x, y, false) -> (subc x, y)
+  if (CarryIn.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::SUBC, SDLoc(N), N->getVTList(), N0, N1);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold (mul x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  bool N0IsConst = false;
+  bool N1IsConst = false;
+  bool N1IsOpaqueConst = false;
+  bool N0IsOpaqueConst = false;
+  APInt ConstValue0, ConstValue1;
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
+    N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
+  } else {
+    N0IsConst = isa<ConstantSDNode>(N0);
+    if (N0IsConst) {
+      ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();
+      N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();
+    }
+    N1IsConst = isa<ConstantSDNode>(N1);
+    if (N1IsConst) {
+      ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
+      N1IsOpaqueConst = cast<ConstantSDNode>(N1)->isOpaque();
+    }
+  }
+
+  // fold (mul c1, c2) -> c1*c2
+  if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)
+    return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,
+                                      N0.getNode(), N1.getNode());
+
+  // canonicalize constant to RHS (vector doesn't have to splat)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
+  // fold (mul x, 0) -> 0
+  if (N1IsConst && ConstValue1 == 0)
+    return N1;
+  // We require a splat of the entire scalar bit width for non-contiguous
+  // bit patterns.
+  bool IsFullSplat =
+    ConstValue1.getBitWidth() == VT.getScalarType().getSizeInBits();
+  // fold (mul x, 1) -> x
+  if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
+    return N0;
+  // fold (mul x, -1) -> 0-x
+  if (N1IsConst && ConstValue1.isAllOnesValue()) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::SUB, DL, VT,
+                       DAG.getConstant(0, DL, VT), N0);
+  }
+  // fold (mul x, (1 << c)) -> x << c
+  if (N1IsConst && !N1IsOpaqueConst && ConstValue1.isPowerOf2() &&
+      IsFullSplat) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::SHL, DL, VT, N0,
+                       DAG.getConstant(ConstValue1.logBase2(), DL,
+                                       getShiftAmountTy(N0.getValueType())));
+  }
+  // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
+  if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2() &&
+      IsFullSplat) {
+    unsigned Log2Val = (-ConstValue1).logBase2();
+    SDLoc DL(N);
+    // FIXME: If the input is something that is easily negated (e.g. a
+    // single-use add), we should put the negate there.
+    return DAG.getNode(ISD::SUB, DL, VT,
+                       DAG.getConstant(0, DL, VT),
+                       DAG.getNode(ISD::SHL, DL, VT, N0,
+                            DAG.getConstant(Log2Val, DL,
+                                      getShiftAmountTy(N0.getValueType()))));
+  }
+
+  APInt Val;
+  // (mul (shl X, c1), c2) -> (mul X, c2 << c1)
+  if (N1IsConst && N0.getOpcode() == ISD::SHL &&
+      (ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+       isa<ConstantSDNode>(N0.getOperand(1)))) {
+    SDValue C3 = DAG.getNode(ISD::SHL, SDLoc(N), VT, N1, N0.getOperand(1));
+    AddToWorklist(C3.getNode());
+    return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), C3);
+  }
+
+  // Change (mul (shl X, C), Y) -> (shl (mul X, Y), C) when the shift has one
+  // use.
+  {
+    SDValue Sh(nullptr, 0), Y(nullptr, 0);
+    // Check for both (mul (shl X, C), Y)  and  (mul Y, (shl X, C)).
+    if (N0.getOpcode() == ISD::SHL &&
+        (ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val) ||
+         isa<ConstantSDNode>(N0.getOperand(1))) &&
+        N0.getNode()->hasOneUse()) {
+      Sh = N0; Y = N1;
+    } else if (N1.getOpcode() == ISD::SHL &&
+               isa<ConstantSDNode>(N1.getOperand(1)) &&
+               N1.getNode()->hasOneUse()) {
+      Sh = N1; Y = N0;
+    }
+
+    if (Sh.getNode()) {
+      SDValue Mul = DAG.getNode(ISD::MUL, SDLoc(N), VT, Sh.getOperand(0), Y);
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, Mul, Sh.getOperand(1));
+    }
+  }
+
+  // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2)
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N1) &&
+      N0.getOpcode() == ISD::ADD &&
+      DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1)) &&
+      isMulAddWithConstProfitable(N, N0, N1))
+      return DAG.getNode(ISD::ADD, SDLoc(N), VT,
+                         DAG.getNode(ISD::MUL, SDLoc(N0), VT,
+                                     N0.getOperand(0), N1),
+                         DAG.getNode(ISD::MUL, SDLoc(N1), VT,
+                                     N0.getOperand(1), N1));
+
+  // reassociate mul
+  if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1))
+    return RMUL;
+
+  return SDValue();
+}
+
+/// Return true if divmod libcall is available.
+static bool isDivRemLibcallAvailable(SDNode *Node, bool isSigned,
+                                     const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  EVT NodeType = Node->getValueType(0);
+  if (!NodeType.isSimple())
+    return false;
+  switch (NodeType.getSimpleVT().SimpleTy) {
+  default: return false; // No libcall for vector types.
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  return TLI.getLibcallName(LC) != nullptr;
+}
+
+/// Issue divrem if both quotient and remainder are needed.
+SDValue DAGCombiner::useDivRem(SDNode *Node) {
+  if (Node->use_empty())
+    return SDValue(); // This is a dead node, leave it alone.
+
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = (Opcode == ISD::SDIV) || (Opcode == ISD::SREM);
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+
+  // DivMod lib calls can still work on non-legal types if using lib-calls.
+  EVT VT = Node->getValueType(0);
+  if (VT.isVector() || !VT.isInteger())
+    return SDValue();
+
+  if (!TLI.isTypeLegal(VT) && !TLI.isOperationCustom(DivRemOpc, VT))
+    return SDValue();
+
+  // If DIVREM is going to get expanded into a libcall,
+  // but there is no libcall available, then don't combine.
+  if (!TLI.isOperationLegalOrCustom(DivRemOpc, VT) &&
+      !isDivRemLibcallAvailable(Node, isSigned, TLI))
+    return SDValue();
+
+  // If div is legal, it's better to do the normal expansion
+  unsigned OtherOpcode = 0;
+  if ((Opcode == ISD::SDIV) || (Opcode == ISD::UDIV)) {
+    OtherOpcode = isSigned ? ISD::SREM : ISD::UREM;
+    if (TLI.isOperationLegalOrCustom(Opcode, VT))
+      return SDValue();
+  } else {
+    OtherOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    if (TLI.isOperationLegalOrCustom(OtherOpcode, VT))
+      return SDValue();
+  }
+
+  SDValue Op0 = Node->getOperand(0);
+  SDValue Op1 = Node->getOperand(1);
+  SDValue combined;
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+         UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node || User->use_empty())
+      continue;
+    // Convert the other matching node(s), too;
+    // otherwise, the DIVREM may get target-legalized into something
+    // target-specific that we won't be able to recognize.
+    unsigned UserOpc = User->getOpcode();
+    if ((UserOpc == Opcode || UserOpc == OtherOpcode || UserOpc == DivRemOpc) &&
+        User->getOperand(0) == Op0 &&
+        User->getOperand(1) == Op1) {
+      if (!combined) {
+        if (UserOpc == OtherOpcode) {
+          SDVTList VTs = DAG.getVTList(VT, VT);
+          combined = DAG.getNode(DivRemOpc, SDLoc(Node), VTs, Op0, Op1);
+        } else if (UserOpc == DivRemOpc) {
+          combined = SDValue(User, 0);
+        } else {
+          assert(UserOpc == Opcode);
+          continue;
+        }
+      }
+      if (UserOpc == ISD::SDIV || UserOpc == ISD::UDIV)
+        CombineTo(User, combined);
+      else if (UserOpc == ISD::SREM || UserOpc == ISD::UREM)
+        CombineTo(User, combined.getValue(1));
+    }
+  }
+  return combined;
+}
+
+SDValue DAGCombiner::visitSDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  SDLoc DL(N);
+
+  // fold (sdiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
+  // fold (sdiv X, 1) -> X
+  if (N1C && N1C->isOne())
+    return N0;
+  // fold (sdiv X, -1) -> 0-X
+  if (N1C && N1C->isAllOnesValue())
+    return DAG.getNode(ISD::SUB, DL, VT,
+                       DAG.getConstant(0, DL, VT), N0);
+
+  // If we know the sign bits of both operands are zero, strength reduce to a
+  // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
+  if (!VT.isVector()) {
+    if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UDIV, DL, N1.getValueType(), N0, N1);
+  }
+
+  // fold (sdiv X, pow2) -> simple ops after legalize
+  // FIXME: We check for the exact bit here because the generic lowering gives
+  // better results in that case. The target-specific lowering should learn how
+  // to handle exact sdivs efficiently.
+  if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
+      !cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact() &&
+      (N1C->getAPIntValue().isPowerOf2() ||
+       (-N1C->getAPIntValue()).isPowerOf2())) {
+    // Target-specific implementation of sdiv x, pow2.
+    if (SDValue Res = BuildSDIVPow2(N))
+      return Res;
+
+    unsigned lg2 = N1C->getAPIntValue().countTrailingZeros();
+
+    // Splat the sign bit into the register
+    SDValue SGN =
+        DAG.getNode(ISD::SRA, DL, VT, N0,
+                    DAG.getConstant(VT.getScalarSizeInBits() - 1, DL,
+                                    getShiftAmountTy(N0.getValueType())));
+    AddToWorklist(SGN.getNode());
+
+    // Add (N0 < 0) ? abs2 - 1 : 0;
+    SDValue SRL =
+        DAG.getNode(ISD::SRL, DL, VT, SGN,
+                    DAG.getConstant(VT.getScalarSizeInBits() - lg2, DL,
+                                    getShiftAmountTy(SGN.getValueType())));
+    SDValue ADD = DAG.getNode(ISD::ADD, DL, VT, N0, SRL);
+    AddToWorklist(SRL.getNode());
+    AddToWorklist(ADD.getNode());    // Divide by pow2
+    SDValue SRA = DAG.getNode(ISD::SRA, DL, VT, ADD,
+                  DAG.getConstant(lg2, DL,
+                                  getShiftAmountTy(ADD.getValueType())));
+
+    // If we're dividing by a positive value, we're done.  Otherwise, we must
+    // negate the result.
+    if (N1C->getAPIntValue().isNonNegative())
+      return SRA;
+
+    AddToWorklist(SRA.getNode());
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), SRA);
+  }
+
+  // If integer divide is expensive and we satisfy the requirements, emit an
+  // alternate sequence.  Targets may check function attributes for size/speed
+  // trade-offs.
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildSDIV(N))
+      return Op;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
+  // Otherwise, we break the simplification logic in visitREM().
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  // undef / X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+  // X / undef -> undef
+  if (N1.isUndef())
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  SDLoc DL(N);
+
+  // fold (udiv c1, c2) -> c1/c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N0C && N1C)
+    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
+                                                    N0C, N1C))
+      return Folded;
+  // fold (udiv x, (1 << c)) -> x >>u c
+  if (N1C && !N1C->isOpaque() && N1C->getAPIntValue().isPowerOf2())
+    return DAG.getNode(ISD::SRL, DL, VT, N0,
+                       DAG.getConstant(N1C->getAPIntValue().logBase2(), DL,
+                                       getShiftAmountTy(N0.getValueType())));
+
+  // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+  if (N1.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0))) {
+      if (SHC->getAPIntValue().isPowerOf2()) {
+        EVT ADDVT = N1.getOperand(1).getValueType();
+        SDValue Add = DAG.getNode(ISD::ADD, DL, ADDVT,
+                                  N1.getOperand(1),
+                                  DAG.getConstant(SHC->getAPIntValue()
+                                                                  .logBase2(),
+                                                  DL, ADDVT));
+        AddToWorklist(Add.getNode());
+        return DAG.getNode(ISD::SRL, DL, VT, N0, Add);
+      }
+    }
+  }
+
+  // fold (udiv x, c) -> alternate
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue Op = BuildUDIV(N))
+      return Op;
+
+  // sdiv, srem -> sdivrem
+  // If the divisor is constant, then return DIVREM only if isIntDivCheap() is true.
+  // Otherwise, we break the simplification logic in visitREM().
+  if (!N1C || TLI.isIntDivCheap(N->getValueType(0), Attr))
+    if (SDValue DivRem = useDivRem(N))
+        return DivRem;
+
+  // undef / X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+  // X / undef -> undef
+  if (N1.isUndef())
+    return N1;
+
+  return SDValue();
+}
+
+// handles ISD::SREM and ISD::UREM
+SDValue DAGCombiner::visitREM(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  bool isSigned = (Opcode == ISD::SREM);
+  SDLoc DL(N);
+
+  // fold (rem c1, c2) -> c1%c2
+  ConstantSDNode *N0C = isConstOrConstSplat(N0);
+  ConstantSDNode *N1C = isConstOrConstSplat(N1);
+  if (N0C && N1C)
+    if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
+      return Folded;
+
+  if (isSigned) {
+    // If we know the sign bits of both operands are zero, strength reduce to a
+    // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
+    if (!VT.isVector()) {
+      if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
+        return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
+    }
+  } else {
+    // fold (urem x, pow2) -> (and x, pow2-1)
+    if (N1C && !N1C->isNullValue() && !N1C->isOpaque() &&
+        N1C->getAPIntValue().isPowerOf2()) {
+      return DAG.getNode(ISD::AND, DL, VT, N0,
+                         DAG.getConstant(N1C->getAPIntValue() - 1, DL, VT));
+    }
+    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+    if (N1.getOpcode() == ISD::SHL) {
+      ConstantSDNode *SHC = getAsNonOpaqueConstant(N1.getOperand(0));
+      if (SHC && SHC->getAPIntValue().isPowerOf2()) {
+        APInt NegOne = APInt::getAllOnesValue(VT.getSizeInBits());
+        SDValue Add =
+            DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+        AddToWorklist(Add.getNode());
+        return DAG.getNode(ISD::AND, DL, VT, N0, Add);
+      }
+    }
+  }
+
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+
+  // If X/C can be simplified by the division-by-constant logic, lower
+  // X%C to the equivalent of X-X/C*C.
+  // To avoid mangling nodes, this simplification requires that the combine()
+  // call for the speculative DIV must not cause a DIVREM conversion.  We guard
+  // against this by skipping the simplification if isIntDivCheap().  When
+  // div is not cheap, combine will not return a DIVREM.  Regardless,
+  // checking cheapness here makes sense since the simplification results in
+  // fatter code.
+  if (N1C && !N1C->isNullValue() && !TLI.isIntDivCheap(VT, Attr)) {
+    unsigned DivOpcode = isSigned ? ISD::SDIV : ISD::UDIV;
+    SDValue Div = DAG.getNode(DivOpcode, DL, VT, N0, N1);
+    AddToWorklist(Div.getNode());
+    SDValue OptimizedDiv = combine(Div.getNode());
+    if (OptimizedDiv.getNode() && OptimizedDiv.getNode() != Div.getNode()) {
+      assert((OptimizedDiv.getOpcode() != ISD::UDIVREM) &&
+             (OptimizedDiv.getOpcode() != ISD::SDIVREM));
+      SDValue Mul = DAG.getNode(ISD::MUL, DL, VT, OptimizedDiv, N1);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, VT, N0, Mul);
+      AddToWorklist(Mul.getNode());
+      return Sub;
+    }
+  }
+
+  // sdiv, srem -> sdivrem
+  if (SDValue DivRem = useDivRem(N))
+    return DivRem.getValue(1);
+
+  // undef % X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+  // X % undef -> undef
+  if (N1.isUndef())
+    return N1;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (mulhs x, 0) -> 0
+  if (isNullConstant(N1))
+    return N1;
+  // fold (mulhs x, 1) -> (sra x, size(x)-1)
+  if (isOneConstant(N1)) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::SRA, DL, N0.getValueType(), N0,
+                       DAG.getConstant(N0.getValueType().getSizeInBits() - 1,
+                                       DL,
+                                       getShiftAmountTy(N0.getValueType())));
+  }
+  // fold (mulhs x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // If the type twice as wide is legal, transform the mulhs to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMULHU(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // fold (mulhu x, 0) -> 0
+  if (isNullConstant(N1))
+    return N1;
+  // fold (mulhu x, 1) -> 0
+  if (isOneConstant(N1))
+    return DAG.getConstant(0, DL, N0.getValueType());
+  // fold (mulhu x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  // If the type twice as wide is legal, transform the mulhu to a wider multiply
+  // plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N0);
+      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N1);
+      N1 = DAG.getNode(ISD::MUL, DL, NewVT, N0, N1);
+      N1 = DAG.getNode(ISD::SRL, DL, NewVT, N1,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(N1.getValueType())));
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, N1);
+    }
+  }
+
+  return SDValue();
+}
+
+/// Perform optimizations common to nodes that compute two values. LoOp and HiOp
+/// give the opcodes for the two computations that are being performed. Return
+/// true if a simplification was made.
+SDValue DAGCombiner::SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
+                                                unsigned HiOp) {
+  // If the high half is not needed, just compute the low half.
+  bool HiExists = N->hasAnyUseOfValue(1);
+  if (!HiExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(LoOp, N->getValueType(0)))) {
+    SDValue Res = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If the low half is not needed, just compute the high half.
+  bool LoExists = N->hasAnyUseOfValue(0);
+  if (!LoExists &&
+      (!LegalOperations ||
+       TLI.isOperationLegal(HiOp, N->getValueType(1)))) {
+    SDValue Res = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
+    return CombineTo(N, Res, Res);
+  }
+
+  // If both halves are used, return as it is.
+  if (LoExists && HiExists)
+    return SDValue();
+
+  // If the two computed results can be simplified separately, separate them.
+  if (LoExists) {
+    SDValue Lo = DAG.getNode(LoOp, SDLoc(N), N->getValueType(0), N->ops());
+    AddToWorklist(Lo.getNode());
+    SDValue LoOpt = combine(Lo.getNode());
+    if (LoOpt.getNode() && LoOpt.getNode() != Lo.getNode() &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(LoOpt.getOpcode(), LoOpt.getValueType())))
+      return CombineTo(N, LoOpt, LoOpt);
+  }
+
+  if (HiExists) {
+    SDValue Hi = DAG.getNode(HiOp, SDLoc(N), N->getValueType(1), N->ops());
+    AddToWorklist(Hi.getNode());
+    SDValue HiOpt = combine(Hi.getNode());
+    if (HiOpt.getNode() && HiOpt != Hi &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(HiOpt.getOpcode(), HiOpt.getValueType())))
+      return CombineTo(N, HiOpt, HiOpt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMUL_LOHI(SDNode *N) {
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHS))
+    return Res;
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // If the type is twice as wide is legal, transform the mulhu to a wider
+  // multiply plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(0));
+      SDValue Hi = DAG.getNode(ISD::SIGN_EXTEND, DL, NewVT, N->getOperand(1));
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMUL_LOHI(SDNode *N) {
+  if (SDValue Res = SimplifyNodeWithTwoResults(N, ISD::MUL, ISD::MULHU))
+    return Res;
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // If the type is twice as wide is legal, transform the mulhu to a wider
+  // multiply plus a shift.
+  if (VT.isSimple() && !VT.isVector()) {
+    MVT Simple = VT.getSimpleVT();
+    unsigned SimpleSize = Simple.getSizeInBits();
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
+    if (TLI.isOperationLegal(ISD::MUL, NewVT)) {
+      SDValue Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(0));
+      SDValue Hi = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, N->getOperand(1));
+      Lo = DAG.getNode(ISD::MUL, DL, NewVT, Lo, Hi);
+      // Compute the high part as N1.
+      Hi = DAG.getNode(ISD::SRL, DL, NewVT, Lo,
+            DAG.getConstant(SimpleSize, DL,
+                            getShiftAmountTy(Lo.getValueType())));
+      Hi = DAG.getNode(ISD::TRUNCATE, DL, VT, Hi);
+      // Compute the low part as N0.
+      Lo = DAG.getNode(ISD::TRUNCATE, DL, VT, Lo);
+      return CombineTo(N, Lo, Hi);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSMULO(SDNode *N) {
+  // (smulo x, 2) -> (saddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::SADDO, SDLoc(N), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUMULO(SDNode *N) {
+  // (umulo x, 2) -> (uaddo x, x)
+  if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    if (C2->getAPIntValue() == 2)
+      return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(),
+                         N->getOperand(0), N->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  // fold (add c1, c2) -> c1+c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
+
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
+
+  return SDValue();
+}
+
+/// If this is a binary operator with two operands of the same opcode, try to
+/// simplify it.
+SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
+  SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  assert(N0.getOpcode() == N1.getOpcode() && "Bad input!");
+
+  // Bail early if none of these transforms apply.
+  if (N0.getNode()->getNumOperands() == 0) return SDValue();
+
+  // For each of OP in AND/OR/XOR:
+  // fold (OP (zext x), (zext y)) -> (zext (OP x, y))
+  // fold (OP (sext x), (sext y)) -> (sext (OP x, y))
+  // fold (OP (aext x), (aext y)) -> (aext (OP x, y))
+  // fold (OP (bswap x), (bswap y)) -> (bswap (OP x, y))
+  // fold (OP (trunc x), (trunc y)) -> (trunc (OP x, y)) (if trunc isn't free)
+  //
+  // do not sink logical op inside of a vector extend, since it may combine
+  // into a vsetcc.
+  EVT Op0VT = N0.getOperand(0).getValueType();
+  if ((N0.getOpcode() == ISD::ZERO_EXTEND ||
+       N0.getOpcode() == ISD::SIGN_EXTEND ||
+       N0.getOpcode() == ISD::BSWAP ||
+       // Avoid infinite looping with PromoteIntBinOp.
+       (N0.getOpcode() == ISD::ANY_EXTEND &&
+        (!LegalTypes || TLI.isTypeDesirableForOp(N->getOpcode(), Op0VT))) ||
+       (N0.getOpcode() == ISD::TRUNCATE &&
+        (!TLI.isZExtFree(VT, Op0VT) ||
+         !TLI.isTruncateFree(Op0VT, VT)) &&
+        TLI.isTypeLegal(Op0VT))) &&
+      !VT.isVector() &&
+      Op0VT == N1.getOperand(0).getValueType() &&
+      (!LegalOperations || TLI.isOperationLegal(N->getOpcode(), Op0VT))) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorklist(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, ORNode);
+  }
+
+  // For each of OP in SHL/SRL/SRA/AND...
+  //   fold (and (OP x, z), (OP y, z)) -> (OP (and x, y), z)
+  //   fold (or  (OP x, z), (OP y, z)) -> (OP (or  x, y), z)
+  //   fold (xor (OP x, z), (OP y, z)) -> (OP (xor x, y), z)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL ||
+       N0.getOpcode() == ISD::SRA || N0.getOpcode() == ISD::AND) &&
+      N0.getOperand(1) == N1.getOperand(1)) {
+    SDValue ORNode = DAG.getNode(N->getOpcode(), SDLoc(N0),
+                                 N0.getOperand(0).getValueType(),
+                                 N0.getOperand(0), N1.getOperand(0));
+    AddToWorklist(ORNode.getNode());
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT,
+                       ORNode, N0.getOperand(1));
+  }
+
+  // Simplify xor/and/or (bitcast(A), bitcast(B)) -> bitcast(op (A,B))
+  // Only perform this optimization up until type legalization, before
+  // LegalizeVectorOprs. LegalizeVectorOprs promotes vector operations by
+  // adding bitcasts. For example (xor v4i32) is promoted to (v2i64), and
+  // we don't want to undo this promotion.
+  // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
+  // on scalars.
+  if ((N0.getOpcode() == ISD::BITCAST ||
+       N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
+       Level <= AfterLegalizeTypes) {
+    SDValue In0 = N0.getOperand(0);
+    SDValue In1 = N1.getOperand(0);
+    EVT In0Ty = In0.getValueType();
+    EVT In1Ty = In1.getValueType();
+    SDLoc DL(N);
+    // If both incoming values are integers, and the original types are the
+    // same.
+    if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
+      SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
+      SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
+      AddToWorklist(Op.getNode());
+      return BC;
+    }
+  }
+
+  // Xor/and/or are indifferent to the swizzle operation (shuffle of one value).
+  // Simplify xor/and/or (shuff(A), shuff(B)) -> shuff(op (A,B))
+  // If both shuffles use the same mask, and both shuffle within a single
+  // vector, then it is worthwhile to move the swizzle after the operation.
+  // The type-legalizer generates this pattern when loading illegal
+  // vector types from memory. In many cases this allows additional shuffle
+  // optimizations.
+  // There are other cases where moving the shuffle after the xor/and/or
+  // is profitable even if shuffles don't perform a swizzle.
+  // If both shuffles use the same mask, and both shuffles have the same first
+  // or second operand, then it might still be profitable to move the shuffle
+  // after the xor/and/or operation.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(N0);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(N1);
+
+    assert(N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType() &&
+           "Inputs to shuffles are not the same type");
+
+    // Check that both shuffles use the same mask. The masks are known to be of
+    // the same length because the result vector type is the same.
+    // Check also that shuffles have only one use to avoid introducing extra
+    // instructions.
+    if (SVN0->hasOneUse() && SVN1->hasOneUse() &&
+        SVN0->getMask().equals(SVN1->getMask())) {
+      SDValue ShOp = N0->getOperand(1);
+
+      // Don't try to fold this node if it requires introducing a
+      // build vector of all zeros that might be illegal at this stage.
+      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
+        if (!LegalTypes)
+          ShOp = DAG.getConstant(0, SDLoc(N), VT);
+        else
+          ShOp = SDValue();
+      }
+
+      // (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
+      // (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
+      // (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+      if (N0.getOperand(1) == N1.getOperand(1) && ShOp.getNode()) {
+        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                      N0->getOperand(0), N1->getOperand(0));
+        AddToWorklist(NewNode.getNode());
+        return DAG.getVectorShuffle(VT, SDLoc(N), NewNode, ShOp,
+                                    SVN0->getMask());
+      }
+
+      // Don't try to fold this node if it requires introducing a
+      // build vector of all zeros that might be illegal at this stage.
+      ShOp = N0->getOperand(0);
+      if (N->getOpcode() == ISD::XOR && !ShOp.isUndef()) {
+        if (!LegalTypes)
+          ShOp = DAG.getConstant(0, SDLoc(N), VT);
+        else
+          ShOp = SDValue();
+      }
+
+      // (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
+      // (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
+      // (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+      if (N0->getOperand(0) == N1->getOperand(0) && ShOp.getNode()) {
+        SDValue NewNode = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                      N0->getOperand(1), N1->getOperand(1));
+        AddToWorklist(NewNode.getNode());
+        return DAG.getVectorShuffle(VT, SDLoc(N), ShOp, NewNode,
+                                    SVN0->getMask());
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// This contains all DAGCombine rules which reduce two values combined by
+/// an And operation to a single value. This makes them reusable in the context
+/// of visitSELECT(). Rules involving constants are not included as
+/// visitSELECT() already handles those cases.
+SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
+                                  SDNode *LocReference) {
+  EVT VT = N1.getValueType();
+
+  // fold (and x, undef) -> 0
+  if (N0.isUndef() || N1.isUndef())
+    return DAG.getConstant(0, SDLoc(LocReference), VT);
+  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
+        LL.getValueType().isInteger()) {
+      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
+      if (isNullConstant(LR) && Op1 == ISD::SETEQ) {
+        SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
+                                     LR.getValueType(), LL, RL);
+        AddToWorklist(ORNode.getNode());
+        return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
+      }
+      if (isAllOnesConstant(LR)) {
+        // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
+        if (Op1 == ISD::SETEQ) {
+          SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
+                                        LR.getValueType(), LL, RL);
+          AddToWorklist(ANDNode.getNode());
+          return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
+        }
+        // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1)
+        if (Op1 == ISD::SETGT) {
+          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
+                                       LR.getValueType(), LL, RL);
+          AddToWorklist(ORNode.getNode());
+          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
+        }
+      }
+    }
+    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
+    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
+        Op0 == Op1 && LL.getValueType().isInteger() &&
+      Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
+                            (isAllOnesConstant(LR) && isNullConstant(RR)))) {
+      SDLoc DL(N0);
+      SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(),
+                                    LL, DAG.getConstant(1, DL,
+                                                        LL.getValueType()));
+      AddToWorklist(ADDNode.getNode());
+      return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode,
+                          DAG.getConstant(2, DL, LL.getValueType()),
+                          ISD::SETUGE);
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations ||
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
+        EVT CCVT = getSetCCResultType(LL.getValueType());
+        if (N0.getValueType() == CCVT ||
+            (!LegalOperations && N0.getValueType() == MVT::i1))
+          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
+                              LL, LR, Result);
+      }
+    }
+  }
+
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      APInt ADDC = ADDI->getAPIntValue();
+      if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLI->getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDLoc DL(N0);
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, DL, VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              // Return N so it doesn't get rechecked!
+              return SDValue(LocReference, 0);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  // Reduce bit extract of low half of an integer to the narrower type.
+  // (and (srl i64:x, K), KMask) ->
+  //   (i64 zero_extend (and (srl (i32 (trunc i64:x)), K)), KMask)
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *CAnd = dyn_cast<ConstantSDNode>(N1)) {
+      if (ConstantSDNode *CShift = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        unsigned Size = VT.getSizeInBits();
+        const APInt &AndMask = CAnd->getAPIntValue();
+        unsigned ShiftBits = CShift->getZExtValue();
+        unsigned MaskBits = AndMask.countTrailingOnes();
+        EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
+
+        if (APIntOps::isMask(AndMask) &&
+            // Required bits must not span the two halves of the integer and
+            // must fit in the half size type.
+            (ShiftBits + MaskBits <= Size / 2) &&
+            TLI.isNarrowingProfitable(VT, HalfVT) &&
+            TLI.isTypeDesirableForOp(ISD::AND, HalfVT) &&
+            TLI.isTypeDesirableForOp(ISD::SRL, HalfVT) &&
+            TLI.isTruncateFree(VT, HalfVT) &&
+            TLI.isZExtFree(HalfVT, VT)) {
+          // The isNarrowingProfitable is to avoid regressions on PPC and
+          // AArch64 which match a few 64-bit bit insert / bit extract patterns
+          // on downstream users of this. Those patterns could probably be
+          // extended to handle extensions mixed in.
+
+          SDValue SL(N0);
+          assert(ShiftBits != 0 && MaskBits <= Size);
+
+          // Extracting the highest bit of the low half.
+          EVT ShiftVT = TLI.getShiftAmountTy(HalfVT, DAG.getDataLayout());
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, HalfVT,
+                                      N0.getOperand(0));
+
+          SDValue NewMask = DAG.getConstant(AndMask.trunc(Size / 2), SL, HalfVT);
+          SDValue ShiftK = DAG.getConstant(ShiftBits, SL, ShiftVT);
+          SDValue Shift = DAG.getNode(ISD::SRL, SL, HalfVT, Trunc, ShiftK);
+          SDValue And = DAG.getNode(ISD::AND, SL, HalfVT, Shift, NewMask);
+          return DAG.getNode(ISD::ZERO_EXTEND, SL, VT, And);
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
+                                   EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
+                                   bool &NarrowLoad) {
+  uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
+
+  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue()))
+    return false;
+
+  ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
+  LoadedVT = LoadN->getMemoryVT();
+
+  if (ExtVT == LoadedVT &&
+      (!LegalOperations ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))) {
+    // ZEXTLOAD will match without needing to change the size of the value being
+    // loaded.
+    NarrowLoad = false;
+    return true;
+  }
+
+  // Do not change the width of a volatile load.
+  if (LoadN->isVolatile())
+    return false;
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!LoadedVT.bitsGT(ExtVT) || !ExtVT.isRound())
+    return false;
+
+  if (LegalOperations &&
+      !TLI.isLoadExtLegal(ISD::ZEXTLOAD, LoadResultTy, ExtVT))
+    return false;
+
+  if (!TLI.shouldReduceLoadWidth(LoadN, ISD::ZEXTLOAD, ExtVT))
+    return false;
+
+  NarrowLoad = true;
+  return true;
+}
+
+SDValue DAGCombiner::visitAND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    // fold (and x, 0) -> 0, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          SDLoc(N), N0.getValueType());
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getNullValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          SDLoc(N), N1.getValueType());
+
+    // fold (and x, -1) -> x, vector edition
+    if (ISD::isBuildVectorAllOnes(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllOnes(N1.getNode()))
+      return N0;
+  }
+
+  // fold (and c1, c2) -> c1&c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && N1C && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+  // fold (and x, -1) -> x
+  if (isAllOnesConstant(N1))
+    return N0;
+  // if (and x, c) is known to be zero, return 0
+  unsigned BitWidth = VT.getScalarType().getSizeInBits();
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(BitWidth)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+  // reassociate and
+  if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
+    return RAND;
+  // fold (and (or x, C), D) -> D if (C & D) == D
+  if (N1C && N0.getOpcode() == ISD::OR)
+    if (ConstantSDNode *ORI = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if ((ORI->getAPIntValue() & N1C->getAPIntValue()) == N1C->getAPIntValue())
+        return N1;
+  // fold (and (any_ext V), c) -> (zero_ext V) if 'and' only clears top bits.
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N0Op0 = N0.getOperand(0);
+    APInt Mask = ~N1C->getAPIntValue();
+    Mask = Mask.trunc(N0Op0.getValueSizeInBits());
+    if (DAG.MaskedValueIsZero(N0Op0, Mask)) {
+      SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N),
+                                 N0.getValueType(), N0Op0);
+
+      // Replace uses of the AND with uses of the Zero extend node.
+      CombineTo(N, Zext);
+
+      // We actually want to replace all uses of the any_extend with the
+      // zero_extend, to avoid duplicating things.  This will later cause this
+      // AND to be folded.
+      CombineTo(N0.getNode(), Zext);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // similarly fold (and (X (load ([non_ext|any_ext|zero_ext] V))), c) ->
+  // (X (load ([non_ext|zero_ext] V))) if 'and' only clears top bits which must
+  // already be zero by virtue of the width of the base type of the load.
+  //
+  // the 'X' node here can either be nothing or an extract_vector_elt to catch
+  // more cases.
+  if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+       N0.getValueSizeInBits() == N0.getOperand(0).getScalarValueSizeInBits() &&
+       N0.getOperand(0).getOpcode() == ISD::LOAD &&
+       N0.getOperand(0).getResNo() == 0) ||
+      (N0.getOpcode() == ISD::LOAD && N0.getResNo() == 0)) {
+    LoadSDNode *Load = cast<LoadSDNode>( (N0.getOpcode() == ISD::LOAD) ?
+                                         N0 : N0.getOperand(0) );
+
+    // Get the constant (if applicable) the zero'th operand is being ANDed with.
+    // This can be a pure constant or a vector splat, in which case we treat the
+    // vector as a scalar and use the splat value.
+    APInt Constant = APInt::getNullValue(1);
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N1)) {
+      Constant = C->getAPIntValue();
+    } else if (BuildVectorSDNode *Vector = dyn_cast<BuildVectorSDNode>(N1)) {
+      APInt SplatValue, SplatUndef;
+      unsigned SplatBitSize;
+      bool HasAnyUndefs;
+      bool IsSplat = Vector->isConstantSplat(SplatValue, SplatUndef,
+                                             SplatBitSize, HasAnyUndefs);
+      if (IsSplat) {
+        // Undef bits can contribute to a possible optimisation if set, so
+        // set them.
+        SplatValue |= SplatUndef;
+
+        // The splat value may be something like "0x00FFFFFF", which means 0 for
+        // the first vector value and FF for the rest, repeating. We need a mask
+        // that will apply equally to all members of the vector, so AND all the
+        // lanes of the constant together.
+        EVT VT = Vector->getValueType(0);
+        unsigned BitWidth = VT.getVectorElementType().getSizeInBits();
+
+        // If the splat value has been compressed to a bitlength lower
+        // than the size of the vector lane, we need to re-expand it to
+        // the lane size.
+        if (BitWidth > SplatBitSize)
+          for (SplatValue = SplatValue.zextOrTrunc(BitWidth);
+               SplatBitSize < BitWidth;
+               SplatBitSize = SplatBitSize * 2)
+            SplatValue |= SplatValue.shl(SplatBitSize);
+
+        // Make sure that variable 'Constant' is only set if 'SplatBitSize' is a
+        // multiple of 'BitWidth'. Otherwise, we could propagate a wrong value.
+        if (SplatBitSize % BitWidth == 0) {
+          Constant = APInt::getAllOnesValue(BitWidth);
+          for (unsigned i = 0, n = SplatBitSize/BitWidth; i < n; ++i)
+            Constant &= SplatValue.lshr(i*BitWidth).zextOrTrunc(BitWidth);
+        }
+      }
+    }
+
+    // If we want to change an EXTLOAD to a ZEXTLOAD, ensure a ZEXTLOAD is
+    // actually legal and isn't going to get expanded, else this is a false
+    // optimisation.
+    bool CanZextLoadProfitably = TLI.isLoadExtLegal(ISD::ZEXTLOAD,
+                                                    Load->getValueType(0),
+                                                    Load->getMemoryVT());
+
+    // Resize the constant to the same size as the original memory access before
+    // extension. If it is still the AllOnesValue then this AND is completely
+    // unneeded.
+    Constant =
+      Constant.zextOrTrunc(Load->getMemoryVT().getScalarType().getSizeInBits());
+
+    bool B;
+    switch (Load->getExtensionType()) {
+    default: B = false; break;
+    case ISD::EXTLOAD: B = CanZextLoadProfitably; break;
+    case ISD::ZEXTLOAD:
+    case ISD::NON_EXTLOAD: B = true; break;
+    }
+
+    if (B && Constant.isAllOnesValue()) {
+      // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
+      // preserve semantics once we get rid of the AND.
+      SDValue NewLoad(Load, 0);
+      if (Load->getExtensionType() == ISD::EXTLOAD) {
+        NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
+                              Load->getValueType(0), SDLoc(Load),
+                              Load->getChain(), Load->getBasePtr(),
+                              Load->getOffset(), Load->getMemoryVT(),
+                              Load->getMemOperand());
+        // Replace uses of the EXTLOAD with the new ZEXTLOAD.
+        if (Load->getNumValues() == 3) {
+          // PRE/POST_INC loads have 3 values.
+          SDValue To[] = { NewLoad.getValue(0), NewLoad.getValue(1),
+                           NewLoad.getValue(2) };
+          CombineTo(Load, To, 3, true);
+        } else {
+          CombineTo(Load, NewLoad.getValue(0), NewLoad.getValue(1));
+        }
+      }
+
+      // Fold the AND away, taking care not to fold to the old load node if we
+      // replaced it.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (and (load x), 255) -> (zextload x, i8)
+  // fold (and (extload x, i16), 255) -> (zextload x, i8)
+  // fold (and (any_ext (extload x, i16)), 255) -> (zextload x, i8)
+  if (N1C && (N0.getOpcode() == ISD::LOAD ||
+              (N0.getOpcode() == ISD::ANY_EXTEND &&
+               N0.getOperand(0).getOpcode() == ISD::LOAD))) {
+    bool HasAnyExt = N0.getOpcode() == ISD::ANY_EXTEND;
+    LoadSDNode *LN0 = HasAnyExt
+      ? cast<LoadSDNode>(N0.getOperand(0))
+      : cast<LoadSDNode>(N0);
+    if (LN0->getExtensionType() != ISD::SEXTLOAD &&
+        LN0->isUnindexed() && N0.hasOneUse() && SDValue(LN0, 0).hasOneUse()) {
+      auto NarrowLoad = false;
+      EVT LoadResultTy = HasAnyExt ? LN0->getValueType(0) : VT;
+      EVT ExtVT, LoadedVT;
+      if (isAndLoadExtLoad(N1C, LN0, LoadResultTy, ExtVT, LoadedVT,
+                           NarrowLoad)) {
+        if (!NarrowLoad) {
+          SDValue NewLoad =
+            DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy,
+                           LN0->getChain(), LN0->getBasePtr(), ExtVT,
+                           LN0->getMemOperand());
+          AddToWorklist(N);
+          CombineTo(LN0, NewLoad, NewLoad.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        } else {
+          EVT PtrType = LN0->getOperand(1).getValueType();
+
+          unsigned Alignment = LN0->getAlignment();
+          SDValue NewPtr = LN0->getBasePtr();
+
+          // For big endian targets, we need to add an offset to the pointer
+          // to load the correct bytes.  For little endian systems, we merely
+          // need to read fewer bytes from the same pointer.
+          if (DAG.getDataLayout().isBigEndian()) {
+            unsigned LVTStoreBytes = LoadedVT.getStoreSize();
+            unsigned EVTStoreBytes = ExtVT.getStoreSize();
+            unsigned PtrOff = LVTStoreBytes - EVTStoreBytes;
+            SDLoc DL(LN0);
+            NewPtr = DAG.getNode(ISD::ADD, DL, PtrType,
+                                 NewPtr, DAG.getConstant(PtrOff, DL, PtrType));
+            Alignment = MinAlign(Alignment, PtrOff);
+          }
+
+          AddToWorklist(NewPtr.getNode());
+
+          SDValue Load = DAG.getExtLoad(
+              ISD::ZEXTLOAD, SDLoc(LN0), LoadResultTy, LN0->getChain(), NewPtr,
+              LN0->getPointerInfo(), ExtVT, Alignment,
+              LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+          AddToWorklist(N);
+          CombineTo(LN0, Load, Load.getValue(1));
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+  }
+
+  if (SDValue Combined = visitANDLike(N0, N1, N))
+    return Combined;
+
+  // Simplify: (and (op x...), (op y...))  -> (op (and x, y))
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
+
+  // fold (and (sign_extend_inreg x, i16 to i32), 1) -> (and x, 1)
+  // fold (and (sra)) -> (and (srl)) when possible.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (zext_inreg (extload x)) -> (zextload x)
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
+      AddToWorklist(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (zext_inreg (sextload x)) -> (zextload x) iff load has one use
+  if (ISD::isSEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    // If we zero all the possible extended bits, then we can turn this into
+    // a zextload if we are running before legalize or the operation is legal.
+    unsigned BitWidth = N1.getValueType().getScalarType().getSizeInBits();
+    if (DAG.MaskedValueIsZero(N1, APInt::getHighBitsSet(BitWidth,
+                           BitWidth - MemVT.getScalarType().getSizeInBits())) &&
+        ((!LegalOperations && !LN0->isVolatile()) ||
+         TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT))) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N0), VT,
+                                       LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
+      AddToWorklist(N);
+      CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+  // fold (and (or (srl N, 8), (shl N, 8)), 0xffff) -> (srl (bswap N), const)
+  if (N1C && N1C->getAPIntValue() == 0xffff && N0.getOpcode() == ISD::OR) {
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
+      return BSwap;
+  }
+
+  return SDValue();
+}
+
+/// Match (a >> 8) | (a << 8) as (bswap a) >> 16.
+SDValue DAGCombiner::MatchBSwapHWordLow(SDNode *N, SDValue N0, SDValue N1,
+                                        bool DemandHighBits) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i64 && VT != MVT::i32 && VT != MVT::i16)
+    return SDValue();
+  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Recognize (and (shl a, 8), 0xff), (and (srl a, 8), 0xff00)
+  bool LookPassAnd0 = false;
+  bool LookPassAnd1 = false;
+  if (N0.getOpcode() == ISD::AND && N0.getOperand(0).getOpcode() == ISD::SRL)
+      std::swap(N0, N1);
+  if (N1.getOpcode() == ISD::AND && N1.getOperand(0).getOpcode() == ISD::SHL)
+      std::swap(N0, N1);
+  if (N0.getOpcode() == ISD::AND) {
+    if (!N0.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01C || N01C->getZExtValue() != 0xFF00)
+      return SDValue();
+    N0 = N0.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  if (N1.getOpcode() == ISD::AND) {
+    if (!N1.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+    if (!N11C || N11C->getZExtValue() != 0xFF)
+      return SDValue();
+    N1 = N1.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
+    std::swap(N0, N1);
+  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
+    return SDValue();
+  if (!N0.getNode()->hasOneUse() ||
+      !N1.getNode()->hasOneUse())
+    return SDValue();
+
+  ConstantSDNode *N01C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  ConstantSDNode *N11C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
+  if (!N01C || !N11C)
+    return SDValue();
+  if (N01C->getZExtValue() != 8 || N11C->getZExtValue() != 8)
+    return SDValue();
+
+  // Look for (shl (and a, 0xff), 8), (srl (and a, 0xff00), 8)
+  SDValue N00 = N0->getOperand(0);
+  if (!LookPassAnd0 && N00.getOpcode() == ISD::AND) {
+    if (!N00.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N001C = dyn_cast<ConstantSDNode>(N00.getOperand(1));
+    if (!N001C || N001C->getZExtValue() != 0xFF)
+      return SDValue();
+    N00 = N00.getOperand(0);
+    LookPassAnd0 = true;
+  }
+
+  SDValue N10 = N1->getOperand(0);
+  if (!LookPassAnd1 && N10.getOpcode() == ISD::AND) {
+    if (!N10.getNode()->hasOneUse())
+      return SDValue();
+    ConstantSDNode *N101C = dyn_cast<ConstantSDNode>(N10.getOperand(1));
+    if (!N101C || N101C->getZExtValue() != 0xFF00)
+      return SDValue();
+    N10 = N10.getOperand(0);
+    LookPassAnd1 = true;
+  }
+
+  if (N00 != N10)
+    return SDValue();
+
+  // Make sure everything beyond the low halfword gets set to zero since the SRL
+  // 16 will clear the top bits.
+  unsigned OpSizeInBits = VT.getSizeInBits();
+  if (DemandHighBits && OpSizeInBits > 16) {
+    // If the left-shift isn't masked out then the only way this is a bswap is
+    // if all bits beyond the low 8 are 0. In that case the entire pattern
+    // reduces to a left shift anyway: leave it for other parts of the combiner.
+    if (!LookPassAnd0)
+      return SDValue();
+
+    // However, if the right shift isn't masked out then it might be because
+    // it's not needed. See if we can spot that too.
+    if (!LookPassAnd1 &&
+        !DAG.MaskedValueIsZero(
+            N10, APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - 16)))
+      return SDValue();
+  }
+
+  SDValue Res = DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N00);
+  if (OpSizeInBits > 16) {
+    SDLoc DL(N);
+    Res = DAG.getNode(ISD::SRL, DL, VT, Res,
+                      DAG.getConstant(OpSizeInBits - 16, DL,
+                                      getShiftAmountTy(VT)));
+  }
+  return Res;
+}
+
+/// Return true if the specified node is an element that makes up a 32-bit
+/// packed halfword byteswap.
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+static bool isBSwapHWordElement(SDValue N, MutableArrayRef<SDNode *> Parts) {
+  if (!N.getNode()->hasOneUse())
+    return false;
+
+  unsigned Opc = N.getOpcode();
+  if (Opc != ISD::AND && Opc != ISD::SHL && Opc != ISD::SRL)
+    return false;
+
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+  if (!N1C)
+    return false;
+
+  unsigned Num;
+  switch (N1C->getZExtValue()) {
+  default:
+    return false;
+  case 0xFF:       Num = 0; break;
+  case 0xFF00:     Num = 1; break;
+  case 0xFF0000:   Num = 2; break;
+  case 0xFF000000: Num = 3; break;
+  }
+
+  // Look for (x & 0xff) << 8 as well as ((x << 8) & 0xff00).
+  SDValue N0 = N.getOperand(0);
+  if (Opc == ISD::AND) {
+    if (Num == 0 || Num == 2) {
+      // (x >> 8) & 0xff
+      // (x >> 8) & 0xff0000
+      if (N0.getOpcode() != ISD::SRL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    } else {
+      // (x << 8) & 0xff00
+      // (x << 8) & 0xff000000
+      if (N0.getOpcode() != ISD::SHL)
+        return false;
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+      if (!C || C->getZExtValue() != 8)
+        return false;
+    }
+  } else if (Opc == ISD::SHL) {
+    // (x & 0xff) << 8
+    // (x & 0xff0000) << 8
+    if (Num != 0 && Num != 2)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  } else { // Opc == ISD::SRL
+    // (x & 0xff00) >> 8
+    // (x & 0xff000000) >> 8
+    if (Num != 1 && Num != 3)
+      return false;
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    if (!C || C->getZExtValue() != 8)
+      return false;
+  }
+
+  if (Parts[Num])
+    return false;
+
+  Parts[Num] = N0.getOperand(0).getNode();
+  return true;
+}
+
+/// Match a 32-bit packed halfword bswap. That is
+/// ((x & 0x000000ff) << 8) |
+/// ((x & 0x0000ff00) >> 8) |
+/// ((x & 0x00ff0000) << 8) |
+/// ((x & 0xff000000) >> 8)
+/// => (rotl (bswap x), 16)
+SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
+  if (!LegalOperations)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+  if (!TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Look for either
+  // (or (or (and), (and)), (or (and), (and)))
+  // (or (or (or (and), (and)), (and)), (and))
+  if (N0.getOpcode() != ISD::OR)
+    return SDValue();
+  SDValue N00 = N0.getOperand(0);
+  SDValue N01 = N0.getOperand(1);
+  SDNode *Parts[4] = {};
+
+  if (N1.getOpcode() == ISD::OR &&
+      N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
+    // (or (or (and), (and)), (or (and), (and)))
+    SDValue N000 = N00.getOperand(0);
+    if (!isBSwapHWordElement(N000, Parts))
+      return SDValue();
+
+    SDValue N001 = N00.getOperand(1);
+    if (!isBSwapHWordElement(N001, Parts))
+      return SDValue();
+    SDValue N010 = N01.getOperand(0);
+    if (!isBSwapHWordElement(N010, Parts))
+      return SDValue();
+    SDValue N011 = N01.getOperand(1);
+    if (!isBSwapHWordElement(N011, Parts))
+      return SDValue();
+  } else {
+    // (or (or (or (and), (and)), (and)), (and))
+    if (!isBSwapHWordElement(N1, Parts))
+      return SDValue();
+    if (!isBSwapHWordElement(N01, Parts))
+      return SDValue();
+    if (N00.getOpcode() != ISD::OR)
+      return SDValue();
+    SDValue N000 = N00.getOperand(0);
+    if (!isBSwapHWordElement(N000, Parts))
+      return SDValue();
+    SDValue N001 = N00.getOperand(1);
+    if (!isBSwapHWordElement(N001, Parts))
+      return SDValue();
+  }
+
+  // Make sure the parts are all coming from the same node.
+  if (Parts[0] != Parts[1] || Parts[0] != Parts[2] || Parts[0] != Parts[3])
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT,
+                              SDValue(Parts[0], 0));
+
+  // Result of the bswap should be rotated by 16. If it's not legal, then
+  // do  (x << 16) | (x >> 16).
+  SDValue ShAmt = DAG.getConstant(16, DL, getShiftAmountTy(VT));
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT))
+    return DAG.getNode(ISD::ROTL, DL, VT, BSwap, ShAmt);
+  if (TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+    return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
+  return DAG.getNode(ISD::OR, DL, VT,
+                     DAG.getNode(ISD::SHL, DL, VT, BSwap, ShAmt),
+                     DAG.getNode(ISD::SRL, DL, VT, BSwap, ShAmt));
+}
+
+/// This contains all DAGCombine rules which reduce two values combined by
+/// an Or operation to a single value \see visitANDLike().
+SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+  EVT VT = N1.getValueType();
+  // fold (or x, undef) -> -1
+  if (!LegalOperations &&
+      (N0.isUndef() || N1.isUndef())) {
+    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
+    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
+                           SDLoc(LocReference), VT);
+  }
+  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
+  SDValue LL, LR, RL, RR, CC0, CC1;
+  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
+    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
+    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
+
+    if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) {
+      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
+      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
+      if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
+        SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
+                                     LR.getValueType(), LL, RL);
+        AddToWorklist(ORNode.getNode());
+        return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
+      }
+      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
+      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
+      if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
+        SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
+                                      LR.getValueType(), LL, RL);
+        AddToWorklist(ANDNode.getNode());
+        return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
+      }
+    }
+    // canonicalize equivalent to ll == rl
+    if (LL == RR && LR == RL) {
+      Op1 = ISD::getSetCCSwappedOperands(Op1);
+      std::swap(RL, RR);
+    }
+    if (LL == RL && LR == RR) {
+      bool isInteger = LL.getValueType().isInteger();
+      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
+      if (Result != ISD::SETCC_INVALID &&
+          (!LegalOperations ||
+           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
+            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
+        EVT CCVT = getSetCCResultType(LL.getValueType());
+        if (N0.getValueType() == CCVT ||
+            (!LegalOperations && N0.getValueType() == MVT::i1))
+          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
+                              LL, LR, Result);
+      }
+    }
+  }
+
+  // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
+  if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    // We can only do this xform if we know that bits from X that are set in C2
+    // but not in C1 are already zero.  Likewise for Y.
+    if (const ConstantSDNode *N0O1C =
+        getAsNonOpaqueConstant(N0.getOperand(1))) {
+      if (const ConstantSDNode *N1O1C =
+          getAsNonOpaqueConstant(N1.getOperand(1))) {
+        // We can only do this xform if we know that bits from X that are set in
+        // C2 but not in C1 are already zero.  Likewise for Y.
+        const APInt &LHSMask = N0O1C->getAPIntValue();
+        const APInt &RHSMask = N1O1C->getAPIntValue();
+
+        if (DAG.MaskedValueIsZero(N0.getOperand(0), RHSMask&~LHSMask) &&
+            DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
+          SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                                  N0.getOperand(0), N1.getOperand(0));
+          SDLoc DL(LocReference);
+          return DAG.getNode(ISD::AND, DL, VT, X,
+                             DAG.getConstant(LHSMask | RHSMask, DL, VT));
+        }
+      }
+    }
+  }
+
+  // (or (and X, M), (and X, N)) -> (and X, (or M, N))
+  if (N0.getOpcode() == ISD::AND &&
+      N1.getOpcode() == ISD::AND &&
+      N0.getOperand(0) == N1.getOperand(0) &&
+      // Don't increase # computations.
+      (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
+    SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
+                            N0.getOperand(1), N1.getOperand(1));
+    return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N1.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    // fold (or x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+
+    // fold (or x, -1) -> -1, vector edition
+    if (ISD::isBuildVectorAllOnes(N0.getNode()))
+      // do not return N0, because undef node may exist in N0
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N0.getValueType().getScalarType().getSizeInBits()),
+          SDLoc(N), N0.getValueType());
+    if (ISD::isBuildVectorAllOnes(N1.getNode()))
+      // do not return N1, because undef node may exist in N1
+      return DAG.getConstant(
+          APInt::getAllOnesValue(
+              N1.getValueType().getScalarType().getSizeInBits()),
+          SDLoc(N), N1.getValueType());
+
+    // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
+    // Do this only if the resulting shuffle is legal.
+    if (isa<ShuffleVectorSDNode>(N0) &&
+        isa<ShuffleVectorSDNode>(N1) &&
+        // Avoid folding a node with illegal type.
+        TLI.isTypeLegal(VT)) {
+      bool ZeroN00 = ISD::isBuildVectorAllZeros(N0.getOperand(0).getNode());
+      bool ZeroN01 = ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode());
+      bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
+      bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
+      // Ensure both shuffles have a zero input.
+      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) {
+        assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
+        assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
+        const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
+        const ShuffleVectorSDNode *SV1 = cast<ShuffleVectorSDNode>(N1);
+        bool CanFold = true;
+        int NumElts = VT.getVectorNumElements();
+        SmallVector<int, 4> Mask(NumElts);
+
+        for (int i = 0; i != NumElts; ++i) {
+          int M0 = SV0->getMaskElt(i);
+          int M1 = SV1->getMaskElt(i);
+
+          // Determine if either index is pointing to a zero vector.
+          bool M0Zero = M0 < 0 || (ZeroN00 == (M0 < NumElts));
+          bool M1Zero = M1 < 0 || (ZeroN10 == (M1 < NumElts));
+
+          // If one element is zero and the otherside is undef, keep undef.
+          // This also handles the case that both are undef.
+          if ((M0Zero && M1 < 0) || (M1Zero && M0 < 0)) {
+            Mask[i] = -1;
+            continue;
+          }
+
+          // Make sure only one of the elements is zero.
+          if (M0Zero == M1Zero) {
+            CanFold = false;
+            break;
+          }
+
+          assert((M0 >= 0 || M1 >= 0) && "Undef index!");
+
+          // We have a zero and non-zero element. If the non-zero came from
+          // SV0 make the index a LHS index. If it came from SV1, make it
+          // a RHS index. We need to mod by NumElts because we don't care
+          // which operand it came from in the original shuffles.
+          Mask[i] = M1Zero ? M0 % NumElts : (M1 % NumElts) + NumElts;
+        }
+
+        if (CanFold) {
+          SDValue NewLHS = ZeroN00 ? N0.getOperand(1) : N0.getOperand(0);
+          SDValue NewRHS = ZeroN10 ? N1.getOperand(1) : N1.getOperand(0);
+
+          bool LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
+          if (!LegalMask) {
+            std::swap(NewLHS, NewRHS);
+            ShuffleVectorSDNode::commuteMask(Mask);
+            LegalMask = TLI.isShuffleMaskLegal(Mask, VT);
+          }
+
+          if (LegalMask)
+            return DAG.getVectorShuffle(VT, SDLoc(N), NewLHS, NewRHS, Mask);
+        }
+      }
+    }
+  }
+
+  // fold (or c1, c2) -> c1|c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && N1C && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
+  // fold (or x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+  // fold (or x, -1) -> -1
+  if (isAllOnesConstant(N1))
+    return N1;
+  // fold (or x, c) -> c iff (x & ~c) == 0
+  if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
+    return N1;
+
+  if (SDValue Combined = visitORLike(N0, N1, N))
+    return Combined;
+
+  // Recognize halfword bswaps as (bswap + rotl 16) or (bswap + shl 16)
+  if (SDValue BSwap = MatchBSwapHWord(N, N0, N1))
+    return BSwap;
+  if (SDValue BSwap = MatchBSwapHWordLow(N, N0, N1))
+    return BSwap;
+
+  // reassociate or
+  if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
+    return ROR;
+  // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
+  // iff (c1 & c2) == 0.
+  if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+             isa<ConstantSDNode>(N0.getOperand(1))) {
+    ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
+    if ((C1->getAPIntValue() & N1C->getAPIntValue()) != 0) {
+      if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
+                                                   N1C, C1))
+        return DAG.getNode(
+            ISD::AND, SDLoc(N), VT,
+            DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1), COR);
+      return SDValue();
+    }
+  }
+  // Simplify: (or (op x...), (op y...))  -> (op (or x, y))
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
+
+  // See if this is some rotate idiom.
+  if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
+    return SDValue(Rot, 0);
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// Match "(X shl/srl V1) & V2" where V2 may not be present.
+bool DAGCombiner::MatchRotateHalf(SDValue Op, SDValue &Shift, SDValue &Mask) {
+  if (Op.getOpcode() == ISD::AND) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(Op.getOperand(1))) {
+      Mask = Op.getOperand(1);
+      Op = Op.getOperand(0);
+    } else {
+      return false;
+    }
+  }
+
+  if (Op.getOpcode() == ISD::SRL || Op.getOpcode() == ISD::SHL) {
+    Shift = Op;
+    return true;
+  }
+
+  return false;
+}
+
+// Return true if we can prove that, whenever Neg and Pos are both in the
+// range [0, EltSize), Neg == (Pos == 0 ? 0 : EltSize - Pos).  This means that
+// for two opposing shifts shift1 and shift2 and a value X with OpBits bits:
+//
+//     (or (shift1 X, Neg), (shift2 X, Pos))
+//
+// reduces to a rotate in direction shift2 by Pos or (equivalently) a rotate
+// in direction shift1 by Neg.  The range [0, EltSize) means that we only need
+// to consider shift amounts with defined behavior.
+static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize) {
+  // If EltSize is a power of 2 then:
+  //
+  //  (a) (Pos == 0 ? 0 : EltSize - Pos) == (EltSize - Pos) & (EltSize - 1)
+  //  (b) Neg == Neg & (EltSize - 1) whenever Neg is in [0, EltSize).
+  //
+  // So if EltSize is a power of 2 and Neg is (and Neg', EltSize-1), we check
+  // for the stronger condition:
+  //
+  //     Neg & (EltSize - 1) == (EltSize - Pos) & (EltSize - 1)    [A]
+  //
+  // for all Neg and Pos.  Since Neg & (EltSize - 1) == Neg' & (EltSize - 1)
+  // we can just replace Neg with Neg' for the rest of the function.
+  //
+  // In other cases we check for the even stronger condition:
+  //
+  //     Neg == EltSize - Pos                                    [B]
+  //
+  // for all Neg and Pos.  Note that the (or ...) then invokes undefined
+  // behavior if Pos == 0 (and consequently Neg == EltSize).
+  //
+  // We could actually use [A] whenever EltSize is a power of 2, but the
+  // only extra cases that it would match are those uninteresting ones
+  // where Neg and Pos are never in range at the same time.  E.g. for
+  // EltSize == 32, using [A] would allow a Neg of the form (sub 64, Pos)
+  // as well as (sub 32, Pos), but:
+  //
+  //     (or (shift1 X, (sub 64, Pos)), (shift2 X, Pos))
+  //
+  // always invokes undefined behavior for 32-bit X.
+  //
+  // Below, Mask == EltSize - 1 when using [A] and is all-ones otherwise.
+  unsigned MaskLoBits = 0;
+  if (Neg.getOpcode() == ISD::AND && isPowerOf2_64(EltSize)) {
+    if (ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(1))) {
+      if (NegC->getAPIntValue() == EltSize - 1) {
+        Neg = Neg.getOperand(0);
+        MaskLoBits = Log2_64(EltSize);
+      }
+    }
+  }
+
+  // Check whether Neg has the form (sub NegC, NegOp1) for some NegC and NegOp1.
+  if (Neg.getOpcode() != ISD::SUB)
+    return false;
+  ConstantSDNode *NegC = isConstOrConstSplat(Neg.getOperand(0));
+  if (!NegC)
+    return false;
+  SDValue NegOp1 = Neg.getOperand(1);
+
+  // On the RHS of [A], if Pos is Pos' & (EltSize - 1), just replace Pos with
+  // Pos'.  The truncation is redundant for the purpose of the equality.
+  if (MaskLoBits && Pos.getOpcode() == ISD::AND)
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+      if (PosC->getAPIntValue() == EltSize - 1)
+        Pos = Pos.getOperand(0);
+
+  // The condition we need is now:
+  //
+  //     (NegC - NegOp1) & Mask == (EltSize - Pos) & Mask
+  //
+  // If NegOp1 == Pos then we need:
+  //
+  //              EltSize & Mask == NegC & Mask
+  //
+  // (because "x & Mask" is a truncation and distributes through subtraction).
+  APInt Width;
+  if (Pos == NegOp1)
+    Width = NegC->getAPIntValue();
+
+  // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
+  // Then the condition we want to prove becomes:
+  //
+  //     (NegC - NegOp1) & Mask == (EltSize - (NegOp1 + PosC)) & Mask
+  //
+  // which, again because "x & Mask" is a truncation, becomes:
+  //
+  //                NegC & Mask == (EltSize - PosC) & Mask
+  //             EltSize & Mask == (NegC + PosC) & Mask
+  else if (Pos.getOpcode() == ISD::ADD && Pos.getOperand(0) == NegOp1) {
+    if (ConstantSDNode *PosC = isConstOrConstSplat(Pos.getOperand(1)))
+      Width = PosC->getAPIntValue() + NegC->getAPIntValue();
+    else
+      return false;
+  } else
+    return false;
+
+  // Now we just need to check that EltSize & Mask == Width & Mask.
+  if (MaskLoBits)
+    // EltSize & Mask is 0 since Mask is EltSize - 1.
+    return Width.getLoBits(MaskLoBits) == 0;
+  return Width == EltSize;
+}
+
+// A subroutine of MatchRotate used once we have found an OR of two opposite
+// shifts of Shifted.  If Neg == <operand size> - Pos then the OR reduces
+// to both (PosOpcode Shifted, Pos) and (NegOpcode Shifted, Neg), with the
+// former being preferred if supported.  InnerPos and InnerNeg are Pos and
+// Neg with outer conversions stripped away.
+SDNode *DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
+                                       SDValue Neg, SDValue InnerPos,
+                                       SDValue InnerNeg, unsigned PosOpcode,
+                                       unsigned NegOpcode, const SDLoc &DL) {
+  // fold (or (shl x, (*ext y)),
+  //          (srl x, (*ext (sub 32, y)))) ->
+  //   (rotl x, y) or (rotr x, (sub 32, y))
+  //
+  // fold (or (shl x, (*ext (sub 32, y))),
+  //          (srl x, (*ext y))) ->
+  //   (rotr x, y) or (rotl x, (sub 32, y))
+  EVT VT = Shifted.getValueType();
+  if (matchRotateSub(InnerPos, InnerNeg, VT.getScalarSizeInBits())) {
+    bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
+    return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, Shifted,
+                       HasPos ? Pos : Neg).getNode();
+  }
+
+  return nullptr;
+}
+
+// MatchRotate - Handle an 'or' of two operands.  If this is one of the many
+// idioms for rotate, and if the target supports rotation instructions, generate
+// a rot[lr].
+SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
+  // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
+  EVT VT = LHS.getValueType();
+  if (!TLI.isTypeLegal(VT)) return nullptr;
+
+  // The target must have at least one rotate flavor.
+  bool HasROTL = TLI.isOperationLegalOrCustom(ISD::ROTL, VT);
+  bool HasROTR = TLI.isOperationLegalOrCustom(ISD::ROTR, VT);
+  if (!HasROTL && !HasROTR) return nullptr;
+
+  // Match "(X shl/srl V1) & V2" where V2 may not be present.
+  SDValue LHSShift;   // The shift.
+  SDValue LHSMask;    // AND value if any.
+  if (!MatchRotateHalf(LHS, LHSShift, LHSMask))
+    return nullptr; // Not part of a rotate.
+
+  SDValue RHSShift;   // The shift.
+  SDValue RHSMask;    // AND value if any.
+  if (!MatchRotateHalf(RHS, RHSShift, RHSMask))
+    return nullptr; // Not part of a rotate.
+
+  if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
+    return nullptr;   // Not shifting the same value.
+
+  if (LHSShift.getOpcode() == RHSShift.getOpcode())
+    return nullptr;   // Shifts must disagree.
+
+  // Canonicalize shl to left side in a shl/srl pair.
+  if (RHSShift.getOpcode() == ISD::SHL) {
+    std::swap(LHS, RHS);
+    std::swap(LHSShift, RHSShift);
+    std::swap(LHSMask, RHSMask);
+  }
+
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  SDValue LHSShiftArg = LHSShift.getOperand(0);
+  SDValue LHSShiftAmt = LHSShift.getOperand(1);
+  SDValue RHSShiftArg = RHSShift.getOperand(0);
+  SDValue RHSShiftAmt = RHSShift.getOperand(1);
+
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
+  // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  if (isConstOrConstSplat(LHSShiftAmt) && isConstOrConstSplat(RHSShiftAmt)) {
+    uint64_t LShVal = isConstOrConstSplat(LHSShiftAmt)->getZExtValue();
+    uint64_t RShVal = isConstOrConstSplat(RHSShiftAmt)->getZExtValue();
+    if ((LShVal + RShVal) != EltSizeInBits)
+      return nullptr;
+
+    SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
+                              LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
+
+    // If there is an AND of either shifted operand, apply it to the result.
+    if (LHSMask.getNode() || RHSMask.getNode()) {
+      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits);
+      SDValue Mask = DAG.getConstant(AllBits, DL, VT);
+
+      if (LHSMask.getNode()) {
+        APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, LHSMask,
+                                       DAG.getConstant(RHSBits, DL, VT)));
+      }
+      if (RHSMask.getNode()) {
+        APInt LHSBits = APInt::getHighBitsSet(EltSizeInBits, RShVal);
+        Mask = DAG.getNode(ISD::AND, DL, VT, Mask,
+                           DAG.getNode(ISD::OR, DL, VT, RHSMask,
+                                       DAG.getConstant(LHSBits, DL, VT)));
+      }
+
+      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
+    }
+
+    return Rot.getNode();
+  }
+
+  // If there is a mask here, and we have a variable shift, we can't be sure
+  // that we're masking out the right stuff.
+  if (LHSMask.getNode() || RHSMask.getNode())
+    return nullptr;
+
+  // If the shift amount is sign/zext/any-extended just peel it off.
+  SDValue LExtOp0 = LHSShiftAmt;
+  SDValue RExtOp0 = RHSShiftAmt;
+  if ((LHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       LHSShiftAmt.getOpcode() == ISD::TRUNCATE) &&
+      (RHSShiftAmt.getOpcode() == ISD::SIGN_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ZERO_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::ANY_EXTEND ||
+       RHSShiftAmt.getOpcode() == ISD::TRUNCATE)) {
+    LExtOp0 = LHSShiftAmt.getOperand(0);
+    RExtOp0 = RHSShiftAmt.getOperand(0);
+  }
+
+  SDNode *TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+                                   LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+  if (TryL)
+    return TryL;
+
+  SDNode *TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+                                   RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+  if (TryR)
+    return TryR;
+
+  return nullptr;
+}
+
+SDValue DAGCombiner::visitXOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+
+  // fold vector ops
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    // fold (xor x, 0) -> x, vector edition
+    if (ISD::isBuildVectorAllZeros(N0.getNode()))
+      return N1;
+    if (ISD::isBuildVectorAllZeros(N1.getNode()))
+      return N0;
+  }
+
+  // fold (xor undef, undef) -> 0. This is a common idiom (misuse).
+  if (N0.isUndef() && N1.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+  // fold (xor x, undef) -> undef
+  if (N0.isUndef())
+    return N0;
+  if (N1.isUndef())
+    return N1;
+  // fold (xor c1, c2) -> c1^c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
+  if (N0C && N1C)
+    return DAG.FoldConstantArithmetic(ISD::XOR, SDLoc(N), VT, N0C, N1C);
+  // canonicalize constant to RHS
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+    return DAG.getNode(ISD::XOR, SDLoc(N), VT, N1, N0);
+  // fold (xor x, 0) -> x
+  if (isNullConstant(N1))
+    return N0;
+  // reassociate xor
+  if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
+    return RXOR;
+
+  // fold !(x cc y) -> (x !cc y)
+  SDValue LHS, RHS, CC;
+  if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+    bool isInt = LHS.getValueType().isInteger();
+    ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
+                                               isInt);
+
+    if (!LegalOperations ||
+        TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
+      switch (N0.getOpcode()) {
+      default:
+        llvm_unreachable("Unhandled SetCC Equivalent!");
+      case ISD::SETCC:
+        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC);
+      case ISD::SELECT_CC:
+        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2),
+                               N0.getOperand(3), NotCC);
+      }
+    }
+  }
+
+  // fold (not (zext (setcc x, y))) -> (zext (not (setcc x, y)))
+  if (isOneConstant(N1) && N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getNode()->hasOneUse() &&
+      isSetCCEquivalent(N0.getOperand(0), LHS, RHS, CC)){
+    SDValue V = N0.getOperand(0);
+    SDLoc DL(N0);
+    V = DAG.getNode(ISD::XOR, DL, V.getValueType(), V,
+                    DAG.getConstant(1, DL, V.getValueType()));
+    AddToWorklist(V.getNode());
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, V);
+  }
+
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  if (isOneConstant(N1) && VT == MVT::i1 &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isOneUseSetCC(RHS) || isOneUseSetCC(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
+      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
+      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+    }
+  }
+  // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are constants
+  if (isAllOnesConstant(N1) &&
+      (N0.getOpcode() == ISD::OR || N0.getOpcode() == ISD::AND)) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    if (isa<ConstantSDNode>(RHS) || isa<ConstantSDNode>(LHS)) {
+      unsigned NewOpcode = N0.getOpcode() == ISD::AND ? ISD::OR : ISD::AND;
+      LHS = DAG.getNode(ISD::XOR, SDLoc(LHS), VT, LHS, N1); // LHS = ~LHS
+      RHS = DAG.getNode(ISD::XOR, SDLoc(RHS), VT, RHS, N1); // RHS = ~RHS
+      AddToWorklist(LHS.getNode()); AddToWorklist(RHS.getNode());
+      return DAG.getNode(NewOpcode, SDLoc(N), VT, LHS, RHS);
+    }
+  }
+  // fold (xor (and x, y), y) -> (and (not x), y)
+  if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
+      N0->getOperand(1) == N1) {
+    SDValue X = N0->getOperand(0);
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, VT);
+    AddToWorklist(NotX.getNode());
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, NotX, N1);
+  }
+  // fold (xor (xor x, c1), c2) -> (xor x, (xor c1, c2))
+  if (N1C && N0.getOpcode() == ISD::XOR) {
+    if (const ConstantSDNode *N00C = getAsNonOpaqueConstant(N0.getOperand(0))) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(1),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N00C->getAPIntValue(), DL, VT));
+    }
+    if (const ConstantSDNode *N01C = getAsNonOpaqueConstant(N0.getOperand(1))) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::XOR, DL, VT, N0.getOperand(0),
+                         DAG.getConstant(N1C->getAPIntValue() ^
+                                         N01C->getAPIntValue(), DL, VT));
+    }
+  }
+  // fold (xor x, x) -> 0
+  if (N0 == N1)
+    return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
+
+  // fold (xor (shl 1, x), -1) -> (rotl ~1, x)
+  // Here is a concrete example of this equivalence:
+  // i16   x ==  14
+  // i16 shl ==   1 << 14  == 16384 == 0b0100000000000000
+  // i16 xor == ~(1 << 14) == 49151 == 0b1011111111111111
+  //
+  // =>
+  //
+  // i16     ~1      == 0b1111111111111110
+  // i16 rol(~1, 14) == 0b1011111111111111
+  //
+  // Some additional tips to help conceptualize this transform:
+  // - Try to see the operation as placing a single zero in a value of all ones.
+  // - There exists no value for x which would allow the result to contain zero.
+  // - Values of x larger than the bitwidth are undefined and do not require a
+  //   consistent result.
+  // - Pushing the zero left requires shifting one bits in from the right.
+  // A rotate left of ~1 is a nice way of achieving the desired result.
+  if (TLI.isOperationLegalOrCustom(ISD::ROTL, VT) && N0.getOpcode() == ISD::SHL
+      && isAllOnesConstant(N1) && isOneConstant(N0.getOperand(0))) {
+    SDLoc DL(N);
+    return DAG.getNode(ISD::ROTL, DL, VT, DAG.getConstant(~1, DL, VT),
+                       N0.getOperand(1));
+  }
+
+  // Simplify: xor (op x...), (op y...)  -> (op (xor x, y))
+  if (N0.getOpcode() == N1.getOpcode())
+    if (SDValue Tmp = SimplifyBinOpWithSameOpcodeHands(N))
+      return Tmp;
+
+  // Simplify the expression using non-local knowledge.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+/// Handle transforms common to the three shifts, when the shift amount is a
+/// constant.
+SDValue DAGCombiner::visitShiftByConstant(SDNode *N, ConstantSDNode *Amt) {
+  SDNode *LHS = N->getOperand(0).getNode();
+  if (!LHS->hasOneUse()) return SDValue();
+
+  // We want to pull some binops through shifts, so that we have (and (shift))
+  // instead of (shift (and)), likewise for add, or, xor, etc.  This sort of
+  // thing happens with address calculations, so it's important to canonicalize
+  // it.
+  bool HighBitSet = false;  // Can we transform this if the high bit is set?
+
+  switch (LHS->getOpcode()) {
+  default: return SDValue();
+  case ISD::OR:
+  case ISD::XOR:
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  case ISD::AND:
+    HighBitSet = true;  // We can only transform sra if the high bit is set.
+    break;
+  case ISD::ADD:
+    if (N->getOpcode() != ISD::SHL)
+      return SDValue(); // only shl(add) not sr[al](add).
+    HighBitSet = false; // We can only transform sra if the high bit is clear.
+    break;
+  }
+
+  // We require the RHS of the binop to be a constant and not opaque as well.
+  ConstantSDNode *BinOpCst = getAsNonOpaqueConstant(LHS->getOperand(1));
+  if (!BinOpCst) return SDValue();
+
+  // FIXME: disable this unless the input to the binop is a shift by a constant.
+  // If it is not a shift, it pessimizes some common cases like:
+  //
+  //    void foo(int *X, int i) { X[i & 1235] = 1; }
+  //    int bar(int *X, int i) { return X[i & 255]; }
+  SDNode *BinOpLHSVal = LHS->getOperand(0).getNode();
+  if ((BinOpLHSVal->getOpcode() != ISD::SHL &&
+       BinOpLHSVal->getOpcode() != ISD::SRA &&
+       BinOpLHSVal->getOpcode() != ISD::SRL) ||
+      !isa<ConstantSDNode>(BinOpLHSVal->getOperand(1)))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // If this is a signed shift right, and the high bit is modified by the
+  // logical operation, do not perform the transformation. The highBitSet
+  // boolean indicates the value of the high bit of the constant which would
+  // cause it to be modified for this operation.
+  if (N->getOpcode() == ISD::SRA) {
+    bool BinOpRHSSignSet = BinOpCst->getAPIntValue().isNegative();
+    if (BinOpRHSSignSet != HighBitSet)
+      return SDValue();
+  }
+
+  if (!TLI.isDesirableToCommuteWithShift(LHS))
+    return SDValue();
+
+  // Fold the constants, shifting the binop RHS by the shift amount.
+  SDValue NewRHS = DAG.getNode(N->getOpcode(), SDLoc(LHS->getOperand(1)),
+                               N->getValueType(0),
+                               LHS->getOperand(1), N->getOperand(1));
+  assert(isa<ConstantSDNode>(NewRHS) && "Folding was not successful!");
+
+  // Create the new shift.
+  SDValue NewShift = DAG.getNode(N->getOpcode(),
+                                 SDLoc(LHS->getOperand(0)),
+                                 VT, LHS->getOperand(0), N->getOperand(1));
+
+  // Create the new binop.
+  return DAG.getNode(LHS->getOpcode(), SDLoc(N), VT, NewShift, NewRHS);
+}
+
+SDValue DAGCombiner::distributeTruncateThroughAnd(SDNode *N) {
+  assert(N->getOpcode() == ISD::TRUNCATE);
+  assert(N->getOperand(0).getOpcode() == ISD::AND);
+
+  // (truncate:TruncVT (and N00, N01C)) -> (and (truncate:TruncVT N00), TruncC)
+  if (N->hasOneUse() && N->getOperand(0).hasOneUse()) {
+    SDValue N01 = N->getOperand(0).getOperand(1);
+
+    if (ConstantSDNode *N01C = isConstOrConstSplat(N01)) {
+      if (!N01C->isOpaque()) {
+        EVT TruncVT = N->getValueType(0);
+        SDValue N00 = N->getOperand(0).getOperand(0);
+        APInt TruncC = N01C->getAPIntValue();
+        TruncC = TruncC.trunc(TruncVT.getScalarSizeInBits());
+        SDLoc DL(N);
+
+        return DAG.getNode(ISD::AND, DL, TruncVT,
+                           DAG.getNode(ISD::TRUNCATE, DL, TruncVT, N00),
+                           DAG.getConstant(TruncC, DL, TruncVT));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitRotate(SDNode *N) {
+  // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
+  if (N->getOperand(1).getOpcode() == ISD::TRUNCATE &&
+      N->getOperand(1).getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 =
+            distributeTruncateThroughAnd(N->getOperand(1).getNode()))
+      return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                         N->getOperand(0), NewOp1);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSHL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+
+  // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    BuildVectorSDNode *N1CV = dyn_cast<BuildVectorSDNode>(N1);
+    // If setcc produces all-one true value then:
+    // (shl (and (setcc) N01CV) N1CV) -> (and (setcc) N01CV<<N1CV)
+    if (N1CV && N1CV->isConstant()) {
+      if (N0.getOpcode() == ISD::AND) {
+        SDValue N00 = N0->getOperand(0);
+        SDValue N01 = N0->getOperand(1);
+        BuildVectorSDNode *N01CV = dyn_cast<BuildVectorSDNode>(N01);
+
+        if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
+            TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
+                TargetLowering::ZeroOrNegativeOneBooleanContent) {
+          if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,
+                                                     N01CV, N1CV))
+            return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
+        }
+      } else {
+        N1C = isConstOrConstSplat(N1);
+      }
+    }
+  }
+
+  // fold (shl c1, c2) -> c1<<c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  if (N0C && N1C && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
+  // fold (shl 0, x) -> 0
+  if (isNullConstant(N0))
+    return N0;
+  // fold (shl x, c >= size(x)) -> undef
+  if (N1C && N1C->getAPIntValue().uge(OpSizeInBits))
+    return DAG.getUNDEF(VT);
+  // fold (shl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (shl undef, x) -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, SDLoc(N), VT);
+  // if (shl x, c) is known to be zero, return 0
+  if (DAG.MaskedValueIsZero(SDValue(N, 0),
+                            APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+  // fold (shl x, (trunc (and y, c))) -> (shl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SHL) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t c1 = N0C1->getZExtValue();
+      uint64_t c2 = N1C->getZExtValue();
+      SDLoc DL(N);
+      if (c1 + c2 >= OpSizeInBits)
+        return DAG.getConstant(0, DL, VT);
+      return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
+                         DAG.getConstant(c1 + c2, DL, N1.getValueType()));
+    }
+  }
+
+  // fold (shl (ext (shl x, c1)), c2) -> (ext (shl x, (add c1, c2)))
+  // For this to be valid, the second form must not preserve any of the bits
+  // that are shifted out by the inner shift in the first form.  This means
+  // the outer shift size must be >= the number of bits added by the ext.
+  // As a corollary, we don't care what kind of ext it is.
+  if (N1C && (N0.getOpcode() == ISD::ZERO_EXTEND ||
+              N0.getOpcode() == ISD::ANY_EXTEND ||
+              N0.getOpcode() == ISD::SIGN_EXTEND) &&
+      N0.getOperand(0).getOpcode() == ISD::SHL) {
+    SDValue N0Op0 = N0.getOperand(0);
+    if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
+      uint64_t c1 = N0Op0C1->getZExtValue();
+      uint64_t c2 = N1C->getZExtValue();
+      EVT InnerShiftVT = N0Op0.getValueType();
+      uint64_t InnerShiftSize = InnerShiftVT.getScalarSizeInBits();
+      if (c2 >= OpSizeInBits - InnerShiftSize) {
+        SDLoc DL(N0);
+        if (c1 + c2 >= OpSizeInBits)
+          return DAG.getConstant(0, DL, VT);
+        return DAG.getNode(ISD::SHL, DL, VT,
+                           DAG.getNode(N0.getOpcode(), DL, VT,
+                                       N0Op0->getOperand(0)),
+                           DAG.getConstant(c1 + c2, DL, N1.getValueType()));
+      }
+    }
+  }
+
+  // fold (shl (zext (srl x, C)), C) -> (zext (shl (srl x, C), C))
+  // Only fold this if the inner zext has no other uses to avoid increasing
+  // the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::ZERO_EXTEND && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::SRL) {
+    SDValue N0Op0 = N0.getOperand(0);
+    if (ConstantSDNode *N0Op0C1 = isConstOrConstSplat(N0Op0.getOperand(1))) {
+      uint64_t c1 = N0Op0C1->getZExtValue();
+      if (c1 < VT.getScalarSizeInBits()) {
+        uint64_t c2 = N1C->getZExtValue();
+        if (c1 == c2) {
+          SDValue NewOp0 = N0.getOperand(0);
+          EVT CountVT = NewOp0.getOperand(1).getValueType();
+          SDLoc DL(N);
+          SDValue NewSHL = DAG.getNode(ISD::SHL, DL, NewOp0.getValueType(),
+                                       NewOp0,
+                                       DAG.getConstant(c2, DL, CountVT));
+          AddToWorklist(NewSHL.getNode());
+          return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N0), VT, NewSHL);
+        }
+      }
+    }
+  }
+
+  // fold (shl (sr[la] exact X,  C1), C2) -> (shl    X, (C2-C1)) if C1 <= C2
+  // fold (shl (sr[la] exact X,  C1), C2) -> (sr[la] X, (C2-C1)) if C1  > C2
+  if (N1C && (N0.getOpcode() == ISD::SRL || N0.getOpcode() == ISD::SRA) &&
+      cast<BinaryWithFlagsSDNode>(N0)->Flags.hasExact()) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t C1 = N0C1->getZExtValue();
+      uint64_t C2 = N1C->getZExtValue();
+      SDLoc DL(N);
+      if (C1 <= C2)
+        return DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
+                           DAG.getConstant(C2 - C1, DL, N1.getValueType()));
+      return DAG.getNode(N0.getOpcode(), DL, VT, N0.getOperand(0),
+                         DAG.getConstant(C1 - C2, DL, N1.getValueType()));
+    }
+  }
+
+  // fold (shl (srl x, c1), c2) -> (and (shl x, (sub c2, c1), MASK) or
+  //                               (and (srl x, (sub c1, c2), MASK)
+  // Only fold this if the inner shift has no other uses -- if it does, folding
+  // this will increase the total number of instructions.
+  if (N1C && N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t c1 = N0C1->getZExtValue();
+      if (c1 < OpSizeInBits) {
+        uint64_t c2 = N1C->getZExtValue();
+        APInt Mask = APInt::getHighBitsSet(OpSizeInBits, OpSizeInBits - c1);
+        SDValue Shift;
+        if (c2 > c1) {
+          Mask = Mask.shl(c2 - c1);
+          SDLoc DL(N);
+          Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0),
+                              DAG.getConstant(c2 - c1, DL, N1.getValueType()));
+        } else {
+          Mask = Mask.lshr(c1 - c2);
+          SDLoc DL(N);
+          Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+                              DAG.getConstant(c1 - c2, DL, N1.getValueType()));
+        }
+        SDLoc DL(N0);
+        return DAG.getNode(ISD::AND, DL, VT, Shift,
+                           DAG.getConstant(Mask, DL, VT));
+      }
+    }
+  }
+  // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
+  if (N1C && N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1)) {
+    unsigned BitSize = VT.getScalarSizeInBits();
+    SDLoc DL(N);
+    SDValue HiBitsMask =
+      DAG.getConstant(APInt::getHighBitsSet(BitSize,
+                                            BitSize - N1C->getZExtValue()),
+                      DL, VT);
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0),
+                       HiBitsMask);
+  }
+
+  // fold (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // Variant of version done on multiply, except mul by a power of 2 is turned
+  // into a shift.
+  APInt Val;
+  if (N1C && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() &&
+      (isa<ConstantSDNode>(N0.getOperand(1)) ||
+       ISD::isConstantSplatVector(N0.getOperand(1).getNode(), Val))) {
+    SDValue Shl0 = DAG.getNode(ISD::SHL, SDLoc(N0), VT, N0.getOperand(0), N1);
+    SDValue Shl1 = DAG.getNode(ISD::SHL, SDLoc(N1), VT, N0.getOperand(1), N1);
+    return DAG.getNode(ISD::ADD, SDLoc(N), VT, Shl0, Shl1);
+  }
+
+  // fold (shl (mul x, c1), c2) -> (mul x, c1 << c2)
+  if (N1C && N0.getOpcode() == ISD::MUL && N0.getNode()->hasOneUse()) {
+    if (ConstantSDNode *N0C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      if (SDValue Folded =
+              DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N1), VT, N0C1, N1C))
+        return DAG.getNode(ISD::MUL, SDLoc(N), VT, N0.getOperand(0), Folded);
+    }
+  }
+
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSHL = visitShiftByConstant(N, N1C))
+      return NewSHL;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    N1C = isConstOrConstSplat(N1);
+  }
+
+  // fold (sra c1, c2) -> (sra c1, c2)
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  if (N0C && N1C && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
+  // fold (sra 0, x) -> 0
+  if (isNullConstant(N0))
+    return N0;
+  // fold (sra -1, x) -> -1
+  if (isAllOnesConstant(N0))
+    return N0;
+  // fold (sra x, (setge c, size(x))) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (sra x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
+  // sext_inreg.
+  if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
+    unsigned LowBits = OpSizeInBits - (unsigned)N1C->getZExtValue();
+    EVT ExtVT = EVT::getIntegerVT(*DAG.getContext(), LowBits);
+    if (VT.isVector())
+      ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               ExtVT, VT.getVectorNumElements());
+    if ((!LegalOperations ||
+         TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
+                         N0.getOperand(0), DAG.getValueType(ExtVT));
+  }
+
+  // fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRA) {
+    if (ConstantSDNode *C1 = isConstOrConstSplat(N0.getOperand(1))) {
+      unsigned Sum = N1C->getZExtValue() + C1->getZExtValue();
+      if (Sum >= OpSizeInBits)
+        Sum = OpSizeInBits - 1;
+      SDLoc DL(N);
+      return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
+                         DAG.getConstant(Sum, DL, N1.getValueType()));
+    }
+  }
+
+  // fold (sra (shl X, m), (sub result_size, n))
+  // -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
+  // result_size - n != m.
+  // If truncate is free for the target sext(shl) is likely to result in better
+  // code.
+  if (N0.getOpcode() == ISD::SHL && N1C) {
+    // Get the two constanst of the shifts, CN0 = m, CN = n.
+    const ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1));
+    if (N01C) {
+      LLVMContext &Ctx = *DAG.getContext();
+      // Determine what the truncate's result bitsize and type would be.
+      EVT TruncVT = EVT::getIntegerVT(Ctx, OpSizeInBits - N1C->getZExtValue());
+
+      if (VT.isVector())
+        TruncVT = EVT::getVectorVT(Ctx, TruncVT, VT.getVectorNumElements());
+
+      // Determine the residual right-shift amount.
+      int ShiftAmt = N1C->getZExtValue() - N01C->getZExtValue();
+
+      // If the shift is not a no-op (in which case this should be just a sign
+      // extend already), the truncated to type is legal, sign_extend is legal
+      // on that type, and the truncate to that type is both legal and free,
+      // perform the transform.
+      if ((ShiftAmt > 0) &&
+          TLI.isOperationLegalOrCustom(ISD::SIGN_EXTEND, TruncVT) &&
+          TLI.isOperationLegalOrCustom(ISD::TRUNCATE, VT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
+
+        SDLoc DL(N);
+        SDValue Amt = DAG.getConstant(ShiftAmt, DL,
+            getShiftAmountTy(N0.getOperand(0).getValueType()));
+        SDValue Shift = DAG.getNode(ISD::SRL, DL, VT,
+                                    N0.getOperand(0), Amt);
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, TruncVT,
+                                    Shift);
+        return DAG.getNode(ISD::SIGN_EXTEND, DL,
+                           N->getValueType(0), Trunc);
+      }
+    }
+  }
+
+  // fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  // fold (sra (trunc (srl x, c1)), c2) -> (trunc (sra x, c1 + c2))
+  //      if c1 is equal to the number of bits the trunc removes
+  if (N0.getOpcode() == ISD::TRUNCATE &&
+      (N0.getOperand(0).getOpcode() == ISD::SRL ||
+       N0.getOperand(0).getOpcode() == ISD::SRA) &&
+      N0.getOperand(0).hasOneUse() &&
+      N0.getOperand(0).getOperand(1).hasOneUse() &&
+      N1C) {
+    SDValue N0Op0 = N0.getOperand(0);
+    if (ConstantSDNode *LargeShift = isConstOrConstSplat(N0Op0.getOperand(1))) {
+      unsigned LargeShiftVal = LargeShift->getZExtValue();
+      EVT LargeVT = N0Op0.getValueType();
+
+      if (LargeVT.getScalarSizeInBits() - OpSizeInBits == LargeShiftVal) {
+        SDLoc DL(N);
+        SDValue Amt =
+          DAG.getConstant(LargeShiftVal + N1C->getZExtValue(), DL,
+                          getShiftAmountTy(N0Op0.getOperand(0).getValueType()));
+        SDValue SRA = DAG.getNode(ISD::SRA, DL, LargeVT,
+                                  N0Op0.getOperand(0), Amt);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, SRA);
+      }
+    }
+  }
+
+  // Simplify, based on bits shifted out of the LHS.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+
+  // If the sign bit is known to be zero, switch this to a SRL.
+  if (DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1);
+
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRA = visitShiftByConstant(N, N1C))
+      return NewSRA;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSRL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  unsigned OpSizeInBits = VT.getScalarType().getSizeInBits();
+
+  // fold vector ops
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (VT.isVector()) {
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+    N1C = isConstOrConstSplat(N1);
+  }
+
+  // fold (srl c1, c2) -> c1 >>u c2
+  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
+  if (N0C && N1C && !N1C->isOpaque())
+    return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
+  // fold (srl 0, x) -> 0
+  if (isNullConstant(N0))
+    return N0;
+  // fold (srl x, c >= size(x)) -> undef
+  if (N1C && N1C->getZExtValue() >= OpSizeInBits)
+    return DAG.getUNDEF(VT);
+  // fold (srl x, 0) -> x
+  if (N1C && N1C->isNullValue())
+    return N0;
+  // if (srl x, c) is known to be zero, return 0
+  if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
+                                   APInt::getAllOnesValue(OpSizeInBits)))
+    return DAG.getConstant(0, SDLoc(N), VT);
+
+  // fold (srl (srl x, c1), c2) -> 0 or (srl x, (add c1, c2))
+  if (N1C && N0.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *N01C = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t c1 = N01C->getZExtValue();
+      uint64_t c2 = N1C->getZExtValue();
+      SDLoc DL(N);
+      if (c1 + c2 >= OpSizeInBits)
+        return DAG.getConstant(0, DL, VT);
+      return DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0),
+                         DAG.getConstant(c1 + c2, DL, N1.getValueType()));
+    }
+  }
+
+  // fold (srl (trunc (srl x, c1)), c2) -> 0 or (trunc (srl x, (add c1, c2)))
+  if (N1C && N0.getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(0).getOpcode() == ISD::SRL &&
+      isa<ConstantSDNode>(N0.getOperand(0)->getOperand(1))) {
+    uint64_t c1 =
+      cast<ConstantSDNode>(N0.getOperand(0)->getOperand(1))->getZExtValue();
+    uint64_t c2 = N1C->getZExtValue();
+    EVT InnerShiftVT = N0.getOperand(0).getValueType();
+    EVT ShiftCountVT = N0.getOperand(0)->getOperand(1).getValueType();
+    uint64_t InnerShiftSize = InnerShiftVT.getScalarType().getSizeInBits();
+    // This is only valid if the OpSizeInBits + c1 = size of inner shift.
+    if (c1 + OpSizeInBits == InnerShiftSize) {
+      SDLoc DL(N0);
+      if (c1 + c2 >= InnerShiftSize)
+        return DAG.getConstant(0, DL, VT);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                         DAG.getNode(ISD::SRL, DL, InnerShiftVT,
+                                     N0.getOperand(0)->getOperand(0),
+                                     DAG.getConstant(c1 + c2, DL,
+                                                     ShiftCountVT)));
+    }
+  }
+
+  // fold (srl (shl x, c), c) -> (and x, cst2)
+  if (N1C && N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1) {
+    unsigned BitSize = N0.getScalarValueSizeInBits();
+    if (BitSize <= 64) {
+      uint64_t ShAmt = N1C->getZExtValue() + 64 - BitSize;
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0),
+                         DAG.getConstant(~0ULL >> ShAmt, DL, VT));
+    }
+  }
+
+  // fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
+  if (N1C && N0.getOpcode() == ISD::ANY_EXTEND) {
+    // Shifting in all undef bits?
+    EVT SmallVT = N0.getOperand(0).getValueType();
+    unsigned BitSize = SmallVT.getScalarSizeInBits();
+    if (N1C->getZExtValue() >= BitSize)
+      return DAG.getUNDEF(VT);
+
+    if (!LegalTypes || TLI.isTypeDesirableForOp(ISD::SRL, SmallVT)) {
+      uint64_t ShiftAmt = N1C->getZExtValue();
+      SDLoc DL0(N0);
+      SDValue SmallShift = DAG.getNode(ISD::SRL, DL0, SmallVT,
+                                       N0.getOperand(0),
+                          DAG.getConstant(ShiftAmt, DL0,
+                                          getShiftAmountTy(SmallVT)));
+      AddToWorklist(SmallShift.getNode());
+      APInt Mask = APInt::getAllOnesValue(OpSizeInBits).lshr(ShiftAmt);
+      SDLoc DL(N);
+      return DAG.getNode(ISD::AND, DL, VT,
+                         DAG.getNode(ISD::ANY_EXTEND, DL, VT, SmallShift),
+                         DAG.getConstant(Mask, DL, VT));
+    }
+  }
+
+  // fold (srl (sra X, Y), 31) -> (srl X, 31).  This srl only looks at the sign
+  // bit, which is unmodified by sra.
+  if (N1C && N1C->getZExtValue() + 1 == OpSizeInBits) {
+    if (N0.getOpcode() == ISD::SRA)
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0.getOperand(0), N1);
+  }
+
+  // fold (srl (ctlz x), "5") -> x  iff x has one bit set (the low bit).
+  if (N1C && N0.getOpcode() == ISD::CTLZ &&
+      N1C->getAPIntValue() == Log2_32(OpSizeInBits)) {
+    APInt KnownZero, KnownOne;
+    DAG.computeKnownBits(N0.getOperand(0), KnownZero, KnownOne);
+
+    // If any of the input bits are KnownOne, then the input couldn't be all
+    // zeros, thus the result of the srl will always be zero.
+    if (KnownOne.getBoolValue()) return DAG.getConstant(0, SDLoc(N0), VT);
+
+    // If all of the bits input the to ctlz node are known to be zero, then
+    // the result of the ctlz is "32" and the result of the shift is one.
+    APInt UnknownBits = ~KnownZero;
+    if (UnknownBits == 0) return DAG.getConstant(1, SDLoc(N0), VT);
+
+    // Otherwise, check to see if there is exactly one bit input to the ctlz.
+    if ((UnknownBits & (UnknownBits - 1)) == 0) {
+      // Okay, we know that only that the single bit specified by UnknownBits
+      // could be set on input to the CTLZ node. If this bit is set, the SRL
+      // will return 0, if it is clear, it returns 1. Change the CTLZ/SRL pair
+      // to an SRL/XOR pair, which is likely to simplify more.
+      unsigned ShAmt = UnknownBits.countTrailingZeros();
+      SDValue Op = N0.getOperand(0);
+
+      if (ShAmt) {
+        SDLoc DL(N0);
+        Op = DAG.getNode(ISD::SRL, DL, VT, Op,
+                  DAG.getConstant(ShAmt, DL,
+                                  getShiftAmountTy(Op.getValueType())));
+        AddToWorklist(Op.getNode());
+      }
+
+      SDLoc DL(N);
+      return DAG.getNode(ISD::XOR, DL, VT,
+                         Op, DAG.getConstant(1, DL, VT));
+    }
+  }
+
+  // fold (srl x, (trunc (and y, c))) -> (srl x, (and (trunc y), (trunc c))).
+  if (N1.getOpcode() == ISD::TRUNCATE &&
+      N1.getOperand(0).getOpcode() == ISD::AND) {
+    if (SDValue NewOp1 = distributeTruncateThroughAnd(N1.getNode()))
+      return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, NewOp1);
+  }
+
+  // fold operands of srl based on knowledge that the low bits are not
+  // demanded.
+  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  if (N1C && !N1C->isOpaque())
+    if (SDValue NewSRL = visitShiftByConstant(N, N1C))
+      return NewSRL;
+
+  // Attempt to convert a srl of a load into a narrower zero-extending load.
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
+    return NarrowLoad;
+
+  // Here is a common situation. We want to optimize:
+  //
+  //   %a = ...
+  //   %b = and i32 %a, 2
+  //   %c = srl i32 %b, 1
+  //   brcond i32 %c ...
+  //
+  // into
+  //
+  //   %a = ...
+  //   %b = and %a, 2
+  //   %c = setcc eq %b, 0
+  //   brcond %c ...
+  //
+  // However when after the source operand of SRL is optimized into AND, the SRL
+  // itself may not be optimized further. Look for it and add the BRCOND into
+  // the worklist.
+  if (N->hasOneUse()) {
+    SDNode *Use = *N->use_begin();
+    if (Use->getOpcode() == ISD::BRCOND)
+      AddToWorklist(Use);
+    else if (Use->getOpcode() == ISD::TRUNCATE && Use->hasOneUse()) {
+      // Also look pass the truncate.
+      Use = *Use->use_begin();
+      if (Use->getOpcode() == ISD::BRCOND)
+        AddToWorklist(Use);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBSWAP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (bswap c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, N0);
+  // fold (bswap (bswap x)) -> x
+  if (N0.getOpcode() == ISD::BSWAP)
+    return N0->getOperand(0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBITREVERSE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (bitreverse (bitreverse x)) -> x
+  if (N0.getOpcode() == ISD::BITREVERSE)
+    return N0.getOperand(0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTLZ, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTLZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctlz_zero_undef c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTLZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTTZ, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTTZ_ZERO_UNDEF(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (cttz_zero_undef c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTTZ_ZERO_UNDEF, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitCTPOP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ctpop c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::CTPOP, SDLoc(N), VT, N0);
+  return SDValue();
+}
+
+
+/// \brief Generate Min/Max node
+static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
+                                   SDValue RHS, SDValue True, SDValue False,
+                                   ISD::CondCode CC, const TargetLowering &TLI,
+                                   SelectionDAG &DAG) {
+  if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
+    return SDValue();
+
+  switch (CC) {
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETULT:
+  case ISD::SETULE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: {
+    unsigned Opcode = (LHS == True) ? ISD::FMAXNUM : ISD::FMINNUM;
+    if (TLI.isOperationLegal(Opcode, VT))
+      return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+    return SDValue();
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue DAGCombiner::visitSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  EVT VT0 = N0.getValueType();
+
+  // fold (select C, X, X) -> X
+  if (N1 == N2)
+    return N1;
+  if (const ConstantSDNode *N0C = dyn_cast<const ConstantSDNode>(N0)) {
+    // fold (select true, X, Y) -> X
+    // fold (select false, X, Y) -> Y
+    return !N0C->isNullValue() ? N1 : N2;
+  }
+  // fold (select C, 1, X) -> (or C, X)
+  if (VT == MVT::i1 && isOneConstant(N1))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
+  // fold (select C, 0, 1) -> (xor C, 1)
+  // We can't do this reliably if integer based booleans have different contents
+  // to floating point based booleans. This is because we can't tell whether we
+  // have an integer-based boolean or a floating-point-based boolean unless we
+  // can find the SETCC that produced it and inspect its operands. This is
+  // fairly easy if C is the SETCC node, but it can potentially be
+  // undiscoverable (or not reasonably discoverable). For example, it could be
+  // in another basic block or it could require searching a complicated
+  // expression.
+  if (VT.isInteger() &&
+      (VT0 == MVT::i1 || (VT0.isInteger() &&
+                          TLI.getBooleanContents(false, false) ==
+                              TLI.getBooleanContents(false, true) &&
+                          TLI.getBooleanContents(false, false) ==
+                              TargetLowering::ZeroOrOneBooleanContent)) &&
+      isNullConstant(N1) && isOneConstant(N2)) {
+    SDValue XORNode;
+    if (VT == VT0) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::XOR, DL, VT0,
+                         N0, DAG.getConstant(1, DL, VT0));
+    }
+    SDLoc DL0(N0);
+    XORNode = DAG.getNode(ISD::XOR, DL0, VT0,
+                          N0, DAG.getConstant(1, DL0, VT0));
+    AddToWorklist(XORNode.getNode());
+    if (VT.bitsGT(VT0))
+      return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, XORNode);
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, XORNode);
+  }
+  // fold (select C, 0, X) -> (and (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && isNullConstant(N1)) {
+    SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
+    AddToWorklist(NOTNode.getNode());
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, NOTNode, N2);
+  }
+  // fold (select C, X, 1) -> (or (not C), X)
+  if (VT == VT0 && VT == MVT::i1 && isOneConstant(N2)) {
+    SDValue NOTNode = DAG.getNOT(SDLoc(N0), N0, VT);
+    AddToWorklist(NOTNode.getNode());
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, NOTNode, N1);
+  }
+  // fold (select C, X, 0) -> (and C, X)
+  if (VT == MVT::i1 && isNullConstant(N2))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
+  // fold (select X, X, Y) -> (or X, Y)
+  // fold (select X, 1, Y) -> (or X, Y)
+  if (VT == MVT::i1 && (N0 == N1 || isOneConstant(N1)))
+    return DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N2);
+  // fold (select X, Y, X) -> (and X, Y)
+  // fold (select X, Y, 0) -> (and X, Y)
+  if (VT == MVT::i1 && (N0 == N2 || isNullConstant(N2)))
+    return DAG.getNode(ISD::AND, SDLoc(N), VT, N0, N1);
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  if (VT0 == MVT::i1) {
+    // The code in this block deals with the following 2 equivalences:
+    //    select(C0|C1, x, y) <=> select(C0, x, select(C1, x, y))
+    //    select(C0&C1, x, y) <=> select(C0, select(C1, x, y), y)
+    // The target can specify its prefered form with the
+    // shouldNormalizeToSelectSequence() callback. However we always transform
+    // to the right anyway if we find the inner select exists in the DAG anyway
+    // and we always transform to the left side if we know that we can further
+    // optimize the combination of the conditions.
+    bool normalizeToSequence
+      = TLI.shouldNormalizeToSelectSequence(*DAG.getContext(), VT);
+    // select (and Cond0, Cond1), X, Y
+    //   -> select Cond0, (select Cond1, X, Y), Y
+    if (N0->getOpcode() == ISD::AND && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0,
+                           InnerSelect, N2);
+    }
+    // select (or Cond0, Cond1), X, Y -> select Cond0, X, (select Cond1, X, Y)
+    if (N0->getOpcode() == ISD::OR && N0->hasOneUse()) {
+      SDValue Cond0 = N0->getOperand(0);
+      SDValue Cond1 = N0->getOperand(1);
+      SDValue InnerSelect = DAG.getNode(ISD::SELECT, SDLoc(N),
+                                        N1.getValueType(), Cond1, N1, N2);
+      if (normalizeToSequence || !InnerSelect.use_empty())
+        return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Cond0, N1,
+                           InnerSelect);
+    }
+
+    // select Cond0, (select Cond1, X, Y), Y -> select (and Cond0, Cond1), X, Y
+    if (N1->getOpcode() == ISD::SELECT && N1->hasOneUse()) {
+      SDValue N1_0 = N1->getOperand(0);
+      SDValue N1_1 = N1->getOperand(1);
+      SDValue N1_2 = N1->getOperand(2);
+      if (N1_2 == N2 && N0.getValueType() == N1_0.getValueType()) {
+        // Create the actual and node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue And = DAG.getNode(ISD::AND, SDLoc(N), N0.getValueType(),
+                                    N0, N1_0);
+          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), And,
+                             N1_1, N2);
+        }
+        // Otherwise see if we can optimize the "and" to a better pattern.
+        if (SDValue Combined = visitANDLike(N0, N1_0, N))
+          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined,
+                             N1_1, N2);
+      }
+    }
+    // select Cond0, X, (select Cond1, X, Y) -> select (or Cond0, Cond1), X, Y
+    if (N2->getOpcode() == ISD::SELECT && N2->hasOneUse()) {
+      SDValue N2_0 = N2->getOperand(0);
+      SDValue N2_1 = N2->getOperand(1);
+      SDValue N2_2 = N2->getOperand(2);
+      if (N2_1 == N1 && N0.getValueType() == N2_0.getValueType()) {
+        // Create the actual or node if we can generate good code for it.
+        if (!normalizeToSequence) {
+          SDValue Or = DAG.getNode(ISD::OR, SDLoc(N), N0.getValueType(),
+                                   N0, N2_0);
+          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Or,
+                             N1, N2_2);
+        }
+        // Otherwise see if we can optimize to a better pattern.
+        if (SDValue Combined = visitORLike(N0, N2_0, N))
+          return DAG.getNode(ISD::SELECT, SDLoc(N), N1.getValueType(), Combined,
+                             N1, N2_2);
+      }
+    }
+  }
+
+  // fold selects based on a setcc into other things, such as min/max/abs
+  if (N0.getOpcode() == ISD::SETCC) {
+    // select x, y (fcmp lt x, y) -> fminnum x, y
+    // select x, y (fcmp gt x, y) -> fmaxnum x, y
+    //
+    // This is OK if we don't care about what happens if either operand is a
+    // NaN.
+    //
+
+    // FIXME: Instead of testing for UnsafeFPMath, this should be checking for
+    // no signed zeros as well as no nans.
+    const TargetOptions &Options = DAG.getTarget().Options;
+    if (Options.UnsafeFPMath &&
+        VT.isFloatingPoint() && N0.hasOneUse() &&
+        DAG.isKnownNeverNaN(N1) && DAG.isKnownNeverNaN(N2)) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+
+      if (SDValue FMinMax = combineMinNumMaxNum(SDLoc(N), VT, N0.getOperand(0),
+                                                N0.getOperand(1), N1, N2, CC,
+                                                TLI, DAG))
+        return FMinMax;
+    }
+
+    if ((!LegalOperations &&
+         TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT)) ||
+        TLI.isOperationLegal(ISD::SELECT_CC, VT))
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         N1, N2, N0.getOperand(2));
+    return SimplifySelect(SDLoc(N), N0, N1, N2);
+  }
+
+  return SDValue();
+}
+
+static
+std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
+// This function assumes all the vselect's arguments are CONCAT_VECTOR
+// nodes and that the condition is a BV of ConstantSDNodes (or undefs).
+static SDValue ConvertSelectToConcatVector(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+  SDValue Cond = N->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT VT = N->getValueType(0);
+  int NumElems = VT.getVectorNumElements();
+  assert(LHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         RHS.getOpcode() == ISD::CONCAT_VECTORS &&
+         Cond.getOpcode() == ISD::BUILD_VECTOR);
+
+  // CONCAT_VECTOR can take an arbitrary number of arguments. We only care about
+  // binary ones here.
+  if (LHS->getNumOperands() != 2 || RHS->getNumOperands() != 2)
+    return SDValue();
+
+  // We're sure we have an even number of elements due to the
+  // concat_vectors we have as arguments to vselect.
+  // Skip BV elements until we find one that's not an UNDEF
+  // After we find an UNDEF element, keep looping until we get to half the
+  // length of the BV and see if all the non-undef nodes are the same.
+  ConstantSDNode *BottomHalf = nullptr;
+  for (int i = 0; i < NumElems / 2; ++i) {
+    if (Cond->getOperand(i)->isUndef())
+      continue;
+
+    if (BottomHalf == nullptr)
+      BottomHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != BottomHalf)
+      return SDValue();
+  }
+
+  // Do the same for the second half of the BuildVector
+  ConstantSDNode *TopHalf = nullptr;
+  for (int i = NumElems / 2; i < NumElems; ++i) {
+    if (Cond->getOperand(i)->isUndef())
+      continue;
+
+    if (TopHalf == nullptr)
+      TopHalf = cast<ConstantSDNode>(Cond.getOperand(i));
+    else if (Cond->getOperand(i).getNode() != TopHalf)
+      return SDValue();
+  }
+
+  assert(TopHalf && BottomHalf &&
+         "One half of the selector was all UNDEFs and the other was all the "
+         "same value. This should have been addressed before this function.");
+  return DAG.getNode(
+      ISD::CONCAT_VECTORS, dl, VT,
+      BottomHalf->isNullValue() ? RHS->getOperand(0) : LHS->getOperand(0),
+      TopHalf->isNullValue() ? RHS->getOperand(1) : LHS->getOperand(1));
+}
+
+SDValue DAGCombiner::visitMSCATTER(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  SDValue Mask = MSC->getMask();
+  SDValue Data  = MSC->getValue();
+  SDLoc DL(N);
+
+  // If the MSCATTER data type requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (Mask.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  // Check if any splitting is required.
+  if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
+      TargetLowering::TypeSplitVector)
+    return SDValue();
+  SDValue MaskLo, MaskHi, Lo, Hi;
+  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MSC->getValueType(0));
+
+  SDValue Chain = MSC->getChain();
+
+  EVT MemoryVT = MSC->getMemoryVT();
+  unsigned Alignment = MSC->getOriginalAlignment();
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+  SDValue BasePtr = MSC->getBasePtr();
+  SDValue IndexLo, IndexHi;
+  std::tie(IndexLo, IndexHi) = DAG.SplitVector(MSC->getIndex(), DL);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MSC->getPointerInfo(),
+                          MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
+                          Alignment, MSC->getAAInfo(), MSC->getRanges());
+
+  SDValue OpsLo[] = { Chain, DataLo, MaskLo, BasePtr, IndexLo };
+  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
+                            DL, OpsLo, MMO);
+
+  SDValue OpsHi[] = {Chain, DataHi, MaskHi, BasePtr, IndexHi};
+  Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
+                            DL, OpsHi, MMO);
+
+  AddToWorklist(Lo.getNode());
+  AddToWorklist(Hi.getNode());
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGCombiner::visitMSTORE(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedStoreSDNode *MST = dyn_cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  SDValue Data  = MST->getValue();
+  SDLoc DL(N);
+
+  // If the MSTORE data type requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (Mask.getOpcode() == ISD::SETCC) {
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), Data.getValueType()) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MST->getValueType(0));
+
+    SDValue Chain = MST->getChain();
+    SDValue Ptr   = MST->getBasePtr();
+
+    EVT MemoryVT = MST->getMemoryVT();
+    unsigned Alignment = MST->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    SDValue DataLo, DataHi;
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  LoMemVT.getStoreSize(),
+                           Alignment, MST->getAAInfo(), MST->getRanges());
+
+    Lo = DAG.getMaskedStore(Chain, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+      getMachineMemOperand(MST->getPointerInfo(),
+                           MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                           SecondHalfAlignment, MST->getAAInfo(),
+                           MST->getRanges());
+
+    Hi = DAG.getMaskedStore(Chain, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                            MST->isTruncatingStore());
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitMGATHER(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedGatherSDNode *MGT = dyn_cast<MaskedGatherSDNode>(N);
+  SDValue Mask = MGT->getMask();
+  SDLoc DL(N);
+
+  // If the MGATHER result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+
+  if (Mask.getOpcode() != ISD::SETCC)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  // Check if any splitting is required.
+  if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+      TargetLowering::TypeSplitVector)
+    return SDValue();
+
+  SDValue MaskLo, MaskHi, Lo, Hi;
+  std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+  SDValue Src0 = MGT->getValue();
+  SDValue Src0Lo, Src0Hi;
+  std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+
+  SDValue Chain = MGT->getChain();
+  EVT MemoryVT = MGT->getMemoryVT();
+  unsigned Alignment = MGT->getOriginalAlignment();
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue BasePtr = MGT->getBasePtr();
+  SDValue Index = MGT->getIndex();
+  SDValue IndexLo, IndexHi;
+  std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MGT->getPointerInfo(),
+                          MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                          Alignment, MGT->getAAInfo(), MGT->getRanges());
+
+  SDValue OpsLo[] = { Chain, Src0Lo, MaskLo, BasePtr, IndexLo };
+  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, DL, OpsLo,
+                            MMO);
+
+  SDValue OpsHi[] = {Chain, Src0Hi, MaskHi, BasePtr, IndexHi};
+  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, DL, OpsHi,
+                            MMO);
+
+  AddToWorklist(Lo.getNode());
+  AddToWorklist(Hi.getNode());
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(MGT, 1), Chain);
+
+  SDValue GatherRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+
+  SDValue RetOps[] = { GatherRes, Chain };
+  return DAG.getMergeValues(RetOps, DL);
+}
+
+SDValue DAGCombiner::visitMLOAD(SDNode *N) {
+
+  if (Level >= AfterLegalizeTypes)
+    return SDValue();
+
+  MaskedLoadSDNode *MLD = dyn_cast<MaskedLoadSDNode>(N);
+  SDValue Mask = MLD->getMask();
+  SDLoc DL(N);
+
+  // If the MLOAD result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+
+  if (Mask.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue MaskLo, MaskHi, Lo, Hi;
+    std::tie(MaskLo, MaskHi) = SplitVSETCC(Mask.getNode(), DAG);
+
+    SDValue Src0 = MLD->getSrc0();
+    SDValue Src0Lo, Src0Hi;
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, DL);
+
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+    SDValue Chain = MLD->getChain();
+    SDValue Ptr   = MLD->getBasePtr();
+    EVT MemoryVT = MLD->getMemoryVT();
+    unsigned Alignment = MLD->getOriginalAlignment();
+
+    // if Alignment is equal to the vector size,
+    // take the half of it for the second part
+    unsigned SecondHalfAlignment =
+      (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+         Alignment/2 : Alignment;
+
+    EVT LoMemVT, HiMemVT;
+    std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+    MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Lo = DAG.getMaskedLoad(LoVT, DL, Chain, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));
+
+    MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, DL, Chain, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                           ISD::NON_EXTLOAD);
+
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo.getValue(1),
+                        Hi.getValue(1));
+
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(MLD, 1), Chain);
+
+    SDValue LoadRes = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+
+    SDValue RetOps[] = { LoadRes, Chain };
+    return DAG.getMergeValues(RetOps, DL);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitVSELECT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDLoc DL(N);
+
+  // Canonicalize integer abs.
+  // vselect (setg[te] X,  0),  X, -X ->
+  // vselect (setgt    X, -1),  X, -X ->
+  // vselect (setl[te] X,  0), -X,  X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N0.getOpcode() == ISD::SETCC) {
+    SDValue LHS = N0.getOperand(0), RHS = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    bool isAbs = false;
+    bool RHSIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+
+    if (((RHSIsAllZeros && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (ISD::isBuildVectorAllOnes(RHS.getNode()) && CC == ISD::SETGT)) &&
+        N1 == LHS && N2.getOpcode() == ISD::SUB && N1 == N2.getOperand(1))
+      isAbs = ISD::isBuildVectorAllZeros(N2.getOperand(0).getNode());
+    else if ((RHSIsAllZeros && (CC == ISD::SETLT || CC == ISD::SETLE)) &&
+             N2 == LHS && N1.getOpcode() == ISD::SUB && N2 == N1.getOperand(1))
+      isAbs = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
+
+    if (isAbs) {
+      EVT VT = LHS.getValueType();
+      SDValue Shift = DAG.getNode(
+          ISD::SRA, DL, VT, LHS,
+          DAG.getConstant(VT.getScalarType().getSizeInBits() - 1, DL, VT));
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, LHS, Shift);
+      AddToWorklist(Shift.getNode());
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, Add, Shift);
+    }
+  }
+
+  if (SimplifySelectOps(N, N1, N2))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // If the VSELECT result requires splitting and the mask is provided by a
+  // SETCC, then split both nodes and its operands before legalization. This
+  // prevents the type legalizer from unrolling SETCC into scalar comparisons
+  // and enables future optimizations (e.g. min/max pattern matching on X86).
+  if (N0.getOpcode() == ISD::SETCC) {
+    EVT VT = N->getValueType(0);
+
+    // Check if any splitting is required.
+    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
+        TargetLowering::TypeSplitVector)
+      return SDValue();
+
+    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
+    std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
+    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
+    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
+
+    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
+    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
+
+    // Add the new VSELECT nodes to the work list in case they need to be split
+    // again.
+    AddToWorklist(Lo.getNode());
+    AddToWorklist(Hi.getNode());
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
+  }
+
+  // Fold (vselect (build_vector all_ones), N1, N2) -> N1
+  if (ISD::isBuildVectorAllOnes(N0.getNode()))
+    return N1;
+  // Fold (vselect (build_vector all_zeros), N1, N2) -> N2
+  if (ISD::isBuildVectorAllZeros(N0.getNode()))
+    return N2;
+
+  // The ConvertSelectToConcatVector function is assuming both the above
+  // checks for (vselect (build_vector all{ones,zeros) ...) have been made
+  // and addressed.
+  if (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+      N2.getOpcode() == ISD::CONCAT_VECTORS &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())) {
+    if (SDValue CV = ConvertSelectToConcatVector(N, DAG))
+      return CV;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSELECT_CC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDValue N3 = N->getOperand(3);
+  SDValue N4 = N->getOperand(4);
+  ISD::CondCode CC = cast<CondCodeSDNode>(N4)->get();
+
+  // fold select_cc lhs, rhs, x, x, cc -> x
+  if (N2 == N3)
+    return N2;
+
+  // Determine if the condition we're dealing with is constant
+  if (SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()), N0, N1,
+                                  CC, SDLoc(N), false)) {
+    AddToWorklist(SCC.getNode());
+
+    if (ConstantSDNode *SCCC = dyn_cast<ConstantSDNode>(SCC.getNode())) {
+      if (!SCCC->isNullValue())
+        return N2;    // cond always true -> true val
+      else
+        return N3;    // cond always false -> false val
+    } else if (SCC->isUndef()) {
+      // When the condition is UNDEF, just return the first operand. This is
+      // coherent the DAG creation, no setcc node is created in this case
+      return N2;
+    } else if (SCC.getOpcode() == ISD::SETCC) {
+      // Fold to a simpler select_cc
+      return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N2.getValueType(),
+                         SCC.getOperand(0), SCC.getOperand(1), N2, N3,
+                         SCC.getOperand(2));
+    }
+  }
+
+  // If we can fold this based on the true/false value, do so.
+  if (SimplifySelectOps(N, N2, N3))
+    return SDValue(N, 0);  // Don't revisit N.
+
+  // fold select_cc into other things, such as min/max/abs
+  return SimplifySelectCC(SDLoc(N), N0, N1, N2, N3, CC);
+}
+
+SDValue DAGCombiner::visitSETCC(SDNode *N) {
+  return SimplifySetCC(N->getValueType(0), N->getOperand(0), N->getOperand(1),
+                       cast<CondCodeSDNode>(N->getOperand(2))->get(),
+                       SDLoc(N));
+}
+
+SDValue DAGCombiner::visitSETCCE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+
+  // If Carry is false, fold to a regular SETCC.
+  if (Carry.getOpcode() == ISD::CARRY_FALSE)
+    return DAG.getNode(ISD::SETCC, SDLoc(N), N->getVTList(), LHS, RHS, Cond);
+
+  return SDValue();
+}
+
+/// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
+/// a build_vector of constants.
+/// This function is called by the DAGCombiner when visiting sext/zext/aext
+/// dag nodes (see for example method DAGCombiner::visitSIGN_EXTEND).
+/// Vector extends are not folded if operations are legal; this is to
+/// avoid introducing illegal build_vector dag nodes.
+static SDNode *tryToFoldExtendOfConstant(SDNode *N, const TargetLowering &TLI,
+                                         SelectionDAG &DAG, bool LegalTypes,
+                                         bool LegalOperations) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  assert((Opcode == ISD::SIGN_EXTEND || Opcode == ISD::ZERO_EXTEND ||
+         Opcode == ISD::ANY_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG ||
+         Opcode == ISD::ZERO_EXTEND_VECTOR_INREG)
+         && "Expected EXTEND dag node in input!");
+
+  // fold (sext c1) -> c1
+  // fold (zext c1) -> c1
+  // fold (aext c1) -> c1
+  if (isa<ConstantSDNode>(N0))
+    return DAG.getNode(Opcode, SDLoc(N), VT, N0).getNode();
+
+  // fold (sext (build_vector AllConstants) -> (build_vector AllConstants)
+  // fold (zext (build_vector AllConstants) -> (build_vector AllConstants)
+  // fold (aext (build_vector AllConstants) -> (build_vector AllConstants)
+  EVT SVT = VT.getScalarType();
+  if (!(VT.isVector() &&
+      (!LegalTypes || (!LegalOperations && TLI.isTypeLegal(SVT))) &&
+      ISD::isBuildVectorOfConstantSDNodes(N0.getNode())))
+    return nullptr;
+
+  // We can fold this node into a build_vector.
+  unsigned VTBits = SVT.getSizeInBits();
+  unsigned EVTBits = N0->getValueType(0).getScalarType().getSizeInBits();
+  SmallVector<SDValue, 8> Elts;
+  unsigned NumElts = VT.getVectorNumElements();
+  SDLoc DL(N);
+
+  for (unsigned i=0; i != NumElts; ++i) {
+    SDValue Op = N0->getOperand(i);
+    if (Op->isUndef()) {
+      Elts.push_back(DAG.getUNDEF(SVT));
+      continue;
+    }
+
+    SDLoc DL(Op);
+    // Get the constant value and if needed trunc it to the size of the type.
+    // Nodes like build_vector might have constants wider than the scalar type.
+    APInt C = cast<ConstantSDNode>(Op)->getAPIntValue().zextOrTrunc(EVTBits);
+    if (Opcode == ISD::SIGN_EXTEND || Opcode == ISD::SIGN_EXTEND_VECTOR_INREG)
+      Elts.push_back(DAG.getConstant(C.sext(VTBits), DL, SVT));
+    else
+      Elts.push_back(DAG.getConstant(C.zext(VTBits), DL, SVT));
+  }
+
+  return DAG.getBuildVector(VT, DL, Elts).getNode();
+}
+
+// ExtendUsesToFormExtLoad - Trying to extend uses of a load to enable this:
+// "fold ({s|z|a}ext (load x)) -> ({s|z|a}ext (truncate ({s|z|a}extload x)))"
+// transformation. Returns true if extension are possible and the above
+// mentioned transformation is profitable.
+static bool ExtendUsesToFormExtLoad(SDNode *N, SDValue N0,
+                                    unsigned ExtOpc,
+                                    SmallVectorImpl<SDNode *> &ExtendNodes,
+                                    const TargetLowering &TLI) {
+  bool HasCopyToRegUses = false;
+  bool isTruncFree = TLI.isTruncateFree(N->getValueType(0), N0.getValueType());
+  for (SDNode::use_iterator UI = N0.getNode()->use_begin(),
+                            UE = N0.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == N)
+      continue;
+    if (UI.getUse().getResNo() != N0.getResNo())
+      continue;
+    // FIXME: Only extend SETCC N, N and SETCC N, c for now.
+    if (ExtOpc != ISD::ANY_EXTEND && User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (ExtOpc == ISD::ZERO_EXTEND && ISD::isSignedIntSetCC(CC))
+        // Sign bits will be lost after a zext.
+        return false;
+      bool Add = false;
+      for (unsigned i = 0; i != 2; ++i) {
+        SDValue UseOp = User->getOperand(i);
+        if (UseOp == N0)
+          continue;
+        if (!isa<ConstantSDNode>(UseOp))
+          return false;
+        Add = true;
+      }
+      if (Add)
+        ExtendNodes.push_back(User);
+      continue;
+    }
+    // If truncates aren't free and there are users we can't
+    // extend, it isn't worthwhile.
+    if (!isTruncFree)
+      return false;
+    // Remember if this value is live-out.
+    if (User->getOpcode() == ISD::CopyToReg)
+      HasCopyToRegUses = true;
+  }
+
+  if (HasCopyToRegUses) {
+    bool BothLiveOut = false;
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == 0 && Use.getUser()->getOpcode() == ISD::CopyToReg) {
+        BothLiveOut = true;
+        break;
+      }
+    }
+    if (BothLiveOut)
+      // Both unextended and extended values are live out. There had better be
+      // a good reason for the transformation.
+      return ExtendNodes.size();
+  }
+  return true;
+}
+
+void DAGCombiner::ExtendSetCCUses(const SmallVectorImpl<SDNode *> &SetCCs,
+                                  SDValue Trunc, SDValue ExtLoad,
+                                  const SDLoc &DL, ISD::NodeType ExtType) {
+  // Extend SetCC uses if necessary.
+  for (unsigned i = 0, e = SetCCs.size(); i != e; ++i) {
+    SDNode *SetCC = SetCCs[i];
+    SmallVector<SDValue, 4> Ops;
+
+    for (unsigned j = 0; j != 2; ++j) {
+      SDValue SOp = SetCC->getOperand(j);
+      if (SOp == Trunc)
+        Ops.push_back(ExtLoad);
+      else
+        Ops.push_back(DAG.getNode(ExtType, DL, ExtLoad->getValueType(0), SOp));
+    }
+
+    Ops.push_back(SetCC->getOperand(2));
+    CombineTo(SetCC, DAG.getNode(ISD::SETCC, DL, SetCC->getValueType(0), Ops));
+  }
+}
+
+// FIXME: Bring more similar combines here, common to sext/zext (maybe aext?).
+SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
+
+  assert((N->getOpcode() == ISD::SIGN_EXTEND ||
+          N->getOpcode() == ISD::ZERO_EXTEND) &&
+         "Unexpected node type (not an extend)!");
+
+  // fold (sext (load x)) to multiple smaller sextloads; same for zext.
+  // For example, on a target with legal v4i32, but illegal v8i32, turn:
+  //   (v8i32 (sext (v8i16 (load x))))
+  // into:
+  //   (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                          (v4i32 (sextload (x + 16)))))
+  // Where uses of the original load, i.e.:
+  //   (v8i16 (load x))
+  // are replaced with:
+  //   (v8i16 (truncate
+  //     (v8i32 (concat_vectors (v4i32 (sextload x)),
+  //                            (v4i32 (sextload (x + 16)))))))
+  //
+  // This combine is only applicable to illegal, but splittable, vectors.
+  // All legal types, and illegal non-vector types, are handled elsewhere.
+  // This combine is controlled by TargetLowering::isVectorLoadExtDesirable.
+  //
+  if (N0->getOpcode() != ISD::LOAD)
+    return SDValue();
+
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+  if (!ISD::isNON_EXTLoad(LN0) || !ISD::isUNINDEXEDLoad(LN0) ||
+      !N0.hasOneUse() || LN0->isVolatile() || !DstVT.isVector() ||
+      !DstVT.isPow2VectorType() || !TLI.isVectorLoadExtDesirable(SDValue(N, 0)))
+    return SDValue();
+
+  SmallVector<SDNode *, 4> SetCCs;
+  if (!ExtendUsesToFormExtLoad(N, N0, N->getOpcode(), SetCCs, TLI))
+    return SDValue();
+
+  ISD::LoadExtType ExtType =
+      N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
+
+  // Try to split the vector types to get down to legal types.
+  EVT SplitSrcVT = SrcVT;
+  EVT SplitDstVT = DstVT;
+  while (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT) &&
+         SplitSrcVT.getVectorNumElements() > 1) {
+    SplitDstVT = DAG.GetSplitDestVTs(SplitDstVT).first;
+    SplitSrcVT = DAG.GetSplitDestVTs(SplitSrcVT).first;
+  }
+
+  if (!TLI.isLoadExtLegalOrCustom(ExtType, SplitDstVT, SplitSrcVT))
+    return SDValue();
+
+  SDLoc DL(N);
+  const unsigned NumSplits =
+      DstVT.getVectorNumElements() / SplitDstVT.getVectorNumElements();
+  const unsigned Stride = SplitSrcVT.getStoreSize();
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+
+  SDValue BasePtr = LN0->getBasePtr();
+  for (unsigned Idx = 0; Idx < NumSplits; Idx++) {
+    const unsigned Offset = Idx * Stride;
+    const unsigned Align = MinAlign(LN0->getAlignment(), Offset);
+
+    SDValue SplitLoad = DAG.getExtLoad(
+        ExtType, DL, SplitDstVT, LN0->getChain(), BasePtr,
+        LN0->getPointerInfo().getWithOffset(Offset), SplitSrcVT, Align,
+        LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+
+    BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(Stride, DL, BasePtr.getValueType()));
+
+    Loads.push_back(SplitLoad.getValue(0));
+    Chains.push_back(SplitLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
+
+  CombineTo(N, NewValue);
+
+  // Replace uses of the original load (before extension)
+  // with a truncate of the concatenated sextloaded vectors.
+  SDValue Trunc =
+      DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(), NewValue);
+  CombineTo(N0.getNode(), Trunc, NewChain);
+  ExtendSetCCUses(SetCCs, Trunc, NewValue, DL,
+                  (ISD::NodeType)N->getOpcode());
+  return SDValue(N, 0); // Return N so it doesn't get rechecked!
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  // fold (sext (sext x)) -> (sext x)
+  // fold (sext (aext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
+                       N0.getOperand(0));
+
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (sext (truncate (load x))) -> (sext (smaller load x))
+    // fold (sext (truncate (srl (load x), c))) -> (sext (smaller load (x+c/n)))
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+
+    // See if the value being truncated is already sign extended.  If so, just
+    // eliminate the trunc/sext pair.
+    SDValue Op = N0.getOperand(0);
+    unsigned OpBits   = Op.getValueType().getScalarType().getSizeInBits();
+    unsigned MidBits  = N0.getValueType().getScalarType().getSizeInBits();
+    unsigned DestBits = VT.getScalarType().getSizeInBits();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op);
+
+    if (OpBits == DestBits) {
+      // Op is i32, Mid is i8, and Dest is i32.  If Op has more than 24 sign
+      // bits, it is already ready.
+      if (NumSignBits > DestBits-MidBits)
+        return Op;
+    } else if (OpBits < DestBits) {
+      // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
+      // bits, just sext from i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op);
+    } else {
+      // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
+      // bits, just truncate to i32.
+      if (NumSignBits > OpBits-MidBits)
+        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+    }
+
+    // fold (sext (truncate x)) -> (sextinreg x).
+    if (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG,
+                                                 N0.getValueType())) {
+      if (OpBits < DestBits)
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
+      else if (OpBits > DestBits)
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op,
+                         DAG.getValueType(N0.getValueType()));
+    }
+  }
+
+  // fold (sext (load x)) -> (sext (truncate (sextload x)))
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::SIGN_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
+                      ISD::SIGN_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (load x)) to multiple smaller sextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
+  // fold (sext (sextload x)) -> (sext (truncate (sextload x)))
+  // fold (sext ( extload x)) -> (sext (truncate (sextload x)))
+  if ((ISD::isSEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (sext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (sextload x), (sext cst))
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::ZEXTLOAD && LN0->isUnindexed()) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse())
+        DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0), ISD::SIGN_EXTEND,
+                                          SetCCs, TLI);
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(LN0), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getMemoryVT(),
+                                         LN0->getMemOperand());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.sext(VT.getSizeInBits());
+        SDLoc DL(N);
+        SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
+                                  ExtLoad, DAG.getConstant(Mask, DL, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    SDLoc(N0.getOperand(0)),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
+                        ISD::SIGN_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    EVT N0VT = N0.getOperand(0).getValueType();
+    // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations &&
+        TLI.getBooleanContents(N0VT) ==
+            TargetLowering::ZeroOrNegativeOneBooleanContent) {
+      // On some architectures (such as SSE/NEON/etc) the SETCC result type is
+      // of the same size as the compared operands. Only optimize sext(setcc())
+      // if this is the case.
+      EVT SVT = getSetCCResultType(N0VT);
+
+      // We know that the # elements of the results is the same as the
+      // # elements of the compare (and the # elements of the compare result
+      // for that matter).  Check to see that they are the same size.  If so,
+      // we know that the element size of the sext'd result matches the
+      // element size of the compare operands.
+      if (VT.getSizeInBits() == SVT.getSizeInBits())
+        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend
+      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
+      if (SVT == MatchingVectorType) {
+        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType,
+                               N0.getOperand(0), N0.getOperand(1),
+                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
+      }
+    }
+
+    // sext(setcc x, y, cc) -> (select (setcc x, y, cc), -1, 0)
+    unsigned ElementWidth = VT.getScalarType().getSizeInBits();
+    SDLoc DL(N);
+    SDValue NegOne =
+      DAG.getConstant(APInt::getAllOnesValue(ElementWidth), DL, VT);
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), NegOne,
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
+
+    if (!VT.isVector()) {
+      EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
+      if (!LegalOperations ||
+          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
+        SDLoc DL(N);
+        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT,
+                                     N0.getOperand(0), N0.getOperand(1), CC);
+        return DAG.getSelect(DL, VT, SetCC,
+                             NegOne, DAG.getConstant(0, DL, VT));
+      }
+    }
+  }
+
+  // fold (sext x) -> (zext x) if the sign bit is known zero.
+  if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
+      DAG.SignBitIsZero(N0))
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+// isTruncateOf - If N is a truncate of some other value, return true, record
+// the value being truncated in Op and which of Op's bits are zero in KnownZero.
+// This function computes KnownZero to avoid a duplicated call to
+// computeKnownBits in the caller.
+static bool isTruncateOf(SelectionDAG &DAG, SDValue N, SDValue &Op,
+                         APInt &KnownZero) {
+  APInt KnownOne;
+  if (N->getOpcode() == ISD::TRUNCATE) {
+    Op = N->getOperand(0);
+    DAG.computeKnownBits(Op, KnownZero, KnownOne);
+    return true;
+  }
+
+  if (N->getOpcode() != ISD::SETCC || N->getValueType(0) != MVT::i1 ||
+      cast<CondCodeSDNode>(N->getOperand(2))->get() != ISD::SETNE)
+    return false;
+
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  assert(Op0.getValueType() == Op1.getValueType());
+
+  if (isNullConstant(Op0))
+    Op = Op1;
+  else if (isNullConstant(Op1))
+    Op = Op0;
+  else
+    return false;
+
+  DAG.computeKnownBits(Op, KnownZero, KnownOne);
+
+  if (!(KnownZero | APInt(Op.getValueSizeInBits(), 1)).isAllOnesValue())
+    return false;
+
+  return true;
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  // fold (zext (zext x)) -> (zext x)
+  // fold (zext (aext x)) -> (zext x)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
+    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT,
+                       N0.getOperand(0));
+
+  // fold (zext (truncate x)) -> (zext x) or
+  //      (zext (truncate x)) -> (truncate x)
+  // This is valid when the truncated bits of x are already zero.
+  // FIXME: We should extend this to work for vectors too.
+  SDValue Op;
+  APInt KnownZero;
+  if (!VT.isVector() && isTruncateOf(DAG, N0, Op, KnownZero)) {
+    APInt TruncatedBits =
+      (Op.getValueSizeInBits() == N0.getValueSizeInBits()) ?
+      APInt(Op.getValueSizeInBits(), 0) :
+      APInt::getBitsSet(Op.getValueSizeInBits(),
+                        N0.getValueSizeInBits(),
+                        std::min(Op.getValueSizeInBits(),
+                                 VT.getSizeInBits()));
+    if (TruncatedBits == (KnownZero & TruncatedBits)) {
+      if (VT.bitsGT(Op.getValueType()))
+        return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op);
+      if (VT.bitsLT(Op.getValueType()))
+        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+
+      return Op;
+    }
+  }
+
+  // fold (zext (truncate (load x))) -> (zext (smaller load x))
+  // fold (zext (truncate (srl (load x), c))) -> (zext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (truncate x)) -> (and x, mask)
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    // fold (zext (truncate (load x))) -> (zext (smaller load x))
+    // fold (zext (truncate (srl (load x), c))) -> (zext (smaller load (x+c/n)))
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode *oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0); // Return N so it doesn't get rechecked!
+    }
+
+    EVT SrcVT = N0.getOperand(0).getValueType();
+    EVT MinVT = N0.getValueType();
+
+    // Try to mask before the extension to avoid having to generate a larger mask,
+    // possibly over several sub-vectors.
+    if (SrcVT.bitsLT(VT)) {
+      if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
+                               TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
+        SDValue Op = N0.getOperand(0);
+        Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+        AddToWorklist(Op.getNode());
+        return DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
+      }
+    }
+
+    if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
+      SDValue Op = N0.getOperand(0);
+      if (SrcVT.bitsLT(VT)) {
+        Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      } else if (SrcVT.bitsGT(VT)) {
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        AddToWorklist(Op.getNode());
+      }
+      return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+    }
+  }
+
+  // Fold (zext (and (trunc x), cst)) -> (and x, cst),
+  // if either of the casts is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      (!TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                           N0.getValueType()) ||
+       !TLI.isZExtFree(N0.getValueType(), VT))) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask = Mask.zext(VT.getSizeInBits());
+    SDLoc DL(N);
+    return DAG.getNode(ISD::AND, DL, VT,
+                       X, DAG.getConstant(Mask, DL, VT));
+  }
+
+  // fold (zext (load x)) -> (zext (truncate (zextload x)))
+  // Only generate vector extloads when 1) they're legal, and 2) they are
+  // deemed desirable by the target.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      ((!LegalOperations && !VT.isVector() &&
+        !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()))) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ZERO_EXTEND, SetCCs, TLI);
+    if (VT.isVector())
+      DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
+                      ISD::ZERO_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (zext (load x)) to multiple smaller zextloads.
+  // Only on illegal but splittable vectors.
+  if (SDValue ExtLoad = CombineExtLoad(N))
+    return ExtLoad;
+
+  // fold (zext (and/or/xor (load x), cst)) ->
+  //      (and/or/xor (zextload x), (zext cst))
+  // Unless (and (load x) cst) will match as a zextload already and has
+  // additional users.
+  if ((N0.getOpcode() == ISD::AND || N0.getOpcode() == ISD::OR ||
+       N0.getOpcode() == ISD::XOR) &&
+      isa<LoadSDNode>(N0.getOperand(0)) &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, N0.getValueType()) &&
+      (!LegalOperations && TLI.isOperationLegal(N0.getOpcode(), VT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0.getOperand(0));
+    if (LN0->getExtensionType() != ISD::SEXTLOAD && LN0->isUnindexed()) {
+      bool DoXform = true;
+      SmallVector<SDNode*, 4> SetCCs;
+      if (!N0.hasOneUse()) {
+        if (N0.getOpcode() == ISD::AND) {
+          auto *AndC = cast<ConstantSDNode>(N0.getOperand(1));
+          auto NarrowLoad = false;
+          EVT LoadResultTy = AndC->getValueType(0);
+          EVT ExtVT, LoadedVT;
+          if (isAndLoadExtLoad(AndC, LN0, LoadResultTy, ExtVT, LoadedVT,
+                               NarrowLoad))
+            DoXform = false;
+        }
+        if (DoXform)
+          DoXform = ExtendUsesToFormExtLoad(N, N0.getOperand(0),
+                                            ISD::ZERO_EXTEND, SetCCs, TLI);
+      }
+      if (DoXform) {
+        SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(LN0), VT,
+                                         LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getMemoryVT(),
+                                         LN0->getMemOperand());
+        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        Mask = Mask.zext(VT.getSizeInBits());
+        SDLoc DL(N);
+        SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
+                                  ExtLoad, DAG.getConstant(Mask, DL, VT));
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
+                                    SDLoc(N0.getOperand(0)),
+                                    N0.getOperand(0).getValueType(), ExtLoad);
+        CombineTo(N, And);
+        CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
+                        ISD::ZERO_EXTEND);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // fold (zext (zextload x)) -> (zext (truncate (zextload x)))
+  // fold (zext ( extload x)) -> (zext (truncate (zextload x)))
+  if ((ISD::isZEXTLoad(N0.getNode()) || ISD::isEXTLoad(N0.getNode())) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    EVT MemVT = LN0->getMemoryVT();
+    if ((!LegalOperations && !LN0->isVolatile()) ||
+        TLI.isLoadExtLegal(ISD::ZEXTLOAD, VT, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), MemVT,
+                                       LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0), N0.getValueType(),
+                            ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // Only do this before legalize for now.
+    if (!LegalOperations && VT.isVector() &&
+        N0.getValueType().getVectorElementType() == MVT::i1) {
+      EVT N00VT = N0.getOperand(0).getValueType();
+      if (getSetCCResultType(N00VT) == N0.getValueType())
+        return SDValue();
+
+      // We know that the # elements of the results is the same as the #
+      // elements of the compare (and the # elements of the compare result for
+      // that matter). Check to see that they are the same size. If so, we know
+      // that the element size of the sext'd result matches the element size of
+      // the compare operands.
+      SDLoc DL(N);
+      SDValue VecOnes = DAG.getConstant(1, DL, VT);
+      if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
+        // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
+        SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
+                                     N0.getOperand(1), N0.getOperand(2));
+        return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
+      }
+
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/sign extend.
+      EVT MatchingElementType = EVT::getIntegerVT(
+          *DAG.getContext(), N00VT.getScalarType().getSizeInBits());
+      EVT MatchingVectorType = EVT::getVectorVT(
+          *DAG.getContext(), MatchingElementType, N00VT.getVectorNumElements());
+      SDValue VsetCC =
+          DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
+                      N0.getOperand(1), N0.getOperand(2));
+      return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
+                         VecOnes);
+    }
+
+    // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDLoc DL(N);
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
+  }
+
+  // (zext (shl (zext x), cst)) -> (shl (zext x), cst)
+  if ((N0.getOpcode() == ISD::SHL || N0.getOpcode() == ISD::SRL) &&
+      isa<ConstantSDNode>(N0.getOperand(1)) &&
+      N0.getOperand(0).getOpcode() == ISD::ZERO_EXTEND &&
+      N0.hasOneUse()) {
+    SDValue ShAmt = N0.getOperand(1);
+    unsigned ShAmtVal = cast<ConstantSDNode>(ShAmt)->getZExtValue();
+    if (N0.getOpcode() == ISD::SHL) {
+      SDValue InnerZExt = N0.getOperand(0);
+      // If the original shl may be shifting out bits, do not perform this
+      // transformation.
+      unsigned KnownZeroBits = InnerZExt.getValueType().getSizeInBits() -
+        InnerZExt.getOperand(0).getValueType().getSizeInBits();
+      if (ShAmtVal > KnownZeroBits)
+        return SDValue();
+    }
+
+    SDLoc DL(N);
+
+    // Ensure that the shift amount is wide enough for the shifted value.
+    if (VT.getSizeInBits() >= 256)
+      ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
+
+    return DAG.getNode(N0.getOpcode(), DL, VT,
+                       DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0)),
+                       ShAmt);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  // fold (aext (aext x)) -> (aext x)
+  // fold (aext (zext x)) -> (zext x)
+  // fold (aext (sext x)) -> (sext x)
+  if (N0.getOpcode() == ISD::ANY_EXTEND  ||
+      N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND)
+    return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+
+  // fold (aext (truncate (load x))) -> (aext (smaller load x))
+  // fold (aext (truncate (srl (load x), c))) -> (aext (small load (x+c/n)))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    if (SDValue NarrowLoad = ReduceLoadWidth(N0.getNode())) {
+      SDNode* oye = N0.getNode()->getOperand(0).getNode();
+      if (NarrowLoad.getNode() != N0.getNode()) {
+        CombineTo(N0.getNode(), NarrowLoad);
+        // CombineTo deleted the truncate, if needed, but not what's under it.
+        AddToWorklist(oye);
+      }
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (truncate x))
+  if (N0.getOpcode() == ISD::TRUNCATE) {
+    SDValue TruncOp = N0.getOperand(0);
+    if (TruncOp.getValueType() == VT)
+      return TruncOp; // x iff x size == zext size.
+    if (TruncOp.getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp);
+    return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp);
+  }
+
+  // Fold (aext (and (trunc x), cst)) -> (and x, cst)
+  // if the trunc is not free.
+  if (N0.getOpcode() == ISD::AND &&
+      N0.getOperand(0).getOpcode() == ISD::TRUNCATE &&
+      N0.getOperand(1).getOpcode() == ISD::Constant &&
+      !TLI.isTruncateFree(N0.getOperand(0).getOperand(0).getValueType(),
+                          N0.getValueType())) {
+    SDValue X = N0.getOperand(0).getOperand(0);
+    if (X.getValueType().bitsLT(VT)) {
+      X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, X);
+    } else if (X.getValueType().bitsGT(VT)) {
+      X = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, X);
+    }
+    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+    Mask = Mask.zext(VT.getSizeInBits());
+    SDLoc DL(N);
+    return DAG.getNode(ISD::AND, DL, VT,
+                       X, DAG.getConstant(Mask, DL, VT));
+  }
+
+  // fold (aext (load x)) -> (aext (truncate (extload x)))
+  // None of the supported targets knows how to perform load and any_ext
+  // on vectors in one instruction.  We only perform this transformation on
+  // scalars.
+  if (ISD::isNON_EXTLoad(N0.getNode()) && !VT.isVector() &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+    bool DoXform = true;
+    SmallVector<SDNode*, 4> SetCCs;
+    if (!N0.hasOneUse())
+      DoXform = ExtendUsesToFormExtLoad(N, N0, ISD::ANY_EXTEND, SetCCs, TLI);
+    if (DoXform) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                       LN0->getChain(),
+                                       LN0->getBasePtr(), N0.getValueType(),
+                                       LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                  N0.getValueType(), ExtLoad);
+      CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
+                      ISD::ANY_EXTEND);
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  // fold (aext (zextload x)) -> (aext (truncate (zextload x)))
+  // fold (aext (sextload x)) -> (aext (truncate (sextload x)))
+  // fold (aext ( extload x)) -> (aext (truncate (extload  x)))
+  if (N0.getOpcode() == ISD::LOAD &&
+      !ISD::isNON_EXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    ISD::LoadExtType ExtType = LN0->getExtensionType();
+    EVT MemVT = LN0->getMemoryVT();
+    if (!LegalOperations || TLI.isLoadExtLegal(ExtType, VT, MemVT)) {
+      SDValue ExtLoad = DAG.getExtLoad(ExtType, SDLoc(N),
+                                       VT, LN0->getChain(), LN0->getBasePtr(),
+                                       MemVT, LN0->getMemOperand());
+      CombineTo(N, ExtLoad);
+      CombineTo(N0.getNode(),
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                            N0.getValueType(), ExtLoad),
+                ExtLoad.getValue(1));
+      return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    }
+  }
+
+  if (N0.getOpcode() == ISD::SETCC) {
+    // For vectors:
+    // aext(setcc) -> vsetcc
+    // aext(setcc) -> truncate(vsetcc)
+    // aext(setcc) -> aext(vsetcc)
+    // Only do this before legalize for now.
+    if (VT.isVector() && !LegalOperations) {
+      EVT N0VT = N0.getOperand(0).getValueType();
+        // We know that the # elements of the results is the same as the
+        // # elements of the compare (and the # elements of the compare result
+        // for that matter).  Check to see that they are the same size.  If so,
+        // we know that the element size of the sext'd result matches the
+        // element size of the compare operands.
+      if (VT.getSizeInBits() == N0VT.getSizeInBits())
+        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
+                             N0.getOperand(1),
+                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+      // If the desired elements are smaller or larger than the source
+      // elements we can use a matching integer vector type and then
+      // truncate/any extend
+      else {
+        EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
+        SDValue VsetCC =
+          DAG.getSetCC(SDLoc(N), MatchingVectorType, N0.getOperand(0),
+                        N0.getOperand(1),
+                        cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getAnyExtOrTrunc(VsetCC, SDLoc(N), VT);
+      }
+    }
+
+    // aext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
+    SDLoc DL(N);
+    if (SDValue SCC = SimplifySelectCC(
+            DL, N0.getOperand(0), N0.getOperand(1), DAG.getConstant(1, DL, VT),
+            DAG.getConstant(0, DL, VT),
+            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+      return SCC;
+  }
+
+  return SDValue();
+}
+
+/// See if the specified operand can be simplified with the knowledge that only
+/// the bits specified by Mask are used.  If so, return the simpler operand,
+/// otherwise return a null SDValue.
+SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
+  switch (V.getOpcode()) {
+  default: break;
+  case ISD::Constant: {
+    const ConstantSDNode *CV = cast<ConstantSDNode>(V.getNode());
+    assert(CV && "Const value should be ConstSDNode.");
+    const APInt &CVal = CV->getAPIntValue();
+    APInt NewVal = CVal & Mask;
+    if (NewVal != CVal)
+      return DAG.getConstant(NewVal, SDLoc(V), V.getValueType());
+    break;
+  }
+  case ISD::OR:
+  case ISD::XOR:
+    // If the LHS or RHS don't contribute bits to the or, drop them.
+    if (DAG.MaskedValueIsZero(V.getOperand(0), Mask))
+      return V.getOperand(1);
+    if (DAG.MaskedValueIsZero(V.getOperand(1), Mask))
+      return V.getOperand(0);
+    break;
+  case ISD::SRL:
+    // Only look at single-use SRLs.
+    if (!V.getNode()->hasOneUse())
+      break;
+    if (ConstantSDNode *RHSC = getAsNonOpaqueConstant(V.getOperand(1))) {
+      // See if we can recursively simplify the LHS.
+      unsigned Amt = RHSC->getZExtValue();
+
+      // Watch out for shift count overflow though.
+      if (Amt >= Mask.getBitWidth()) break;
+      APInt NewMask = Mask << Amt;
+      if (SDValue SimplifyLHS = GetDemandedBits(V.getOperand(0), NewMask))
+        return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
+                           SimplifyLHS, V.getOperand(1));
+    }
+  }
+  return SDValue();
+}
+
+/// If the result of a wider load is shifted to right of N  bits and then
+/// truncated to a narrower type and where N is a multiple of number of bits of
+/// the narrower type, transform it to a narrower load from address + N / num of
+/// bits of new type. If the result is to be extended, also fold the extension
+/// to form a extending load.
+SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
+  unsigned Opc = N->getOpcode();
+
+  ISD::LoadExtType ExtType = ISD::NON_EXTLOAD;
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT ExtVT = VT;
+
+  // This transformation isn't valid for vector loads.
+  if (VT.isVector())
+    return SDValue();
+
+  // Special case: SIGN_EXTEND_INREG is basically truncating to ExtVT then
+  // extended to VT.
+  if (Opc == ISD::SIGN_EXTEND_INREG) {
+    ExtType = ISD::SEXTLOAD;
+    ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  } else if (Opc == ISD::SRL) {
+    // Another special-case: SRL is basically zero-extending a narrower value.
+    ExtType = ISD::ZEXTLOAD;
+    N0 = SDValue(N, 0);
+    ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+    if (!N01) return SDValue();
+    ExtVT = EVT::getIntegerVT(*DAG.getContext(),
+                              VT.getSizeInBits() - N01->getZExtValue());
+  }
+  if (LegalOperations && !TLI.isLoadExtLegal(ExtType, VT, ExtVT))
+    return SDValue();
+
+  unsigned EVTBits = ExtVT.getSizeInBits();
+
+  // Do not generate loads of non-round integer types since these can
+  // be expensive (and would be wrong if the type is not byte sized).
+  if (!ExtVT.isRound())
+    return SDValue();
+
+  unsigned ShAmt = 0;
+  if (N0.getOpcode() == ISD::SRL && N0.hasOneUse()) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShAmt = N01->getZExtValue();
+      // Is the shift amount a multiple of size of VT?
+      if ((ShAmt & (EVTBits-1)) == 0) {
+        N0 = N0.getOperand(0);
+        // Is the load width a multiple of size of VT?
+        if ((N0.getValueType().getSizeInBits() & (EVTBits-1)) != 0)
+          return SDValue();
+      }
+
+      // At this point, we must have a load or else we can't do the transform.
+      if (!isa<LoadSDNode>(N0)) return SDValue();
+
+      // Because a SRL must be assumed to *need* to zero-extend the high bits
+      // (as opposed to anyext the high bits), we can't combine the zextload
+      // lowering of SRL and an sextload.
+      if (cast<LoadSDNode>(N0)->getExtensionType() == ISD::SEXTLOAD)
+        return SDValue();
+
+      // If the shift amount is larger than the input type then we're not
+      // accessing any of the loaded bytes.  If the load was a zextload/extload
+      // then the result of the shift+trunc is zero/undef (handled elsewhere).
+      if (ShAmt >= cast<LoadSDNode>(N0)->getMemoryVT().getSizeInBits())
+        return SDValue();
+    }
+  }
+
+  // If the load is shifted left (and the result isn't shifted back right),
+  // we can fold the truncate through the shift.
+  unsigned ShLeftAmt = 0;
+  if (ShAmt == 0 && N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      ExtVT == VT && TLI.isNarrowingProfitable(N0.getValueType(), VT)) {
+    if (ConstantSDNode *N01 = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      ShLeftAmt = N01->getZExtValue();
+      N0 = N0.getOperand(0);
+    }
+  }
+
+  // If we haven't found a load, we can't narrow it.  Don't transform one with
+  // multiple uses, this would require adding a new load.
+  if (!isa<LoadSDNode>(N0) || !N0.hasOneUse())
+    return SDValue();
+
+  // Don't change the width of a volatile load.
+  LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+  if (LN0->isVolatile())
+    return SDValue();
+
+  // Verify that we are actually reducing a load width here.
+  if (LN0->getMemoryVT().getSizeInBits() < EVTBits)
+    return SDValue();
+
+  // For the transform to be legal, the load must produce only two values
+  // (the value loaded and the chain).  Don't transform a pre-increment
+  // load, for example, which produces an extra value.  Otherwise the
+  // transformation is not equivalent, and the downstream logic to replace
+  // uses gets things wrong.
+  if (LN0->getNumValues() > 2)
+    return SDValue();
+
+  // If the load that we're shrinking is an extload and we're not just
+  // discarding the extension we can't simply shrink the load. Bail.
+  // TODO: It would be possible to merge the extensions in some cases.
+  if (LN0->getExtensionType() != ISD::NON_EXTLOAD &&
+      LN0->getMemoryVT().getSizeInBits() < ExtVT.getSizeInBits() + ShAmt)
+    return SDValue();
+
+  if (!TLI.shouldReduceLoadWidth(LN0, ExtType, ExtVT))
+    return SDValue();
+
+  EVT PtrType = N0.getOperand(1).getValueType();
+
+  if (PtrType == MVT::Untyped || PtrType.isExtended())
+    // It's not possible to generate a constant of extended or untyped type.
+    return SDValue();
+
+  // For big endian targets, we need to adjust the offset to the pointer to
+  // load the correct bytes.
+  if (DAG.getDataLayout().isBigEndian()) {
+    unsigned LVTStoreBits = LN0->getMemoryVT().getStoreSizeInBits();
+    unsigned EVTStoreBits = ExtVT.getStoreSizeInBits();
+    ShAmt = LVTStoreBits - EVTStoreBits - ShAmt;
+  }
+
+  uint64_t PtrOff = ShAmt / 8;
+  unsigned NewAlign = MinAlign(LN0->getAlignment(), PtrOff);
+  SDLoc DL(LN0);
+  // The original load itself didn't wrap, so an offset within it doesn't.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+  SDValue NewPtr = DAG.getNode(ISD::ADD, DL,
+                               PtrType, LN0->getBasePtr(),
+                               DAG.getConstant(PtrOff, DL, PtrType),
+                               &Flags);
+  AddToWorklist(NewPtr.getNode());
+
+  SDValue Load;
+  if (ExtType == ISD::NON_EXTLOAD)
+    Load = DAG.getLoad(VT, SDLoc(N0), LN0->getChain(), NewPtr,
+                       LN0->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                       LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+  else
+    Load = DAG.getExtLoad(ExtType, SDLoc(N0), VT, LN0->getChain(), NewPtr,
+                          LN0->getPointerInfo().getWithOffset(PtrOff), ExtVT,
+                          NewAlign, LN0->getMemOperand()->getFlags(),
+                          LN0->getAAInfo());
+
+  // Replace the old load's chain with the new load's chain.
+  WorklistRemover DeadNodes(*this);
+  DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+
+  // Shift the result left, if we've swallowed a left shift.
+  SDValue Result = Load;
+  if (ShLeftAmt != 0) {
+    EVT ShImmTy = getShiftAmountTy(Result.getValueType());
+    if (!isUIntN(ShImmTy.getSizeInBits(), ShLeftAmt))
+      ShImmTy = VT;
+    // If the shift amount is as large as the result size (but, presumably,
+    // no larger than the source) then the useful bits of the result are
+    // zero; we can't simply return the shortened shift, because the result
+    // of that operation is undefined.
+    SDLoc DL(N0);
+    if (ShLeftAmt >= VT.getSizeInBits())
+      Result = DAG.getConstant(0, DL, VT);
+    else
+      Result = DAG.getNode(ISD::SHL, DL, VT,
+                          Result, DAG.getConstant(ShLeftAmt, DL, ShImmTy));
+  }
+
+  // Return the new loaded value.
+  return Result;
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned EVTBits = EVT.getScalarType().getSizeInBits();
+
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  // fold (sext_in_reg c1) -> c1
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
+
+  // If the input is already sign extended, just drop the extension.
+  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
+    return N0;
+
+  // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
+  if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
+                       N0.getOperand(0), N1);
+
+  // fold (sext_in_reg (sext x)) -> (sext x)
+  // fold (sext_in_reg (aext x)) -> (sext x)
+  // if x is small enough.
+  if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
+    SDValue N00 = N0.getOperand(0);
+    if (N00.getValueType().getScalarType().getSizeInBits() <= EVTBits &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
+      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
+  }
+
+  // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
+  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+    return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT);
+
+  // fold operands of sext_in_reg based on knowledge that the top bits are not
+  // demanded.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  // fold (sext_in_reg (load x)) -> (smaller sextload x)
+  // fold (sext_in_reg (srl (load x), c)) -> (smaller sextload (x+c/evtbits))
+  if (SDValue NarrowLoad = ReduceLoadWidth(N))
+    return NarrowLoad;
+
+  // fold (sext_in_reg (srl X, 24), i8) -> (sra X, 24)
+  // fold (sext_in_reg (srl X, 23), i8) -> (sra X, 23) iff possible.
+  // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
+  if (N0.getOpcode() == ISD::SRL) {
+    if (ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
+      if (ShAmt->getZExtValue()+EVTBits <= VTBits) {
+        // We can turn this into an SRA iff the input to the SRL is already sign
+        // extended enough.
+        unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
+        if (VTBits-(ShAmt->getZExtValue()+EVTBits) < InSignBits)
+          return DAG.getNode(ISD::SRA, SDLoc(N), VT,
+                             N0.getOperand(0), N0.getOperand(1));
+      }
+  }
+
+  // fold (sext_inreg (extload x)) -> (sextload x)
+  if (ISD::isEXTLoad(N0.getNode()) &&
+      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    AddToWorklist(ExtLoad.getNode());
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+  // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
+  if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
+      N0.hasOneUse() &&
+      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ((!LegalOperations && !cast<LoadSDNode>(N0)->isVolatile()) ||
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), EVT,
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
+  if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
+    if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
+                                           N0.getOperand(1), false))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
+                         BSwap, N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
+  if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
+                                              LegalOperations))
+    return SDValue(Res, 0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  bool isLE = DAG.getDataLayout().isLittleEndian();
+
+  // noop truncate
+  if (N0.getValueType() == N->getValueType(0))
+    return N0;
+  // fold (truncate c1) -> c1
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0);
+  // fold (truncate (truncate x)) -> (truncate x)
+  if (N0.getOpcode() == ISD::TRUNCATE)
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+  // fold (truncate (ext x)) -> (ext x) or (truncate x) or x
+  if (N0.getOpcode() == ISD::ZERO_EXTEND ||
+      N0.getOpcode() == ISD::SIGN_EXTEND ||
+      N0.getOpcode() == ISD::ANY_EXTEND) {
+    // if the source is smaller than the dest, we still need an extend.
+    if (N0.getOperand(0).getValueType().bitsLT(VT))
+      return DAG.getNode(N0.getOpcode(), SDLoc(N), VT, N0.getOperand(0));
+    // if the source is larger than the dest, than we just need the truncate.
+    if (N0.getOperand(0).getValueType().bitsGT(VT))
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, N0.getOperand(0));
+    // if the source and dest are the same type, we can drop both the extend
+    // and the truncate.
+    return N0.getOperand(0);
+  }
+
+  // Fold extract-and-trunc into a narrow extract. For example:
+  //   i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
+  //   i32 y = TRUNCATE(i64 x)
+  //        -- becomes --
+  //   v16i8 b = BITCAST (v2i64 val)
+  //   i8 x = EXTRACT_VECTOR_ELT(v16i8 b, i32 8)
+  //
+  // Note: We only run this optimization after type legalization (which often
+  // creates this pattern) and before operation legalization after which
+  // we need to be more careful about the vector instructions that we generate.
+  if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
+
+    EVT VecTy = N0.getOperand(0).getValueType();
+    EVT ExTy = N0.getValueType();
+    EVT TrTy = N->getValueType(0);
+
+    unsigned NumElem = VecTy.getVectorNumElements();
+    unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
+
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, SizeRatio * NumElem);
+    assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+
+    SDValue EltNo = N0->getOperand(1);
+    if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
+      int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+      int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
+
+      SDLoc DL(N);
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
+                         DAG.getBitcast(NVT, N0.getOperand(0)),
+                         DAG.getConstant(Index, DL, IndexTy));
+    }
+  }
+
+  // trunc (select c, a, b) -> select c, (trunc a), (trunc b)
+  if (N0.getOpcode() == ISD::SELECT) {
+    EVT SrcVT = N0.getValueType();
+    if ((!LegalOperations || TLI.isOperationLegal(ISD::SELECT, SrcVT)) &&
+        TLI.isTruncateFree(SrcVT, VT)) {
+      SDLoc SL(N0);
+      SDValue Cond = N0.getOperand(0);
+      SDValue TruncOp0 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+      SDValue TruncOp1 = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(2));
+      return DAG.getNode(ISD::SELECT, SDLoc(N), VT, Cond, TruncOp0, TruncOp1);
+    }
+  }
+
+  // trunc (shl x, K) -> shl (trunc x), K => K < vt.size / 2
+  if (N0.getOpcode() == ISD::SHL && N0.hasOneUse() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::SHL, VT)) &&
+      TLI.isTypeDesirableForOp(ISD::SHL, VT)) {
+    if (const ConstantSDNode *CAmt = isConstOrConstSplat(N0.getOperand(1))) {
+      uint64_t Amt = CAmt->getZExtValue();
+      unsigned Size = VT.getSizeInBits();
+
+      if (Amt < Size / 2) {
+        SDLoc SL(N);
+        EVT AmtVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+        return DAG.getNode(ISD::SHL, SL, VT, Trunc,
+                           DAG.getConstant(Amt, SL, AmtVT));
+      }
+    }
+  }
+
+  // Fold a series of buildvector, bitcast, and truncate if possible.
+  // For example fold
+  //   (2xi32 trunc (bitcast ((4xi32)buildvector x, x, y, y) 2xi64)) to
+  //   (2xi32 (buildvector x, y)).
+  if (Level == AfterLegalizeVectorOps && VT.isVector() &&
+      N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N0.getOperand(0).getOpcode() == ISD::BUILD_VECTOR &&
+      N0.getOperand(0).hasOneUse()) {
+
+    SDValue BuildVect = N0.getOperand(0);
+    EVT BuildVectEltTy = BuildVect.getValueType().getVectorElementType();
+    EVT TruncVecEltTy = VT.getVectorElementType();
+
+    // Check that the element types match.
+    if (BuildVectEltTy == TruncVecEltTy) {
+      // Now we only need to compute the offset of the truncated elements.
+      unsigned BuildVecNumElts =  BuildVect.getNumOperands();
+      unsigned TruncVecNumElts = VT.getVectorNumElements();
+      unsigned TruncEltOffset = BuildVecNumElts / TruncVecNumElts;
+
+      assert((BuildVecNumElts % TruncVecNumElts) == 0 &&
+             "Invalid number of elements");
+
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = BuildVecNumElts; i != e; i += TruncEltOffset)
+        Opnds.push_back(BuildVect.getOperand(i));
+
+      return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+    }
+  }
+
+  // See if we can simplify the input to this truncate through knowledge that
+  // only the low bits are being used.
+  // For example "trunc (or (shl x, 8), y)" // -> trunc y
+  // Currently we only perform this optimization on scalars because vectors
+  // may have different active low bits.
+  if (!VT.isVector()) {
+    if (SDValue Shorter =
+            GetDemandedBits(N0, APInt::getLowBitsSet(N0.getValueSizeInBits(),
+                                                     VT.getSizeInBits())))
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
+  }
+  // fold (truncate (load x)) -> (smaller load x)
+  // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
+  if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
+    if (SDValue Reduced = ReduceLoadWidth(N))
+      return Reduced;
+
+    // Handle the case where the load remains an extending load even
+    // after truncation.
+    if (N0.hasOneUse() && ISD::isUNINDEXEDLoad(N0.getNode())) {
+      LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+      if (!LN0->isVolatile() &&
+          LN0->getMemoryVT().getStoreSizeInBits() < VT.getSizeInBits()) {
+        SDValue NewLoad = DAG.getExtLoad(LN0->getExtensionType(), SDLoc(LN0),
+                                         VT, LN0->getChain(), LN0->getBasePtr(),
+                                         LN0->getMemoryVT(),
+                                         LN0->getMemOperand());
+        DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLoad.getValue(1));
+        return NewLoad;
+      }
+    }
+  }
+  // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
+  // where ... are all 'undef'.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
+    SmallVector<EVT, 8> VTs;
+    SDValue V;
+    unsigned Idx = 0;
+    unsigned NumDefs = 0;
+
+    for (unsigned i = 0, e = N0.getNumOperands(); i != e; ++i) {
+      SDValue X = N0.getOperand(i);
+      if (!X.isUndef()) {
+        V = X;
+        Idx = i;
+        NumDefs++;
+      }
+      // Stop if more than one members are non-undef.
+      if (NumDefs > 1)
+        break;
+      VTs.push_back(EVT::getVectorVT(*DAG.getContext(),
+                                     VT.getVectorElementType(),
+                                     X.getValueType().getVectorNumElements()));
+    }
+
+    if (NumDefs == 0)
+      return DAG.getUNDEF(VT);
+
+    if (NumDefs == 1) {
+      assert(V.getNode() && "The single defined operand is empty!");
+      SmallVector<SDValue, 8> Opnds;
+      for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+        if (i != Idx) {
+          Opnds.push_back(DAG.getUNDEF(VTs[i]));
+          continue;
+        }
+        SDValue NV = DAG.getNode(ISD::TRUNCATE, SDLoc(V), VTs[i], V);
+        AddToWorklist(NV.getNode());
+        Opnds.push_back(NV);
+      }
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Opnds);
+    }
+  }
+
+  // Fold truncate of a bitcast of a vector to an extract of the low vector
+  // element.
+  //
+  // e.g. trunc (i64 (bitcast v2i32:x)) -> extract_vector_elt v2i32:x, 0
+  if (N0.getOpcode() == ISD::BITCAST && !VT.isVector()) {
+    SDValue VecSrc = N0.getOperand(0);
+    EVT SrcVT = VecSrc.getValueType();
+    if (SrcVT.isVector() && SrcVT.getScalarType() == VT &&
+        (!LegalOperations ||
+         TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, SrcVT))) {
+      SDLoc SL(N);
+
+      EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT,
+                         VecSrc, DAG.getConstant(0, SL, IdxVT));
+    }
+  }
+
+  // Simplify the operands using demanded-bits information.
+  if (!VT.isVector() &&
+      SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDNode *getBuildPairElt(SDNode *N, unsigned i) {
+  SDValue Elt = N->getOperand(i);
+  if (Elt.getOpcode() != ISD::MERGE_VALUES)
+    return Elt.getNode();
+  return Elt.getOperand(Elt.getResNo()).getNode();
+}
+
+/// build_pair (load, load) -> load
+/// if load locations are consecutive.
+SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
+  assert(N->getOpcode() == ISD::BUILD_PAIR);
+
+  LoadSDNode *LD1 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 0));
+  LoadSDNode *LD2 = dyn_cast<LoadSDNode>(getBuildPairElt(N, 1));
+  if (!LD1 || !LD2 || !ISD::isNON_EXTLoad(LD1) || !LD1->hasOneUse() ||
+      LD1->getAddressSpace() != LD2->getAddressSpace())
+    return SDValue();
+  EVT LD1VT = LD1->getValueType(0);
+  unsigned LD1Bytes = LD1VT.getSizeInBits() / 8;
+  if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
+      DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
+    unsigned Align = LD1->getAlignment();
+    unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+        VT.getTypeForEVT(*DAG.getContext()));
+
+    if (NewAlign <= Align &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
+      return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
+                         LD1->getPointerInfo(), Align);
+  }
+
+  return SDValue();
+}
+
+static unsigned getPPCf128HiElementSelector(const SelectionDAG &DAG) {
+  // On little-endian machines, bitcasting from ppcf128 to i128 does swap the Hi
+  // and Lo parts; on big-endian machines it doesn't.
+  return DAG.getDataLayout().isBigEndian() ? 1 : 0;
+}
+
+static SDValue foldBitcastedFPLogic(SDNode *N, SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  // If this is not a bitcast to an FP type or if the target doesn't have
+  // IEEE754-compliant FP logic, we're done.
+  EVT VT = N->getValueType(0);
+  if (!VT.isFloatingPoint() || !TLI.hasBitPreservingFPLogic(VT))
+    return SDValue();
+
+  // TODO: Use splat values for the constant-checking below and remove this
+  // restriction.
+  SDValue N0 = N->getOperand(0);
+  EVT SourceVT = N0.getValueType();
+  if (SourceVT.isVector())
+    return SDValue();
+
+  unsigned FPOpcode;
+  APInt SignMask;
+  switch (N0.getOpcode()) {
+  case ISD::AND:
+    FPOpcode = ISD::FABS;
+    SignMask = ~APInt::getSignBit(SourceVT.getSizeInBits());
+    break;
+  case ISD::XOR:
+    FPOpcode = ISD::FNEG;
+    SignMask = APInt::getSignBit(SourceVT.getSizeInBits());
+    break;
+  // TODO: ISD::OR --> ISD::FNABS?
+  default:
+    return SDValue();
+  }
+
+  // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X
+  // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X
+  SDValue LogicOp0 = N0.getOperand(0);
+  ConstantSDNode *LogicOp1 = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+  if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask &&
+      LogicOp0.getOpcode() == ISD::BITCAST &&
+      LogicOp0->getOperand(0).getValueType() == VT)
+    return DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0->getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBITCAST(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If the input is a BUILD_VECTOR with all constant elements, fold this now.
+  // Only do this before legalize, since afterward the target may be depending
+  // on the bitconvert.
+  // First check to see if this is all constant.
+  if (!LegalTypes &&
+      N0.getOpcode() == ISD::BUILD_VECTOR && N0.getNode()->hasOneUse() &&
+      VT.isVector()) {
+    bool isSimple = cast<BuildVectorSDNode>(N0)->isConstant();
+
+    EVT DestEltVT = N->getValueType(0).getVectorElementType();
+    assert(!DestEltVT.isVector() &&
+           "Element type of vector ValueType must not be vector!");
+    if (isSimple)
+      return ConstantFoldBITCASTofBUILD_VECTOR(N0.getNode(), DestEltVT);
+  }
+
+  // If the input is a constant, let getNode fold it.
+  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0)) {
+    // If we can't allow illegal operations, we need to check that this is just
+    // a fp -> int or int -> conversion and that the resulting operation will
+    // be legal.
+    if (!LegalOperations ||
+        (isa<ConstantSDNode>(N0) && VT.isFloatingPoint() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::ConstantFP, VT)) ||
+        (isa<ConstantFPSDNode>(N0) && VT.isInteger() && !VT.isVector() &&
+         TLI.isOperationLegal(ISD::Constant, VT)))
+      return DAG.getBitcast(VT, N0);
+  }
+
+  // (conv (conv x, t1), t2) -> (conv x, t2)
+  if (N0.getOpcode() == ISD::BITCAST)
+    return DAG.getBitcast(VT, N0.getOperand(0));
+
+  // fold (conv (load x)) -> (load (conv*)x)
+  // If the resultant load doesn't need a higher alignment than the original!
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      // Do not change the width of a volatile load.
+      !cast<LoadSDNode>(N0)->isVolatile() &&
+      // Do not remove the cast if the types differ in endian layout.
+      TLI.hasBigEndianPartOrdering(N0.getValueType(), DAG.getDataLayout()) ==
+          TLI.hasBigEndianPartOrdering(VT, DAG.getDataLayout()) &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)) &&
+      TLI.isLoadBitCastBeneficial(N0.getValueType(), VT)) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    unsigned OrigAlign = LN0->getAlignment();
+
+    bool Fast = false;
+    if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                               LN0->getAddressSpace(), OrigAlign, &Fast) &&
+        Fast) {
+      SDValue Load =
+          DAG.getLoad(VT, SDLoc(N), LN0->getChain(), LN0->getBasePtr(),
+                      LN0->getPointerInfo(), OrigAlign,
+                      LN0->getMemOperand()->getFlags(), LN0->getAAInfo());
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+      return Load;
+    }
+  }
+
+  if (SDValue V = foldBitcastedFPLogic(N, DAG, TLI))
+    return V;
+
+  // fold (bitconvert (fneg x)) -> (xor (bitconvert x), signbit)
+  // fold (bitconvert (fabs x)) -> (and (bitconvert x), (not signbit))
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fneg x)) ->
+  //     flipbit = signbit
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  //
+  // fold (bitcast (fabs x)) ->
+  //     flipbit = (and (extract_element (bitcast x), 0), signbit)
+  //     (xor (bitcast x) (build_pair flipbit, flipbit))
+  // This often reduces constant pool loads.
+  if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(N0.getValueType())) ||
+       (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(N0.getValueType()))) &&
+      N0.getNode()->hasOneUse() && VT.isInteger() &&
+      !VT.isVector() && !N0.getValueType().isVector()) {
+    SDValue NewConv = DAG.getBitcast(VT, N0.getOperand(0));
+    AddToWorklist(NewConv.getNode());
+
+    SDLoc DL(N);
+    if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+      assert(VT.getSizeInBits() == 128);
+      SDValue SignBit = DAG.getConstant(
+          APInt::getSignBit(VT.getSizeInBits() / 2), SDLoc(N0), MVT::i64);
+      SDValue FlipBit;
+      if (N0.getOpcode() == ISD::FNEG) {
+        FlipBit = SignBit;
+        AddToWorklist(FlipBit.getNode());
+      } else {
+        assert(N0.getOpcode() == ISD::FABS);
+        SDValue Hi =
+            DAG.getNode(ISD::EXTRACT_ELEMENT, SDLoc(NewConv), MVT::i64, NewConv,
+                        DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                              SDLoc(NewConv)));
+        AddToWorklist(Hi.getNode());
+        FlipBit = DAG.getNode(ISD::AND, SDLoc(N0), MVT::i64, Hi, SignBit);
+        AddToWorklist(FlipBit.getNode());
+      }
+      SDValue FlipBits =
+          DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+      AddToWorklist(FlipBits.getNode());
+      return DAG.getNode(ISD::XOR, DL, VT, NewConv, FlipBits);
+    }
+    APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+    if (N0.getOpcode() == ISD::FNEG)
+      return DAG.getNode(ISD::XOR, DL, VT,
+                         NewConv, DAG.getConstant(SignBit, DL, VT));
+    assert(N0.getOpcode() == ISD::FABS);
+    return DAG.getNode(ISD::AND, DL, VT,
+                       NewConv, DAG.getConstant(~SignBit, DL, VT));
+  }
+
+  // fold (bitconvert (fcopysign cst, x)) ->
+  //         (or (and (bitconvert x), sign), (and cst, (not sign)))
+  // Note that we don't handle (copysign x, cst) because this can always be
+  // folded to an fneg or fabs.
+  //
+  // For ppc_fp128:
+  // fold (bitcast (fcopysign cst, x)) ->
+  //     flipbit = (and (extract_element
+  //                     (xor (bitcast cst), (bitcast x)), 0),
+  //                    signbit)
+  //     (xor (bitcast cst) (build_pair flipbit, flipbit))
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse() &&
+      isa<ConstantFPSDNode>(N0.getOperand(0)) &&
+      VT.isInteger() && !VT.isVector()) {
+    unsigned OrigXWidth = N0.getOperand(1).getValueType().getSizeInBits();
+    EVT IntXVT = EVT::getIntegerVT(*DAG.getContext(), OrigXWidth);
+    if (isTypeLegal(IntXVT)) {
+      SDValue X = DAG.getBitcast(IntXVT, N0.getOperand(1));
+      AddToWorklist(X.getNode());
+
+      // If X has a different width than the result/lhs, sext it or truncate it.
+      unsigned VTWidth = VT.getSizeInBits();
+      if (OrigXWidth < VTWidth) {
+        X = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, X);
+        AddToWorklist(X.getNode());
+      } else if (OrigXWidth > VTWidth) {
+        // To get the sign bit in the right place, we have to shift it right
+        // before truncating.
+        SDLoc DL(X);
+        X = DAG.getNode(ISD::SRL, DL,
+                        X.getValueType(), X,
+                        DAG.getConstant(OrigXWidth-VTWidth, DL,
+                                        X.getValueType()));
+        AddToWorklist(X.getNode());
+        X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X);
+        AddToWorklist(X.getNode());
+      }
+
+      if (N0.getValueType() == MVT::ppcf128 && !LegalTypes) {
+        APInt SignBit = APInt::getSignBit(VT.getSizeInBits() / 2);
+        SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
+        AddToWorklist(Cst.getNode());
+        SDValue X = DAG.getBitcast(VT, N0.getOperand(1));
+        AddToWorklist(X.getNode());
+        SDValue XorResult = DAG.getNode(ISD::XOR, SDLoc(N0), VT, Cst, X);
+        AddToWorklist(XorResult.getNode());
+        SDValue XorResult64 = DAG.getNode(
+            ISD::EXTRACT_ELEMENT, SDLoc(XorResult), MVT::i64, XorResult,
+            DAG.getIntPtrConstant(getPPCf128HiElementSelector(DAG),
+                                  SDLoc(XorResult)));
+        AddToWorklist(XorResult64.getNode());
+        SDValue FlipBit =
+            DAG.getNode(ISD::AND, SDLoc(XorResult64), MVT::i64, XorResult64,
+                        DAG.getConstant(SignBit, SDLoc(XorResult64), MVT::i64));
+        AddToWorklist(FlipBit.getNode());
+        SDValue FlipBits =
+            DAG.getNode(ISD::BUILD_PAIR, SDLoc(N0), VT, FlipBit, FlipBit);
+        AddToWorklist(FlipBits.getNode());
+        return DAG.getNode(ISD::XOR, SDLoc(N), VT, Cst, FlipBits);
+      }
+      APInt SignBit = APInt::getSignBit(VT.getSizeInBits());
+      X = DAG.getNode(ISD::AND, SDLoc(X), VT,
+                      X, DAG.getConstant(SignBit, SDLoc(X), VT));
+      AddToWorklist(X.getNode());
+
+      SDValue Cst = DAG.getBitcast(VT, N0.getOperand(0));
+      Cst = DAG.getNode(ISD::AND, SDLoc(Cst), VT,
+                        Cst, DAG.getConstant(~SignBit, SDLoc(Cst), VT));
+      AddToWorklist(Cst.getNode());
+
+      return DAG.getNode(ISD::OR, SDLoc(N), VT, X, Cst);
+    }
+  }
+
+  // bitconvert(build_pair(ld, ld)) -> ld iff load locations are consecutive.
+  if (N0.getOpcode() == ISD::BUILD_PAIR)
+    if (SDValue CombineLD = CombineConsecutiveLoads(N0.getNode(), VT))
+      return CombineLD;
+
+  // Remove double bitcasts from shuffles - this is often a legacy of
+  // XformToShuffleWithZero being used to combine bitmaskings (of
+  // float vectors bitcast to integer vectors) into shuffles.
+  // bitcast(shuffle(bitcast(s0),bitcast(s1))) -> shuffle(s0,s1)
+  if (Level < AfterLegalizeDAG && TLI.isTypeLegal(VT) && VT.isVector() &&
+      N0->getOpcode() == ISD::VECTOR_SHUFFLE &&
+      VT.getVectorNumElements() >= N0.getValueType().getVectorNumElements() &&
+      !(VT.getVectorNumElements() % N0.getValueType().getVectorNumElements())) {
+    ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N0);
+
+    // If operands are a bitcast, peek through if it casts the original VT.
+    // If operands are a constant, just bitcast back to original VT.
+    auto PeekThroughBitcast = [&](SDValue Op) {
+      if (Op.getOpcode() == ISD::BITCAST &&
+          Op.getOperand(0).getValueType() == VT)
+        return SDValue(Op.getOperand(0));
+      if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
+          ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()))
+        return DAG.getBitcast(VT, Op);
+      return SDValue();
+    };
+
+    SDValue SV0 = PeekThroughBitcast(N0->getOperand(0));
+    SDValue SV1 = PeekThroughBitcast(N0->getOperand(1));
+    if (!(SV0 && SV1))
+      return SDValue();
+
+    int MaskScale =
+        VT.getVectorNumElements() / N0.getValueType().getVectorNumElements();
+    SmallVector<int, 8> NewMask;
+    for (int M : SVN->getMask())
+      for (int i = 0; i != MaskScale; ++i)
+        NewMask.push_back(M < 0 ? -1 : M * MaskScale + i);
+
+    bool LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
+    if (!LegalMask) {
+      std::swap(SV0, SV1);
+      ShuffleVectorSDNode::commuteMask(NewMask);
+      LegalMask = TLI.isShuffleMaskLegal(NewMask, VT);
+    }
+
+    if (LegalMask)
+      return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, NewMask);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  return CombineConsecutiveLoads(N, VT);
+}
+
+/// We know that BV is a build_vector node with Constant, ConstantFP or Undef
+/// operands. DstEltVT indicates the destination element value type.
+SDValue DAGCombiner::
+ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
+  EVT SrcEltVT = BV->getValueType(0).getVectorElementType();
+
+  // If this is already the right type, we're done.
+  if (SrcEltVT == DstEltVT) return SDValue(BV, 0);
+
+  unsigned SrcBitSize = SrcEltVT.getSizeInBits();
+  unsigned DstBitSize = DstEltVT.getSizeInBits();
+
+  // If this is a conversion of N elements of one type to N elements of another
+  // type, convert each element.  This handles FP<->INT cases.
+  if (SrcBitSize == DstBitSize) {
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                              BV->getValueType(0).getVectorNumElements());
+
+    // Due to the FP element handling below calling this routine recursively,
+    // we can end up with a scalar-to-vector node here.
+    if (BV->getOpcode() == ISD::SCALAR_TO_VECTOR)
+      return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(BV), VT,
+                         DAG.getBitcast(DstEltVT, BV->getOperand(0)));
+
+    SmallVector<SDValue, 8> Ops;
+    for (SDValue Op : BV->op_values()) {
+      // If the vector element type is not legal, the BUILD_VECTOR operands
+      // are promoted and implicitly truncated.  Make that explicit here.
+      if (Op.getValueType() != SrcEltVT)
+        Op = DAG.getNode(ISD::TRUNCATE, SDLoc(BV), SrcEltVT, Op);
+      Ops.push_back(DAG.getBitcast(DstEltVT, Op));
+      AddToWorklist(Ops.back().getNode());
+    }
+    return DAG.getBuildVector(VT, SDLoc(BV), Ops);
+  }
+
+  // Otherwise, we're growing or shrinking the elements.  To avoid having to
+  // handle annoying details of growing/shrinking FP values, we convert them to
+  // int first.
+  if (SrcEltVT.isFloatingPoint()) {
+    // Convert the input float vector to a int vector where the elements are the
+    // same sizes.
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), SrcEltVT.getSizeInBits());
+    BV = ConstantFoldBITCASTofBUILD_VECTOR(BV, IntVT).getNode();
+    SrcEltVT = IntVT;
+  }
+
+  // Now we know the input is an integer vector.  If the output is a FP type,
+  // convert to integer first, then to FP of the right size.
+  if (DstEltVT.isFloatingPoint()) {
+    EVT TmpVT = EVT::getIntegerVT(*DAG.getContext(), DstEltVT.getSizeInBits());
+    SDNode *Tmp = ConstantFoldBITCASTofBUILD_VECTOR(BV, TmpVT).getNode();
+
+    // Next, convert to FP elements of the same size.
+    return ConstantFoldBITCASTofBUILD_VECTOR(Tmp, DstEltVT);
+  }
+
+  SDLoc DL(BV);
+
+  // Okay, we know the src/dst types are both integers of differing types.
+  // Handling growing first.
+  assert(SrcEltVT.isInteger() && DstEltVT.isInteger());
+  if (SrcBitSize < DstBitSize) {
+    unsigned NumInputsPerOutput = DstBitSize/SrcBitSize;
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0, e = BV->getNumOperands(); i != e;
+         i += NumInputsPerOutput) {
+      bool isLE = DAG.getDataLayout().isLittleEndian();
+      APInt NewBits = APInt(DstBitSize, 0);
+      bool EltIsUndef = true;
+      for (unsigned j = 0; j != NumInputsPerOutput; ++j) {
+        // Shift the previously computed bits over.
+        NewBits <<= SrcBitSize;
+        SDValue Op = BV->getOperand(i+ (isLE ? (NumInputsPerOutput-j-1) : j));
+        if (Op.isUndef()) continue;
+        EltIsUndef = false;
+
+        NewBits |= cast<ConstantSDNode>(Op)->getAPIntValue().
+                   zextOrTrunc(SrcBitSize).zext(DstBitSize);
+      }
+
+      if (EltIsUndef)
+        Ops.push_back(DAG.getUNDEF(DstEltVT));
+      else
+        Ops.push_back(DAG.getConstant(NewBits, DL, DstEltVT));
+    }
+
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT, Ops.size());
+    return DAG.getBuildVector(VT, DL, Ops);
+  }
+
+  // Finally, this must be the case where we are shrinking elements: each input
+  // turns into multiple outputs.
+  unsigned NumOutputsPerInput = SrcBitSize/DstBitSize;
+  EVT VT = EVT::getVectorVT(*DAG.getContext(), DstEltVT,
+                            NumOutputsPerInput*BV->getNumOperands());
+  SmallVector<SDValue, 8> Ops;
+
+  for (const SDValue &Op : BV->op_values()) {
+    if (Op.isUndef()) {
+      Ops.append(NumOutputsPerInput, DAG.getUNDEF(DstEltVT));
+      continue;
+    }
+
+    APInt OpVal = cast<ConstantSDNode>(Op)->
+                  getAPIntValue().zextOrTrunc(SrcBitSize);
+
+    for (unsigned j = 0; j != NumOutputsPerInput; ++j) {
+      APInt ThisVal = OpVal.trunc(DstBitSize);
+      Ops.push_back(DAG.getConstant(ThisVal, DL, DstEltVT));
+      OpVal = OpVal.lshr(DstBitSize);
+    }
+
+    // For big endian targets, swap the order of the pieces of each element.
+    if (DAG.getDataLayout().isBigEndian())
+      std::reverse(Ops.end()-NumOutputsPerInput, Ops.end());
+  }
+
+  return DAG.getBuildVector(VT, DL, Ops);
+}
+
+/// Try to perform FMA combining on a given FADD node.
+SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+
+  // Floating-point multiply-add with intermediate rounding.
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+  ;
+  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool LookThroughFPExt = TLI.isFPExtFree(VT);
+
+  // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
+      N1.getOpcode() == ISD::FMUL) {
+    if (N0.getNode()->use_size() > N1.getNode()->use_size())
+      std::swap(N0, N1);
+  }
+
+  // fold (fadd (fmul x, y), z) -> (fma x, y, z)
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                       N0.getOperand(0), N0.getOperand(1), N1);
+  }
+
+  // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
+  // Note: Commutes FADD operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse())) {
+    return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                       N1.getOperand(0), N1.getOperand(1), N0);
+  }
+
+  // Look through FP_EXTEND nodes to do more combining.
+  if (AllowFusion && LookThroughFPExt) {
+    // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
+    if (N0.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N00 = N0.getOperand(0);
+      if (N00.getOpcode() == ISD::FMUL)
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N00.getOperand(0)),
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N00.getOperand(1)), N1);
+    }
+
+    // fold (fadd x, (fpext (fmul y, z))) -> (fma (fpext y), (fpext z), x)
+    // Note: Commutes FADD operands.
+    if (N1.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N10 = N1.getOperand(0);
+      if (N10.getOpcode() == ISD::FMUL)
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N10.getOperand(0)),
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N10.getOperand(1)), N0);
+    }
+  }
+
+  // More folding opportunities when target permits.
+  if ((AllowFusion || HasFMAD)  && Aggressive) {
+    // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
+    if (N0.getOpcode() == PreferredFusedOpcode &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     N1));
+    }
+
+    // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
+    if (N1->getOpcode() == PreferredFusedOpcode &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                         N1.getOperand(0), N1.getOperand(1),
+                         DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                     N1.getOperand(2).getOperand(0),
+                                     N1.getOperand(2).getOperand(1),
+                                     N0));
+    }
+
+    if (AllowFusion && LookThroughFPExt) {
+      // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
+      //   -> (fma x, y, (fma (fpext u), (fpext v), z))
+      auto FoldFAddFMAFPExtFMul = [&] (
+          SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X, Y,
+                           DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
+                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
+                                       Z));
+      };
+      if (N0.getOpcode() == PreferredFusedOpcode) {
+        SDValue N02 = N0.getOperand(2);
+        if (N02.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N020 = N02.getOperand(0);
+          if (N020.getOpcode() == ISD::FMUL)
+            return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
+                                        N020.getOperand(0), N020.getOperand(1),
+                                        N1);
+        }
+      }
+
+      // fold (fadd (fpext (fma x, y, (fmul u, v))), z)
+      //   -> (fma (fpext x), (fpext y), (fma (fpext u), (fpext v), z))
+      // FIXME: This turns two single-precision and one double-precision
+      // operation into two double-precision operations, which might not be
+      // interesting for all targets, especially GPUs.
+      auto FoldFAddFPExtFMAFMul = [&] (
+          SDValue X, SDValue Y, SDValue U, SDValue V, SDValue Z) {
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT, X),
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT, Y),
+                           DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, U),
+                                       DAG.getNode(ISD::FP_EXTEND, SL, VT, V),
+                                       Z));
+      };
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == PreferredFusedOpcode) {
+          SDValue N002 = N00.getOperand(2);
+          if (N002.getOpcode() == ISD::FMUL)
+            return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
+                                        N002.getOperand(0), N002.getOperand(1),
+                                        N1);
+        }
+      }
+
+      // fold (fadd x, (fma y, z, (fpext (fmul u, v)))
+      //   -> (fma y, z, (fma (fpext u), (fpext v), x))
+      if (N1.getOpcode() == PreferredFusedOpcode) {
+        SDValue N12 = N1.getOperand(2);
+        if (N12.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N120 = N12.getOperand(0);
+          if (N120.getOpcode() == ISD::FMUL)
+            return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
+                                        N120.getOperand(0), N120.getOperand(1),
+                                        N0);
+        }
+      }
+
+      // fold (fadd x, (fpext (fma y, z, (fmul u, v)))
+      //   -> (fma (fpext y), (fpext z), (fma (fpext u), (fpext v), x))
+      // FIXME: This turns two single-precision and one double-precision
+      // operation into two double-precision operations, which might not be
+      // interesting for all targets, especially GPUs.
+      if (N1.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N10 = N1.getOperand(0);
+        if (N10.getOpcode() == PreferredFusedOpcode) {
+          SDValue N102 = N10.getOperand(2);
+          if (N102.getOpcode() == ISD::FMUL)
+            return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
+                                        N102.getOperand(0), N102.getOperand(1),
+                                        N0);
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// Try to perform FMA combining on a given FSUB node.
+SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+
+  // Floating-point multiply-add with intermediate rounding.
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
+  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool LookThroughFPExt = TLI.isFPExtFree(VT);
+
+  // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+  if (N0.getOpcode() == ISD::FMUL &&
+      (Aggressive || N0->hasOneUse())) {
+    return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                       N0.getOperand(0), N0.getOperand(1),
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+  // Note: Commutes FSUB operands.
+  if (N1.getOpcode() == ISD::FMUL &&
+      (Aggressive || N1->hasOneUse()))
+    return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT,
+                                   N1.getOperand(0)),
+                       N1.getOperand(1), N0);
+
+  // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+  if (N0.getOpcode() == ISD::FNEG &&
+      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+      (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
+    SDValue N00 = N0.getOperand(0).getOperand(0);
+    SDValue N01 = N0.getOperand(0).getOperand(1);
+    return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                       DAG.getNode(ISD::FNEG, SL, VT, N00), N01,
+                       DAG.getNode(ISD::FNEG, SL, VT, N1));
+  }
+
+  // Look through FP_EXTEND nodes to do more combining.
+  if (AllowFusion && LookThroughFPExt) {
+    // fold (fsub (fpext (fmul x, y)), z)
+    //   -> (fma (fpext x), (fpext y), (fneg z))
+    if (N0.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N00 = N0.getOperand(0);
+      if (N00.getOpcode() == ISD::FMUL)
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N00.getOperand(0)),
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N00.getOperand(1)),
+                           DAG.getNode(ISD::FNEG, SL, VT, N1));
+    }
+
+    // fold (fsub x, (fpext (fmul y, z)))
+    //   -> (fma (fneg (fpext y)), (fpext z), x)
+    // Note: Commutes FSUB operands.
+    if (N1.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N10 = N1.getOperand(0);
+      if (N10.getOpcode() == ISD::FMUL)
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FNEG, SL, VT,
+                                       DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                   N10.getOperand(0))),
+                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                       N10.getOperand(1)),
+                           N0);
+    }
+
+    // fold (fsub (fpext (fneg (fmul, x, y))), z)
+    //   -> (fneg (fma (fpext x), (fpext y), z))
+    // Note: This could be removed with appropriate canonicalization of the
+    // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+    // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+    // from implementing the canonicalization in visitFSUB.
+    if (N0.getOpcode() == ISD::FP_EXTEND) {
+      SDValue N00 = N0.getOperand(0);
+      if (N00.getOpcode() == ISD::FNEG) {
+        SDValue N000 = N00.getOperand(0);
+        if (N000.getOpcode() == ISD::FMUL) {
+          return DAG.getNode(ISD::FNEG, SL, VT,
+                             DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N000.getOperand(0)),
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N000.getOperand(1)),
+                                         N1));
+        }
+      }
+    }
+
+    // fold (fsub (fneg (fpext (fmul, x, y))), z)
+    //   -> (fneg (fma (fpext x)), (fpext y), z)
+    // Note: This could be removed with appropriate canonicalization of the
+    // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
+    // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+    // from implementing the canonicalization in visitFSUB.
+    if (N0.getOpcode() == ISD::FNEG) {
+      SDValue N00 = N0.getOperand(0);
+      if (N00.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N000 = N00.getOperand(0);
+        if (N000.getOpcode() == ISD::FMUL) {
+          return DAG.getNode(ISD::FNEG, SL, VT,
+                             DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N000.getOperand(0)),
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N000.getOperand(1)),
+                                         N1));
+        }
+      }
+    }
+
+  }
+
+  // More folding opportunities when target permits.
+  if ((AllowFusion || HasFMAD) && Aggressive) {
+    // fold (fsub (fma x, y, (fmul u, v)), z)
+    //   -> (fma x, y (fma u, v, (fneg z)))
+    if (N0.getOpcode() == PreferredFusedOpcode &&
+        N0.getOperand(2).getOpcode() == ISD::FMUL) {
+      return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                         N0.getOperand(0), N0.getOperand(1),
+                         DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                     N0.getOperand(2).getOperand(0),
+                                     N0.getOperand(2).getOperand(1),
+                                     DAG.getNode(ISD::FNEG, SL, VT,
+                                                 N1)));
+    }
+
+    // fold (fsub x, (fma y, z, (fmul u, v)))
+    //   -> (fma (fneg y), z, (fma (fneg u), v, x))
+    if (N1.getOpcode() == PreferredFusedOpcode &&
+        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+      SDValue N20 = N1.getOperand(2).getOperand(0);
+      SDValue N21 = N1.getOperand(2).getOperand(1);
+      return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                         DAG.getNode(ISD::FNEG, SL, VT,
+                                     N1.getOperand(0)),
+                         N1.getOperand(1),
+                         DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                     DAG.getNode(ISD::FNEG, SL, VT, N20),
+
+                                     N21, N0));
+    }
+
+    if (AllowFusion && LookThroughFPExt) {
+      // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
+      //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
+      if (N0.getOpcode() == PreferredFusedOpcode) {
+        SDValue N02 = N0.getOperand(2);
+        if (N02.getOpcode() == ISD::FP_EXTEND) {
+          SDValue N020 = N02.getOperand(0);
+          if (N020.getOpcode() == ISD::FMUL)
+            return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                               N0.getOperand(0), N0.getOperand(1),
+                               DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                       N020.getOperand(0)),
+                                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                       N020.getOperand(1)),
+                                           DAG.getNode(ISD::FNEG, SL, VT,
+                                                       N1)));
+        }
+      }
+
+      // fold (fsub (fpext (fma x, y, (fmul u, v))), z)
+      //   -> (fma (fpext x), (fpext y),
+      //           (fma (fpext u), (fpext v), (fneg z)))
+      // FIXME: This turns two single-precision and one double-precision
+      // operation into two double-precision operations, which might not be
+      // interesting for all targets, especially GPUs.
+      if (N0.getOpcode() == ISD::FP_EXTEND) {
+        SDValue N00 = N0.getOperand(0);
+        if (N00.getOpcode() == PreferredFusedOpcode) {
+          SDValue N002 = N00.getOperand(2);
+          if (N002.getOpcode() == ISD::FMUL)
+            return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                               DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                           N00.getOperand(0)),
+                               DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                           N00.getOperand(1)),
+                               DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                       N002.getOperand(0)),
+                                           DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                       N002.getOperand(1)),
+                                           DAG.getNode(ISD::FNEG, SL, VT,
+                                                       N1)));
+        }
+      }
+
+      // fold (fsub x, (fma y, z, (fpext (fmul u, v))))
+      //   -> (fma (fneg y), z, (fma (fneg (fpext u)), (fpext v), x))
+      if (N1.getOpcode() == PreferredFusedOpcode &&
+        N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
+        SDValue N120 = N1.getOperand(2).getOperand(0);
+        if (N120.getOpcode() == ISD::FMUL) {
+          SDValue N1200 = N120.getOperand(0);
+          SDValue N1201 = N120.getOperand(1);
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT, N1.getOperand(0)),
+                             N1.getOperand(1),
+                             DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                         DAG.getNode(ISD::FNEG, SL, VT,
+                                             DAG.getNode(ISD::FP_EXTEND, SL,
+                                                         VT, N1200)),
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N1201),
+                                         N0));
+        }
+      }
+
+      // fold (fsub x, (fpext (fma y, z, (fmul u, v))))
+      //   -> (fma (fneg (fpext y)), (fpext z),
+      //           (fma (fneg (fpext u)), (fpext v), x))
+      // FIXME: This turns two single-precision and one double-precision
+      // operation into two double-precision operations, which might not be
+      // interesting for all targets, especially GPUs.
+      if (N1.getOpcode() == ISD::FP_EXTEND &&
+        N1.getOperand(0).getOpcode() == PreferredFusedOpcode) {
+        SDValue N100 = N1.getOperand(0).getOperand(0);
+        SDValue N101 = N1.getOperand(0).getOperand(1);
+        SDValue N102 = N1.getOperand(0).getOperand(2);
+        if (N102.getOpcode() == ISD::FMUL) {
+          SDValue N1020 = N102.getOperand(0);
+          SDValue N1021 = N102.getOperand(1);
+          return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                             DAG.getNode(ISD::FNEG, SL, VT,
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N100)),
+                             DAG.getNode(ISD::FP_EXTEND, SL, VT, N101),
+                             DAG.getNode(PreferredFusedOpcode, SL, VT,
+                                         DAG.getNode(ISD::FNEG, SL, VT,
+                                             DAG.getNode(ISD::FP_EXTEND, SL,
+                                                         VT, N1020)),
+                                         DAG.getNode(ISD::FP_EXTEND, SL, VT,
+                                                     N1021),
+                                         N0));
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// Try to perform FMA combining on a given FMUL node.
+SDValue DAGCombiner::visitFMULForFMACombine(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc SL(N);
+
+  assert(N->getOpcode() == ISD::FMUL && "Expected FMUL Operation");
+
+  const TargetOptions &Options = DAG.getTarget().Options;
+  bool AllowFusion =
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
+
+  // Floating-point multiply-add with intermediate rounding.
+  bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+
+  // Floating-point multiply-add without intermediate rounding.
+  bool HasFMA =
+      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
+
+  // No valid opcode, do not combine.
+  if (!HasFMAD && !HasFMA)
+    return SDValue();
+
+  // Always prefer FMAD to FMA for precision.
+  unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
+  bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+
+  // fold (fmul (fadd x, +1.0), y) -> (fma x, y, y)
+  // fold (fmul (fadd x, -1.0), y) -> (fma x, y, (fneg y))
+  auto FuseFADD = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FADD && (Aggressive || X->hasOneUse())) {
+      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+      if (XC1 && XC1->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+      if (XC1 && XC1->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFADD(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFADD(N1, N0))
+    return FMA;
+
+  // fold (fmul (fsub +1.0, x), y) -> (fma (fneg x), y, y)
+  // fold (fmul (fsub -1.0, x), y) -> (fma (fneg x), y, (fneg y))
+  // fold (fmul (fsub x, +1.0), y) -> (fma x, y, (fneg y))
+  // fold (fmul (fsub x, -1.0), y) -> (fma x, y, y)
+  auto FuseFSUB = [&](SDValue X, SDValue Y) {
+    if (X.getOpcode() == ISD::FSUB && (Aggressive || X->hasOneUse())) {
+      auto XC0 = isConstOrConstSplatFP(X.getOperand(0));
+      if (XC0 && XC0->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                           Y);
+      if (XC0 && XC0->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                           DAG.getNode(ISD::FNEG, SL, VT, X.getOperand(1)), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+
+      auto XC1 = isConstOrConstSplatFP(X.getOperand(1));
+      if (XC1 && XC1->isExactlyValue(+1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y,
+                           DAG.getNode(ISD::FNEG, SL, VT, Y));
+      if (XC1 && XC1->isExactlyValue(-1.0))
+        return DAG.getNode(PreferredFusedOpcode, SL, VT, X.getOperand(0), Y, Y);
+    }
+    return SDValue();
+  };
+
+  if (SDValue FMA = FuseFSUB(N0, N1))
+    return FMA;
+  if (SDValue FMA = FuseFSUB(N1, N0))
+    return FMA;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFADD(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  bool N0CFP = isConstantFPBuildVectorOrConstantFP(N0);
+  bool N1CFP = isConstantFPBuildVectorOrConstantFP(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  // fold (fadd c1, c2) -> c1 + c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N1, Flags);
+
+  // canonicalize constant to RHS
+  if (N0CFP && !N1CFP)
+    return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
+
+  // fold (fadd A, (fneg B)) -> (fsub A, B)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+      isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
+    return DAG.getNode(ISD::FSUB, DL, VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
+
+  // fold (fadd (fneg A), B) -> (fsub B, A)
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
+      isNegatibleForFree(N0, LegalOperations, TLI, &Options) == 2)
+    return DAG.getNode(ISD::FSUB, DL, VT, N1,
+                       GetNegatedExpression(N0, DAG, LegalOperations), Flags);
+
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // No FP constant should be created after legalization as Instruction
+    // Selection pass has a hard time dealing with FP constants.
+    bool AllowNewConst = (Level < AfterLegalizeDAG);
+
+    // fold (fadd A, 0) -> A
+    if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
+      if (N1C->isZero())
+        return N0;
+
+    // fold (fadd (fadd x, c1), c2) -> (fadd x, (fadd c1, c2))
+    if (N1CFP && N0.getOpcode() == ISD::FADD && N0.getNode()->hasOneUse() &&
+        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1)))
+      return DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(0),
+                         DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1), N1,
+                                     Flags),
+                         Flags);
+
+    // If allowed, fold (fadd (fneg x), x) -> 0.0
+    if (AllowNewConst && N0.getOpcode() == ISD::FNEG && N0.getOperand(0) == N1)
+      return DAG.getConstantFP(0.0, DL, VT);
+
+    // If allowed, fold (fadd x, (fneg x)) -> 0.0
+    if (AllowNewConst && N1.getOpcode() == ISD::FNEG && N1.getOperand(0) == N0)
+      return DAG.getConstantFP(0.0, DL, VT);
+
+    // We can fold chains of FADD's of the same value into multiplications.
+    // This transform is not safe in general because we are reducing the number
+    // of rounding steps.
+    if (TLI.isOperationLegalOrCustom(ISD::FMUL, VT) && !N0CFP && !N1CFP) {
+      if (N0.getOpcode() == ISD::FMUL) {
+        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        bool CFP01 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(1));
+
+        // (fadd (fmul x, c), x) -> (fmul x, c+1)
+        if (CFP01 && !CFP00 && N0.getOperand(0) == N1) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N1, NewCFP, Flags);
+        }
+
+        // (fadd (fmul x, c), (fadd x, x)) -> (fmul x, c+2)
+        if (CFP01 && !CFP00 && N1.getOpcode() == ISD::FADD &&
+            N1.getOperand(0) == N1.getOperand(1) &&
+            N0.getOperand(0) == N1.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N0.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), NewCFP, Flags);
+        }
+      }
+
+      if (N1.getOpcode() == ISD::FMUL) {
+        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        bool CFP11 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(1));
+
+        // (fadd x, (fmul x, c)) -> (fmul x, c+1)
+        if (CFP11 && !CFP10 && N1.getOperand(0) == N0) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(1.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, NewCFP, Flags);
+        }
+
+        // (fadd (fadd x, x), (fmul x, c)) -> (fmul x, c+2)
+        if (CFP11 && !CFP10 && N0.getOpcode() == ISD::FADD &&
+            N0.getOperand(0) == N0.getOperand(1) &&
+            N1.getOperand(0) == N0.getOperand(0)) {
+          SDValue NewCFP = DAG.getNode(ISD::FADD, DL, VT, N1.getOperand(1),
+                                       DAG.getConstantFP(2.0, DL, VT), Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N1.getOperand(0), NewCFP, Flags);
+        }
+      }
+
+      if (N0.getOpcode() == ISD::FADD && AllowNewConst) {
+        bool CFP00 = isConstantFPBuildVectorOrConstantFP(N0.getOperand(0));
+        // (fadd (fadd x, x), x) -> (fmul x, 3.0)
+        if (!CFP00 && N0.getOperand(0) == N0.getOperand(1) &&
+            (N0.getOperand(0) == N1)) {
+          return DAG.getNode(ISD::FMUL, DL, VT,
+                             N1, DAG.getConstantFP(3.0, DL, VT), Flags);
+        }
+      }
+
+      if (N1.getOpcode() == ISD::FADD && AllowNewConst) {
+        bool CFP10 = isConstantFPBuildVectorOrConstantFP(N1.getOperand(0));
+        // (fadd x, (fadd x, x)) -> (fmul x, 3.0)
+        if (!CFP10 && N1.getOperand(0) == N1.getOperand(1) &&
+            N1.getOperand(0) == N0) {
+          return DAG.getNode(ISD::FMUL, DL, VT,
+                             N0, DAG.getConstantFP(3.0, DL, VT), Flags);
+        }
+      }
+
+      // (fadd (fadd x, x), (fadd x, x)) -> (fmul x, 4.0)
+      if (AllowNewConst &&
+          N0.getOpcode() == ISD::FADD && N1.getOpcode() == ISD::FADD &&
+          N0.getOperand(0) == N0.getOperand(1) &&
+          N1.getOperand(0) == N1.getOperand(1) &&
+          N0.getOperand(0) == N1.getOperand(0)) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0),
+                           DAG.getConstantFP(4.0, DL, VT), Flags);
+      }
+    }
+  } // enable-unsafe-fp-math
+
+  // FADD -> FMA combines:
+  if (SDValue Fused = visitFADDForFMACombine(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSUB(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  // fold (fsub c1, c2) -> c1-c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FSUB, dl, VT, N0, N1, Flags);
+
+  // fold (fsub A, (fneg B)) -> (fadd A, B)
+  if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
+    return DAG.getNode(ISD::FADD, dl, VT, N0,
+                       GetNegatedExpression(N1, DAG, LegalOperations), Flags);
+
+  // If 'unsafe math' is enabled, fold lots of things.
+  if (Options.UnsafeFPMath) {
+    // (fsub A, 0) -> A
+    if (N1CFP && N1CFP->isZero())
+      return N0;
+
+    // (fsub 0, B) -> -B
+    if (N0CFP && N0CFP->isZero()) {
+      if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
+        return GetNegatedExpression(N1, DAG, LegalOperations);
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, dl, VT, N1);
+    }
+
+    // (fsub x, x) -> 0.0
+    if (N0 == N1)
+      return DAG.getConstantFP(0.0f, dl, VT);
+
+    // (fsub x, (fadd x, y)) -> (fneg y)
+    // (fsub x, (fadd y, x)) -> (fneg y)
+    if (N1.getOpcode() == ISD::FADD) {
+      SDValue N10 = N1->getOperand(0);
+      SDValue N11 = N1->getOperand(1);
+
+      if (N10 == N0 && isNegatibleForFree(N11, LegalOperations, TLI, &Options))
+        return GetNegatedExpression(N11, DAG, LegalOperations);
+
+      if (N11 == N0 && isNegatibleForFree(N10, LegalOperations, TLI, &Options))
+        return GetNegatedExpression(N10, DAG, LegalOperations);
+    }
+  }
+
+  // FSUB -> FMA combines:
+  if (SDValue Fused = visitFSUBForFMACombine(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMUL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+
+  // fold vector ops
+  if (VT.isVector()) {
+    // This just handles C1 * C2 for vectors. Other vector folds are below.
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+  }
+
+  // fold (fmul c1, c2) -> c1*c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FMUL, DL, VT, N0, N1, Flags);
+
+  // canonicalize constant to RHS
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMUL, DL, VT, N1, N0, Flags);
+
+  // fold (fmul A, 1.0) -> A
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return N0;
+
+  if (Options.UnsafeFPMath) {
+    // fold (fmul A, 0) -> 0
+    if (N1CFP && N1CFP->isZero())
+      return N1;
+
+    // fold (fmul (fmul x, c1), c2) -> (fmul x, (fmul c1, c2))
+    if (N0.getOpcode() == ISD::FMUL) {
+      // Fold scalars or any vector constants (not just splats).
+      // This fold is done in general by InstCombine, but extra fmul insts
+      // may have been generated during lowering.
+      SDValue N00 = N0.getOperand(0);
+      SDValue N01 = N0.getOperand(1);
+      auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
+      auto *BV00 = dyn_cast<BuildVectorSDNode>(N00);
+      auto *BV01 = dyn_cast<BuildVectorSDNode>(N01);
+
+      // Check 1: Make sure that the first operand of the inner multiply is NOT
+      // a constant. Otherwise, we may induce infinite looping.
+      if (!(isConstOrConstSplatFP(N00) || (BV00 && BV00->isConstant()))) {
+        // Check 2: Make sure that the second operand of the inner multiply and
+        // the second operand of the outer multiply are constants.
+        if ((N1CFP && isConstOrConstSplatFP(N01)) ||
+            (BV1 && BV01 && BV1->isConstant() && BV01->isConstant())) {
+          SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, N01, N1, Flags);
+          return DAG.getNode(ISD::FMUL, DL, VT, N00, MulConsts, Flags);
+        }
+      }
+    }
+
+    // fold (fmul (fadd x, x), c) -> (fmul x, (fmul 2.0, c))
+    // Undo the fmul 2.0, x -> fadd x, x transformation, since if it occurs
+    // during an early run of DAGCombiner can prevent folding with fmuls
+    // inserted during lowering.
+    if (N0.getOpcode() == ISD::FADD &&
+        (N0.getOperand(0) == N0.getOperand(1)) &&
+        N0.hasOneUse()) {
+      const SDValue Two = DAG.getConstantFP(2.0, DL, VT);
+      SDValue MulConsts = DAG.getNode(ISD::FMUL, DL, VT, Two, N1, Flags);
+      return DAG.getNode(ISD::FMUL, DL, VT, N0.getOperand(0), MulConsts, Flags);
+    }
+  }
+
+  // fold (fmul X, 2.0) -> (fadd X, X)
+  if (N1CFP && N1CFP->isExactlyValue(+2.0))
+    return DAG.getNode(ISD::FADD, DL, VT, N0, N0, Flags);
+
+  // fold (fmul X, -1.0) -> (fneg X)
+  if (N1CFP && N1CFP->isExactlyValue(-1.0))
+    if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+      return DAG.getNode(ISD::FNEG, DL, VT, N0);
+
+  // fold (fmul (fneg X), (fneg Y)) -> (fmul X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FMUL, DL, VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations),
+                           Flags);
+    }
+  }
+
+  // FMUL -> FMA combines:
+  if (SDValue Fused = visitFMULForFMACombine(N)) {
+    AddToWorklist(Fused.getNode());
+    return Fused;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMA(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  // Constant fold FMA.
+  if (isa<ConstantFPSDNode>(N0) &&
+      isa<ConstantFPSDNode>(N1) &&
+      isa<ConstantFPSDNode>(N2)) {
+    return DAG.getNode(ISD::FMA, dl, VT, N0, N1, N2);
+  }
+
+  if (Options.UnsafeFPMath) {
+    if (N0CFP && N0CFP->isZero())
+      return N2;
+    if (N1CFP && N1CFP->isZero())
+      return N2;
+  }
+  // TODO: The FMA node should have flags that propagate to these nodes.
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N1, N2);
+  if (N1CFP && N1CFP->isExactlyValue(1.0))
+    return DAG.getNode(ISD::FADD, SDLoc(N), VT, N0, N2);
+
+  // Canonicalize (fma c, x, y) -> (fma x, c, y)
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1, N0, N2);
+
+  // TODO: FMA nodes should have flags that propagate to the created nodes.
+  // For now, create a Flags object for use with all unsafe math transforms.
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
+
+  if (Options.UnsafeFPMath) {
+    // (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
+    if (N2.getOpcode() == ISD::FMUL && N0 == N2.getOperand(0) &&
+        isConstantFPBuildVectorOrConstantFP(N1) &&
+        isConstantFPBuildVectorOrConstantFP(N2.getOperand(1))) {
+      return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                         DAG.getNode(ISD::FADD, dl, VT, N1, N2.getOperand(1),
+                                     &Flags), &Flags);
+    }
+
+    // (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
+    if (N0.getOpcode() == ISD::FMUL &&
+        isConstantFPBuildVectorOrConstantFP(N1) &&
+        isConstantFPBuildVectorOrConstantFP(N0.getOperand(1))) {
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         N0.getOperand(0),
+                         DAG.getNode(ISD::FMUL, dl, VT, N1, N0.getOperand(1),
+                                     &Flags),
+                         N2);
+    }
+  }
+
+  // (fma x, 1, y) -> (fadd x, y)
+  // (fma x, -1, y) -> (fadd (fneg x), y)
+  if (N1CFP) {
+    if (N1CFP->isExactlyValue(1.0))
+      // TODO: The FMA node should have flags that propagate to this node.
+      return DAG.getNode(ISD::FADD, dl, VT, N0, N2);
+
+    if (N1CFP->isExactlyValue(-1.0) &&
+        (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))) {
+      SDValue RHSNeg = DAG.getNode(ISD::FNEG, dl, VT, N0);
+      AddToWorklist(RHSNeg.getNode());
+      // TODO: The FMA node should have flags that propagate to this node.
+      return DAG.getNode(ISD::FADD, dl, VT, N2, RHSNeg);
+    }
+  }
+
+  if (Options.UnsafeFPMath) {
+    // (fma x, c, x) -> (fmul x, (c+1))
+    if (N1CFP && N0 == N2) {
+    return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                         DAG.getNode(ISD::FADD, dl, VT,
+                                     N1, DAG.getConstantFP(1.0, dl, VT),
+                                     &Flags), &Flags);
+    }
+
+    // (fma x, c, (fneg x)) -> (fmul x, (c-1))
+    if (N1CFP && N2.getOpcode() == ISD::FNEG && N2.getOperand(0) == N0) {
+      return DAG.getNode(ISD::FMUL, dl, VT, N0,
+                         DAG.getNode(ISD::FADD, dl, VT,
+                                     N1, DAG.getConstantFP(-1.0, dl, VT),
+                                     &Flags), &Flags);
+    }
+  }
+
+  return SDValue();
+}
+
+// Combine multiple FDIVs with the same divisor into multiple FMULs by the
+// reciprocal.
+// E.g., (a / D; b / D;) -> (recip = 1.0 / D; a * recip; b * recip)
+// Notice that this is not always beneficial. One reason is different target
+// may have different costs for FDIV and FMUL, so sometimes the cost of two
+// FDIVs may be lower than the cost of one FDIV and two FMULs. Another reason
+// is the critical path is increased from "one FDIV" to "one FDIV + one FMUL".
+SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
+  bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
+  const SDNodeFlags *Flags = N->getFlags();
+  if (!UnsafeMath && !Flags->hasAllowReciprocal())
+    return SDValue();
+
+  // Skip if current node is a reciprocal.
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  if (N0CFP && N0CFP->isExactlyValue(1.0))
+    return SDValue();
+
+  // Exit early if the target does not want this transform or if there can't
+  // possibly be enough uses of the divisor to make the transform worthwhile.
+  SDValue N1 = N->getOperand(1);
+  unsigned MinUses = TLI.combineRepeatedFPDivisors();
+  if (!MinUses || N1->use_size() < MinUses)
+    return SDValue();
+
+  // Find all FDIV users of the same divisor.
+  // Use a set because duplicates may be present in the user list.
+  SetVector<SDNode *> Users;
+  for (auto *U : N1->uses()) {
+    if (U->getOpcode() == ISD::FDIV && U->getOperand(1) == N1) {
+      // This division is eligible for optimization only if global unsafe math
+      // is enabled or if this division allows reciprocal formation.
+      if (UnsafeMath || U->getFlags()->hasAllowReciprocal())
+        Users.insert(U);
+    }
+  }
+
+  // Now that we have the actual number of divisor uses, make sure it meets
+  // the minimum threshold specified by the target.
+  if (Users.size() < MinUses)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+  SDValue Reciprocal = DAG.getNode(ISD::FDIV, DL, VT, FPOne, N1, Flags);
+
+  // Dividend / Divisor -> Dividend * Reciprocal
+  for (auto *U : Users) {
+    SDValue Dividend = U->getOperand(0);
+    if (Dividend != FPOne) {
+      SDValue NewNode = DAG.getNode(ISD::FMUL, SDLoc(U), VT, Dividend,
+                                    Reciprocal, Flags);
+      CombineTo(U, NewNode);
+    } else if (U != Reciprocal.getNode()) {
+      // In the absence of fast-math-flags, this user node is always the
+      // same node as Reciprocal, but with FMF they may be different nodes.
+      CombineTo(U, Reciprocal);
+    }
+  }
+  return SDValue(N, 0);  // N was replaced.
+}
+
+SDValue DAGCombiner::visitFDIV(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  const TargetOptions &Options = DAG.getTarget().Options;
+  SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(N)->Flags;
+
+  // fold vector ops
+  if (VT.isVector())
+    if (SDValue FoldedVOp = SimplifyVBinOp(N))
+      return FoldedVOp;
+
+  // fold (fdiv c1, c2) -> c1/c2
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
+
+  if (Options.UnsafeFPMath) {
+    // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
+    if (N1CFP) {
+      // Compute the reciprocal 1.0 / c2.
+      const APFloat &N1APF = N1CFP->getValueAPF();
+      APFloat Recip(N1APF.getSemantics(), 1); // 1.0
+      APFloat::opStatus st = Recip.divide(N1APF, APFloat::rmNearestTiesToEven);
+      // Only do the transform if the reciprocal is a legal fp immediate that
+      // isn't too nasty (eg NaN, denormal, ...).
+      if ((st == APFloat::opOK || st == APFloat::opInexact) && // Not too nasty
+          (!LegalOperations ||
+           // FIXME: custom lowering of ConstantFP might fail (see e.g. ARM
+           // backend)... we should handle this gracefully after Legalize.
+           // TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT) ||
+           TLI.isOperationLegal(llvm::ISD::ConstantFP, VT) ||
+           TLI.isFPImmLegal(Recip, VT)))
+        return DAG.getNode(ISD::FMUL, DL, VT, N0,
+                           DAG.getConstantFP(Recip, DL, VT), Flags);
+    }
+
+    // If this FDIV is part of a reciprocal square root, it may be folded
+    // into a target-specific square root estimate instruction.
+    if (N1.getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0), Flags)) {
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+      }
+    } else if (N1.getOpcode() == ISD::FP_EXTEND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+                                          Flags)) {
+        RV = DAG.getNode(ISD::FP_EXTEND, SDLoc(N1), VT, RV);
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+      }
+    } else if (N1.getOpcode() == ISD::FP_ROUND &&
+               N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+      if (SDValue RV = buildRsqrtEstimate(N1.getOperand(0).getOperand(0),
+                                          Flags)) {
+        RV = DAG.getNode(ISD::FP_ROUND, SDLoc(N1), VT, RV, N1.getOperand(1));
+        AddToWorklist(RV.getNode());
+        return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+      }
+    } else if (N1.getOpcode() == ISD::FMUL) {
+      // Look through an FMUL. Even though this won't remove the FDIV directly,
+      // it's still worthwhile to get rid of the FSQRT if possible.
+      SDValue SqrtOp;
+      SDValue OtherOp;
+      if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(0);
+        OtherOp = N1.getOperand(1);
+      } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
+        SqrtOp = N1.getOperand(1);
+        OtherOp = N1.getOperand(0);
+      }
+      if (SqrtOp.getNode()) {
+        // We found a FSQRT, so try to make this fold:
+        // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
+        if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
+          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
+          AddToWorklist(RV.getNode());
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+        }
+      }
+    }
+
+    // Fold into a reciprocal estimate and multiply instead of a real divide.
+    if (SDValue RV = BuildReciprocalEstimate(N1, Flags)) {
+      AddToWorklist(RV.getNode());
+      return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+    }
+  }
+
+  // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
+  if (char LHSNeg = isNegatibleForFree(N0, LegalOperations, TLI, &Options)) {
+    if (char RHSNeg = isNegatibleForFree(N1, LegalOperations, TLI, &Options)) {
+      // Both can be negated for free, check to see if at least one is cheaper
+      // negated.
+      if (LHSNeg == 2 || RHSNeg == 2)
+        return DAG.getNode(ISD::FDIV, SDLoc(N), VT,
+                           GetNegatedExpression(N0, DAG, LegalOperations),
+                           GetNegatedExpression(N1, DAG, LegalOperations),
+                           Flags);
+    }
+  }
+
+  if (SDValue CombineRepeatedDivisors = combineRepeatedFPDivisors(N))
+    return CombineRepeatedDivisors;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFREM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  // fold (frem c1, c2) -> fmod(c1,c2)
+  if (N0CFP && N1CFP)
+    return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,
+                       &cast<BinaryWithFlagsSDNode>(N)->Flags);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFSQRT(SDNode *N) {
+  if (!DAG.getTarget().Options.UnsafeFPMath || TLI.isFsqrtCheap())
+    return SDValue();
+
+  // TODO: FSQRT nodes should have flags that propagate to the created nodes.
+  // For now, create a Flags object for use with all unsafe math transforms.
+  SDNodeFlags Flags;
+  Flags.setUnsafeAlgebra(true);
+  return buildSqrtEstimate(N->getOperand(0), &Flags);
+}
+
+/// copysign(x, fp_extend(y)) -> copysign(x, y)
+/// copysign(x, fp_round(y)) -> copysign(x, y)
+static inline bool CanCombineFCOPYSIGN_EXTEND_ROUND(SDNode *N) {
+  SDValue N1 = N->getOperand(1);
+  if ((N1.getOpcode() == ISD::FP_EXTEND ||
+       N1.getOpcode() == ISD::FP_ROUND)) {
+    // Do not optimize out type conversion of f128 type yet.
+    // For some targets like x86_64, configuration is changed to keep one f128
+    // value in one SSE register, but instruction selection cannot handle
+    // FCOPYSIGN on SSE registers yet.
+    EVT N1VT = N1->getValueType(0);
+    EVT N1Op0VT = N1->getOperand(0)->getValueType(0);
+    return (N1VT == N1Op0VT || N1Op0VT != MVT::f128);
+  }
+  return false;
+}
+
+SDValue DAGCombiner::visitFCOPYSIGN(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  EVT VT = N->getValueType(0);
+
+  if (N0CFP && N1CFP)  // Constant fold
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT, N0, N1);
+
+  if (N1CFP) {
+    const APFloat& V = N1CFP->getValueAPF();
+    // copysign(x, c1) -> fabs(x)       iff ispos(c1)
+    // copysign(x, c1) -> fneg(fabs(x)) iff isneg(c1)
+    if (!V.isNegative()) {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FABS, VT))
+        return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+    } else {
+      if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
+        return DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                           DAG.getNode(ISD::FABS, SDLoc(N0), VT, N0));
+    }
+  }
+
+  // copysign(fabs(x), y) -> copysign(x, y)
+  // copysign(fneg(x), y) -> copysign(x, y)
+  // copysign(copysign(x,z), y) -> copysign(x, y)
+  if (N0.getOpcode() == ISD::FABS || N0.getOpcode() == ISD::FNEG ||
+      N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
+                       N0.getOperand(0), N1);
+
+  // copysign(x, abs(y)) -> abs(x)
+  if (N1.getOpcode() == ISD::FABS)
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
+  // copysign(x, copysign(y,z)) -> copysign(x, z)
+  if (N1.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
+                       N0, N1.getOperand(1));
+
+  // copysign(x, fp_extend(y)) -> copysign(x, y)
+  // copysign(x, fp_round(y)) -> copysign(x, y)
+  if (CanCombineFCOPYSIGN_EXTEND_ROUND(N))
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
+                       N0, N1.getOperand(0));
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (sint_to_fp c1) -> c1fp
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+
+  // If the input is a legal type, and SINT_TO_FP is not legal on this target,
+  // but UINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+  }
+
+  // The next optimizations are desirable only if SELECT_CC can be lowered.
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
+    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+        !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDLoc DL(N);
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
+    }
+
+    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+    //      (select_cc x, y, 1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDLoc DL(N);
+      SDValue Ops[] =
+        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
+          DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
+          N0.getOperand(0).getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N0.getValueType();
+
+  // fold (uint_to_fp c1) -> c1fp
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
+      // ...but only if the target supports immediate floating-point values
+      (!LegalOperations ||
+       TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+
+  // If the input is a legal type, and UINT_TO_FP is not legal on this target,
+  // but SINT_TO_FP is legal on this target, try to convert.
+  if (!TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, OpVT) &&
+      TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, OpVT)) {
+    // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
+    if (DAG.SignBitIsZero(N0))
+      return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+  }
+
+  // The next optimizations are desirable only if SELECT_CC can be lowered.
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
+    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+
+    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDLoc DL(N);
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
+    }
+  }
+
+  return SDValue();
+}
+
+// Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
+static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  if (N0.getOpcode() != ISD::UINT_TO_FP && N0.getOpcode() != ISD::SINT_TO_FP)
+    return SDValue();
+
+  SDValue Src = N0.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool IsInputSigned = N0.getOpcode() == ISD::SINT_TO_FP;
+  bool IsOutputSigned = N->getOpcode() == ISD::FP_TO_SINT;
+
+  // We can safely assume the conversion won't overflow the output range,
+  // because (for example) (uint8_t)18293.f is undefined behavior.
+
+  // Since we can assume the conversion won't overflow, our decision as to
+  // whether the input will fit in the float should depend on the minimum
+  // of the input range and output range.
+
+  // This means this is also safe for a signed input and unsigned output, since
+  // a negative input would lead to undefined behavior.
+  unsigned InputSize = (int)SrcVT.getScalarSizeInBits() - IsInputSigned;
+  unsigned OutputSize = (int)VT.getScalarSizeInBits() - IsOutputSigned;
+  unsigned ActualSize = std::min(InputSize, OutputSize);
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(N0.getValueType());
+
+  // We can only fold away the float conversion if the input range can be
+  // represented exactly in the float range.
+  if (APFloat::semanticsPrecision(sem) >= ActualSize) {
+    if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
+      unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
+                                                       : ISD::ZERO_EXTEND;
+      return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
+    }
+    if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
+    return DAG.getBitcast(VT, Src);
+  }
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_sint c1fp) -> c1
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
+
+  return FoldIntToFPToInt(N, DAG);
+}
+
+SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_to_uint c1fp) -> c1
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
+
+  return FoldIntToFPToInt(N, DAG);
+}
+
+SDValue DAGCombiner::visitFP_ROUND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fp_round c1fp) -> c1fp
+  if (N0CFP)
+    return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT, N0, N1);
+
+  // fold (fp_round (fp_extend x)) -> x
+  if (N0.getOpcode() == ISD::FP_EXTEND && VT == N0.getOperand(0).getValueType())
+    return N0.getOperand(0);
+
+  // fold (fp_round (fp_round x)) -> (fp_round x)
+  if (N0.getOpcode() == ISD::FP_ROUND) {
+    const bool NIsTrunc = N->getConstantOperandVal(1) == 1;
+    const bool N0IsTrunc = N0.getNode()->getConstantOperandVal(1) == 1;
+
+    // Skip this folding if it results in an fp_round from f80 to f16.
+    //
+    // f80 to f16 always generates an expensive (and as yet, unimplemented)
+    // libcall to __truncxfhf2 instead of selecting native f16 conversion
+    // instructions from f32 or f64.  Moreover, the first (value-preserving)
+    // fp_round from f80 to either f32 or f64 may become a NOP in platforms like
+    // x86.
+    if (N0.getOperand(0).getValueType() == MVT::f80 && VT == MVT::f16)
+      return SDValue();
+
+    // If the first fp_round isn't a value preserving truncation, it might
+    // introduce a tie in the second fp_round, that wouldn't occur in the
+    // single-step fp_round we want to fold to.
+    // In other words, double rounding isn't the same as rounding.
+    // Also, this is a value preserving truncation iff both fp_round's are.
+    if (DAG.getTarget().Options.UnsafeFPMath || N0IsTrunc) {
+      SDLoc DL(N);
+      return DAG.getNode(ISD::FP_ROUND, DL, VT, N0.getOperand(0),
+                         DAG.getIntPtrConstant(NIsTrunc && N0IsTrunc, DL));
+    }
+  }
+
+  // fold (fp_round (copysign X, Y)) -> (copysign (fp_round X), Y)
+  if (N0.getOpcode() == ISD::FCOPYSIGN && N0.getNode()->hasOneUse()) {
+    SDValue Tmp = DAG.getNode(ISD::FP_ROUND, SDLoc(N0), VT,
+                              N0.getOperand(0), N1);
+    AddToWorklist(Tmp.getNode());
+    return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N), VT,
+                       Tmp, N0.getOperand(1));
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_ROUND_INREG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+
+  // fold (fp_round_inreg c1fp) -> c1fp
+  if (N0CFP && isTypeLegal(EVT)) {
+    SDLoc DL(N);
+    SDValue Round = DAG.getConstantFP(*N0CFP->getConstantFPValue(), DL, EVT);
+    return DAG.getNode(ISD::FP_EXTEND, DL, VT, Round);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_EXTEND(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // If this is fp_round(fpextend), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() &&
+      N->use_begin()->getOpcode() == ISD::FP_ROUND)
+    return SDValue();
+
+  // fold (fp_extend c1fp) -> c1fp
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, N0);
+
+  // fold (fp_extend (fp16_to_fp op)) -> (fp16_to_fp op)
+  if (N0.getOpcode() == ISD::FP16_TO_FP &&
+      TLI.getOperationAction(ISD::FP16_TO_FP, VT) == TargetLowering::Legal)
+    return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), VT, N0.getOperand(0));
+
+  // Turn fp_extend(fp_round(X, 1)) -> x since the fp_round doesn't affect the
+  // value of X.
+  if (N0.getOpcode() == ISD::FP_ROUND
+      && N0.getNode()->getConstantOperandVal(1) == 1) {
+    SDValue In = N0.getOperand(0);
+    if (In.getValueType() == VT) return In;
+    if (VT.bitsLT(In.getValueType()))
+      return DAG.getNode(ISD::FP_ROUND, SDLoc(N), VT,
+                         In, N0.getOperand(1));
+    return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, In);
+  }
+
+  // fold (fpext (load x)) -> (fpext (fptrunc (extload x)))
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+       TLI.isLoadExtLegal(ISD::EXTLOAD, VT, N0.getValueType())) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad = DAG.getExtLoad(ISD::EXTLOAD, SDLoc(N), VT,
+                                     LN0->getChain(),
+                                     LN0->getBasePtr(), N0.getValueType(),
+                                     LN0->getMemOperand());
+    CombineTo(N, ExtLoad);
+    CombineTo(N0.getNode(),
+              DAG.getNode(ISD::FP_ROUND, SDLoc(N0),
+                          N0.getValueType(), ExtLoad,
+                          DAG.getIntPtrConstant(1, SDLoc(N0))),
+              ExtLoad.getValue(1));
+    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
+
+  return SDValue();
+}
+
+// FIXME: FNEG and FABS have a lot in common; refactor.
+SDValue DAGCombiner::visitFNEG(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Constant fold FNEG.
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
+
+  if (isNegatibleForFree(N0, LegalOperations, DAG.getTargetLoweringInfo(),
+                         &DAG.getTarget().Options))
+    return GetNegatedExpression(N0, DAG, LegalOperations);
+
+  // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading
+  // constant pool values.
+  if (!TLI.isFNegFree(VT) &&
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x80... per scalar element
+        // and splat it.
+        SignMask = APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x80...
+        SignMask = APInt::getSignBit(IntVT.getSizeInBits());
+      }
+      SDLoc DL0(N0);
+      Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int,
+                        DAG.getConstant(SignMask, DL0, IntVT));
+      AddToWorklist(Int.getNode());
+      return DAG.getBitcast(VT, Int);
+    }
+  }
+
+  // (fneg (fmul c, x)) -> (fmul -c, x)
+  if (N0.getOpcode() == ISD::FMUL &&
+      (N0.getNode()->hasOneUse() || !TLI.isFNegFree(VT))) {
+    ConstantFPSDNode *CFP1 = dyn_cast<ConstantFPSDNode>(N0.getOperand(1));
+    if (CFP1) {
+      APFloat CVal = CFP1->getValueAPF();
+      CVal.changeSign();
+      if (Level >= AfterLegalizeDAG &&
+          (TLI.isFPImmLegal(CVal, VT) ||
+           TLI.isOperationLegal(ISD::ConstantFP, VT)))
+        return DAG.getNode(ISD::FMUL, SDLoc(N), VT, N0.getOperand(0),
+                           DAG.getNode(ISD::FNEG, SDLoc(N), VT,
+                                       N0.getOperand(1)),
+                           &cast<BinaryWithFlagsSDNode>(N0)->Flags);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMINNUM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(minnum(C0, C1), SDLoc(N), VT);
+  }
+
+  // Canonicalize to constant on RHS.
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMINNUM, SDLoc(N), VT, N1, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFMAXNUM(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  const ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0);
+  const ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1);
+
+  if (N0CFP && N1CFP) {
+    const APFloat &C0 = N0CFP->getValueAPF();
+    const APFloat &C1 = N1CFP->getValueAPF();
+    return DAG.getConstantFP(maxnum(C0, C1), SDLoc(N), VT);
+  }
+
+  // Canonicalize to constant on RHS.
+  if (isConstantFPBuildVectorOrConstantFP(N0) &&
+     !isConstantFPBuildVectorOrConstantFP(N1))
+    return DAG.getNode(ISD::FMAXNUM, SDLoc(N), VT, N1, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fabs c1) -> fabs(c1)
+  if (isConstantFPBuildVectorOrConstantFP(N0))
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0);
+
+  // fold (fabs (fabs x)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FABS)
+    return N->getOperand(0);
+
+  // fold (fabs (fneg x)) -> (fabs x)
+  // fold (fabs (fcopysign x, y)) -> (fabs x)
+  if (N0.getOpcode() == ISD::FNEG || N0.getOpcode() == ISD::FCOPYSIGN)
+    return DAG.getNode(ISD::FABS, SDLoc(N), VT, N0.getOperand(0));
+
+  // Transform fabs(bitconvert(x)) -> bitconvert(x & ~sign) to avoid loading
+  // constant pool values.
+  if (!TLI.isFAbsFree(VT) &&
+      N0.getOpcode() == ISD::BITCAST &&
+      N0.getNode()->hasOneUse()) {
+    SDValue Int = N0.getOperand(0);
+    EVT IntVT = Int.getValueType();
+    if (IntVT.isInteger() && !IntVT.isVector()) {
+      APInt SignMask;
+      if (N0.getValueType().isVector()) {
+        // For a vector, get a mask such as 0x7f... per scalar element
+        // and splat it.
+        SignMask = ~APInt::getSignBit(N0.getValueType().getScalarSizeInBits());
+        SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask);
+      } else {
+        // For a scalar, just generate 0x7f...
+        SignMask = ~APInt::getSignBit(IntVT.getSizeInBits());
+      }
+      SDLoc DL(N0);
+      Int = DAG.getNode(ISD::AND, DL, IntVT, Int,
+                        DAG.getConstant(SignMask, DL, IntVT));
+      AddToWorklist(Int.getNode());
+      return DAG.getBitcast(N->getValueType(0), Int);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitBRCOND(SDNode *N) {
+  SDValue Chain = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // fold a brcond with a setcc condition into a BR_CC node if BR_CC is legal
+  // on the target.
+  if (N1.getOpcode() == ISD::SETCC &&
+      TLI.isOperationLegalOrCustom(ISD::BR_CC,
+                                   N1.getOperand(0).getValueType())) {
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
+                       Chain, N1.getOperand(2),
+                       N1.getOperand(0), N1.getOperand(1), N2);
+  }
+
+  if ((N1.hasOneUse() && N1.getOpcode() == ISD::SRL) ||
+      ((N1.getOpcode() == ISD::TRUNCATE && N1.hasOneUse()) &&
+       (N1.getOperand(0).hasOneUse() &&
+        N1.getOperand(0).getOpcode() == ISD::SRL))) {
+    SDNode *Trunc = nullptr;
+    if (N1.getOpcode() == ISD::TRUNCATE) {
+      // Look pass the truncate.
+      Trunc = N1.getNode();
+      N1 = N1.getOperand(0);
+    }
+
+    // Match this pattern so that we can generate simpler code:
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = srl i32 %b, 1
+    //   brcond i32 %c ...
+    //
+    // into
+    //
+    //   %a = ...
+    //   %b = and i32 %a, 2
+    //   %c = setcc eq %b, 0
+    //   brcond %c ...
+    //
+    // This applies only when the AND constant value has one bit set and the
+    // SRL constant is equal to the log2 of the AND constant. The back-end is
+    // smart enough to convert the result into a TEST/JMP sequence.
+    SDValue Op0 = N1.getOperand(0);
+    SDValue Op1 = N1.getOperand(1);
+
+    if (Op0.getOpcode() == ISD::AND &&
+        Op1.getOpcode() == ISD::Constant) {
+      SDValue AndOp1 = Op0.getOperand(1);
+
+      if (AndOp1.getOpcode() == ISD::Constant) {
+        const APInt &AndConst = cast<ConstantSDNode>(AndOp1)->getAPIntValue();
+
+        if (AndConst.isPowerOf2() &&
+            cast<ConstantSDNode>(Op1)->getAPIntValue()==AndConst.logBase2()) {
+          SDLoc DL(N);
+          SDValue SetCC =
+            DAG.getSetCC(DL,
+                         getSetCCResultType(Op0.getValueType()),
+                         Op0, DAG.getConstant(0, DL, Op0.getValueType()),
+                         ISD::SETNE);
+
+          SDValue NewBRCond = DAG.getNode(ISD::BRCOND, DL,
+                                          MVT::Other, Chain, SetCC, N2);
+          // Don't add the new BRCond into the worklist or else SimplifySelectCC
+          // will convert it back to (X & C1) >> C2.
+          CombineTo(N, NewBRCond, false);
+          // Truncate is dead.
+          if (Trunc)
+            deleteAndRecombine(Trunc);
+          // Replace the uses of SRL with SETCC
+          WorklistRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
+          deleteAndRecombine(N1.getNode());
+          return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+
+    if (Trunc)
+      // Restore N1 if the above transformation doesn't match.
+      N1 = N->getOperand(1);
+  }
+
+  // Transform br(xor(x, y)) -> br(x != y)
+  // Transform br(xor(xor(x,y), 1)) -> br (x == y)
+  if (N1.hasOneUse() && N1.getOpcode() == ISD::XOR) {
+    SDNode *TheXor = N1.getNode();
+    SDValue Op0 = TheXor->getOperand(0);
+    SDValue Op1 = TheXor->getOperand(1);
+    if (Op0.getOpcode() == Op1.getOpcode()) {
+      // Avoid missing important xor optimizations.
+      if (SDValue Tmp = visitXOR(TheXor)) {
+        if (Tmp.getNode() != TheXor) {
+          DEBUG(dbgs() << "\nReplacing.8 ";
+                TheXor->dump(&DAG);
+                dbgs() << "\nWith: ";
+                Tmp.getNode()->dump(&DAG);
+                dbgs() << '\n');
+          WorklistRemover DeadNodes(*this);
+          DAG.ReplaceAllUsesOfValueWith(N1, Tmp);
+          deleteAndRecombine(TheXor);
+          return DAG.getNode(ISD::BRCOND, SDLoc(N),
+                             MVT::Other, Chain, Tmp, N2);
+        }
+
+        // visitXOR has changed XOR's operands or replaced the XOR completely,
+        // bail out.
+        return SDValue(N, 0);
+      }
+    }
+
+    if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
+      bool Equal = false;
+      if (isOneConstant(Op0) && Op0.hasOneUse() &&
+          Op0.getOpcode() == ISD::XOR) {
+        TheXor = Op0.getNode();
+        Equal = true;
+      }
+
+      EVT SetCCVT = N1.getValueType();
+      if (LegalTypes)
+        SetCCVT = getSetCCResultType(SetCCVT);
+      SDValue SetCC = DAG.getSetCC(SDLoc(TheXor),
+                                   SetCCVT,
+                                   Op0, Op1,
+                                   Equal ? ISD::SETEQ : ISD::SETNE);
+      // Replace the uses of XOR with SETCC
+      WorklistRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N1, SetCC);
+      deleteAndRecombine(N1.getNode());
+      return DAG.getNode(ISD::BRCOND, SDLoc(N),
+                         MVT::Other, Chain, SetCC, N2);
+    }
+  }
+
+  return SDValue();
+}
+
+// Operand List for BR_CC: Chain, CondCC, CondLHS, CondRHS, DestBB.
+//
+SDValue DAGCombiner::visitBR_CC(SDNode *N) {
+  CondCodeSDNode *CC = cast<CondCodeSDNode>(N->getOperand(1));
+  SDValue CondLHS = N->getOperand(2), CondRHS = N->getOperand(3);
+
+  // If N is a constant we could fold this into a fallthrough or unconditional
+  // branch. However that doesn't happen very often in normal code, because
+  // Instcombine/SimplifyCFG should have handled the available opportunities.
+  // If we did this folding here, it would be necessary to update the
+  // MachineBasicBlock CFG, which is awkward.
+
+  // Use SimplifySetCC to simplify SETCC's.
+  SDValue Simp = SimplifySetCC(getSetCCResultType(CondLHS.getValueType()),
+                               CondLHS, CondRHS, CC->get(), SDLoc(N),
+                               false);
+  if (Simp.getNode()) AddToWorklist(Simp.getNode());
+
+  // fold to a simpler setcc
+  if (Simp.getNode() && Simp.getOpcode() == ISD::SETCC)
+    return DAG.getNode(ISD::BR_CC, SDLoc(N), MVT::Other,
+                       N->getOperand(0), Simp.getOperand(2),
+                       Simp.getOperand(0), Simp.getOperand(1),
+                       N->getOperand(4));
+
+  return SDValue();
+}
+
+/// Return true if 'Use' is a load or a store that uses N as its base pointer
+/// and that N may be folded in the load / store addressing mode.
+static bool canFoldInAddressingMode(SDNode *N, SDNode *Use,
+                                    SelectionDAG &DAG,
+                                    const TargetLowering &TLI) {
+  EVT VT;
+  unsigned AS;
+
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(Use)) {
+    if (LD->isIndexed() || LD->getBasePtr().getNode() != N)
+      return false;
+    VT = LD->getMemoryVT();
+    AS = LD->getAddressSpace();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(Use)) {
+    if (ST->isIndexed() || ST->getBasePtr().getNode() != N)
+      return false;
+    VT = ST->getMemoryVT();
+    AS = ST->getAddressSpace();
+  } else
+    return false;
+
+  TargetLowering::AddrMode AM;
+  if (N->getOpcode() == ISD::ADD) {
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else if (N->getOpcode() == ISD::SUB) {
+    ConstantSDNode *Offset = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    if (Offset)
+      // [reg +/- imm]
+      AM.BaseOffs = -Offset->getSExtValue();
+    else
+      // [reg +/- reg]
+      AM.Scale = 1;
+  } else
+    return false;
+
+  return TLI.isLegalAddressingMode(DAG.getDataLayout(), AM,
+                                   VT.getTypeForEVT(*DAG.getContext()), AS);
+}
+
+/// Try turning a load/store into a pre-indexed load/store when the base
+/// pointer is an add or subtract and it has other uses besides the load/store.
+/// After the transformation, the new indexed load/store has effectively folded
+/// the add/subtract in and all of its other uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::PRE_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::PRE_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  // If the pointer is not an add/sub, or if it doesn't have multiple uses, bail
+  // out.  There is no reason to make this a preinc/predec.
+  if ((Ptr.getOpcode() != ISD::ADD && Ptr.getOpcode() != ISD::SUB) ||
+      Ptr.getNode()->hasOneUse())
+    return false;
+
+  // Ask the target to do addressing mode selection.
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  if (!TLI.getPreIndexedAddressParts(N, BasePtr, Offset, AM, DAG))
+    return false;
+
+  // Backends without true r+i pre-indexed forms may need to pass a
+  // constant base with a variable offset so that constant coercion
+  // will work with the patterns in canonical form.
+  bool Swapped = false;
+  if (isa<ConstantSDNode>(BasePtr)) {
+    std::swap(BasePtr, Offset);
+    Swapped = true;
+  }
+
+  // Don't create a indexed load / store with zero offset.
+  if (isNullConstant(Offset))
+    return false;
+
+  // Try turning it into a pre-indexed load / store except when:
+  // 1) The new base ptr is a frame index.
+  // 2) If N is a store and the new base ptr is either the same as or is a
+  //    predecessor of the value being stored.
+  // 3) Another use of old base ptr is a predecessor of N. If ptr is folded
+  //    that would create a cycle.
+  // 4) All uses are load / store ops that use it as old base ptr.
+
+  // Check #1.  Preinc'ing a frame index would require copying the stack pointer
+  // (plus the implicit offset) to a register to preinc anyway.
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
+
+  // Check #2.
+  if (!isLoad) {
+    SDValue Val = cast<StoreSDNode>(N)->getValue();
+    if (Val == BasePtr || BasePtr.getNode()->isPredecessorOf(Val.getNode()))
+      return false;
+  }
+
+  // Caches for hasPredecessorHelper.
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(N);
+
+  // If the offset is a constant, there may be other adds of constants that
+  // can be folded with this one. We should do this to avoid having to keep
+  // a copy of the original base pointer.
+  SmallVector<SDNode *, 16> OtherUses;
+  if (isa<ConstantSDNode>(Offset))
+    for (SDNode::use_iterator UI = BasePtr.getNode()->use_begin(),
+                              UE = BasePtr.getNode()->use_end();
+         UI != UE; ++UI) {
+      SDUse &Use = UI.getUse();
+      // Skip the use that is Ptr and uses of other results from BasePtr's
+      // node (important for nodes that return multiple results).
+      if (Use.getUser() == Ptr.getNode() || Use != BasePtr)
+        continue;
+
+      if (SDNode::hasPredecessorHelper(Use.getUser(), Visited, Worklist))
+        continue;
+
+      if (Use.getUser()->getOpcode() != ISD::ADD &&
+          Use.getUser()->getOpcode() != ISD::SUB) {
+        OtherUses.clear();
+        break;
+      }
+
+      SDValue Op1 = Use.getUser()->getOperand((UI.getOperandNo() + 1) & 1);
+      if (!isa<ConstantSDNode>(Op1)) {
+        OtherUses.clear();
+        break;
+      }
+
+      // FIXME: In some cases, we can be smarter about this.
+      if (Op1.getValueType() != Offset.getValueType()) {
+        OtherUses.clear();
+        break;
+      }
+
+      OtherUses.push_back(Use.getUser());
+    }
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Now check for #3 and #4.
+  bool RealUse = false;
+
+  for (SDNode *Use : Ptr.getNode()->uses()) {
+    if (Use == N)
+      continue;
+    if (SDNode::hasPredecessorHelper(Use, Visited, Worklist))
+      return false;
+
+    // If Ptr may be folded in addressing mode of other use, then it's
+    // not profitable to do this transformation.
+    if (!canFoldInAddressingMode(Ptr.getNode(), Use, DAG, TLI))
+      RealUse = true;
+  }
+
+  if (!RealUse)
+    return false;
+
+  SDValue Result;
+  if (isLoad)
+    Result = DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
+                                BasePtr, Offset, AM);
+  else
+    Result = DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
+                                 BasePtr, Offset, AM);
+  ++PreIndexedNodes;
+  ++NodesCombined;
+  DEBUG(dbgs() << "\nReplacing.4 ";
+        N->dump(&DAG);
+        dbgs() << "\nWith: ";
+        Result.getNode()->dump(&DAG);
+        dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  if (isLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+  }
+
+  // Finally, since the node is now dead, remove it from the graph.
+  deleteAndRecombine(N);
+
+  if (Swapped)
+    std::swap(BasePtr, Offset);
+
+  // Replace other uses of BasePtr that can be updated to use Ptr
+  for (unsigned i = 0, e = OtherUses.size(); i != e; ++i) {
+    unsigned OffsetIdx = 1;
+    if (OtherUses[i]->getOperand(OffsetIdx).getNode() == BasePtr.getNode())
+      OffsetIdx = 0;
+    assert(OtherUses[i]->getOperand(!OffsetIdx).getNode() ==
+           BasePtr.getNode() && "Expected BasePtr operand");
+
+    // We need to replace ptr0 in the following expression:
+    //   x0 * offset0 + y0 * ptr0 = t0
+    // knowing that
+    //   x1 * offset1 + y1 * ptr0 = t1 (the indexed load/store)
+    //
+    // where x0, x1, y0 and y1 in {-1, 1} are given by the types of the
+    // indexed load/store and the expresion that needs to be re-written.
+    //
+    // Therefore, we have:
+    //   t0 = (x0 * offset0 - x1 * y0 * y1 *offset1) + (y0 * y1) * t1
+
+    ConstantSDNode *CN =
+      cast<ConstantSDNode>(OtherUses[i]->getOperand(OffsetIdx));
+    int X0, X1, Y0, Y1;
+    const APInt &Offset0 = CN->getAPIntValue();
+    APInt Offset1 = cast<ConstantSDNode>(Offset)->getAPIntValue();
+
+    X0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 1) ? -1 : 1;
+    Y0 = (OtherUses[i]->getOpcode() == ISD::SUB && OffsetIdx == 0) ? -1 : 1;
+    X1 = (AM == ISD::PRE_DEC && !Swapped) ? -1 : 1;
+    Y1 = (AM == ISD::PRE_DEC && Swapped) ? -1 : 1;
+
+    unsigned Opcode = (Y0 * Y1 < 0) ? ISD::SUB : ISD::ADD;
+
+    APInt CNV = Offset0;
+    if (X0 < 0) CNV = -CNV;
+    if (X1 * Y0 * Y1 < 0) CNV = CNV + Offset1;
+    else CNV = CNV - Offset1;
+
+    SDLoc DL(OtherUses[i]);
+
+    // We can now generate the new expression.
+    SDValue NewOp1 = DAG.getConstant(CNV, DL, CN->getValueType(0));
+    SDValue NewOp2 = Result.getValue(isLoad ? 1 : 0);
+
+    SDValue NewUse = DAG.getNode(Opcode,
+                                 DL,
+                                 OtherUses[i]->getValueType(0), NewOp1, NewOp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(OtherUses[i], 0), NewUse);
+    deleteAndRecombine(OtherUses[i]);
+  }
+
+  // Replace the uses of Ptr with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(Ptr, Result.getValue(isLoad ? 1 : 0));
+  deleteAndRecombine(Ptr.getNode());
+
+  return true;
+}
+
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  bool isLoad = true;
+  SDValue Ptr;
+  EVT VT;
+  if (LoadSDNode *LD  = dyn_cast<LoadSDNode>(N)) {
+    if (LD->isIndexed())
+      return false;
+    VT = LD->getMemoryVT();
+    if (!TLI.isIndexedLoadLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedLoadLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = LD->getBasePtr();
+  } else if (StoreSDNode *ST  = dyn_cast<StoreSDNode>(N)) {
+    if (ST->isIndexed())
+      return false;
+    VT = ST->getMemoryVT();
+    if (!TLI.isIndexedStoreLegal(ISD::POST_INC, VT) &&
+        !TLI.isIndexedStoreLegal(ISD::POST_DEC, VT))
+      return false;
+    Ptr = ST->getBasePtr();
+    isLoad = false;
+  } else {
+    return false;
+  }
+
+  if (Ptr.getNode()->hasOneUse())
+    return false;
+
+  for (SDNode *Op : Ptr.getNode()->uses()) {
+    if (Op == N ||
+        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
+      continue;
+
+    SDValue BasePtr;
+    SDValue Offset;
+    ISD::MemIndexedMode AM = ISD::UNINDEXED;
+    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
+      // Don't create a indexed load / store with zero offset.
+      if (isNullConstant(Offset))
+        continue;
+
+      // Try turning it into a post-indexed load / store except when
+      // 1) All uses are load / store ops that use it as base ptr (and
+      //    it may be folded as addressing mmode).
+      // 2) Op must be independent of N, i.e. Op is neither a predecessor
+      //    nor a successor of N. Otherwise, if Op is folded that would
+      //    create a cycle.
+
+      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+        continue;
+
+      // Check for #1.
+      bool TryNext = false;
+      for (SDNode *Use : BasePtr.getNode()->uses()) {
+        if (Use == Ptr.getNode())
+          continue;
+
+        // If all the uses are load / store addresses, then don't do the
+        // transformation.
+        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB){
+          bool RealUse = false;
+          for (SDNode *UseUse : Use->uses()) {
+            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
+              RealUse = true;
+          }
+
+          if (!RealUse) {
+            TryNext = true;
+            break;
+          }
+        }
+      }
+
+      if (TryNext)
+        continue;
+
+      // Check for #2
+      if (!Op->isPredecessorOf(N) && !N->isPredecessorOf(Op)) {
+        SDValue Result = isLoad
+          ? DAG.getIndexedLoad(SDValue(N,0), SDLoc(N),
+                               BasePtr, Offset, AM)
+          : DAG.getIndexedStore(SDValue(N,0), SDLoc(N),
+                                BasePtr, Offset, AM);
+        ++PostIndexedNodes;
+        ++NodesCombined;
+        DEBUG(dbgs() << "\nReplacing.5 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Result.getNode()->dump(&DAG);
+              dbgs() << '\n');
+        WorklistRemover DeadNodes(*this);
+        if (isLoad) {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+        } else {
+          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
+        }
+
+        // Finally, since the node is now dead, remove it from the graph.
+        deleteAndRecombine(N);
+
+        // Replace the uses of Use with uses of the updated base value.
+        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                      Result.getValue(isLoad ? 1 : 0));
+        deleteAndRecombine(Op);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+/// \brief Return the base-pointer arithmetic from an indexed \p LD.
+SDValue DAGCombiner::SplitIndexingFromLoad(LoadSDNode *LD) {
+  ISD::MemIndexedMode AM = LD->getAddressingMode();
+  assert(AM != ISD::UNINDEXED);
+  SDValue BP = LD->getOperand(1);
+  SDValue Inc = LD->getOperand(2);
+
+  // Some backends use TargetConstants for load offsets, but don't expect
+  // TargetConstants in general ADD nodes. We can convert these constants into
+  // regular Constants (if the constant is not opaque).
+  assert((Inc.getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(Inc)->isOpaque()) &&
+         "Cannot split out indexing using opaque target constants");
+  if (Inc.getOpcode() == ISD::TargetConstant) {
+    ConstantSDNode *ConstInc = cast<ConstantSDNode>(Inc);
+    Inc = DAG.getConstant(*ConstInc->getConstantIntValue(), SDLoc(Inc),
+                          ConstInc->getValueType(0));
+  }
+
+  unsigned Opc =
+      (AM == ISD::PRE_INC || AM == ISD::POST_INC ? ISD::ADD : ISD::SUB);
+  return DAG.getNode(Opc, SDLoc(LD), BP.getSimpleValueType(), BP, Inc);
+}
+
+SDValue DAGCombiner::visitLOAD(SDNode *N) {
+  LoadSDNode *LD  = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr   = LD->getBasePtr();
+
+  // If load is not volatile and there are no uses of the loaded value (and
+  // the updated indexed value in case of indexed loads), change uses of the
+  // chain value into uses of the chain input (i.e. delete the dead load).
+  if (!LD->isVolatile()) {
+    if (N->getValueType(1) == MVT::Other) {
+      // Unindexed loads.
+      if (!N->hasAnyUseOfValue(0)) {
+        // It's not safe to use the two value CombineTo variant here. e.g.
+        // v1, chain2 = load chain1, loc
+        // v2, chain3 = load chain2, loc
+        // v3         = add v2, c
+        // Now we replace use of chain2 with chain1.  This makes the second load
+        // isomorphic to the one we are deleting, and thus makes this load live.
+        DEBUG(dbgs() << "\nReplacing.6 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith chain: ";
+              Chain.getNode()->dump(&DAG);
+              dbgs() << "\n");
+        WorklistRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+
+        if (N->use_empty())
+          deleteAndRecombine(N);
+
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    } else {
+      // Indexed loads.
+      assert(N->getValueType(2) == MVT::Other && "Malformed indexed loads?");
+
+      // If this load has an opaque TargetConstant offset, then we cannot split
+      // the indexing into an add/sub directly (that TargetConstant may not be
+      // valid for a different type of node, and we cannot convert an opaque
+      // target constant into a regular constant).
+      bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
+                       cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();
+
+      if (!N->hasAnyUseOfValue(0) &&
+          ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {
+        SDValue Undef = DAG.getUNDEF(N->getValueType(0));
+        SDValue Index;
+        if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
+          Index = SplitIndexingFromLoad(LD);
+          // Try to fold the base pointer arithmetic into subsequent loads and
+          // stores.
+          AddUsersToWorklist(N);
+        } else
+          Index = DAG.getUNDEF(N->getValueType(1));
+        DEBUG(dbgs() << "\nReplacing.7 ";
+              N->dump(&DAG);
+              dbgs() << "\nWith: ";
+              Undef.getNode()->dump(&DAG);
+              dbgs() << " and 2 other values\n");
+        WorklistRemover DeadNodes(*this);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Undef);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Index);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(N, 2), Chain);
+        deleteAndRecombine(N);
+        return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+      }
+    }
+  }
+
+  // If this load is directly stored, replace the load value with the stored
+  // value.
+  // TODO: Handle store large -> read small portion.
+  // TODO: Handle TRUNCSTORE/LOADEXT
+  if (ISD::isNormalLoad(N) && !LD->isVolatile()) {
+    if (ISD::isNON_TRUNCStore(Chain.getNode())) {
+      StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
+      if (PrevST->getBasePtr() == Ptr &&
+          PrevST->getValue().getValueType() == N->getValueType(0))
+      return CombineTo(N, Chain.getOperand(1), Chain);
+    }
+  }
+
+  // Try to infer better alignment information than the load already has.
+  if (OptLevel != CodeGenOpt::None && LD->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > LD->getMemOperand()->getBaseAlignment()) {
+        SDValue NewLoad = DAG.getExtLoad(
+            LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
+            LD->getPointerInfo(), LD->getMemoryVT(), Align,
+            LD->getMemOperand()->getFlags(), LD->getAAInfo());
+        if (NewLoad.getNode() != N)
+          return CombineTo(N, NewLoad, SDValue(NewLoad.getNode(), 1), true);
+      }
+    }
+  }
+
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+#ifndef NDEBUG
+  if (CombinerAAOnlyFunc.getNumOccurrences() &&
+      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
+    UseAA = false;
+#endif
+  if (UseAA && LD->isUnindexed()) {
+    // Walk up chain skipping non-aliasing memory nodes.
+    SDValue BetterChain = FindBetterChain(N, Chain);
+
+    // If there is a better chain.
+    if (Chain != BetterChain) {
+      SDValue ReplLoad;
+
+      // Replace the chain to void dependency.
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD) {
+        ReplLoad = DAG.getLoad(N->getValueType(0), SDLoc(LD),
+                               BetterChain, Ptr, LD->getMemOperand());
+      } else {
+        ReplLoad = DAG.getExtLoad(LD->getExtensionType(), SDLoc(LD),
+                                  LD->getValueType(0),
+                                  BetterChain, Ptr, LD->getMemoryVT(),
+                                  LD->getMemOperand());
+      }
+
+      // Create token factor to keep old chain connected.
+      SDValue Token = DAG.getNode(ISD::TokenFactor, SDLoc(N),
+                                  MVT::Other, Chain, ReplLoad.getValue(1));
+
+      // Make sure the new and old chains are cleaned up.
+      AddToWorklist(Token.getNode());
+
+      // Replace uses with load result and token factor. Don't add users
+      // to work list.
+      return CombineTo(N, ReplLoad.getValue(0), Token, false);
+    }
+  }
+
+  // Try transforming N to an indexed load.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // Try to slice up N to more direct loads if the slices are mapped to
+  // different register banks or pairing can take place.
+  if (SliceUpLoad(N))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+namespace {
+/// \brief Helper structure used to slice a load in smaller loads.
+/// Basically a slice is obtained from the following sequence:
+/// Origin = load Ty1, Base
+/// Shift = srl Ty1 Origin, CstTy Amount
+/// Inst = trunc Shift to Ty2
+///
+/// Then, it will be rewriten into:
+/// Slice = load SliceTy, Base + SliceOffset
+/// [Inst = zext Slice to Ty2], only if SliceTy <> Ty2
+///
+/// SliceTy is deduced from the number of bits that are actually used to
+/// build Inst.
+struct LoadedSlice {
+  /// \brief Helper structure used to compute the cost of a slice.
+  struct Cost {
+    /// Are we optimizing for code size.
+    bool ForCodeSize;
+    /// Various cost.
+    unsigned Loads;
+    unsigned Truncates;
+    unsigned CrossRegisterBanksCopies;
+    unsigned ZExts;
+    unsigned Shift;
+
+    Cost(bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(0), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {}
+
+    /// \brief Get the cost of one isolated slice.
+    Cost(const LoadedSlice &LS, bool ForCodeSize = false)
+        : ForCodeSize(ForCodeSize), Loads(1), Truncates(0),
+          CrossRegisterBanksCopies(0), ZExts(0), Shift(0) {
+      EVT TruncType = LS.Inst->getValueType(0);
+      EVT LoadedType = LS.getLoadedType();
+      if (TruncType != LoadedType &&
+          !LS.DAG->getTargetLoweringInfo().isZExtFree(LoadedType, TruncType))
+        ZExts = 1;
+    }
+
+    /// \brief Account for slicing gain in the current cost.
+    /// Slicing provide a few gains like removing a shift or a
+    /// truncate. This method allows to grow the cost of the original
+    /// load with the gain from this slice.
+    void addSliceGain(const LoadedSlice &LS) {
+      // Each slice saves a truncate.
+      const TargetLowering &TLI = LS.DAG->getTargetLoweringInfo();
+      if (!TLI.isTruncateFree(LS.Inst->getOperand(0).getValueType(),
+                              LS.Inst->getValueType(0)))
+        ++Truncates;
+      // If there is a shift amount, this slice gets rid of it.
+      if (LS.Shift)
+        ++Shift;
+      // If this slice can merge a cross register bank copy, account for it.
+      if (LS.canMergeExpensiveCrossRegisterBankCopy())
+        ++CrossRegisterBanksCopies;
+    }
+
+    Cost &operator+=(const Cost &RHS) {
+      Loads += RHS.Loads;
+      Truncates += RHS.Truncates;
+      CrossRegisterBanksCopies += RHS.CrossRegisterBanksCopies;
+      ZExts += RHS.ZExts;
+      Shift += RHS.Shift;
+      return *this;
+    }
+
+    bool operator==(const Cost &RHS) const {
+      return Loads == RHS.Loads && Truncates == RHS.Truncates &&
+             CrossRegisterBanksCopies == RHS.CrossRegisterBanksCopies &&
+             ZExts == RHS.ZExts && Shift == RHS.Shift;
+    }
+
+    bool operator!=(const Cost &RHS) const { return !(*this == RHS); }
+
+    bool operator<(const Cost &RHS) const {
+      // Assume cross register banks copies are as expensive as loads.
+      // FIXME: Do we want some more target hooks?
+      unsigned ExpensiveOpsLHS = Loads + CrossRegisterBanksCopies;
+      unsigned ExpensiveOpsRHS = RHS.Loads + RHS.CrossRegisterBanksCopies;
+      // Unless we are optimizing for code size, consider the
+      // expensive operation first.
+      if (!ForCodeSize && ExpensiveOpsLHS != ExpensiveOpsRHS)
+        return ExpensiveOpsLHS < ExpensiveOpsRHS;
+      return (Truncates + ZExts + Shift + ExpensiveOpsLHS) <
+             (RHS.Truncates + RHS.ZExts + RHS.Shift + ExpensiveOpsRHS);
+    }
+
+    bool operator>(const Cost &RHS) const { return RHS < *this; }
+
+    bool operator<=(const Cost &RHS) const { return !(RHS < *this); }
+
+    bool operator>=(const Cost &RHS) const { return !(*this < RHS); }
+  };
+  // The last instruction that represent the slice. This should be a
+  // truncate instruction.
+  SDNode *Inst;
+  // The original load instruction.
+  LoadSDNode *Origin;
+  // The right shift amount in bits from the original load.
+  unsigned Shift;
+  // The DAG from which Origin came from.
+  // This is used to get some contextual information about legal types, etc.
+  SelectionDAG *DAG;
+
+  LoadedSlice(SDNode *Inst = nullptr, LoadSDNode *Origin = nullptr,
+              unsigned Shift = 0, SelectionDAG *DAG = nullptr)
+      : Inst(Inst), Origin(Origin), Shift(Shift), DAG(DAG) {}
+
+  /// \brief Get the bits used in a chunk of bits \p BitWidth large.
+  /// \return Result is \p BitWidth and has used bits set to 1 and
+  ///         not used bits set to 0.
+  APInt getUsedBits() const {
+    // Reproduce the trunc(lshr) sequence:
+    // - Start from the truncated value.
+    // - Zero extend to the desired bit width.
+    // - Shift left.
+    assert(Origin && "No original load to compare against.");
+    unsigned BitWidth = Origin->getValueSizeInBits(0);
+    assert(Inst && "This slice is not bound to an instruction");
+    assert(Inst->getValueSizeInBits(0) <= BitWidth &&
+           "Extracted slice is bigger than the whole type!");
+    APInt UsedBits(Inst->getValueSizeInBits(0), 0);
+    UsedBits.setAllBits();
+    UsedBits = UsedBits.zext(BitWidth);
+    UsedBits <<= Shift;
+    return UsedBits;
+  }
+
+  /// \brief Get the size of the slice to be loaded in bytes.
+  unsigned getLoadedSize() const {
+    unsigned SliceSize = getUsedBits().countPopulation();
+    assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte.");
+    return SliceSize / 8;
+  }
+
+  /// \brief Get the type that will be loaded for this slice.
+  /// Note: This may not be the final type for the slice.
+  EVT getLoadedType() const {
+    assert(DAG && "Missing context");
+    LLVMContext &Ctxt = *DAG->getContext();
+    return EVT::getIntegerVT(Ctxt, getLoadedSize() * 8);
+  }
+
+  /// \brief Get the alignment of the load used for this slice.
+  unsigned getAlignment() const {
+    unsigned Alignment = Origin->getAlignment();
+    unsigned Offset = getOffsetFromBase();
+    if (Offset != 0)
+      Alignment = MinAlign(Alignment, Alignment + Offset);
+    return Alignment;
+  }
+
+  /// \brief Check if this slice can be rewritten with legal operations.
+  bool isLegal() const {
+    // An invalid slice is not legal.
+    if (!Origin || !Inst || !DAG)
+      return false;
+
+    // Offsets are for indexed load only, we do not handle that.
+    if (!Origin->getOffset().isUndef())
+      return false;
+
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+
+    // Check that the type is legal.
+    EVT SliceType = getLoadedType();
+    if (!TLI.isTypeLegal(SliceType))
+      return false;
+
+    // Check that the load is legal for this type.
+    if (!TLI.isOperationLegal(ISD::LOAD, SliceType))
+      return false;
+
+    // Check that the offset can be computed.
+    // 1. Check its type.
+    EVT PtrType = Origin->getBasePtr().getValueType();
+    if (PtrType == MVT::Untyped || PtrType.isExtended())
+      return false;
+
+    // 2. Check that it fits in the immediate.
+    if (!TLI.isLegalAddImmediate(getOffsetFromBase()))
+      return false;
+
+    // 3. Check that the computation is legal.
+    if (!TLI.isOperationLegal(ISD::ADD, PtrType))
+      return false;
+
+    // Check that the zext is legal if it needs one.
+    EVT TruncateType = Inst->getValueType(0);
+    if (TruncateType != SliceType &&
+        !TLI.isOperationLegal(ISD::ZERO_EXTEND, TruncateType))
+      return false;
+
+    return true;
+  }
+
+  /// \brief Get the offset in bytes of this slice in the original chunk of
+  /// bits.
+  /// \pre DAG != nullptr.
+  uint64_t getOffsetFromBase() const {
+    assert(DAG && "Missing context.");
+    bool IsBigEndian = DAG->getDataLayout().isBigEndian();
+    assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not supported.");
+    uint64_t Offset = Shift / 8;
+    unsigned TySizeInBytes = Origin->getValueSizeInBits(0) / 8;
+    assert(!(Origin->getValueSizeInBits(0) & 0x7) &&
+           "The size of the original loaded type is not a multiple of a"
+           " byte.");
+    // If Offset is bigger than TySizeInBytes, it means we are loading all
+    // zeros. This should have been optimized before in the process.
+    assert(TySizeInBytes > Offset &&
+           "Invalid shift amount for given loaded size");
+    if (IsBigEndian)
+      Offset = TySizeInBytes - Offset - getLoadedSize();
+    return Offset;
+  }
+
+  /// \brief Generate the sequence of instructions to load the slice
+  /// represented by this object and redirect the uses of this slice to
+  /// this new sequence of instructions.
+  /// \pre this->Inst && this->Origin are valid Instructions and this
+  /// object passed the legal check: LoadedSlice::isLegal returned true.
+  /// \return The last instruction of the sequence used to load the slice.
+  SDValue loadSlice() const {
+    assert(Inst && Origin && "Unable to replace a non-existing slice.");
+    const SDValue &OldBaseAddr = Origin->getBasePtr();
+    SDValue BaseAddr = OldBaseAddr;
+    // Get the offset in that chunk of bytes w.r.t. the endianess.
+    int64_t Offset = static_cast<int64_t>(getOffsetFromBase());
+    assert(Offset >= 0 && "Offset too big to fit in int64_t!");
+    if (Offset) {
+      // BaseAddr = BaseAddr + Offset.
+      EVT ArithType = BaseAddr.getValueType();
+      SDLoc DL(Origin);
+      BaseAddr = DAG->getNode(ISD::ADD, DL, ArithType, BaseAddr,
+                              DAG->getConstant(Offset, DL, ArithType));
+    }
+
+    // Create the type of the loaded slice according to its size.
+    EVT SliceType = getLoadedType();
+
+    // Create the load for the slice.
+    SDValue LastInst =
+        DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
+                     Origin->getPointerInfo().getWithOffset(Offset),
+                     getAlignment(), Origin->getMemOperand()->getFlags());
+    // If the final type is not the same as the loaded type, this means that
+    // we have to pad with zero. Create a zero extend for that.
+    EVT FinalType = Inst->getValueType(0);
+    if (SliceType != FinalType)
+      LastInst =
+          DAG->getNode(ISD::ZERO_EXTEND, SDLoc(LastInst), FinalType, LastInst);
+    return LastInst;
+  }
+
+  /// \brief Check if this slice can be merged with an expensive cross register
+  /// bank copy. E.g.,
+  /// i = load i32
+  /// f = bitcast i32 i to float
+  bool canMergeExpensiveCrossRegisterBankCopy() const {
+    if (!Inst || !Inst->hasOneUse())
+      return false;
+    SDNode *Use = *Inst->use_begin();
+    if (Use->getOpcode() != ISD::BITCAST)
+      return false;
+    assert(DAG && "Missing context");
+    const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+    EVT ResVT = Use->getValueType(0);
+    const TargetRegisterClass *ResRC = TLI.getRegClassFor(ResVT.getSimpleVT());
+    const TargetRegisterClass *ArgRC =
+        TLI.getRegClassFor(Use->getOperand(0).getValueType().getSimpleVT());
+    if (ArgRC == ResRC || !TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // At this point, we know that we perform a cross-register-bank copy.
+    // Check if it is expensive.
+    const TargetRegisterInfo *TRI = DAG->getSubtarget().getRegisterInfo();
+    // Assume bitcasts are cheap, unless both register classes do not
+    // explicitly share a common sub class.
+    if (!TRI || TRI->getCommonSubClass(ArgRC, ResRC))
+      return false;
+
+    // Check if it will be merged with the load.
+    // 1. Check the alignment constraint.
+    unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
+        ResVT.getTypeForEVT(*DAG->getContext()));
+
+    if (RequiredAlignment > getAlignment())
+      return false;
+
+    // 2. Check that the load is a legal operation for that type.
+    if (!TLI.isOperationLegal(ISD::LOAD, ResVT))
+      return false;
+
+    // 3. Check that we do not have a zext in the way.
+    if (Inst->getValueType(0) != getLoadedType())
+      return false;
+
+    return true;
+  }
+};
+}
+
+/// \brief Check that all bits set in \p UsedBits form a dense region, i.e.,
+/// \p UsedBits looks like 0..0 1..1 0..0.
+static bool areUsedBitsDense(const APInt &UsedBits) {
+  // If all the bits are one, this is dense!
+  if (UsedBits.isAllOnesValue())
+    return true;
+
+  // Get rid of the unused bits on the right.
+  APInt NarrowedUsedBits = UsedBits.lshr(UsedBits.countTrailingZeros());
+  // Get rid of the unused bits on the left.
+  if (NarrowedUsedBits.countLeadingZeros())
+    NarrowedUsedBits = NarrowedUsedBits.trunc(NarrowedUsedBits.getActiveBits());
+  // Check that the chunk of bits is completely used.
+  return NarrowedUsedBits.isAllOnesValue();
+}
+
+/// \brief Check whether or not \p First and \p Second are next to each other
+/// in memory. This means that there is no hole between the bits loaded
+/// by \p First and the bits loaded by \p Second.
+static bool areSlicesNextToEachOther(const LoadedSlice &First,
+                                     const LoadedSlice &Second) {
+  assert(First.Origin == Second.Origin && First.Origin &&
+         "Unable to match different memory origins.");
+  APInt UsedBits = First.getUsedBits();
+  assert((UsedBits & Second.getUsedBits()) == 0 &&
+         "Slices are not supposed to overlap.");
+  UsedBits |= Second.getUsedBits();
+  return areUsedBitsDense(UsedBits);
+}
+
+/// \brief Adjust the \p GlobalLSCost according to the target
+/// paring capabilities and the layout of the slices.
+/// \pre \p GlobalLSCost should account for at least as many loads as
+/// there is in the slices in \p LoadedSlices.
+static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                 LoadedSlice::Cost &GlobalLSCost) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  // If there is less than 2 elements, no pairing is possible.
+  if (NumberOfSlices < 2)
+    return;
+
+  // Sort the slices so that elements that are likely to be next to each
+  // other in memory are next to each other in the list.
+  std::sort(LoadedSlices.begin(), LoadedSlices.end(),
+            [](const LoadedSlice &LHS, const LoadedSlice &RHS) {
+    assert(LHS.Origin == RHS.Origin && "Different bases not implemented.");
+    return LHS.getOffsetFromBase() < RHS.getOffsetFromBase();
+  });
+  const TargetLowering &TLI = LoadedSlices[0].DAG->getTargetLoweringInfo();
+  // First (resp. Second) is the first (resp. Second) potentially candidate
+  // to be placed in a paired load.
+  const LoadedSlice *First = nullptr;
+  const LoadedSlice *Second = nullptr;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice,
+                // Set the beginning of the pair.
+                                                           First = Second) {
+
+    Second = &LoadedSlices[CurrSlice];
+
+    // If First is NULL, it means we start a new pair.
+    // Get to the next slice.
+    if (!First)
+      continue;
+
+    EVT LoadedType = First->getLoadedType();
+
+    // If the types of the slices are different, we cannot pair them.
+    if (LoadedType != Second->getLoadedType())
+      continue;
+
+    // Check if the target supplies paired loads for this type.
+    unsigned RequiredAlignment = 0;
+    if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
+      // move to the next pair, this type is hopeless.
+      Second = nullptr;
+      continue;
+    }
+    // Check if we meet the alignment requirement.
+    if (RequiredAlignment > First->getAlignment())
+      continue;
+
+    // Check that both loads are next to each other in memory.
+    if (!areSlicesNextToEachOther(*First, *Second))
+      continue;
+
+    assert(GlobalLSCost.Loads > 0 && "We save more loads than we created!");
+    --GlobalLSCost.Loads;
+    // Move to the next pair.
+    Second = nullptr;
+  }
+}
+
+/// \brief Check the profitability of all involved LoadedSlice.
+/// Currently, it is considered profitable if there is exactly two
+/// involved slices (1) which are (2) next to each other in memory, and
+/// whose cost (\see LoadedSlice::Cost) is smaller than the original load (3).
+///
+/// Note: The order of the elements in \p LoadedSlices may be modified, but not
+/// the elements themselves.
+///
+/// FIXME: When the cost model will be mature enough, we can relax
+/// constraints (1) and (2).
+static bool isSlicingProfitable(SmallVectorImpl<LoadedSlice> &LoadedSlices,
+                                const APInt &UsedBits, bool ForCodeSize) {
+  unsigned NumberOfSlices = LoadedSlices.size();
+  if (StressLoadSlicing)
+    return NumberOfSlices > 1;
+
+  // Check (1).
+  if (NumberOfSlices != 2)
+    return false;
+
+  // Check (2).
+  if (!areUsedBitsDense(UsedBits))
+    return false;
+
+  // Check (3).
+  LoadedSlice::Cost OrigCost(ForCodeSize), GlobalSlicingCost(ForCodeSize);
+  // The original code has one big load.
+  OrigCost.Loads = 1;
+  for (unsigned CurrSlice = 0; CurrSlice < NumberOfSlices; ++CurrSlice) {
+    const LoadedSlice &LS = LoadedSlices[CurrSlice];
+    // Accumulate the cost of all the slices.
+    LoadedSlice::Cost SliceCost(LS, ForCodeSize);
+    GlobalSlicingCost += SliceCost;
+
+    // Account as cost in the original configuration the gain obtained
+    // with the current slices.
+    OrigCost.addSliceGain(LS);
+  }
+
+  // If the target supports paired load, adjust the cost accordingly.
+  adjustCostForPairing(LoadedSlices, GlobalSlicingCost);
+  return OrigCost > GlobalSlicingCost;
+}
+
+/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr)
+/// operations, split it in the various pieces being extracted.
+///
+/// This sort of thing is introduced by SROA.
+/// This slicing takes care not to insert overlapping loads.
+/// \pre LI is a simple load (i.e., not an atomic or volatile load).
+bool DAGCombiner::SliceUpLoad(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  if (LD->isVolatile() || !ISD::isNormalLoad(LD) ||
+      !LD->getValueType(0).isInteger())
+    return false;
+
+  // Keep track of already used bits to detect overlapping values.
+  // In that case, we will just abort the transformation.
+  APInt UsedBits(LD->getValueSizeInBits(0), 0);
+
+  SmallVector<LoadedSlice, 4> LoadedSlices;
+
+  // Check if this load is used as several smaller chunks of bits.
+  // Basically, look for uses in trunc or trunc(lshr) and record a new chain
+  // of computation for each trunc.
+  for (SDNode::use_iterator UI = LD->use_begin(), UIEnd = LD->use_end();
+       UI != UIEnd; ++UI) {
+    // Skip the uses of the chain.
+    if (UI.getUse().getResNo() != 0)
+      continue;
+
+    SDNode *User = *UI;
+    unsigned Shift = 0;
+
+    // Check if this is a trunc(lshr).
+    if (User->getOpcode() == ISD::SRL && User->hasOneUse() &&
+        isa<ConstantSDNode>(User->getOperand(1))) {
+      Shift = cast<ConstantSDNode>(User->getOperand(1))->getZExtValue();
+      User = *User->use_begin();
+    }
+
+    // At this point, User is a Truncate, iff we encountered, trunc or
+    // trunc(lshr).
+    if (User->getOpcode() != ISD::TRUNCATE)
+      return false;
+
+    // The width of the type must be a power of 2 and greater than 8-bits.
+    // Otherwise the load cannot be represented in LLVM IR.
+    // Moreover, if we shifted with a non-8-bits multiple, the slice
+    // will be across several bytes. We do not support that.
+    unsigned Width = User->getValueSizeInBits(0);
+    if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7))
+      return 0;
+
+    // Build the slice for this chain of computations.
+    LoadedSlice LS(User, LD, Shift, &DAG);
+    APInt CurrentUsedBits = LS.getUsedBits();
+
+    // Check if this slice overlaps with another.
+    if ((CurrentUsedBits & UsedBits) != 0)
+      return false;
+    // Update the bits used globally.
+    UsedBits |= CurrentUsedBits;
+
+    // Check if the new slice would be legal.
+    if (!LS.isLegal())
+      return false;
+
+    // Record the slice.
+    LoadedSlices.push_back(LS);
+  }
+
+  // Abort slicing if it does not seem to be profitable.
+  if (!isSlicingProfitable(LoadedSlices, UsedBits, ForCodeSize))
+    return false;
+
+  ++SlicedLoads;
+
+  // Rewrite each chain to use an independent load.
+  // By construction, each chain can be represented by a unique load.
+
+  // Prepare the argument for the new token factor for all the slices.
+  SmallVector<SDValue, 8> ArgChains;
+  for (SmallVectorImpl<LoadedSlice>::const_iterator
+           LSIt = LoadedSlices.begin(),
+           LSItEnd = LoadedSlices.end();
+       LSIt != LSItEnd; ++LSIt) {
+    SDValue SliceInst = LSIt->loadSlice();
+    CombineTo(LSIt->Inst, SliceInst, true);
+    if (SliceInst.getNode()->getOpcode() != ISD::LOAD)
+      SliceInst = SliceInst.getOperand(0);
+    assert(SliceInst->getOpcode() == ISD::LOAD &&
+           "It takes more than a zext to get to the loaded slice!!");
+    ArgChains.push_back(SliceInst.getValue(1));
+  }
+
+  SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
+                              ArgChains);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  return true;
+}
+
+/// Check to see if V is (and load (ptr), imm), where the load is having
+/// specific bytes cleared out.  If so, return the byte size being masked out
+/// and the shift amount.
+static std::pair<unsigned, unsigned>
+CheckForMaskedLoad(SDValue V, SDValue Ptr, SDValue Chain) {
+  std::pair<unsigned, unsigned> Result(0, 0);
+
+  // Check for the structure we're looking for.
+  if (V->getOpcode() != ISD::AND ||
+      !isa<ConstantSDNode>(V->getOperand(1)) ||
+      !ISD::isNormalLoad(V->getOperand(0).getNode()))
+    return Result;
+
+  // Check the chain and pointer.
+  LoadSDNode *LD = cast<LoadSDNode>(V->getOperand(0));
+  if (LD->getBasePtr() != Ptr) return Result;  // Not from same pointer.
+
+  // The store should be chained directly to the load or be an operand of a
+  // tokenfactor.
+  if (LD == Chain.getNode())
+    ; // ok.
+  else if (Chain->getOpcode() != ISD::TokenFactor)
+    return Result; // Fail.
+  else {
+    bool isOk = false;
+    for (const SDValue &ChainOp : Chain->op_values())
+      if (ChainOp.getNode() == LD) {
+        isOk = true;
+        break;
+      }
+    if (!isOk) return Result;
+  }
+
+  // This only handles simple types.
+  if (V.getValueType() != MVT::i16 &&
+      V.getValueType() != MVT::i32 &&
+      V.getValueType() != MVT::i64)
+    return Result;
+
+  // Check the constant mask.  Invert it so that the bits being masked out are
+  // 0 and the bits being kept are 1.  Use getSExtValue so that leading bits
+  // follow the sign bit for uniformity.
+  uint64_t NotMask = ~cast<ConstantSDNode>(V->getOperand(1))->getSExtValue();
+  unsigned NotMaskLZ = countLeadingZeros(NotMask);
+  if (NotMaskLZ & 7) return Result;  // Must be multiple of a byte.
+  unsigned NotMaskTZ = countTrailingZeros(NotMask);
+  if (NotMaskTZ & 7) return Result;  // Must be multiple of a byte.
+  if (NotMaskLZ == 64) return Result;  // All zero mask.
+
+  // See if we have a continuous run of bits.  If so, we have 0*1+0*
+  if (countTrailingOnes(NotMask >> NotMaskTZ) + NotMaskTZ + NotMaskLZ != 64)
+    return Result;
+
+  // Adjust NotMaskLZ down to be from the actual size of the int instead of i64.
+  if (V.getValueType() != MVT::i64 && NotMaskLZ)
+    NotMaskLZ -= 64-V.getValueSizeInBits();
+
+  unsigned MaskedBytes = (V.getValueSizeInBits()-NotMaskLZ-NotMaskTZ)/8;
+  switch (MaskedBytes) {
+  case 1:
+  case 2:
+  case 4: break;
+  default: return Result; // All one mask, or 5-byte mask.
+  }
+
+  // Verify that the first bit starts at a multiple of mask so that the access
+  // is aligned the same as the access width.
+  if (NotMaskTZ && NotMaskTZ/8 % MaskedBytes) return Result;
+
+  Result.first = MaskedBytes;
+  Result.second = NotMaskTZ/8;
+  return Result;
+}
+
+
+/// Check to see if IVal is something that provides a value as specified by
+/// MaskInfo. If so, replace the specified store with a narrower store of
+/// truncated IVal.
+static SDNode *
+ShrinkLoadReplaceStoreWithStore(const std::pair<unsigned, unsigned> &MaskInfo,
+                                SDValue IVal, StoreSDNode *St,
+                                DAGCombiner *DC) {
+  unsigned NumBytes = MaskInfo.first;
+  unsigned ByteShift = MaskInfo.second;
+  SelectionDAG &DAG = DC->getDAG();
+
+  // Check to see if IVal is all zeros in the part being masked in by the 'or'
+  // that uses this.  If not, this is not a replacement.
+  APInt Mask = ~APInt::getBitsSet(IVal.getValueSizeInBits(),
+                                  ByteShift*8, (ByteShift+NumBytes)*8);
+  if (!DAG.MaskedValueIsZero(IVal, Mask)) return nullptr;
+
+  // Check that it is legal on the target to do this.  It is legal if the new
+  // VT we're shrinking to (i8/i16/i32) is legal or we're still before type
+  // legalization.
+  MVT VT = MVT::getIntegerVT(NumBytes*8);
+  if (!DC->isTypeLegal(VT))
+    return nullptr;
+
+  // Okay, we can do this!  Replace the 'St' store with a store of IVal that is
+  // shifted by ByteShift and truncated down to NumBytes.
+  if (ByteShift) {
+    SDLoc DL(IVal);
+    IVal = DAG.getNode(ISD::SRL, DL, IVal.getValueType(), IVal,
+                       DAG.getConstant(ByteShift*8, DL,
+                                    DC->getShiftAmountTy(IVal.getValueType())));
+  }
+
+  // Figure out the offset for the store and the alignment of the access.
+  unsigned StOffset;
+  unsigned NewAlign = St->getAlignment();
+
+  if (DAG.getDataLayout().isLittleEndian())
+    StOffset = ByteShift;
+  else
+    StOffset = IVal.getValueType().getStoreSize() - ByteShift - NumBytes;
+
+  SDValue Ptr = St->getBasePtr();
+  if (StOffset) {
+    SDLoc DL(IVal);
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(),
+                      Ptr, DAG.getConstant(StOffset, DL, Ptr.getValueType()));
+    NewAlign = MinAlign(NewAlign, StOffset);
+  }
+
+  // Truncate down to the new size.
+  IVal = DAG.getNode(ISD::TRUNCATE, SDLoc(IVal), VT, IVal);
+
+  ++OpsNarrowed;
+  return DAG
+      .getStore(St->getChain(), SDLoc(St), IVal, Ptr,
+                St->getPointerInfo().getWithOffset(StOffset), NewAlign)
+      .getNode();
+}
+
+
+/// Look for sequence of load / op / store where op is one of 'or', 'xor', and
+/// 'and' of immediates. If 'op' is only touching some of the loaded bits, try
+/// narrowing the load and store if it would end up being a win for performance
+/// or code size.
+SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  if (ST->isVolatile())
+    return SDValue();
+
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+  EVT VT = Value.getValueType();
+
+  if (ST->isTruncatingStore() || VT.isVector() || !Value.hasOneUse())
+    return SDValue();
+
+  unsigned Opc = Value.getOpcode();
+
+  // If this is "store (or X, Y), P" and X is "(and (load P), cst)", where cst
+  // is a byte mask indicating a consecutive number of bytes, check to see if
+  // Y is known to provide just those bytes.  If so, we try to replace the
+  // load + replace + store sequence with a single (narrower) store, which makes
+  // the load dead.
+  if (Opc == ISD::OR) {
+    std::pair<unsigned, unsigned> MaskedLoad;
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(1), ST,this))
+        return SDValue(NewST, 0);
+
+    // Or is commutative, so try swapping X and Y.
+    MaskedLoad = CheckForMaskedLoad(Value.getOperand(1), Ptr, Chain);
+    if (MaskedLoad.first)
+      if (SDNode *NewST = ShrinkLoadReplaceStoreWithStore(MaskedLoad,
+                                                  Value.getOperand(0), ST,this))
+        return SDValue(NewST, 0);
+  }
+
+  if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
+      Value.getOperand(1).getOpcode() != ISD::Constant)
+    return SDValue();
+
+  SDValue N0 = Value.getOperand(0);
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse() &&
+      Chain == SDValue(N0.getNode(), 1)) {
+    LoadSDNode *LD = cast<LoadSDNode>(N0);
+    if (LD->getBasePtr() != Ptr ||
+        LD->getPointerInfo().getAddrSpace() !=
+        ST->getPointerInfo().getAddrSpace())
+      return SDValue();
+
+    // Find the type to narrow it the load / op / store to.
+    SDValue N1 = Value.getOperand(1);
+    unsigned BitWidth = N1.getValueSizeInBits();
+    APInt Imm = cast<ConstantSDNode>(N1)->getAPIntValue();
+    if (Opc == ISD::AND)
+      Imm ^= APInt::getAllOnesValue(BitWidth);
+    if (Imm == 0 || Imm.isAllOnesValue())
+      return SDValue();
+    unsigned ShAmt = Imm.countTrailingZeros();
+    unsigned MSB = BitWidth - Imm.countLeadingZeros() - 1;
+    unsigned NewBW = NextPowerOf2(MSB - ShAmt);
+    EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    // The narrowing should be profitable, the load/store operation should be
+    // legal (or custom) and the store size should be equal to the NewVT width.
+    while (NewBW < BitWidth &&
+           (NewVT.getStoreSizeInBits() != NewBW ||
+            !TLI.isOperationLegalOrCustom(Opc, NewVT) ||
+            !TLI.isNarrowingProfitable(VT, NewVT))) {
+      NewBW = NextPowerOf2(NewBW);
+      NewVT = EVT::getIntegerVT(*DAG.getContext(), NewBW);
+    }
+    if (NewBW >= BitWidth)
+      return SDValue();
+
+    // If the lsb changed does not start at the type bitwidth boundary,
+    // start at the previous one.
+    if (ShAmt % NewBW)
+      ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
+                                   std::min(BitWidth, ShAmt + NewBW));
+    if ((Imm & Mask) == Imm) {
+      APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
+      if (Opc == ISD::AND)
+        NewImm ^= APInt::getAllOnesValue(NewBW);
+      uint64_t PtrOff = ShAmt / 8;
+      // For big endian targets, we need to adjust the offset to the pointer to
+      // load the correct bytes.
+      if (DAG.getDataLayout().isBigEndian())
+        PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
+
+      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
+      if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
+        return SDValue();
+
+      SDValue NewPtr = DAG.getNode(ISD::ADD, SDLoc(LD),
+                                   Ptr.getValueType(), Ptr,
+                                   DAG.getConstant(PtrOff, SDLoc(LD),
+                                                   Ptr.getValueType()));
+      SDValue NewLD =
+          DAG.getLoad(NewVT, SDLoc(N0), LD->getChain(), NewPtr,
+                      LD->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                      LD->getMemOperand()->getFlags(), LD->getAAInfo());
+      SDValue NewVal = DAG.getNode(Opc, SDLoc(Value), NewVT, NewLD,
+                                   DAG.getConstant(NewImm, SDLoc(Value),
+                                                   NewVT));
+      SDValue NewST =
+          DAG.getStore(Chain, SDLoc(N), NewVal, NewPtr,
+                       ST->getPointerInfo().getWithOffset(PtrOff), NewAlign);
+
+      AddToWorklist(NewPtr.getNode());
+      AddToWorklist(NewLD.getNode());
+      AddToWorklist(NewVal.getNode());
+      WorklistRemover DeadNodes(*this);
+      DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), NewLD.getValue(1));
+      ++OpsNarrowed;
+      return NewST;
+    }
+  }
+
+  return SDValue();
+}
+
+/// For a given floating point load / store pair, if the load value isn't used
+/// by any other operations, then consider transforming the pair to integer
+/// load / store operations if the target deems the transformation profitable.
+SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  if (ISD::isNormalStore(ST) && ISD::isNormalLoad(Value.getNode()) &&
+      Value.hasOneUse() &&
+      Chain == SDValue(Value.getNode(), 1)) {
+    LoadSDNode *LD = cast<LoadSDNode>(Value);
+    EVT VT = LD->getMemoryVT();
+    if (!VT.isFloatingPoint() ||
+        VT != ST->getMemoryVT() ||
+        LD->isNonTemporal() ||
+        ST->isNonTemporal() ||
+        LD->getPointerInfo().getAddrSpace() != 0 ||
+        ST->getPointerInfo().getAddrSpace() != 0)
+      return SDValue();
+
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
+        !TLI.isOperationLegal(ISD::STORE, IntVT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
+        !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
+      return SDValue();
+
+    unsigned LDAlign = LD->getAlignment();
+    unsigned STAlign = ST->getAlignment();
+    Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
+    unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
+    if (LDAlign < ABIAlign || STAlign < ABIAlign)
+      return SDValue();
+
+    SDValue NewLD =
+        DAG.getLoad(IntVT, SDLoc(Value), LD->getChain(), LD->getBasePtr(),
+                    LD->getPointerInfo(), LDAlign);
+
+    SDValue NewST =
+        DAG.getStore(NewLD.getValue(1), SDLoc(N), NewLD, ST->getBasePtr(),
+                     ST->getPointerInfo(), STAlign);
+
+    AddToWorklist(NewLD.getNode());
+    AddToWorklist(NewST.getNode());
+    WorklistRemover DeadNodes(*this);
+    DAG.ReplaceAllUsesOfValueWith(Value.getValue(1), NewLD.getValue(1));
+    ++LdStFP2Int;
+    return NewST;
+  }
+
+  return SDValue();
+}
+
+namespace {
+/// Helper struct to parse and store a memory address as base + index + offset.
+/// We ignore sign extensions when it is safe to do so.
+/// The following two expressions are not equivalent. To differentiate we need
+/// to store whether there was a sign extension involved in the index
+/// computation.
+///  (load (i64 add (i64 copyfromreg %c)
+///                 (i64 signextend (add (i8 load %index)
+///                                      (i8 1))))
+/// vs
+///
+/// (load (i64 add (i64 copyfromreg %c)
+///                (i64 signextend (i32 add (i32 signextend (i8 load %index))
+///                                         (i32 1)))))
+struct BaseIndexOffset {
+  SDValue Base;
+  SDValue Index;
+  int64_t Offset;
+  bool IsIndexSignExt;
+
+  BaseIndexOffset() : Offset(0), IsIndexSignExt(false) {}
+
+  BaseIndexOffset(SDValue Base, SDValue Index, int64_t Offset,
+                  bool IsIndexSignExt) :
+    Base(Base), Index(Index), Offset(Offset), IsIndexSignExt(IsIndexSignExt) {}
+
+  bool equalBaseIndex(const BaseIndexOffset &Other) {
+    return Other.Base == Base && Other.Index == Index &&
+      Other.IsIndexSignExt == IsIndexSignExt;
+  }
+
+  /// Parses tree in Ptr for base, index, offset addresses.
+  static BaseIndexOffset match(SDValue Ptr, SelectionDAG &DAG) {
+    bool IsIndexSignExt = false;
+
+    // Split up a folded GlobalAddress+Offset into its component parts.
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Ptr))
+      if (GA->getOpcode() == ISD::GlobalAddress && GA->getOffset() != 0) {
+        return BaseIndexOffset(DAG.getGlobalAddress(GA->getGlobal(),
+                                                    SDLoc(GA),
+                                                    GA->getValueType(0),
+                                                    /*Offset=*/0,
+                                                    /*isTargetGA=*/false,
+                                                    GA->getTargetFlags()),
+                               SDValue(),
+                               GA->getOffset(),
+                               IsIndexSignExt);
+      }
+
+    // We only can pattern match BASE + INDEX + OFFSET. If Ptr is not an ADD
+    // instruction, then it could be just the BASE or everything else we don't
+    // know how to handle. Just use Ptr as BASE and give up.
+    if (Ptr->getOpcode() != ISD::ADD)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
+    // We know that we have at least an ADD instruction. Try to pattern match
+    // the simple case of BASE + OFFSET.
+    if (isa<ConstantSDNode>(Ptr->getOperand(1))) {
+      int64_t Offset = cast<ConstantSDNode>(Ptr->getOperand(1))->getSExtValue();
+      return  BaseIndexOffset(Ptr->getOperand(0), SDValue(), Offset,
+                              IsIndexSignExt);
+    }
+
+    // Inside a loop the current BASE pointer is calculated using an ADD and a
+    // MUL instruction. In this case Ptr is the actual BASE pointer.
+    // (i64 add (i64 %array_ptr)
+    //          (i64 mul (i64 %induction_var)
+    //                   (i64 %element_size)))
+    if (Ptr->getOperand(1)->getOpcode() == ISD::MUL)
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
+    // Look at Base + Index + Offset cases.
+    SDValue Base = Ptr->getOperand(0);
+    SDValue IndexOffset = Ptr->getOperand(1);
+
+    // Skip signextends.
+    if (IndexOffset->getOpcode() == ISD::SIGN_EXTEND) {
+      IndexOffset = IndexOffset->getOperand(0);
+      IsIndexSignExt = true;
+    }
+
+    // Either the case of Base + Index (no offset) or something else.
+    if (IndexOffset->getOpcode() != ISD::ADD)
+      return BaseIndexOffset(Base, IndexOffset, 0, IsIndexSignExt);
+
+    // Now we have the case of Base + Index + offset.
+    SDValue Index = IndexOffset->getOperand(0);
+    SDValue Offset = IndexOffset->getOperand(1);
+
+    if (!isa<ConstantSDNode>(Offset))
+      return BaseIndexOffset(Ptr, SDValue(), 0, IsIndexSignExt);
+
+    // Ignore signextends.
+    if (Index->getOpcode() == ISD::SIGN_EXTEND) {
+      Index = Index->getOperand(0);
+      IsIndexSignExt = true;
+    } else IsIndexSignExt = false;
+
+    int64_t Off = cast<ConstantSDNode>(Offset)->getSExtValue();
+    return BaseIndexOffset(Base, Index, Off, IsIndexSignExt);
+  }
+};
+} // namespace
+
+// This is a helper function for visitMUL to check the profitability
+// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
+// MulNode is the original multiply, AddNode is (add x, c1),
+// and ConstNode is c2.
+//
+// If the (add x, c1) has multiple uses, we could increase
+// the number of adds if we make this transformation.
+// It would only be worth doing this if we can remove a
+// multiply in the process. Check for that here.
+// To illustrate:
+//     (A + c1) * c3
+//     (A + c2) * c3
+// We're checking for cases where we have common "c3 * A" expressions.
+bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
+                                              SDValue &AddNode,
+                                              SDValue &ConstNode) {
+  APInt Val;
+
+  // If the add only has one use, this would be OK to do.
+  if (AddNode.getNode()->hasOneUse())
+    return true;
+
+  // Walk all the users of the constant with which we're multiplying.
+  for (SDNode *Use : ConstNode->uses()) {
+
+    if (Use == MulNode) // This use is the one we're on right now. Skip it.
+      continue;
+
+    if (Use->getOpcode() == ISD::MUL) { // We have another multiply use.
+      SDNode *OtherOp;
+      SDNode *MulVar = AddNode.getOperand(0).getNode();
+
+      // OtherOp is what we're multiplying against the constant.
+      if (Use->getOperand(0) == ConstNode)
+        OtherOp = Use->getOperand(1).getNode();
+      else
+        OtherOp = Use->getOperand(0).getNode();
+
+      // Check to see if multiply is with the same operand of our "add".
+      //
+      //     ConstNode  = CONST
+      //     Use = ConstNode * A  <-- visiting Use. OtherOp is A.
+      //     ...
+      //     AddNode  = (A + c1)  <-- MulVar is A.
+      //         = AddNode * ConstNode   <-- current visiting instruction.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (ConstNode * A) that we can save.
+      if (OtherOp == MulVar)
+        return true;
+
+      // Now check to see if a future expansion will give us a common
+      // multiply.
+      //
+      //     ConstNode  = CONST
+      //     AddNode    = (A + c1)
+      //     ...   = AddNode * ConstNode <-- current visiting instruction.
+      //     ...
+      //     OtherOp = (A + c2)
+      //     Use     = OtherOp * ConstNode <-- visiting Use.
+      //
+      // If we make this transformation, we will have a common
+      // multiply (CONST * A) after we also do the same transformation
+      // to the "t2" instruction.
+      if (OtherOp->getOpcode() == ISD::ADD &&
+          DAG.isConstantIntBuildVectorOrConstantInt(OtherOp->getOperand(1)) &&
+          OtherOp->getOperand(0).getNode() == MulVar)
+        return true;
+    }
+  }
+
+  // Didn't find a case where this would be profitable.
+  return false;
+}
+
+SDValue DAGCombiner::getMergedConstantVectorStore(
+    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores,
+    SmallVectorImpl<SDValue> &Chains, EVT Ty) const {
+  SmallVector<SDValue, 8> BuildVector;
+
+  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode);
+    Chains.push_back(St->getChain());
+    BuildVector.push_back(St->getValue());
+  }
+
+  return DAG.getBuildVector(Ty, SL, BuildVector);
+}
+
+bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
+                  SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT,
+                  unsigned NumStores, bool IsConstantSrc, bool UseVector) {
+  // Make sure we have something to merge.
+  if (NumStores < 2)
+    return false;
+
+  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned LatestNodeUsed = 0;
+
+  for (unsigned i=0; i < NumStores; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // latest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
+      LatestNodeUsed = i;
+  }
+
+  SmallVector<SDValue, 8> Chains;
+
+  // The latest Node in the DAG.
+  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
+  SDLoc DL(StoreNodes[0].MemNode);
+
+  SDValue StoredVal;
+  if (UseVector) {
+    bool IsVec = MemVT.isVector();
+    unsigned Elts = NumStores;
+    if (IsVec) {
+      // When merging vector stores, get the total number of elements.
+      Elts *= MemVT.getVectorNumElements();
+    }
+    // Get the type for the merged vector store.
+    EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+    assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
+
+    if (IsConstantSrc) {
+      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
+    } else {
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i < NumStores; ++i) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue Val = St->getValue();
+        // All operands of BUILD_VECTOR / CONCAT_VECTOR must have the same type.
+        if (Val.getValueType() != MemVT)
+          return false;
+        Ops.push_back(Val);
+        Chains.push_back(St->getChain());
+      }
+
+      // Build the extracted vector elements back into a vector.
+      StoredVal = DAG.getNode(IsVec ? ISD::CONCAT_VECTORS : ISD::BUILD_VECTOR,
+                              DL, Ty, Ops);    }
+  } else {
+    // We should always use a vector store when merging extracted vector
+    // elements, so this path implies a store of constants.
+    assert(IsConstantSrc && "Merged vector elements should use vector store");
+
+    unsigned SizeInBits = NumStores * ElementSizeBytes * 8;
+    APInt StoreInt(SizeInBits, 0);
+
+    // Construct a single integer constant which is made of the smaller
+    // constant inputs.
+    bool IsLE = DAG.getDataLayout().isLittleEndian();
+    for (unsigned i = 0; i < NumStores; ++i) {
+      unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
+      Chains.push_back(St->getChain());
+
+      SDValue Val = St->getValue();
+      StoreInt <<= ElementSizeBytes * 8;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Val)) {
+        StoreInt |= C->getAPIntValue().zext(SizeInBits);
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Val)) {
+        StoreInt |= C->getValueAPF().bitcastToAPInt().zext(SizeInBits);
+      } else {
+        llvm_unreachable("Invalid constant element type");
+      }
+    }
+
+    // Create the new Load and Store operations.
+    EVT StoreTy = EVT::getIntegerVT(*DAG.getContext(), SizeInBits);
+    StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
+  }
+
+  assert(!Chains.empty());
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
+                                  FirstInChain->getBasePtr(),
+                                  FirstInChain->getPointerInfo(),
+                                  FirstInChain->getAlignment());
+
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+  if (UseAA) {
+    // Replace all merged stores with the new store.
+    for (unsigned i = 0; i < NumStores; ++i)
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+  } else {
+    // Replace the last store with the new store.
+    CombineTo(LatestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumStores; ++i) {
+      if (StoreNodes[i].MemNode == LatestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      // ReplaceAllUsesWith will replace all uses that existed when it was
+      // called, but graph optimizations may cause new ones to appear. For
+      // example, the case in pr14333 looks like
+      //
+      //  St's chain -> St -> another store -> X
+      //
+      // And the only difference from St to the other store is the chain.
+      // When we change it's chain to be St's chain they become identical,
+      // get CSEed and the net result is that X is now a use of St.
+      // Since we know that St is redundant, just iterate.
+      while (!St->use_empty())
+        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
+      deleteAndRecombine(St);
+    }
+  }
+
+  return true;
+}
+
+void DAGCombiner::getStoreMergeAndAliasCandidates(
+    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
+    SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer.
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+
+  // We must have a base and an offset.
+  if (!BasePtr.Base.getNode())
+    return;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.Base.isUndef())
+    return;
+
+  // Walk up the chain and look for nodes with offsets from the same
+  // base pointer. Stop when reaching an instruction with a different kind
+  // or instruction which has a different base pointer.
+  EVT MemVT = St->getMemoryVT();
+  unsigned Seq = 0;
+  StoreSDNode *Index = St;
+
+
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+
+  if (UseAA) {
+    // Look at other users of the same chain. Stores on the same chain do not
+    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
+    // to be on the same chain, so don't bother looking at adjacent chains.
+
+    SDValue Chain = St->getChain();
+    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
+      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
+        if (I.getOperandNo() != 0)
+          continue;
+
+        if (OtherST->isVolatile() || OtherST->isIndexed())
+          continue;
+
+        if (OtherST->getMemoryVT() != MemVT)
+          continue;
+
+        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
+
+        if (Ptr.equalBaseIndex(BasePtr))
+          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
+      }
+    }
+
+    return;
+  }
+
+  while (Index) {
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (Index != St && !SDValue(Index, 0)->hasOneUse())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
+
+    // Check that the base pointer is the same as the original one.
+    if (!Ptr.equalBaseIndex(BasePtr))
+      break;
+
+    // The memory operands must not be volatile.
+    if (Index->isVolatile() || Index->isIndexed())
+      break;
+
+    // No truncation.
+    if (Index->isTruncatingStore())
+      break;
+
+    // The stored memory type must be the same.
+    if (Index->getMemoryVT() != MemVT)
+      break;
+
+    // We do not allow under-aligned stores in order to prevent
+    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
+    // be irrelevant here; what MATTERS is that we not move memory
+    // operations that potentially overlap past each-other.
+    if (Index->getAlignment() < MemVT.getStoreSize())
+      break;
+
+    // We found a potential memory operand to merge.
+    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
+
+    // Find the next memory operand in the chain. If the next operand in the
+    // chain is a store then move up and continue the scan with the next
+    // memory operand. If the next operand is a load save it and use alias
+    // information to check if it interferes with anything.
+    SDNode *NextInChain = Index->getChain().getNode();
+    while (1) {
+      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
+        // We found a store node. Use it for the next iteration.
+        Index = STn;
+        break;
+      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        if (Ldn->isVolatile()) {
+          Index = nullptr;
+          break;
+        }
+
+        // Save the load node for later. Continue the scan.
+        AliasLoadNodes.push_back(Ldn);
+        NextInChain = Ldn->getChain().getNode();
+        continue;
+      } else {
+        Index = nullptr;
+        break;
+      }
+    }
+  }
+}
+
+// We need to check that merging these stores does not cause a loop
+// in the DAG. Any store candidate may depend on another candidate
+// indirectly through its operand (we already consider dependencies
+// through the chain). Check in parallel by searching up from
+// non-chain operands of candidates.
+bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
+    SmallVectorImpl<MemOpLink> &StoreNodes) {
+  SmallPtrSet<const SDNode *, 16> Visited;
+  SmallVector<const SDNode *, 8> Worklist;
+  // search ops of store candidates
+  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+    SDNode *n = StoreNodes[i].MemNode;
+    // Potential loops may happen only through non-chain operands
+    for (unsigned j = 1; j < n->getNumOperands(); ++j)
+      Worklist.push_back(n->getOperand(j).getNode());
+  }
+  // search through DAG. We can stop early if we find a storenode
+  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+    if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))
+      return false;
+  }
+  return true;
+}
+
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode* St) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  EVT MemVT = St->getMemoryVT();
+  int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+  bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
+      Attribute::NoImplicitFloat);
+
+  // This function cannot currently deal with non-byte-sized memory sizes.
+  if (ElementSizeBytes * 8 != MemVT.getSizeInBits())
+    return false;
+
+  if (!MemVT.isSimple())
+    return false;
+
+  // Perform an early exit check. Do not bother looking at stored values that
+  // are not constants, loads, or extracted vector elements.
+  SDValue StoredVal = St->getValue();
+  bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
+  bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
+                       isa<ConstantFPSDNode>(StoredVal);
+  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+                          StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
+
+  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
+    return false;
+
+  // Don't merge vectors into wider vectors if the source data comes from loads.
+  // TODO: This restriction can be lifted by using logic similar to the
+  // ExtractVecSrc case.
+  if (MemVT.isVector() && IsLoadSrc)
+    return false;
+
+  // Only look at ends of store sequences.
+  SDValue Chain = SDValue(St, 0);
+  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
+    return false;
+
+  // Save the LoadSDNodes that we find in the chain.
+  // We need to make sure that these nodes do not interfere with
+  // any of the store nodes.
+  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
+
+  // Save the StoreSDNodes that we find in the chain.
+  SmallVector<MemOpLink, 8> StoreNodes;
+
+  getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
+
+  // Check if there is anything to merge.
+  if (StoreNodes.size() < 2)
+    return false;
+
+  // only do dep endence check in AA case
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
+    return false;
+
+  // Sort the memory operands according to their distance from the
+  // base pointer.  As a secondary criteria: make sure stores coming
+  // later in the code come first in the list. This is important for
+  // the non-UseAA case, because we're merging stores into the FINAL
+  // store along a chain which potentially contains aliasing stores.
+  // Thus, if there are multiple stores to the same address, the last
+  // one can be considered for merging but not the others.
+  std::sort(StoreNodes.begin(), StoreNodes.end(),
+            [](MemOpLink LHS, MemOpLink RHS) {
+    return LHS.OffsetFromBase < RHS.OffsetFromBase ||
+           (LHS.OffsetFromBase == RHS.OffsetFromBase &&
+            LHS.SequenceNum < RHS.SequenceNum);
+  });
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // store memory address.
+  unsigned LastConsecutiveStore = 0;
+  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
+  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
+
+    // Check that the addresses are consecutive starting from the second
+    // element in the list of stores.
+    if (i > 0) {
+      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+    }
+
+    // Check if this store interferes with any of the loads that we found.
+    // If we find a load that alias with this store. Stop the sequence.
+    if (std::any_of(AliasLoadNodes.begin(), AliasLoadNodes.end(),
+                    [&](LSBaseSDNode* Ldn) {
+                      return isAlias(Ldn, StoreNodes[i].MemNode);
+                    }))
+      break;
+
+    // Mark this node as useful.
+    LastConsecutiveStore = i;
+  }
+
+  // The node with the lowest store address.
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+  unsigned FirstStoreAlign = FirstInChain->getAlignment();
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+
+  // Store the constants into memory as one consecutive store.
+  if (IsConstantSrc) {
+    unsigned LastLegalType = 0;
+    unsigned LastLegalVectorType = 0;
+    bool NonZero = false;
+    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = St->getValue();
+
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+        NonZero |= !C->isNullValue();
+      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
+        NonZero |= !C->getConstantFPValue()->isNullValue();
+      } else {
+        // Non-constant.
+        break;
+      }
+
+      // Find a legal type for the constant store.
+      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      bool IsFast;
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                 FirstStoreAlign, &IsFast) && IsFast) {
+        LastLegalType = i+1;
+      // Or check whether a truncstore is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValueTy =
+          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                   FirstStoreAS, FirstStoreAlign, &IsFast) &&
+            IsFast) {
+          LastLegalType = i + 1;
+        }
+      }
+
+      // We only use vectors if the constant is known to be zero or the target
+      // allows it and the function is not marked with the noimplicitfloat
+      // attribute.
+      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
+                                                        FirstStoreAS)) &&
+          !NoVectors) {
+        // Find a legal type for the vector store.
+        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
+        if (TLI.isTypeLegal(Ty) &&
+            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) && IsFast)
+          LastLegalVectorType = i + 1;
+      }
+    }
+
+    // Check if we found a legal integer type to store.
+    if (LastLegalType == 0 && LastLegalVectorType == 0)
+      return false;
+
+    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
+
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                           true, UseVector);
+  }
+
+  // When extracting multiple vector elements, try to store them
+  // in one vector store rather than a sequence of scalar stores.
+  if (IsExtractVecSrc) {
+    unsigned NumStoresToMerge = 0;
+    bool IsVec = MemVT.isVector();
+    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
+      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      unsigned StoreValOpcode = St->getValue().getOpcode();
+      // This restriction could be loosened.
+      // Bail out if any stored values are not elements extracted from a vector.
+      // It should be possible to handle mixed sources, but load sources need
+      // more careful handling (see the block of code below that handles
+      // consecutive loads).
+      if (StoreValOpcode != ISD::EXTRACT_VECTOR_ELT &&
+          StoreValOpcode != ISD::EXTRACT_SUBVECTOR)
+        return false;
+
+      // Find a legal type for the vector store.
+      unsigned Elts = i + 1;
+      if (IsVec) {
+        // When merging vector stores, get the total number of elements.
+        Elts *= MemVT.getVectorNumElements();
+      }
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+      bool IsFast;
+      if (TLI.isTypeLegal(Ty) &&
+          TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                 FirstStoreAlign, &IsFast) && IsFast)
+        NumStoresToMerge = i + 1;
+    }
+
+    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumStoresToMerge,
+                                           false, true);
+  }
+
+  // Below we handle the case of multiple consecutive stores that
+  // come from multiple consecutive loads. We merge them into a single
+  // wide load and a single wide store.
+
+  // Look for load nodes which are used by the stored values.
+  SmallVector<MemOpLink, 8> LoadNodes;
+
+  // Find acceptable loads. Loads need to have the same chain (token factor),
+  // must not be zext, volatile, indexed, and they must be consecutive.
+  BaseIndexOffset LdBasePtr;
+  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+    StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
+    if (!Ld) break;
+
+    // Loads must only have one use.
+    if (!Ld->hasNUsesOfValue(1, 0))
+      break;
+
+    // The memory operands must not be volatile.
+    if (Ld->isVolatile() || Ld->isIndexed())
+      break;
+
+    // We do not accept ext loads.
+    if (Ld->getExtensionType() != ISD::NON_EXTLOAD)
+      break;
+
+    // The stored memory type must be the same.
+    if (Ld->getMemoryVT() != MemVT)
+      break;
+
+    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld->getBasePtr(), DAG);
+    // If this is not the first ptr that we check.
+    if (LdBasePtr.Base.getNode()) {
+      // The base ptr must be the same.
+      if (!LdPtr.equalBaseIndex(LdBasePtr))
+        break;
+    } else {
+      // Check that all other base pointers are the same as this one.
+      LdBasePtr = LdPtr;
+    }
+
+    // We found a potential memory operand to merge.
+    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
+  }
+
+  if (LoadNodes.size() < 2)
+    return false;
+
+  // If we have load/store pair instructions and we only have two values,
+  // don't bother.
+  unsigned RequiredAlignment;
+  if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+      St->getAlignment() >= RequiredAlignment)
+    return false;
+
+  LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
+  unsigned FirstLoadAS = FirstLoad->getAddressSpace();
+  unsigned FirstLoadAlign = FirstLoad->getAlignment();
+
+  // Scan the memory operations on the chain and find the first non-consecutive
+  // load memory address. These variables hold the index in the store node
+  // array.
+  unsigned LastConsecutiveLoad = 0;
+  // This variable refers to the size and not index in the array.
+  unsigned LastLegalVectorType = 0;
+  unsigned LastLegalIntegerType = 0;
+  StartAddress = LoadNodes[0].OffsetFromBase;
+  SDValue FirstChain = FirstLoad->getChain();
+  for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+    // All loads must share the same chain.
+    if (LoadNodes[i].MemNode->getChain() != FirstChain)
+      break;
+
+    int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+      break;
+    LastConsecutiveLoad = i;
+    // Find a legal type for the vector store.
+    EVT StoreTy = EVT::getVectorVT(Context, MemVT, i+1);
+    bool IsFastSt, IsFastLd;
+    if (TLI.isTypeLegal(StoreTy) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign, &IsFastLd) && IsFastLd) {
+      LastLegalVectorType = i + 1;
+    }
+
+    // Find a legal type for the integer store.
+    unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
+    StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+    if (TLI.isTypeLegal(StoreTy) &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                               FirstStoreAlign, &IsFastSt) && IsFastSt &&
+        TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstLoadAS,
+                               FirstLoadAlign, &IsFastLd) && IsFastLd)
+      LastLegalIntegerType = i + 1;
+    // Or check whether a truncstore and extload is legal.
+    else if (TLI.getTypeAction(Context, StoreTy) ==
+             TargetLowering::TypePromoteInteger) {
+      EVT LegalizedStoredValueTy =
+        TLI.getTypeToTransformTo(Context, StoreTy);
+      if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValueTy, StoreTy) &&
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstStoreAS, FirstStoreAlign, &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                 FirstLoadAS, FirstLoadAlign, &IsFastLd) &&
+          IsFastLd)
+        LastLegalIntegerType = i+1;
+    }
+  }
+
+  // Only use vector types if the vector type is larger than the integer type.
+  // If they are the same, use integers.
+  bool UseVectorTy = LastLegalVectorType > LastLegalIntegerType && !NoVectors;
+  unsigned LastLegalType = std::max(LastLegalVectorType, LastLegalIntegerType);
+
+  // We add +1 here because the LastXXX variables refer to location while
+  // the NumElem refers to array/index size.
+  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
+  NumElem = std::min(LastLegalType, NumElem);
+
+  if (NumElem < 2)
+    return false;
+
+  // Collect the chains from all merged stores.
+  SmallVector<SDValue, 8> MergeStoreChains;
+  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
+
+  // The latest Node in the DAG.
+  unsigned LatestNodeUsed = 0;
+  for (unsigned i=1; i<NumElem; ++i) {
+    // Find a chain for the new wide-store operand. Notice that some
+    // of the store nodes that we found may not be selected for inclusion
+    // in the wide store. The chain we use needs to be the chain of the
+    // latest store node which is *used* and replaced by the wide store.
+    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
+      LatestNodeUsed = i;
+
+    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
+  }
+
+  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
+
+  // Find if it is better to use vectors or integers to load and store
+  // to memory.
+  EVT JointMemOpVT;
+  if (UseVectorTy) {
+    JointMemOpVT = EVT::getVectorVT(Context, MemVT, NumElem);
+  } else {
+    unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+    JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+  }
+
+  SDLoc LoadDL(LoadNodes[0].MemNode);
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  // The merged loads are required to have the same incoming chain, so
+  // using the first's chain is acceptable.
+  SDValue NewLoad = DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
+                                FirstLoad->getBasePtr(),
+                                FirstLoad->getPointerInfo(), FirstLoadAlign);
+
+  SDValue NewStoreChain =
+    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+
+  SDValue NewStore =
+      DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
+                   FirstInChain->getPointerInfo(), FirstStoreAlign);
+
+  // Transfer chain users from old loads to the new load.
+  for (unsigned i = 0; i < NumElem; ++i) {
+    LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                  SDValue(NewLoad.getNode(), 1));
+  }
+
+  if (UseAA) {
+    // Replace the all stores with the new store.
+    for (unsigned i = 0; i < NumElem; ++i)
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+  } else {
+    // Replace the last store with the new store.
+    CombineTo(LatestOp, NewStore);
+    // Erase all other stores.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      // Remove all Store nodes.
+      if (StoreNodes[i].MemNode == LatestOp)
+        continue;
+      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
+      deleteAndRecombine(St);
+    }
+  }
+
+  return true;
+}
+
+SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
+  SDLoc SL(ST);
+  SDValue ReplStore;
+
+  // Replace the chain to avoid dependency.
+  if (ST->isTruncatingStore()) {
+    ReplStore = DAG.getTruncStore(BetterChain, SL, ST->getValue(),
+                                  ST->getBasePtr(), ST->getMemoryVT(),
+                                  ST->getMemOperand());
+  } else {
+    ReplStore = DAG.getStore(BetterChain, SL, ST->getValue(), ST->getBasePtr(),
+                             ST->getMemOperand());
+  }
+
+  // Create token to keep both nodes around.
+  SDValue Token = DAG.getNode(ISD::TokenFactor, SL,
+                              MVT::Other, ST->getChain(), ReplStore);
+
+  // Make sure the new and old chains are cleaned up.
+  AddToWorklist(Token.getNode());
+
+  // Don't add users to work list.
+  return CombineTo(ST, Token, false);
+}
+
+SDValue DAGCombiner::replaceStoreOfFPConstant(StoreSDNode *ST) {
+  SDValue Value = ST->getValue();
+  if (Value.getOpcode() == ISD::TargetConstantFP)
+    return SDValue();
+
+  SDLoc DL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  const ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Value);
+
+  // NOTE: If the original store is volatile, this transform must not increase
+  // the number of stores.  For example, on x86-32 an f64 can be stored in one
+  // processor operation but an i64 (which is not legal) requires two.  So the
+  // transform should not be done in this case.
+
+  SDValue Tmp;
+  switch (CFP->getSimpleValueType(0).SimpleTy) {
+  default:
+    llvm_unreachable("Unknown FP type");
+  case MVT::f16:    // We don't do this for these yet.
+  case MVT::f80:
+  case MVT::f128:
+  case MVT::ppcf128:
+    return SDValue();
+  case MVT::f32:
+    if ((isTypeLegal(MVT::i32) && !LegalOperations && !ST->isVolatile()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      ;
+      Tmp = DAG.getConstant((uint32_t)CFP->getValueAPF().
+                            bitcastToAPInt().getZExtValue(), SDLoc(CFP),
+                            MVT::i32);
+      return DAG.getStore(Chain, DL, Tmp, Ptr, ST->getMemOperand());
+    }
+
+    return SDValue();
+  case MVT::f64:
+    if ((TLI.isTypeLegal(MVT::i64) && !LegalOperations &&
+         !ST->isVolatile()) ||
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i64)) {
+      ;
+      Tmp = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                            getZExtValue(), SDLoc(CFP), MVT::i64);
+      return DAG.getStore(Chain, DL, Tmp,
+                          Ptr, ST->getMemOperand());
+    }
+
+    if (!ST->isVolatile() &&
+        TLI.isOperationLegalOrCustom(ISD::STORE, MVT::i32)) {
+      // Many FP stores are not made apparent until after legalize, e.g. for
+      // argument passing.  Since this is so common, custom legalize the
+      // 64-bit integer store into two 32-bit stores.
+      uint64_t Val = CFP->getValueAPF().bitcastToAPInt().getZExtValue();
+      SDValue Lo = DAG.getConstant(Val & 0xFFFFFFFF, SDLoc(CFP), MVT::i32);
+      SDValue Hi = DAG.getConstant(Val >> 32, SDLoc(CFP), MVT::i32);
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      unsigned Alignment = ST->getAlignment();
+      MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+      AAMDNodes AAInfo = ST->getAAInfo();
+
+      SDValue St0 = DAG.getStore(Chain, DL, Lo, Ptr, ST->getPointerInfo(),
+                                 ST->getAlignment(), MMOFlags, AAInfo);
+      Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                        DAG.getConstant(4, DL, Ptr.getValueType()));
+      Alignment = MinAlign(Alignment, 4U);
+      SDValue St1 = DAG.getStore(Chain, DL, Hi, Ptr,
+                                 ST->getPointerInfo().getWithOffset(4),
+                                 Alignment, MMOFlags, AAInfo);
+      return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                         St0, St1);
+    }
+
+    return SDValue();
+  }
+}
+
+SDValue DAGCombiner::visitSTORE(SDNode *N) {
+  StoreSDNode *ST  = cast<StoreSDNode>(N);
+  SDValue Chain = ST->getChain();
+  SDValue Value = ST->getValue();
+  SDValue Ptr   = ST->getBasePtr();
+
+  // If this is a store of a bit convert, store the input value if the
+  // resultant store does not need a higher alignment than the original.
+  if (Value.getOpcode() == ISD::BITCAST && !ST->isTruncatingStore() &&
+      ST->isUnindexed()) {
+    EVT SVT = Value.getOperand(0).getValueType();
+    if (((!LegalOperations && !ST->isVolatile()) ||
+         TLI.isOperationLegalOrCustom(ISD::STORE, SVT)) &&
+        TLI.isStoreBitCastBeneficial(Value.getValueType(), SVT)) {
+      unsigned OrigAlign = ST->getAlignment();
+      bool Fast = false;
+      if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), SVT,
+                                 ST->getAddressSpace(), OrigAlign, &Fast) &&
+          Fast) {
+        return DAG.getStore(Chain, SDLoc(N), Value.getOperand(0), Ptr,
+                            ST->getPointerInfo(), OrigAlign,
+                            ST->getMemOperand()->getFlags(), ST->getAAInfo());
+      }
+    }
+  }
+
+  // Turn 'store undef, Ptr' -> nothing.
+  if (Value.isUndef() && ST->isUnindexed())
+    return Chain;
+
+  // Try to infer better alignment information than the store already has.
+  if (OptLevel != CodeGenOpt::None && ST->isUnindexed()) {
+    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
+      if (Align > ST->getAlignment()) {
+        SDValue NewStore =
+            DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
+                              ST->getMemoryVT(), Align,
+                              ST->getMemOperand()->getFlags(), ST->getAAInfo());
+        if (NewStore.getNode() != N)
+          return CombineTo(ST, NewStore, true);
+      }
+    }
+  }
+
+  // Try transforming a pair floating point load / store ops to integer
+  // load / store ops.
+  if (SDValue NewST = TransformFPLoadStorePair(N))
+    return NewST;
+
+  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
+                                                  : DAG.getSubtarget().useAA();
+#ifndef NDEBUG
+  if (CombinerAAOnlyFunc.getNumOccurrences() &&
+      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
+    UseAA = false;
+#endif
+  if (UseAA && ST->isUnindexed()) {
+    // FIXME: We should do this even without AA enabled. AA will just allow
+    // FindBetterChain to work in more situations. The problem with this is that
+    // any combine that expects memory operations to be on consecutive chains
+    // first needs to be updated to look for users of the same chain.
+
+    // Walk up chain skipping non-aliasing memory nodes, on this store and any
+    // adjacent stores.
+    if (findBetterNeighborChains(ST)) {
+      // replaceStoreChain uses CombineTo, which handled all of the worklist
+      // manipulation. Return the original node to not do anything else.
+      return SDValue(ST, 0);
+    }
+    Chain = ST->getChain();
+  }
+
+  // Try transforming N to an indexed store.
+  if (CombineToPreIndexedLoadStore(N) || CombineToPostIndexedLoadStore(N))
+    return SDValue(N, 0);
+
+  // FIXME: is there such a thing as a truncating indexed store?
+  if (ST->isTruncatingStore() && ST->isUnindexed() &&
+      Value.getValueType().isInteger()) {
+    // See if we can simplify the input to this truncstore with knowledge that
+    // only the low bits are being used.  For example:
+    // "truncstore (or (shl x, 8), y), i8"  -> "truncstore y, i8"
+    SDValue Shorter =
+      GetDemandedBits(Value,
+                      APInt::getLowBitsSet(
+                        Value.getValueType().getScalarType().getSizeInBits(),
+                        ST->getMemoryVT().getScalarType().getSizeInBits()));
+    AddToWorklist(Value.getNode());
+    if (Shorter.getNode())
+      return DAG.getTruncStore(Chain, SDLoc(N), Shorter,
+                               Ptr, ST->getMemoryVT(), ST->getMemOperand());
+
+    // Otherwise, see if we can simplify the operation with
+    // SimplifyDemandedBits, which only works if the value has a single use.
+    if (SimplifyDemandedBits(Value,
+                        APInt::getLowBitsSet(
+                          Value.getValueType().getScalarType().getSizeInBits(),
+                          ST->getMemoryVT().getScalarType().getSizeInBits())))
+      return SDValue(N, 0);
+  }
+
+  // If this is a load followed by a store to the same location, then the store
+  // is dead/noop.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Value)) {
+    if (Ld->getBasePtr() == Ptr && ST->getMemoryVT() == Ld->getMemoryVT() &&
+        ST->isUnindexed() && !ST->isVolatile() &&
+        // There can't be any side effects between the load and store, such as
+        // a call or store.
+        Chain.reachesChainWithoutSideEffects(SDValue(Ld, 1))) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // If this is a store followed by a store with the same value to the same
+  // location, then the store is dead/noop.
+  if (StoreSDNode *ST1 = dyn_cast<StoreSDNode>(Chain)) {
+    if (ST1->getBasePtr() == Ptr && ST->getMemoryVT() == ST1->getMemoryVT() &&
+        ST1->getValue() == Value && ST->isUnindexed() && !ST->isVolatile() &&
+        ST1->isUnindexed() && !ST1->isVolatile()) {
+      // The store is dead, remove it.
+      return Chain;
+    }
+  }
+
+  // If this is an FP_ROUND or TRUNC followed by a store, fold this into a
+  // truncating store.  We can do this even if this is already a truncstore.
+  if ((Value.getOpcode() == ISD::FP_ROUND || Value.getOpcode() == ISD::TRUNCATE)
+      && Value.getNode()->hasOneUse() && ST->isUnindexed() &&
+      TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
+                            ST->getMemoryVT())) {
+    return DAG.getTruncStore(Chain, SDLoc(N), Value.getOperand(0),
+                             Ptr, ST->getMemoryVT(), ST->getMemOperand());
+  }
+
+  // Only perform this optimization before the types are legal, because we
+  // don't want to perform this optimization on every DAGCombine invocation.
+  if (!LegalTypes) {
+    bool EverChanged = false;
+
+    do {
+      // There can be multiple store sequences on the same chain.
+      // Keep trying to merge store sequences until we are unable to do so
+      // or until we merge the last store on the chain.
+      bool Changed = MergeConsecutiveStores(ST);
+      EverChanged |= Changed;
+      if (!Changed) break;
+    } while (ST->getOpcode() != ISD::DELETED_NODE);
+
+    if (EverChanged)
+      return SDValue(N, 0);
+  }
+
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  //
+  // Make sure to do this only after attempting to merge stores in order to
+  //  avoid changing the types of some subset of stores due to visit order,
+  //  preventing their merging.
+  if (isa<ConstantFPSDNode>(Value)) {
+    if (SDValue NewSt = replaceStoreOfFPConstant(ST))
+      return NewSt;
+  }
+
+  return ReduceLoadOpStoreWidth(N);
+}
+
+SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InVec = N->getOperand(0);
+  SDValue InVal = N->getOperand(1);
+  SDValue EltNo = N->getOperand(2);
+  SDLoc dl(N);
+
+  // If the inserted element is an UNDEF, just use the input vector.
+  if (InVal.isUndef())
+    return InVec;
+
+  EVT VT = InVec.getValueType();
+
+  // If we can't generate a legal BUILD_VECTOR, exit
+  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return SDValue();
+
+  // Check that we know which element is being inserted
+  if (!isa<ConstantSDNode>(EltNo))
+    return SDValue();
+  unsigned Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+  // Canonicalize insert_vector_elt dag nodes.
+  // Example:
+  // (insert_vector_elt (insert_vector_elt A, Idx0), Idx1)
+  // -> (insert_vector_elt (insert_vector_elt A, Idx1), Idx0)
+  //
+  // Do this only if the child insert_vector node has one use; also
+  // do this only if indices are both constants and Idx1 < Idx0.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT && InVec.hasOneUse()
+      && isa<ConstantSDNode>(InVec.getOperand(2))) {
+    unsigned OtherElt =
+      cast<ConstantSDNode>(InVec.getOperand(2))->getZExtValue();
+    if (Elt < OtherElt) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N), VT,
+                                  InVec.getOperand(0), InVal, EltNo);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(InVec.getNode()),
+                         VT, NewOp, InVec.getOperand(1), InVec.getOperand(2));
+    }
+  }
+
+  // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
+  // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
+  // vector elements.
+  SmallVector<SDValue, 8> Ops;
+  // Do not combine these two vectors if the output vector will not replace
+  // the input vector.
+  if (InVec.getOpcode() == ISD::BUILD_VECTOR && InVec.hasOneUse()) {
+    Ops.append(InVec.getNode()->op_begin(),
+               InVec.getNode()->op_end());
+  } else if (InVec.isUndef()) {
+    unsigned NElts = VT.getVectorNumElements();
+    Ops.append(NElts, DAG.getUNDEF(InVal.getValueType()));
+  } else {
+    return SDValue();
+  }
+
+  // Insert the element
+  if (Elt < Ops.size()) {
+    // All the operands of BUILD_VECTOR must have the same type;
+    // we enforce that here.
+    EVT OpVT = Ops[0].getValueType();
+    if (InVal.getValueType() != OpVT)
+      InVal = OpVT.bitsGT(InVal.getValueType()) ?
+                DAG.getNode(ISD::ANY_EXTEND, dl, OpVT, InVal) :
+                DAG.getNode(ISD::TRUNCATE, dl, OpVT, InVal);
+    Ops[Elt] = InVal;
+  }
+
+  // Return the new vector
+  return DAG.getBuildVector(VT, dl, Ops);
+}
+
+SDValue DAGCombiner::ReplaceExtractVectorEltOfLoadWithNarrowedLoad(
+    SDNode *EVE, EVT InVecVT, SDValue EltNo, LoadSDNode *OriginalLoad) {
+  assert(!OriginalLoad->isVolatile());
+
+  EVT ResultVT = EVE->getValueType(0);
+  EVT VecEltVT = InVecVT.getVectorElementType();
+  unsigned Align = OriginalLoad->getAlignment();
+  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+      VecEltVT.getTypeForEVT(*DAG.getContext()));
+
+  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+    return SDValue();
+
+  Align = NewAlign;
+
+  SDValue NewPtr = OriginalLoad->getBasePtr();
+  SDValue Offset;
+  EVT PtrType = NewPtr.getValueType();
+  MachinePointerInfo MPI;
+  SDLoc DL(EVE);
+  if (auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo)) {
+    int Elt = ConstEltNo->getZExtValue();
+    unsigned PtrOff = VecEltVT.getSizeInBits() * Elt / 8;
+    Offset = DAG.getConstant(PtrOff, DL, PtrType);
+    MPI = OriginalLoad->getPointerInfo().getWithOffset(PtrOff);
+  } else {
+    Offset = DAG.getZExtOrTrunc(EltNo, DL, PtrType);
+    Offset = DAG.getNode(
+        ISD::MUL, DL, PtrType, Offset,
+        DAG.getConstant(VecEltVT.getStoreSize(), DL, PtrType));
+    MPI = OriginalLoad->getPointerInfo();
+  }
+  NewPtr = DAG.getNode(ISD::ADD, DL, PtrType, NewPtr, Offset);
+
+  // The replacement we need to do here is a little tricky: we need to
+  // replace an extractelement of a load with a load.
+  // Use ReplaceAllUsesOfValuesWith to do the replacement.
+  // Note that this replacement assumes that the extractvalue is the only
+  // use of the load; that's okay because we don't want to perform this
+  // transformation in other cases anyway.
+  SDValue Load;
+  SDValue Chain;
+  if (ResultVT.bitsGT(VecEltVT)) {
+    // If the result type of vextract is wider than the load, then issue an
+    // extending load instead.
+    ISD::LoadExtType ExtType = TLI.isLoadExtLegal(ISD::ZEXTLOAD, ResultVT,
+                                                  VecEltVT)
+                                   ? ISD::ZEXTLOAD
+                                   : ISD::EXTLOAD;
+    Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
+                          OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
+                          Align, OriginalLoad->getMemOperand()->getFlags(),
+                          OriginalLoad->getAAInfo());
+    Chain = Load.getValue(1);
+  } else {
+    Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
+                       MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
+                       OriginalLoad->getAAInfo());
+    Chain = Load.getValue(1);
+    if (ResultVT.bitsLT(VecEltVT))
+      Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
+    else
+      Load = DAG.getBitcast(ResultVT, Load);
+  }
+  WorklistRemover DeadNodes(*this);
+  SDValue From[] = { SDValue(EVE, 0), SDValue(OriginalLoad, 1) };
+  SDValue To[] = { Load, Chain };
+  DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  // Since we're explicitly calling ReplaceAllUses, add the new node to the
+  // worklist explicitly as well.
+  AddToWorklist(Load.getNode());
+  AddUsersToWorklist(Load.getNode()); // Add users too
+  // Make sure to revisit this node to clean it up; it will usually be dead.
+  AddToWorklist(EVE);
+  ++OpsNarrowed;
+  return SDValue(EVE, 0);
+}
+
+SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
+  // (vextract (scalar_to_vector val, 0) -> val
+  SDValue InVec = N->getOperand(0);
+  EVT VT = InVec.getValueType();
+  EVT NVT = N->getValueType(0);
+
+  if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Check if the result type doesn't match the inserted element type. A
+    // SCALAR_TO_VECTOR may truncate the inserted element and the
+    // EXTRACT_VECTOR_ELT may widen the extracted vector.
+    SDValue InOp = InVec.getOperand(0);
+    if (InOp.getValueType() != NVT) {
+      assert(InOp.getValueType().isInteger() && NVT.isInteger());
+      return DAG.getSExtOrTrunc(InOp, SDLoc(InVec), NVT);
+    }
+    return InOp;
+  }
+
+  SDValue EltNo = N->getOperand(1);
+  ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+
+  // extract_vector_elt (build_vector x, y), 1 -> y
+  if (ConstEltNo &&
+      InVec.getOpcode() == ISD::BUILD_VECTOR &&
+      TLI.isTypeLegal(VT) &&
+      (InVec.hasOneUse() ||
+       TLI.aggressivelyPreferBuildVectorSources(VT))) {
+    SDValue Elt = InVec.getOperand(ConstEltNo->getZExtValue());
+    EVT InEltVT = Elt.getValueType();
+
+    // Sometimes build_vector's scalar input types do not match result type.
+    if (NVT == InEltVT)
+      return Elt;
+
+    // TODO: It may be useful to truncate if free if the build_vector implicitly
+    // converts.
+  }
+
+  // extract_vector_elt (v2i32 (bitcast i64:x)), 0 -> i32 (trunc i64:x)
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && InVec.hasOneUse() &&
+      ConstEltNo->isNullValue() && VT.isInteger()) {
+    SDValue BCSrc = InVec.getOperand(0);
+    if (BCSrc.getValueType().isScalarInteger())
+      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+  }
+
+  // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
+  //
+  // This only really matters if the index is non-constant since other combines
+  // on the constant elements already work.
+  if (InVec.getOpcode() == ISD::INSERT_VECTOR_ELT &&
+      EltNo == InVec.getOperand(2)) {
+    SDValue Elt = InVec.getOperand(1);
+    return VT.isInteger() ? DAG.getAnyExtOrTrunc(Elt, SDLoc(N), NVT) : Elt;
+  }
+
+  // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
+  // We only perform this optimization before the op legalization phase because
+  // we may introduce new vector instructions which are not backed by TD
+  // patterns. For example on AVX, extracting elements from a wide vector
+  // without using extract_subvector. However, if we can find an underlying
+  // scalar value, then we can always use that.
+  if (ConstEltNo && InVec.getOpcode() == ISD::VECTOR_SHUFFLE) {
+    int NumElem = VT.getVectorNumElements();
+    ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(InVec);
+    // Find the new index to extract from.
+    int OrigElt = SVOp->getMaskElt(ConstEltNo->getZExtValue());
+
+    // Extracting an undef index is undef.
+    if (OrigElt == -1)
+      return DAG.getUNDEF(NVT);
+
+    // Select the right vector half to extract from.
+    SDValue SVInVec;
+    if (OrigElt < NumElem) {
+      SVInVec = InVec->getOperand(0);
+    } else {
+      SVInVec = InVec->getOperand(1);
+      OrigElt -= NumElem;
+    }
+
+    if (SVInVec.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue InOp = SVInVec.getOperand(OrigElt);
+      if (InOp.getValueType() != NVT) {
+        assert(InOp.getValueType().isInteger() && NVT.isInteger());
+        InOp = DAG.getSExtOrTrunc(InOp, SDLoc(SVInVec), NVT);
+      }
+
+      return InOp;
+    }
+
+    // FIXME: We should handle recursing on other vector shuffles and
+    // scalar_to_vector here as well.
+
+    if (!LegalOperations) {
+      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), NVT, SVInVec,
+                         DAG.getConstant(OrigElt, SDLoc(SVOp), IndexTy));
+    }
+  }
+
+  bool BCNumEltsChanged = false;
+  EVT ExtVT = VT.getVectorElementType();
+  EVT LVT = ExtVT;
+
+  // If the result of load has to be truncated, then it's not necessarily
+  // profitable.
+  if (NVT.bitsLT(LVT) && !TLI.isTruncateFree(LVT, NVT))
+    return SDValue();
+
+  if (InVec.getOpcode() == ISD::BITCAST) {
+    // Don't duplicate a load with other uses.
+    if (!InVec.hasOneUse())
+      return SDValue();
+
+    EVT BCVT = InVec.getOperand(0).getValueType();
+    if (!BCVT.isVector() || ExtVT.bitsGT(BCVT.getVectorElementType()))
+      return SDValue();
+    if (VT.getVectorNumElements() != BCVT.getVectorNumElements())
+      BCNumEltsChanged = true;
+    InVec = InVec.getOperand(0);
+    ExtVT = BCVT.getVectorElementType();
+  }
+
+  // (vextract (vN[if]M load $addr), i) -> ([if]M load $addr + i * size)
+  if (!LegalOperations && !ConstEltNo && InVec.hasOneUse() &&
+      ISD::isNormalLoad(InVec.getNode()) &&
+      !N->getOperand(1)->hasPredecessor(InVec.getNode())) {
+    SDValue Index = N->getOperand(1);
+    if (LoadSDNode *OrigLoad = dyn_cast<LoadSDNode>(InVec)) {
+      if (!OrigLoad->isVolatile()) {
+        return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, Index,
+                                                             OrigLoad);
+      }
+    }
+  }
+
+  // Perform only after legalization to ensure build_vector / vector_shuffle
+  // optimizations have already been done.
+  if (!LegalOperations) return SDValue();
+
+  // (vextract (v4f32 load $addr), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 s2v (f32 load $addr)), c) -> (f32 load $addr+c*size)
+  // (vextract (v4f32 shuffle (load $addr), <1,u,u,u>), 0) -> (f32 load $addr)
+
+  if (ConstEltNo) {
+    int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
+
+    LoadSDNode *LN0 = nullptr;
+    const ShuffleVectorSDNode *SVN = nullptr;
+    if (ISD::isNormalLoad(InVec.getNode())) {
+      LN0 = cast<LoadSDNode>(InVec);
+    } else if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+               InVec.getOperand(0).getValueType() == ExtVT &&
+               ISD::isNormalLoad(InVec.getOperand(0).getNode())) {
+      // Don't duplicate a load with other uses.
+      if (!InVec.hasOneUse())
+        return SDValue();
+
+      LN0 = cast<LoadSDNode>(InVec.getOperand(0));
+    } else if ((SVN = dyn_cast<ShuffleVectorSDNode>(InVec))) {
+      // (vextract (vector_shuffle (load $addr), v2, <1, u, u, u>), 1)
+      // =>
+      // (load $addr+1*size)
+
+      // Don't duplicate a load with other uses.
+      if (!InVec.hasOneUse())
+        return SDValue();
+
+      // If the bit convert changed the number of elements, it is unsafe
+      // to examine the mask.
+      if (BCNumEltsChanged)
+        return SDValue();
+
+      // Select the input vector, guarding against out of range extract vector.
+      unsigned NumElems = VT.getVectorNumElements();
+      int Idx = (Elt > (int)NumElems) ? -1 : SVN->getMaskElt(Elt);
+      InVec = (Idx < (int)NumElems) ? InVec.getOperand(0) : InVec.getOperand(1);
+
+      if (InVec.getOpcode() == ISD::BITCAST) {
+        // Don't duplicate a load with other uses.
+        if (!InVec.hasOneUse())
+          return SDValue();
+
+        InVec = InVec.getOperand(0);
+      }
+      if (ISD::isNormalLoad(InVec.getNode())) {
+        LN0 = cast<LoadSDNode>(InVec);
+        Elt = (Idx < (int)NumElems) ? Idx : Idx - (int)NumElems;
+        EltNo = DAG.getConstant(Elt, SDLoc(EltNo), EltNo.getValueType());
+      }
+    }
+
+    // Make sure we found a non-volatile load and the extractelement is
+    // the only use.
+    if (!LN0 || !LN0->hasNUsesOfValue(1,0) || LN0->isVolatile())
+      return SDValue();
+
+    // If Idx was -1 above, Elt is going to be -1, so just return undef.
+    if (Elt == -1)
+      return DAG.getUNDEF(LVT);
+
+    return ReplaceExtractVectorEltOfLoadWithNarrowedLoad(N, VT, EltNo, LN0);
+  }
+
+  return SDValue();
+}
+
+// Simplify (build_vec (ext )) to (bitcast (build_vec ))
+SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may create bit-casts which
+  // will be type-legalized to complex code sequences.
+  // We perform this optimization only before the operation legalizer because we
+  // may introduce illegal operations.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return SDValue();
+
+  unsigned NumInScalars = N->getNumOperands();
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of values
+  // which come from any_extend or zero_extend nodes. If so, we can create
+  // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
+  // optimizations. We do not handle sign-extend because we can't fill the sign
+  // using shuffles.
+  EVT SourceType = MVT::Other;
+  bool AllAnyExt = true;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    // Ignore undef inputs.
+    if (In.isUndef()) continue;
+
+    bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
+    bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
+
+    // Abort if the element is not an extension.
+    if (!ZeroExt && !AnyExt) {
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // The input is a ZeroExt or AnyExt. Check the original type.
+    EVT InTy = In.getOperand(0).getValueType();
+
+    // Check that all of the widened source types are the same.
+    if (SourceType == MVT::Other)
+      // First time.
+      SourceType = InTy;
+    else if (InTy != SourceType) {
+      // Multiple income types. Abort.
+      SourceType = MVT::Other;
+      break;
+    }
+
+    // Check if all of the extends are ANY_EXTENDs.
+    AllAnyExt &= AnyExt;
+  }
+
+  // In order to have valid types, all of the inputs must be extended from the
+  // same source type and all of the inputs must be any or zero extend.
+  // Scalar sizes must be a power of two.
+  EVT OutScalarTy = VT.getScalarType();
+  bool ValidTypes = SourceType != MVT::Other &&
+                 isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
+                 isPowerOf2_32(SourceType.getSizeInBits());
+
+  // Create a new simpler BUILD_VECTOR sequence which other optimizations can
+  // turn into a single shuffle instruction.
+  if (!ValidTypes)
+    return SDValue();
+
+  bool isLE = DAG.getDataLayout().isLittleEndian();
+  unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
+  assert(ElemRatio > 1 && "Invalid element size ratio");
+  SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
+                               DAG.getConstant(0, SDLoc(N), SourceType);
+
+  unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
+
+  // Populate the new build_vector
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Cast = N->getOperand(i);
+    assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
+            Cast.getOpcode() == ISD::ZERO_EXTEND ||
+            Cast.isUndef()) && "Invalid cast opcode");
+    SDValue In;
+    if (Cast.isUndef())
+      In = DAG.getUNDEF(SourceType);
+    else
+      In = Cast->getOperand(0);
+    unsigned Index = isLE ? (i * ElemRatio) :
+                            (i * ElemRatio + (ElemRatio - 1));
+
+    assert(Index < Ops.size() && "Invalid index");
+    Ops[Index] = In;
+  }
+
+  // The type of the new BUILD_VECTOR node.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
+  assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
+         "Invalid vector size");
+  // Check if the new vector type is legal.
+  if (!isTypeLegal(VecVT)) return SDValue();
+
+  // Make the new BUILD_VECTOR.
+  SDValue BV = DAG.getBuildVector(VecVT, dl, Ops);
+
+  // The new BUILD_VECTOR node has the potential to be further optimized.
+  AddToWorklist(BV.getNode());
+  // Bitcast to the desired type.
+  return DAG.getBitcast(VT, BV);
+}
+
+SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+  EVT VT = N->getValueType(0);
+
+  unsigned NumInScalars = N->getNumOperands();
+  SDLoc dl(N);
+
+  EVT SrcVT = MVT::Other;
+  unsigned Opcode = ISD::DELETED_NODE;
+  unsigned NumDefs = 0;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opc = In.getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    // If all scalar values are floats and converted from integers.
+    if (Opcode == ISD::DELETED_NODE &&
+        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
+      Opcode = Opc;
+    }
+
+    if (Opc != Opcode)
+      return SDValue();
+
+    EVT InVT = In.getOperand(0).getValueType();
+
+    // If all scalar values are typed differently, bail out. It's chosen to
+    // simplify BUILD_VECTOR of integer types.
+    if (SrcVT == MVT::Other)
+      SrcVT = InVT;
+    if (SrcVT != InVT)
+      return SDValue();
+    NumDefs++;
+  }
+
+  // If the vector has just one element defined, it's not worth to fold it into
+  // a vectorized one.
+  if (NumDefs < 2)
+    return SDValue();
+
+  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
+         && "Should only handle conversion from integer to float.");
+  assert(SrcVT != MVT::Other && "Cannot determine source type!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
+
+  if (!TLI.isOperationLegalOrCustom(Opcode, NVT))
+    return SDValue();
+
+  // Just because the floating-point vector type is legal does not necessarily
+  // mean that the corresponding integer vector type is.
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
+  SmallVector<SDValue, 8> Opnds;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+
+    if (In.isUndef())
+      Opnds.push_back(DAG.getUNDEF(SrcVT));
+    else
+      Opnds.push_back(In.getOperand(0));
+  }
+  SDValue BV = DAG.getBuildVector(NVT, dl, Opnds);
+  AddToWorklist(BV.getNode());
+
+  return DAG.getNode(Opcode, dl, VT, BV);
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
+    return V;
+
+  if (SDValue V = reduceBuildVecConvertToConvertBuildVec(N))
+    return V;
+
+  // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
+  // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
+  // at most two distinct vectors, turn this into a shuffle node.
+
+  // Only type-legal BUILD_VECTOR nodes are converted to shuffle nodes.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // May only combine to shuffle after legalize if shuffle is legal.
+  if (LegalOperations && !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, VT))
+    return SDValue();
+
+  SDValue VecIn1, VecIn2;
+  bool UsesZeroVector = false;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue Op = N->getOperand(i);
+    // Ignore undef inputs.
+    if (Op.isUndef()) continue;
+
+    // See if we can combine this build_vector into a blend with a zero vector.
+    if (!VecIn2.getNode() && (isNullConstant(Op) || isNullFPConstant(Op))) {
+      UsesZeroVector = true;
+      continue;
+    }
+
+    // If this input is something other than a EXTRACT_VECTOR_ELT with a
+    // constant index, bail out.
+    if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        !isa<ConstantSDNode>(Op.getOperand(1))) {
+      VecIn1 = VecIn2 = SDValue(nullptr, 0);
+      break;
+    }
+
+    // We allow up to two distinct input vectors.
+    SDValue ExtractedFromVec = Op.getOperand(0);
+    if (ExtractedFromVec == VecIn1 || ExtractedFromVec == VecIn2)
+      continue;
+
+    if (!VecIn1.getNode()) {
+      VecIn1 = ExtractedFromVec;
+    } else if (!VecIn2.getNode() && !UsesZeroVector) {
+      VecIn2 = ExtractedFromVec;
+    } else {
+      // Too many inputs.
+      VecIn1 = VecIn2 = SDValue(nullptr, 0);
+      break;
+    }
+  }
+
+  // If everything is good, we can make a shuffle operation.
+  if (VecIn1.getNode()) {
+    unsigned InNumElements = VecIn1.getValueType().getVectorNumElements();
+    SmallVector<int, 8> Mask;
+    for (unsigned i = 0; i != NumInScalars; ++i) {
+      unsigned Opcode = N->getOperand(i).getOpcode();
+      if (Opcode == ISD::UNDEF) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Operands can also be zero.
+      if (Opcode != ISD::EXTRACT_VECTOR_ELT) {
+        assert(UsesZeroVector &&
+               (Opcode == ISD::Constant || Opcode == ISD::ConstantFP) &&
+               "Unexpected node found!");
+        Mask.push_back(NumInScalars+i);
+        continue;
+      }
+
+      // If extracting from the first vector, just use the index directly.
+      SDValue Extract = N->getOperand(i);
+      SDValue ExtVal = Extract.getOperand(1);
+      unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
+      if (Extract.getOperand(0) == VecIn1) {
+        Mask.push_back(ExtIndex);
+        continue;
+      }
+
+      // Otherwise, use InIdx + InputVecSize
+      Mask.push_back(InNumElements + ExtIndex);
+    }
+
+    // Avoid introducing illegal shuffles with zero.
+    if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
+      return SDValue();
+
+    // We can't generate a shuffle node with mismatched input and output types.
+    // Attempt to transform a single input vector to the correct type.
+    if ((VT != VecIn1.getValueType())) {
+      // If the input vector type has a different base type to the output
+      // vector type, bail out.
+      EVT VTElemType = VT.getVectorElementType();
+      if ((VecIn1.getValueType().getVectorElementType() != VTElemType) ||
+          (VecIn2.getNode() &&
+           (VecIn2.getValueType().getVectorElementType() != VTElemType)))
+        return SDValue();
+
+      // If the input vector is too small, widen it.
+      // We only support widening of vectors which are half the size of the
+      // output registers. For example XMM->YMM widening on X86 with AVX.
+      EVT VecInT = VecIn1.getValueType();
+      if (VecInT.getSizeInBits() * 2 == VT.getSizeInBits()) {
+        // If we only have one small input, widen it by adding undef values.
+        if (!VecIn2.getNode())
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1,
+                               DAG.getUNDEF(VecIn1.getValueType()));
+        else if (VecIn1.getValueType() == VecIn2.getValueType()) {
+          // If we have two small inputs of the same type, try to concat them.
+          VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, VecIn1, VecIn2);
+          VecIn2 = SDValue(nullptr, 0);
+        } else
+          return SDValue();
+      } else if (VecInT.getSizeInBits() == VT.getSizeInBits() * 2) {
+        // If the input vector is too large, try to split it.
+        // We don't support having two input vectors that are too large.
+        // If the zero vector was used, we can not split the vector,
+        // since we'd need 3 inputs.
+        if (UsesZeroVector || VecIn2.getNode())
+          return SDValue();
+
+        if (!TLI.isExtractSubvectorCheap(VT, VT.getVectorNumElements()))
+          return SDValue();
+
+        // Try to replace VecIn1 with two extract_subvectors
+        // No need to update the masks, they should still be correct.
+        VecIn2 = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+            DAG.getConstant(VT.getVectorNumElements(), dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout())));
+        VecIn1 = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1,
+            DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      } else
+        return SDValue();
+    }
+
+    if (UsesZeroVector)
+      VecIn2 = VT.isInteger() ? DAG.getConstant(0, dl, VT) :
+                                DAG.getConstantFP(0.0, dl, VT);
+    else
+      // If VecIn2 is unused then change it to undef.
+      VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
+
+    // Check that we were able to transform all incoming values to the same
+    // type.
+    if (VecIn2.getValueType() != VecIn1.getValueType() ||
+        VecIn1.getValueType() != VT)
+          return SDValue();
+
+    // Return the new VECTOR_SHUFFLE node.
+    SDValue Ops[2];
+    Ops[0] = VecIn1;
+    Ops[1] = VecIn2;
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], Mask);
+  }
+
+  return SDValue();
+}
+
+static SDValue combineConcatVectorOfScalars(SDNode *N, SelectionDAG &DAG) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT OpVT = N->getOperand(0).getValueType();
+
+  // If the operands are legal vectors, leave them alone.
+  if (TLI.isTypeLegal(OpVT))
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SmallVector<SDValue, 8> Ops;
+
+  EVT SVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+  SDValue ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
+
+  // Keep track of what we encounter.
+  bool AnyInteger = false;
+  bool AnyFP = false;
+  for (const SDValue &Op : N->ops()) {
+    if (ISD::BITCAST == Op.getOpcode() &&
+        !Op.getOperand(0).getValueType().isVector())
+      Ops.push_back(Op.getOperand(0));
+    else if (ISD::UNDEF == Op.getOpcode())
+      Ops.push_back(ScalarUndef);
+    else
+      return SDValue();
+
+    // Note whether we encounter an integer or floating point scalar.
+    // If it's neither, bail out, it could be something weird like x86mmx.
+    EVT LastOpVT = Ops.back().getValueType();
+    if (LastOpVT.isFloatingPoint())
+      AnyFP = true;
+    else if (LastOpVT.isInteger())
+      AnyInteger = true;
+    else
+      return SDValue();
+  }
+
+  // If any of the operands is a floating point scalar bitcast to a vector,
+  // use floating point types throughout, and bitcast everything.
+  // Replace UNDEFs by another scalar UNDEF node, of the final desired type.
+  if (AnyFP) {
+    SVT = EVT::getFloatingPointVT(OpVT.getSizeInBits());
+    ScalarUndef = DAG.getNode(ISD::UNDEF, DL, SVT);
+    if (AnyInteger) {
+      for (SDValue &Op : Ops) {
+        if (Op.getValueType() == SVT)
+          continue;
+        if (Op.isUndef())
+          Op = ScalarUndef;
+        else
+          Op = DAG.getBitcast(SVT, Op);
+      }
+    }
+  }
+
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SVT,
+                               VT.getSizeInBits() / SVT.getSizeInBits());
+  return DAG.getBitcast(VT, DAG.getBuildVector(VecVT, DL, Ops));
+}
+
+// Check to see if this is a CONCAT_VECTORS of a bunch of EXTRACT_SUBVECTOR
+// operations. If so, and if the EXTRACT_SUBVECTOR vector inputs come from at
+// most two distinct vectors the same size as the result, attempt to turn this
+// into a legal shuffle.
+static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  EVT OpVT = N->getOperand(0).getValueType();
+  int NumElts = VT.getVectorNumElements();
+  int NumOpElts = OpVT.getVectorNumElements();
+
+  SDValue SV0 = DAG.getUNDEF(VT), SV1 = DAG.getUNDEF(VT);
+  SmallVector<int, 8> Mask;
+
+  for (SDValue Op : N->ops()) {
+    // Peek through any bitcast.
+    while (Op.getOpcode() == ISD::BITCAST)
+      Op = Op.getOperand(0);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (Op.isUndef()) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // What vector are we extracting the subvector from and at what index?
+    SDValue ExtVec = Op.getOperand(0);
+
+    // We want the EVT of the original extraction to correctly scale the
+    // extraction index.
+    EVT ExtVT = ExtVec.getValueType();
+
+    // Peek through any bitcast.
+    while (ExtVec.getOpcode() == ISD::BITCAST)
+      ExtVec = ExtVec.getOperand(0);
+
+    // UNDEF nodes convert to UNDEF shuffle mask values.
+    if (ExtVec.isUndef()) {
+      Mask.append((unsigned)NumOpElts, -1);
+      continue;
+    }
+
+    if (!isa<ConstantSDNode>(Op.getOperand(1)))
+      return SDValue();
+    int ExtIdx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // Ensure that we are extracting a subvector from a vector the same
+    // size as the result.
+    if (ExtVT.getSizeInBits() != VT.getSizeInBits())
+      return SDValue();
+
+    // Scale the subvector index to account for any bitcast.
+    int NumExtElts = ExtVT.getVectorNumElements();
+    if (0 == (NumExtElts % NumElts))
+      ExtIdx /= (NumExtElts / NumElts);
+    else if (0 == (NumElts % NumExtElts))
+      ExtIdx *= (NumElts / NumExtElts);
+    else
+      return SDValue();
+
+    // At most we can reference 2 inputs in the final shuffle.
+    if (SV0.isUndef() || SV0 == ExtVec) {
+      SV0 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx);
+    } else if (SV1.isUndef() || SV1 == ExtVec) {
+      SV1 = ExtVec;
+      for (int i = 0; i != NumOpElts; ++i)
+        Mask.push_back(i + ExtIdx + NumElts);
+    } else {
+      return SDValue();
+    }
+  }
+
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(Mask, VT))
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(N), DAG.getBitcast(VT, SV0),
+                              DAG.getBitcast(VT, SV1), Mask);
+}
+
+SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
+  // If we only have one input vector, we don't need to do any concatenation.
+  if (N->getNumOperands() == 1)
+    return N->getOperand(0);
+
+  // Check if all of the operands are undefs.
+  EVT VT = N->getValueType(0);
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  // Optimize concat_vectors where all but the first of the vectors are undef.
+  if (std::all_of(std::next(N->op_begin()), N->op_end(), [](const SDValue &Op) {
+        return Op.isUndef();
+      })) {
+    SDValue In = N->getOperand(0);
+    assert(In.getValueType().isVector() && "Must concat vectors");
+
+    // Transform: concat_vectors(scalar, undef) -> scalar_to_vector(sclr).
+    if (In->getOpcode() == ISD::BITCAST &&
+        !In->getOperand(0)->getValueType(0).isVector()) {
+      SDValue Scalar = In->getOperand(0);
+
+      // If the bitcast type isn't legal, it might be a trunc of a legal type;
+      // look through the trunc so we can still do the transform:
+      //   concat_vectors(trunc(scalar), undef) -> scalar_to_vector(scalar)
+      if (Scalar->getOpcode() == ISD::TRUNCATE &&
+          !TLI.isTypeLegal(Scalar.getValueType()) &&
+          TLI.isTypeLegal(Scalar->getOperand(0).getValueType()))
+        Scalar = Scalar->getOperand(0);
+
+      EVT SclTy = Scalar->getValueType(0);
+
+      if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
+                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
+        return SDValue();
+
+      SDLoc dl = SDLoc(N);
+      SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NVT, Scalar);
+      return DAG.getBitcast(VT, Res);
+    }
+  }
+
+  // Fold any combination of BUILD_VECTOR or UNDEF nodes into one BUILD_VECTOR.
+  // We have already tested above for an UNDEF only concatenation.
+  // fold (concat_vectors (BUILD_VECTOR A, B, ...), (BUILD_VECTOR C, D, ...))
+  // -> (BUILD_VECTOR A, B, ..., C, D, ...)
+  auto IsBuildVectorOrUndef = [](const SDValue &Op) {
+    return ISD::UNDEF == Op.getOpcode() || ISD::BUILD_VECTOR == Op.getOpcode();
+  };
+  if (llvm::all_of(N->ops(), IsBuildVectorOrUndef)) {
+    SmallVector<SDValue, 8> Opnds;
+    EVT SVT = VT.getScalarType();
+
+    EVT MinVT = SVT;
+    if (!SVT.isFloatingPoint()) {
+      // If BUILD_VECTOR are from built from integer, they may have different
+      // operand types. Get the smallest type and truncate all operands to it.
+      bool FoundMinVT = false;
+      for (const SDValue &Op : N->ops())
+        if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+          EVT OpSVT = Op.getOperand(0)->getValueType(0);
+          MinVT = (!FoundMinVT || OpSVT.bitsLE(MinVT)) ? OpSVT : MinVT;
+          FoundMinVT = true;
+        }
+      assert(FoundMinVT && "Concat vector type mismatch");
+    }
+
+    for (const SDValue &Op : N->ops()) {
+      EVT OpVT = Op.getValueType();
+      unsigned NumElts = OpVT.getVectorNumElements();
+
+      if (ISD::UNDEF == Op.getOpcode())
+        Opnds.append(NumElts, DAG.getUNDEF(MinVT));
+
+      if (ISD::BUILD_VECTOR == Op.getOpcode()) {
+        if (SVT.isFloatingPoint()) {
+          assert(SVT == OpVT.getScalarType() && "Concat vector type mismatch");
+          Opnds.append(Op->op_begin(), Op->op_begin() + NumElts);
+        } else {
+          for (unsigned i = 0; i != NumElts; ++i)
+            Opnds.push_back(
+                DAG.getNode(ISD::TRUNCATE, SDLoc(N), MinVT, Op.getOperand(i)));
+        }
+      }
+    }
+
+    assert(VT.getVectorNumElements() == Opnds.size() &&
+           "Concat vector type mismatch");
+    return DAG.getBuildVector(VT, SDLoc(N), Opnds);
+  }
+
+  // Fold CONCAT_VECTORS of only bitcast scalars (or undef) to BUILD_VECTOR.
+  if (SDValue V = combineConcatVectorOfScalars(N, DAG))
+    return V;
+
+  // Fold CONCAT_VECTORS of EXTRACT_SUBVECTOR (or undef) to VECTOR_SHUFFLE.
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT))
+    if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
+      return V;
+
+  // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
+  // nodes often generate nop CONCAT_VECTOR nodes.
+  // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
+  // place the incoming vectors at the exact same location.
+  SDValue SingleSource = SDValue();
+  unsigned PartNumElem = N->getOperand(0).getValueType().getVectorNumElements();
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Op = N->getOperand(i);
+
+    if (Op.isUndef())
+      continue;
+
+    // Check if this is the identity extract:
+    if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR)
+      return SDValue();
+
+    // Find the single incoming vector for the extract_subvector.
+    if (SingleSource.getNode()) {
+      if (Op.getOperand(0) != SingleSource)
+        return SDValue();
+    } else {
+      SingleSource = Op.getOperand(0);
+
+      // Check the source type is the same as the type of the result.
+      // If not, this concat may extend the vector, so we can not
+      // optimize it away.
+      if (SingleSource.getValueType() != N->getValueType(0))
+        return SDValue();
+    }
+
+    unsigned IdentityIndex = i * PartNumElem;
+    ConstantSDNode *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    // The extract index must be constant.
+    if (!CS)
+      return SDValue();
+
+    // Check that we are reading from the identity index.
+    if (CS->getZExtValue() != IdentityIndex)
+      return SDValue();
+  }
+
+  if (SingleSource.getNode())
+    return SingleSource;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
+  EVT NVT = N->getValueType(0);
+  SDValue V = N->getOperand(0);
+
+  if (V->getOpcode() == ISD::CONCAT_VECTORS) {
+    // Combine:
+    //    (extract_subvec (concat V1, V2, ...), i)
+    // Into:
+    //    Vi if possible
+    // Only operand 0 is checked as 'concat' assumes all inputs of the same
+    // type.
+    if (V->getOperand(0).getValueType() != NVT)
+      return SDValue();
+    unsigned Idx = N->getConstantOperandVal(1);
+    unsigned NumElems = NVT.getVectorNumElements();
+    assert((Idx % NumElems) == 0 &&
+           "IDX in concat is not a multiple of the result vector length.");
+    return V->getOperand(Idx / NumElems);
+  }
+
+  // Skip bitcasting
+  if (V->getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
+    SDLoc dl(N);
+    // Handle only simple case where vector being inserted and vector
+    // being extracted are of same type, and are half size of larger vectors.
+    EVT BigVT = V->getOperand(0).getValueType();
+    EVT SmallVT = V->getOperand(1).getValueType();
+    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+      return SDValue();
+
+    // Only handle cases where both indexes are constants with the same type.
+    ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
+    ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
+
+    if (InsIdx && ExtIdx &&
+        InsIdx->getValueType(0).getSizeInBits() <= 64 &&
+        ExtIdx->getValueType(0).getSizeInBits() <= 64) {
+      // Combine:
+      //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+      // Into:
+      //    indices are equal or bit offsets are equal => V1
+      //    otherwise => (extract_subvec V1, ExtIdx)
+      if (InsIdx->getZExtValue() * SmallVT.getScalarType().getSizeInBits() ==
+          ExtIdx->getZExtValue() * NVT.getScalarType().getSizeInBits())
+        return DAG.getBitcast(NVT, V->getOperand(1));
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, NVT,
+          DAG.getBitcast(N->getOperand(0).getValueType(), V->getOperand(0)),
+          N->getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
+static SDValue simplifyShuffleOperandRecursively(SmallBitVector &UsedElements,
+                                                 SDValue V, SelectionDAG &DAG) {
+  SDLoc DL(V);
+  EVT VT = V.getValueType();
+
+  switch (V.getOpcode()) {
+  default:
+    return V;
+
+  case ISD::CONCAT_VECTORS: {
+    EVT OpVT = V->getOperand(0).getValueType();
+    int OpSize = OpVT.getVectorNumElements();
+    SmallBitVector OpUsedElements(OpSize, false);
+    bool FoundSimplification = false;
+    SmallVector<SDValue, 4> NewOps;
+    NewOps.reserve(V->getNumOperands());
+    for (int i = 0, NumOps = V->getNumOperands(); i < NumOps; ++i) {
+      SDValue Op = V->getOperand(i);
+      bool OpUsed = false;
+      for (int j = 0; j < OpSize; ++j)
+        if (UsedElements[i * OpSize + j]) {
+          OpUsedElements[j] = true;
+          OpUsed = true;
+        }
+      NewOps.push_back(
+          OpUsed ? simplifyShuffleOperandRecursively(OpUsedElements, Op, DAG)
+                 : DAG.getUNDEF(OpVT));
+      FoundSimplification |= Op == NewOps.back();
+      OpUsedElements.reset();
+    }
+    if (FoundSimplification)
+      V = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, NewOps);
+    return V;
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    SDValue BaseV = V->getOperand(0);
+    SDValue SubV = V->getOperand(1);
+    auto *IdxN = dyn_cast<ConstantSDNode>(V->getOperand(2));
+    if (!IdxN)
+      return V;
+
+    int SubSize = SubV.getValueType().getVectorNumElements();
+    int Idx = IdxN->getZExtValue();
+    bool SubVectorUsed = false;
+    SmallBitVector SubUsedElements(SubSize, false);
+    for (int i = 0; i < SubSize; ++i)
+      if (UsedElements[i + Idx]) {
+        SubVectorUsed = true;
+        SubUsedElements[i] = true;
+        UsedElements[i + Idx] = false;
+      }
+
+    // Now recurse on both the base and sub vectors.
+    SDValue SimplifiedSubV =
+        SubVectorUsed
+            ? simplifyShuffleOperandRecursively(SubUsedElements, SubV, DAG)
+            : DAG.getUNDEF(SubV.getValueType());
+    SDValue SimplifiedBaseV = simplifyShuffleOperandRecursively(UsedElements, BaseV, DAG);
+    if (SimplifiedSubV != SubV || SimplifiedBaseV != BaseV)
+      V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                      SimplifiedBaseV, SimplifiedSubV, V->getOperand(2));
+    return V;
+  }
+  }
+}
+
+static SDValue simplifyShuffleOperands(ShuffleVectorSDNode *SVN, SDValue N0,
+                                       SDValue N1, SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  int NumElts = VT.getVectorNumElements();
+  SmallBitVector N0UsedElements(NumElts, false), N1UsedElements(NumElts, false);
+  for (int M : SVN->getMask())
+    if (M >= 0 && M < NumElts)
+      N0UsedElements[M] = true;
+    else if (M >= NumElts)
+      N1UsedElements[M - NumElts] = true;
+
+  SDValue S0 = simplifyShuffleOperandRecursively(N0UsedElements, N0, DAG);
+  SDValue S1 = simplifyShuffleOperandRecursively(N1UsedElements, N1, DAG);
+  if (S0 == N0 && S1 == N1)
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(SVN), S0, S1, SVN->getMask());
+}
+
+// Tries to turn a shuffle of two CONCAT_VECTORS into a single concat,
+// or turn a shuffle of a single concat into simpler shuffle then concat.
+static SDValue partitionShuffleOfConcats(SDNode *N, SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+  SmallVector<SDValue, 4> Ops;
+  EVT ConcatVT = N0.getOperand(0).getValueType();
+  unsigned NumElemsPerConcat = ConcatVT.getVectorNumElements();
+  unsigned NumConcats = NumElts / NumElemsPerConcat;
+
+  // Special case: shuffle(concat(A,B)) can be more efficiently represented
+  // as concat(shuffle(A,B),UNDEF) if the shuffle doesn't set any of the high
+  // half vector elements.
+  if (NumElemsPerConcat * 2 == NumElts && N1.isUndef() &&
+      std::all_of(SVN->getMask().begin() + NumElemsPerConcat,
+                  SVN->getMask().end(), [](int i) { return i == -1; })) {
+    N0 = DAG.getVectorShuffle(ConcatVT, SDLoc(N), N0.getOperand(0), N0.getOperand(1),
+                              makeArrayRef(SVN->getMask().begin(), NumElemsPerConcat));
+    N1 = DAG.getUNDEF(ConcatVT);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0, N1);
+  }
+
+  // Look at every vector that's inserted. We're looking for exact
+  // subvector-sized copies from a concatenated vector
+  for (unsigned I = 0; I != NumConcats; ++I) {
+    // Make sure we're dealing with a copy.
+    unsigned Begin = I * NumElemsPerConcat;
+    bool AllUndef = true, NoUndef = true;
+    for (unsigned J = Begin; J != Begin + NumElemsPerConcat; ++J) {
+      if (SVN->getMaskElt(J) >= 0)
+        AllUndef = false;
+      else
+        NoUndef = false;
+    }
+
+    if (NoUndef) {
+      if (SVN->getMaskElt(Begin) % NumElemsPerConcat != 0)
+        return SDValue();
+
+      for (unsigned J = 1; J != NumElemsPerConcat; ++J)
+        if (SVN->getMaskElt(Begin + J - 1) + 1 != SVN->getMaskElt(Begin + J))
+          return SDValue();
+
+      unsigned FirstElt = SVN->getMaskElt(Begin) / NumElemsPerConcat;
+      if (FirstElt < N0.getNumOperands())
+        Ops.push_back(N0.getOperand(FirstElt));
+      else
+        Ops.push_back(N1.getOperand(FirstElt - N0.getNumOperands()));
+
+    } else if (AllUndef) {
+      Ops.push_back(DAG.getUNDEF(N0.getOperand(0).getValueType()));
+    } else { // Mixed with general masks and undefs, can't do optimization.
+      return SDValue();
+    }
+  }
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
+}
+
+SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  unsigned NumElts = VT.getVectorNumElements();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  assert(N0.getValueType() == VT && "Vector shuffle must be normalized in DAG");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N0.isUndef() && N1.isUndef())
+    return DAG.getUNDEF(VT);
+
+  ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N0 == N1) {
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= (int)NumElts) Idx -= NumElts;
+      NewMask.push_back(Idx);
+    }
+    return DAG.getVectorShuffle(VT, SDLoc(N), N0, DAG.getUNDEF(VT), NewMask);
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N0.isUndef())
+    return DAG.getCommutedVectorShuffle(*SVN);
+
+  // Remove references to rhs if it is undef
+  if (N1.isUndef()) {
+    bool Changed = false;
+    SmallVector<int, 8> NewMask;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx >= (int)NumElts) {
+        Idx = -1;
+        Changed = true;
+      }
+      NewMask.push_back(Idx);
+    }
+    if (Changed)
+      return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask);
+  }
+
+  // If it is a splat, check if the argument vector is another splat or a
+  // build_vector.
+  if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
+    SDNode *V = N0.getNode();
+
+    // If this is a bit convert that changes the element type of the vector but
+    // not the number of vector elements, look through it.  Be careful not to
+    // look though conversions that change things like v4f32 to v2f64.
+    if (V->getOpcode() == ISD::BITCAST) {
+      SDValue ConvInput = V->getOperand(0);
+      if (ConvInput.getValueType().isVector() &&
+          ConvInput.getValueType().getVectorNumElements() == NumElts)
+        V = ConvInput.getNode();
+    }
+
+    if (V->getOpcode() == ISD::BUILD_VECTOR) {
+      assert(V->getNumOperands() == NumElts &&
+             "BUILD_VECTOR has wrong number of operands");
+      SDValue Base;
+      bool AllSame = true;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (!V->getOperand(i).isUndef()) {
+          Base = V->getOperand(i);
+          break;
+        }
+      }
+      // Splat of <u, u, u, u>, return <u, u, u, u>
+      if (!Base.getNode())
+        return N0;
+      for (unsigned i = 0; i != NumElts; ++i) {
+        if (V->getOperand(i) != Base) {
+          AllSame = false;
+          break;
+        }
+      }
+      // Splat of <x, x, x, x>, return <x, x, x, x>
+      if (AllSame)
+        return N0;
+
+      // Canonicalize any other splat as a build_vector.
+      const SDValue &Splatted = V->getOperand(SVN->getSplatIndex());
+      SmallVector<SDValue, 8> Ops(NumElts, Splatted);
+      SDValue NewBV = DAG.getBuildVector(V->getValueType(0), SDLoc(N), Ops);
+
+      // We may have jumped through bitcasts, so the type of the
+      // BUILD_VECTOR may not match the type of the shuffle.
+      if (V->getValueType(0) != VT)
+        NewBV = DAG.getBitcast(VT, NewBV);
+      return NewBV;
+    }
+  }
+
+  // There are various patterns used to build up a vector from smaller vectors,
+  // subvectors, or elements. Scan chains of these and replace unused insertions
+  // or components with undef.
+  if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
+    return S;
+
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
+      Level < AfterLegalizeVectorOps &&
+      (N1.isUndef() ||
+      (N1.getOpcode() == ISD::CONCAT_VECTORS &&
+       N0.getOperand(0).getValueType() == N1.getOperand(0).getValueType()))) {
+    if (SDValue V = partitionShuffleOfConcats(N, DAG))
+      return V;
+  }
+
+  // Attempt to combine a shuffle of 2 inputs of 'scalar sources' -
+  // BUILD_VECTOR or SCALAR_TO_VECTOR into a single BUILD_VECTOR.
+  if (Level < AfterLegalizeVectorOps && TLI.isTypeLegal(VT)) {
+    SmallVector<SDValue, 8> Ops;
+    for (int M : SVN->getMask()) {
+      SDValue Op = DAG.getUNDEF(VT.getScalarType());
+      if (M >= 0) {
+        int Idx = M % NumElts;
+        SDValue &S = (M < (int)NumElts ? N0 : N1);
+        if (S.getOpcode() == ISD::BUILD_VECTOR && S.hasOneUse()) {
+          Op = S.getOperand(Idx);
+        } else if (S.getOpcode() == ISD::SCALAR_TO_VECTOR && S.hasOneUse()) {
+          if (Idx == 0)
+            Op = S.getOperand(0);
+        } else {
+          // Operand can't be combined - bail out.
+          break;
+        }
+      }
+      Ops.push_back(Op);
+    }
+    if (Ops.size() == VT.getVectorNumElements()) {
+      // BUILD_VECTOR requires all inputs to be of the same type, find the
+      // maximum type and extend them all.
+      EVT SVT = VT.getScalarType();
+      if (SVT.isInteger())
+        for (SDValue &Op : Ops)
+          SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+      if (SVT != VT.getScalarType())
+        for (SDValue &Op : Ops)
+          Op = TLI.isZExtFree(Op.getValueType(), SVT)
+                   ? DAG.getZExtOrTrunc(Op, SDLoc(N), SVT)
+                   : DAG.getSExtOrTrunc(Op, SDLoc(N), SVT);
+      return DAG.getBuildVector(VT, SDLoc(N), Ops);
+    }
+  }
+
+  // If this shuffle only has a single input that is a bitcasted shuffle,
+  // attempt to merge the 2 shuffles and suitably bitcast the inputs/output
+  // back to their original types.
+  if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
+      N1.isUndef() && Level < AfterLegalizeVectorOps &&
+      TLI.isTypeLegal(VT)) {
+
+    // Peek through the bitcast only if there is one user.
+    SDValue BC0 = N0;
+    while (BC0.getOpcode() == ISD::BITCAST) {
+      if (!BC0.hasOneUse())
+        break;
+      BC0 = BC0.getOperand(0);
+    }
+
+    auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
+      if (Scale == 1)
+        return SmallVector<int, 8>(Mask.begin(), Mask.end());
+
+      SmallVector<int, 8> NewMask;
+      for (int M : Mask)
+        for (int s = 0; s != Scale; ++s)
+          NewMask.push_back(M < 0 ? -1 : Scale * M + s);
+      return NewMask;
+    };
+
+    if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
+      EVT SVT = VT.getScalarType();
+      EVT InnerVT = BC0->getValueType(0);
+      EVT InnerSVT = InnerVT.getScalarType();
+
+      // Determine which shuffle works with the smaller scalar type.
+      EVT ScaleVT = SVT.bitsLT(InnerSVT) ? VT : InnerVT;
+      EVT ScaleSVT = ScaleVT.getScalarType();
+
+      if (TLI.isTypeLegal(ScaleVT) &&
+          0 == (InnerSVT.getSizeInBits() % ScaleSVT.getSizeInBits()) &&
+          0 == (SVT.getSizeInBits() % ScaleSVT.getSizeInBits())) {
+
+        int InnerScale = InnerSVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+        int OuterScale = SVT.getSizeInBits() / ScaleSVT.getSizeInBits();
+
+        // Scale the shuffle masks to the smaller scalar type.
+        ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
+        SmallVector<int, 8> InnerMask =
+            ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
+        SmallVector<int, 8> OuterMask =
+            ScaleShuffleMask(SVN->getMask(), OuterScale);
+
+        // Merge the shuffle masks.
+        SmallVector<int, 8> NewMask;
+        for (int M : OuterMask)
+          NewMask.push_back(M < 0 ? -1 : InnerMask[M]);
+
+        // Test for shuffle mask legality over both commutations.
+        SDValue SV0 = BC0->getOperand(0);
+        SDValue SV1 = BC0->getOperand(1);
+        bool LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        if (!LegalMask) {
+          std::swap(SV0, SV1);
+          ShuffleVectorSDNode::commuteMask(NewMask);
+          LegalMask = TLI.isShuffleMaskLegal(NewMask, ScaleVT);
+        }
+
+        if (LegalMask) {
+          SV0 = DAG.getBitcast(ScaleVT, SV0);
+          SV1 = DAG.getBitcast(ScaleVT, SV1);
+          return DAG.getBitcast(
+              VT, DAG.getVectorShuffle(ScaleVT, SDLoc(N), SV0, SV1, NewMask));
+        }
+      }
+    }
+  }
+
+  // Canonicalize shuffles according to rules:
+  //  shuffle(A, shuffle(A, B)) -> shuffle(shuffle(A,B), A)
+  //  shuffle(B, shuffle(A, B)) -> shuffle(shuffle(A,B), B)
+  //  shuffle(B, shuffle(A, Undef)) -> shuffle(shuffle(A, Undef), B)
+  if (N1.getOpcode() == ISD::VECTOR_SHUFFLE &&
+      N0.getOpcode() != ISD::VECTOR_SHUFFLE && Level < AfterLegalizeDAG &&
+      TLI.isTypeLegal(VT)) {
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(N1->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    SDValue SV0 = N1->getOperand(0);
+    SDValue SV1 = N1->getOperand(1);
+    bool HasSameOp0 = N0 == SV0;
+    bool IsSV1Undef = SV1.isUndef();
+    if (HasSameOp0 || IsSV1Undef || N0 == SV1)
+      // Commute the operands of this shuffle so that next rule
+      // will trigger.
+      return DAG.getCommutedVectorShuffle(*SVN);
+  }
+
+  // Try to fold according to rules:
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+  //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+  // Don't try to fold shuffles with illegal type.
+  // Only fold if this shuffle is the only user of the other shuffle.
+  if (N0.getOpcode() == ISD::VECTOR_SHUFFLE && N->isOnlyUserOf(N0.getNode()) &&
+      Level < AfterLegalizeDAG && TLI.isTypeLegal(VT)) {
+    ShuffleVectorSDNode *OtherSV = cast<ShuffleVectorSDNode>(N0);
+
+    // The incoming shuffle must be of the same type as the result of the
+    // current shuffle.
+    assert(OtherSV->getOperand(0).getValueType() == VT &&
+           "Shuffle types don't match");
+
+    SDValue SV0, SV1;
+    SmallVector<int, 4> Mask;
+    // Compute the combined shuffle mask for a shuffle with SV0 as the first
+    // operand, and SV1 as the second operand.
+    for (unsigned i = 0; i != NumElts; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (Idx < 0) {
+        // Propagate Undef.
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      SDValue CurrentVec;
+      if (Idx < (int)NumElts) {
+        // This shuffle index refers to the inner shuffle N0. Lookup the inner
+        // shuffle mask to identify which vector is actually referenced.
+        Idx = OtherSV->getMaskElt(Idx);
+        if (Idx < 0) {
+          // Propagate Undef.
+          Mask.push_back(Idx);
+          continue;
+        }
+
+        CurrentVec = (Idx < (int) NumElts) ? OtherSV->getOperand(0)
+                                           : OtherSV->getOperand(1);
+      } else {
+        // This shuffle index references an element within N1.
+        CurrentVec = N1;
+      }
+
+      // Simple case where 'CurrentVec' is UNDEF.
+      if (CurrentVec.isUndef()) {
+        Mask.push_back(-1);
+        continue;
+      }
+
+      // Canonicalize the shuffle index. We don't know yet if CurrentVec
+      // will be the first or second operand of the combined shuffle.
+      Idx = Idx % NumElts;
+      if (!SV0.getNode() || SV0 == CurrentVec) {
+        // Ok. CurrentVec is the left hand side.
+        // Update the mask accordingly.
+        SV0 = CurrentVec;
+        Mask.push_back(Idx);
+        continue;
+      }
+
+      // Bail out if we cannot convert the shuffle pair into a single shuffle.
+      if (SV1.getNode() && SV1 != CurrentVec)
+        return SDValue();
+
+      // Ok. CurrentVec is the right hand side.
+      // Update the mask accordingly.
+      SV1 = CurrentVec;
+      Mask.push_back(Idx + NumElts);
+    }
+
+    // Check if all indices in Mask are Undef. In case, propagate Undef.
+    bool isUndefMask = true;
+    for (unsigned i = 0; i != NumElts && isUndefMask; ++i)
+      isUndefMask &= Mask[i] < 0;
+
+    if (isUndefMask)
+      return DAG.getUNDEF(VT);
+
+    if (!SV0.getNode())
+      SV0 = DAG.getUNDEF(VT);
+    if (!SV1.getNode())
+      SV1 = DAG.getUNDEF(VT);
+
+    // Avoid introducing shuffles with illegal mask.
+    if (!TLI.isShuffleMaskLegal(Mask, VT)) {
+      ShuffleVectorSDNode::commuteMask(Mask);
+
+      if (!TLI.isShuffleMaskLegal(Mask, VT))
+        return SDValue();
+
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, A, M2)
+      //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(C, B, M2)
+      std::swap(SV0, SV1);
+    }
+
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, B, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(A, C, M2)
+    //   shuffle(shuffle(A, B, M0), C, M1) -> shuffle(B, C, M2)
+    return DAG.getVectorShuffle(VT, SDLoc(N), SV0, SV1, Mask);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
+  SDValue InVal = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
+  // with a VECTOR_SHUFFLE.
+  if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    SDValue InVec = InVal->getOperand(0);
+    SDValue EltNo = InVal->getOperand(1);
+
+    // FIXME: We could support implicit truncation if the shuffle can be
+    // scaled to a smaller vector scalar type.
+    ConstantSDNode *C0 = dyn_cast<ConstantSDNode>(EltNo);
+    if (C0 && VT == InVec.getValueType() &&
+        VT.getScalarType() == InVal.getValueType()) {
+      SmallVector<int, 8> NewMask(VT.getVectorNumElements(), -1);
+      int Elt = C0->getZExtValue();
+      NewMask[0] = Elt;
+
+      if (TLI.isShuffleMaskLegal(NewMask, VT))
+        return DAG.getVectorShuffle(VT, SDLoc(N), InVec, DAG.getUNDEF(VT),
+                                    NewMask);
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+
+  if (N0.getValueType() != N1.getValueType())
+    return SDValue();
+
+  // If the input vector is a concatenation, and the insert replaces
+  // one of the halves, we can optimize into a single concat_vectors.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
+      N2.getOpcode() == ISD::Constant) {
+    APInt InsIdx = cast<ConstantSDNode>(N2)->getAPIntValue();
+    EVT VT = N->getValueType(0);
+
+    // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
+    // (concat_vectors Z, Y)
+    if (InsIdx == 0)
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
+                         N0.getOperand(1));
+
+    // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
+    // (concat_vectors X, Z)
+    if (InsIdx == VT.getVectorNumElements() / 2)
+      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0),
+                         N1);
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP_TO_FP16(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold (fp_to_fp16 (fp16_to_fp op)) -> op
+  if (N0->getOpcode() == ISD::FP16_TO_FP)
+    return N0->getOperand(0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFP16_TO_FP(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // fold fp16_to_fp(op & 0xffff) -> fp16_to_fp(op)
+  if (N0->getOpcode() == ISD::AND) {
+    ConstantSDNode *AndConst = getAsNonOpaqueConstant(N0.getOperand(1));
+    if (AndConst && AndConst->getAPIntValue() == 0xffff) {
+      return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0),
+                         N0.getOperand(0));
+    }
+  }
+
+  return SDValue();
+}
+
+/// Returns a vector_shuffle if it able to transform an AND to a vector_shuffle
+/// with the destination vector and a zero vector.
+/// e.g. AND V, <0xffffffff, 0, 0xffffffff, 0>. ==>
+///      vector_shuffle V, Zero, <0, 4, 2, 4>
+SDValue DAGCombiner::XformToShuffleWithZero(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc dl(N);
+
+  // Make sure we're not running after operation legalization where it
+  // may have custom lowered the vector shuffles.
+  if (LegalOperations)
+    return SDValue();
+
+  if (N->getOpcode() != ISD::AND)
+    return SDValue();
+
+  if (RHS.getOpcode() == ISD::BITCAST)
+    RHS = RHS.getOperand(0);
+
+  if (RHS.getOpcode() != ISD::BUILD_VECTOR)
+    return SDValue();
+
+  EVT RVT = RHS.getValueType();
+  unsigned NumElts = RHS.getNumOperands();
+
+  // Attempt to create a valid clear mask, splitting the mask into
+  // sub elements and checking to see if each is
+  // all zeros or all ones - suitable for shuffle masking.
+  auto BuildClearMask = [&](int Split) {
+    int NumSubElts = NumElts * Split;
+    int NumSubBits = RVT.getScalarSizeInBits() / Split;
+
+    SmallVector<int, 8> Indices;
+    for (int i = 0; i != NumSubElts; ++i) {
+      int EltIdx = i / Split;
+      int SubIdx = i % Split;
+      SDValue Elt = RHS.getOperand(EltIdx);
+      if (Elt.isUndef()) {
+        Indices.push_back(-1);
+        continue;
+      }
+
+      APInt Bits;
+      if (isa<ConstantSDNode>(Elt))
+        Bits = cast<ConstantSDNode>(Elt)->getAPIntValue();
+      else if (isa<ConstantFPSDNode>(Elt))
+        Bits = cast<ConstantFPSDNode>(Elt)->getValueAPF().bitcastToAPInt();
+      else
+        return SDValue();
+
+      // Extract the sub element from the constant bit mask.
+      if (DAG.getDataLayout().isBigEndian()) {
+        Bits = Bits.lshr((Split - SubIdx - 1) * NumSubBits);
+      } else {
+        Bits = Bits.lshr(SubIdx * NumSubBits);
+      }
+
+      if (Split > 1)
+        Bits = Bits.trunc(NumSubBits);
+
+      if (Bits.isAllOnesValue())
+        Indices.push_back(i);
+      else if (Bits == 0)
+        Indices.push_back(i + NumSubElts);
+      else
+        return SDValue();
+    }
+
+    // Let's see if the target supports this vector_shuffle.
+    EVT ClearSVT = EVT::getIntegerVT(*DAG.getContext(), NumSubBits);
+    EVT ClearVT = EVT::getVectorVT(*DAG.getContext(), ClearSVT, NumSubElts);
+    if (!TLI.isVectorClearMaskLegal(Indices, ClearVT))
+      return SDValue();
+
+    SDValue Zero = DAG.getConstant(0, dl, ClearVT);
+    return DAG.getBitcast(VT, DAG.getVectorShuffle(ClearVT, dl,
+                                                   DAG.getBitcast(ClearVT, LHS),
+                                                   Zero, Indices));
+  };
+
+  // Determine maximum split level (byte level masking).
+  int MaxSplit = 1;
+  if (RVT.getScalarSizeInBits() % 8 == 0)
+    MaxSplit = RVT.getScalarSizeInBits() / 8;
+
+  for (int Split = 1; Split <= MaxSplit; ++Split)
+    if (RVT.getScalarSizeInBits() % Split == 0)
+      if (SDValue S = BuildClearMask(Split))
+        return S;
+
+  return SDValue();
+}
+
+/// Visit a binary vector operation, like ADD.
+SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         "SimplifyVBinOp only works on vectors!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Ops[] = {LHS, RHS};
+
+  // See if we can constant fold the vector operation.
+  if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
+          N->getOpcode(), SDLoc(LHS), LHS.getValueType(), Ops, N->getFlags()))
+    return Fold;
+
+  // Try to convert a constant mask AND into a shuffle clear mask.
+  if (SDValue Shuffle = XformToShuffleWithZero(N))
+    return Shuffle;
+
+  // Type legalization might introduce new shuffles in the DAG.
+  // Fold (VBinOp (shuffle (A, Undef, Mask)), (shuffle (B, Undef, Mask)))
+  //   -> (shuffle (VBinOp (A, B)), Undef, Mask).
+  if (LegalTypes && isa<ShuffleVectorSDNode>(LHS) &&
+      isa<ShuffleVectorSDNode>(RHS) && LHS.hasOneUse() && RHS.hasOneUse() &&
+      LHS.getOperand(1).isUndef() &&
+      RHS.getOperand(1).isUndef()) {
+    ShuffleVectorSDNode *SVN0 = cast<ShuffleVectorSDNode>(LHS);
+    ShuffleVectorSDNode *SVN1 = cast<ShuffleVectorSDNode>(RHS);
+
+    if (SVN0->getMask().equals(SVN1->getMask())) {
+      EVT VT = N->getValueType(0);
+      SDValue UndefVector = LHS.getOperand(1);
+      SDValue NewBinOp = DAG.getNode(N->getOpcode(), SDLoc(N), VT,
+                                     LHS.getOperand(0), RHS.getOperand(0),
+                                     N->getFlags());
+      AddUsersToWorklist(N);
+      return DAG.getVectorShuffle(VT, SDLoc(N), NewBinOp, UndefVector,
+                                  SVN0->getMask());
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1,
+                                    SDValue N2) {
+  assert(N0.getOpcode() ==ISD::SETCC && "First argument must be a SetCC node!");
+
+  SDValue SCC = SimplifySelectCC(DL, N0.getOperand(0), N0.getOperand(1), N1, N2,
+                                 cast<CondCodeSDNode>(N0.getOperand(2))->get());
+
+  // If we got a simplified select_cc node back from SimplifySelectCC, then
+  // break it down into a new SETCC node, and a new SELECT node, and then return
+  // the SELECT node, since we were called with a SELECT node.
+  if (SCC.getNode()) {
+    // Check to see if we got a select_cc back (to turn into setcc/select).
+    // Otherwise, just return whatever node we got back, like fabs.
+    if (SCC.getOpcode() == ISD::SELECT_CC) {
+      SDValue SETCC = DAG.getNode(ISD::SETCC, SDLoc(N0),
+                                  N0.getValueType(),
+                                  SCC.getOperand(0), SCC.getOperand(1),
+                                  SCC.getOperand(4));
+      AddToWorklist(SETCC.getNode());
+      return DAG.getSelect(SDLoc(SCC), SCC.getValueType(), SETCC,
+                           SCC.getOperand(2), SCC.getOperand(3));
+    }
+
+    return SCC;
+  }
+  return SDValue();
+}
+
+/// Given a SELECT or a SELECT_CC node, where LHS and RHS are the two values
+/// being selected between, see if we can simplify the select.  Callers of this
+/// should assume that TheSelect is deleted if this returns true.  As such, they
+/// should return the appropriate thing (e.g. the node) back to the top-level of
+/// the DAG combiner loop to avoid it being looked at.
+bool DAGCombiner::SimplifySelectOps(SDNode *TheSelect, SDValue LHS,
+                                    SDValue RHS) {
+
+  // fold (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
+  // The select + setcc is redundant, because fsqrt returns NaN for X < 0.
+  if (const ConstantFPSDNode *NaN = isConstOrConstSplatFP(LHS)) {
+    if (NaN->isNaN() && RHS.getOpcode() == ISD::FSQRT) {
+      // We have: (select (setcc ?, ?, ?), NaN, (fsqrt ?))
+      SDValue Sqrt = RHS;
+      ISD::CondCode CC;
+      SDValue CmpLHS;
+      const ConstantFPSDNode *Zero = nullptr;
+
+      if (TheSelect->getOpcode() == ISD::SELECT_CC) {
+        CC = dyn_cast<CondCodeSDNode>(TheSelect->getOperand(4))->get();
+        CmpLHS = TheSelect->getOperand(0);
+        Zero = isConstOrConstSplatFP(TheSelect->getOperand(1));
+      } else {
+        // SELECT or VSELECT
+        SDValue Cmp = TheSelect->getOperand(0);
+        if (Cmp.getOpcode() == ISD::SETCC) {
+          CC = dyn_cast<CondCodeSDNode>(Cmp.getOperand(2))->get();
+          CmpLHS = Cmp.getOperand(0);
+          Zero = isConstOrConstSplatFP(Cmp.getOperand(1));
+        }
+      }
+      if (Zero && Zero->isZero() &&
+          Sqrt.getOperand(0) == CmpLHS && (CC == ISD::SETOLT ||
+          CC == ISD::SETULT || CC == ISD::SETLT)) {
+        // We have: (select (setcc x, [+-]0.0, *lt), NaN, (fsqrt x))
+        CombineTo(TheSelect, Sqrt);
+        return true;
+      }
+    }
+  }
+  // Cannot simplify select with vector condition
+  if (TheSelect->getOperand(0).getValueType().isVector()) return false;
+
+  // If this is a select from two identical things, try to pull the operation
+  // through the select.
+  if (LHS.getOpcode() != RHS.getOpcode() ||
+      !LHS.hasOneUse() || !RHS.hasOneUse())
+    return false;
+
+  // If this is a load and the token chain is identical, replace the select
+  // of two loads with a load through a select of the address to load from.
+  // This triggers in things like "select bool X, 10.0, 123.0" after the FP
+  // constants have been dropped into the constant pool.
+  if (LHS.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LLD = cast<LoadSDNode>(LHS);
+    LoadSDNode *RLD = cast<LoadSDNode>(RHS);
+
+    // Token chains must be identical.
+    if (LHS.getOperand(0) != RHS.getOperand(0) ||
+        // Do not let this transformation reduce the number of volatile loads.
+        LLD->isVolatile() || RLD->isVolatile() ||
+        // FIXME: If either is a pre/post inc/dec load,
+        // we'd need to split out the address adjustment.
+        LLD->isIndexed() || RLD->isIndexed() ||
+        // If this is an EXTLOAD, the VT's must match.
+        LLD->getMemoryVT() != RLD->getMemoryVT() ||
+        // If this is an EXTLOAD, the kind of extension must match.
+        (LLD->getExtensionType() != RLD->getExtensionType() &&
+         // The only exception is if one of the extensions is anyext.
+         LLD->getExtensionType() != ISD::EXTLOAD &&
+         RLD->getExtensionType() != ISD::EXTLOAD) ||
+        // FIXME: this discards src value information.  This is
+        // over-conservative. It would be beneficial to be able to remember
+        // both potential memory locations.  Since we are discarding
+        // src value info, don't do the transformation if the memory
+        // locations are not in the default address space.
+        LLD->getPointerInfo().getAddrSpace() != 0 ||
+        RLD->getPointerInfo().getAddrSpace() != 0 ||
+        !TLI.isOperationLegalOrCustom(TheSelect->getOpcode(),
+                                      LLD->getBasePtr().getValueType()))
+      return false;
+
+    // Check that the select condition doesn't reach either load.  If so,
+    // folding this will induce a cycle into the DAG.  If not, this is safe to
+    // xform, so create a select of the addresses.
+    SDValue Addr;
+    if (TheSelect->getOpcode() == ISD::SELECT) {
+      SDNode *CondNode = TheSelect->getOperand(0).getNode();
+      if ((LLD->hasAnyUseOfValue(1) && LLD->isPredecessorOf(CondNode)) ||
+          (RLD->hasAnyUseOfValue(1) && RLD->isPredecessorOf(CondNode)))
+        return false;
+      // The loads must not depend on one another.
+      if (LLD->isPredecessorOf(RLD) ||
+          RLD->isPredecessorOf(LLD))
+        return false;
+      Addr = DAG.getSelect(SDLoc(TheSelect),
+                           LLD->getBasePtr().getValueType(),
+                           TheSelect->getOperand(0), LLD->getBasePtr(),
+                           RLD->getBasePtr());
+    } else {  // Otherwise SELECT_CC
+      SDNode *CondLHS = TheSelect->getOperand(0).getNode();
+      SDNode *CondRHS = TheSelect->getOperand(1).getNode();
+
+      if ((LLD->hasAnyUseOfValue(1) &&
+           (LLD->isPredecessorOf(CondLHS) || LLD->isPredecessorOf(CondRHS))) ||
+          (RLD->hasAnyUseOfValue(1) &&
+           (RLD->isPredecessorOf(CondLHS) || RLD->isPredecessorOf(CondRHS))))
+        return false;
+
+      Addr = DAG.getNode(ISD::SELECT_CC, SDLoc(TheSelect),
+                         LLD->getBasePtr().getValueType(),
+                         TheSelect->getOperand(0),
+                         TheSelect->getOperand(1),
+                         LLD->getBasePtr(), RLD->getBasePtr(),
+                         TheSelect->getOperand(4));
+    }
+
+    SDValue Load;
+    // It is safe to replace the two loads if they have different alignments,
+    // but the new load must be the minimum (most restrictive) alignment of the
+    // inputs.
+    unsigned Alignment = std::min(LLD->getAlignment(), RLD->getAlignment());
+    MachineMemOperand::Flags MMOFlags = LLD->getMemOperand()->getFlags();
+    if (!RLD->isInvariant())
+      MMOFlags &= ~MachineMemOperand::MOInvariant;
+    if (LLD->getExtensionType() == ISD::NON_EXTLOAD) {
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getLoad(TheSelect->getValueType(0), SDLoc(TheSelect),
+                         LLD->getChain(), Addr, MachinePointerInfo(), Alignment,
+                         MMOFlags);
+    } else {
+      // FIXME: Discards pointer and AA info.
+      Load = DAG.getExtLoad(
+          LLD->getExtensionType() == ISD::EXTLOAD ? RLD->getExtensionType()
+                                                  : LLD->getExtensionType(),
+          SDLoc(TheSelect), TheSelect->getValueType(0), LLD->getChain(), Addr,
+          MachinePointerInfo(), LLD->getMemoryVT(), Alignment, MMOFlags);
+    }
+
+    // Users of the select now use the result of the load.
+    CombineTo(TheSelect, Load);
+
+    // Users of the old loads now use the new load's chain.  We know the
+    // old-load value is dead now.
+    CombineTo(LHS.getNode(), Load.getValue(0), Load.getValue(1));
+    CombineTo(RHS.getNode(), Load.getValue(0), Load.getValue(1));
+    return true;
+  }
+
+  return false;
+}
+
+/// Simplify an expression of the form (N0 cond N1) ? N2 : N3
+/// where 'cond' is the comparison specified by CC.
+SDValue DAGCombiner::SimplifySelectCC(const SDLoc &DL, SDValue N0, SDValue N1,
+                                      SDValue N2, SDValue N3, ISD::CondCode CC,
+                                      bool NotExtCompare) {
+  // (x ? y : y) -> y.
+  if (N2 == N3) return N2;
+
+  EVT VT = N2.getValueType();
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1.getNode());
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2.getNode());
+
+  // Determine if the condition we're dealing with is constant
+  SDValue SCC = SimplifySetCC(getSetCCResultType(N0.getValueType()),
+                              N0, N1, CC, DL, false);
+  if (SCC.getNode()) AddToWorklist(SCC.getNode());
+
+  if (ConstantSDNode *SCCC = dyn_cast_or_null<ConstantSDNode>(SCC.getNode())) {
+    // fold select_cc true, x, y -> x
+    // fold select_cc false, x, y -> y
+    return !SCCC->isNullValue() ? N2 : N3;
+  }
+
+  // Check to see if we can simplify the select into an fabs node
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N1)) {
+    // Allow either -0.0 or 0.0
+    if (CFP->isZero()) {
+      // select (setg[te] X, +/-0.0), X, fneg(X) -> fabs
+      if ((CC == ISD::SETGE || CC == ISD::SETGT) &&
+          N0 == N2 && N3.getOpcode() == ISD::FNEG &&
+          N2 == N3.getOperand(0))
+        return DAG.getNode(ISD::FABS, DL, VT, N0);
+
+      // select (setl[te] X, +/-0.0), fneg(X), X -> fabs
+      if ((CC == ISD::SETLT || CC == ISD::SETLE) &&
+          N0 == N3 && N2.getOpcode() == ISD::FNEG &&
+          N2.getOperand(0) == N3)
+        return DAG.getNode(ISD::FABS, DL, VT, N3);
+    }
+  }
+
+  // Turn "(a cond b) ? 1.0f : 2.0f" into "load (tmp + ((a cond b) ? 0 : 4)"
+  // where "tmp" is a constant pool entry containing an array with 1.0 and 2.0
+  // in it.  This is a win when the constant is not otherwise available because
+  // it replaces two constant pool loads with one.  We only do this if the FP
+  // type is known to be legal, because if it isn't, then we are before legalize
+  // types an we want the other legalization to happen first (e.g. to avoid
+  // messing with soft float) and if the ConstantFP is not legal, because if
+  // it is legal, we may not need to store the FP constant in a constant pool.
+  if (ConstantFPSDNode *TV = dyn_cast<ConstantFPSDNode>(N2))
+    if (ConstantFPSDNode *FV = dyn_cast<ConstantFPSDNode>(N3)) {
+      if (TLI.isTypeLegal(N2.getValueType()) &&
+          (TLI.getOperationAction(ISD::ConstantFP, N2.getValueType()) !=
+               TargetLowering::Legal &&
+           !TLI.isFPImmLegal(TV->getValueAPF(), TV->getValueType(0)) &&
+           !TLI.isFPImmLegal(FV->getValueAPF(), FV->getValueType(0))) &&
+          // If both constants have multiple uses, then we won't need to do an
+          // extra load, they are likely around in registers for other users.
+          (TV->hasOneUse() || FV->hasOneUse())) {
+        Constant *Elts[] = {
+          const_cast<ConstantFP*>(FV->getConstantFPValue()),
+          const_cast<ConstantFP*>(TV->getConstantFPValue())
+        };
+        Type *FPTy = Elts[0]->getType();
+        const DataLayout &TD = DAG.getDataLayout();
+
+        // Create a ConstantArray of the two constants.
+        Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
+        SDValue CPIdx =
+            DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
+                                TD.getPrefTypeAlignment(FPTy));
+        unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+
+        // Get the offsets to the 0 and 1 element of the array so that we can
+        // select between them.
+        SDValue Zero = DAG.getIntPtrConstant(0, DL);
+        unsigned EltSize = (unsigned)TD.getTypeAllocSize(Elts[0]->getType());
+        SDValue One = DAG.getIntPtrConstant(EltSize, SDLoc(FV));
+
+        SDValue Cond = DAG.getSetCC(DL,
+                                    getSetCCResultType(N0.getValueType()),
+                                    N0, N1, CC);
+        AddToWorklist(Cond.getNode());
+        SDValue CstOffset = DAG.getSelect(DL, Zero.getValueType(),
+                                          Cond, One, Zero);
+        AddToWorklist(CstOffset.getNode());
+        CPIdx = DAG.getNode(ISD::ADD, DL, CPIdx.getValueType(), CPIdx,
+                            CstOffset);
+        AddToWorklist(CPIdx.getNode());
+        return DAG.getLoad(
+            TV->getValueType(0), DL, DAG.getEntryNode(), CPIdx,
+            MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+            Alignment);
+      }
+    }
+
+  // Check to see if we can perform the "gzip trick", transforming
+  // (select_cc setlt X, 0, A, 0) -> (and (sra X, (sub size(X), 1), A)
+  if (isNullConstant(N3) && CC == ISD::SETLT &&
+      (isNullConstant(N1) ||                 // (a < 0) ? b : 0
+       (isOneConstant(N1) && N0 == N2))) {   // (a < 1) ? a : 0
+    EVT XType = N0.getValueType();
+    EVT AType = N2.getValueType();
+    if (XType.bitsGE(AType)) {
+      // and (sra X, size(X)-1, A) -> "and (srl X, C2), A" iff A is a
+      // single-bit constant.
+      if (N2C && ((N2C->getAPIntValue() & (N2C->getAPIntValue() - 1)) == 0)) {
+        unsigned ShCtV = N2C->getAPIntValue().logBase2();
+        ShCtV = XType.getSizeInBits() - ShCtV - 1;
+        SDValue ShCt = DAG.getConstant(ShCtV, SDLoc(N0),
+                                       getShiftAmountTy(N0.getValueType()));
+        SDValue Shift = DAG.getNode(ISD::SRL, SDLoc(N0),
+                                    XType, N0, ShCt);
+        AddToWorklist(Shift.getNode());
+
+        if (XType.bitsGT(AType)) {
+          Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+          AddToWorklist(Shift.getNode());
+        }
+
+        return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+      }
+
+      SDValue Shift = DAG.getNode(ISD::SRA, SDLoc(N0),
+                                  XType, N0,
+                                  DAG.getConstant(XType.getSizeInBits() - 1,
+                                                  SDLoc(N0),
+                                         getShiftAmountTy(N0.getValueType())));
+      AddToWorklist(Shift.getNode());
+
+      if (XType.bitsGT(AType)) {
+        Shift = DAG.getNode(ISD::TRUNCATE, DL, AType, Shift);
+        AddToWorklist(Shift.getNode());
+      }
+
+      return DAG.getNode(ISD::AND, DL, AType, Shift, N2);
+    }
+  }
+
+  // fold (select_cc seteq (and x, y), 0, 0, A) -> (and (shr (shl x)) A)
+  // where y is has a single bit set.
+  // A plaintext description would be, we can turn the SELECT_CC into an AND
+  // when the condition can be materialized as an all-ones register.  Any
+  // single bit-test can be materialized as an all-ones register with
+  // shift-left and shift-right-arith.
+  if (CC == ISD::SETEQ && N0->getOpcode() == ISD::AND &&
+      N0->getValueType(0) == VT && isNullConstant(N1) && isNullConstant(N2)) {
+    SDValue AndLHS = N0->getOperand(0);
+    ConstantSDNode *ConstAndRHS = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+    if (ConstAndRHS && ConstAndRHS->getAPIntValue().countPopulation() == 1) {
+      // Shift the tested bit over the sign bit.
+      const APInt &AndMask = ConstAndRHS->getAPIntValue();
+      SDValue ShlAmt =
+        DAG.getConstant(AndMask.countLeadingZeros(), SDLoc(AndLHS),
+                        getShiftAmountTy(AndLHS.getValueType()));
+      SDValue Shl = DAG.getNode(ISD::SHL, SDLoc(N0), VT, AndLHS, ShlAmt);
+
+      // Now arithmetic right shift it all the way over, so the result is either
+      // all-ones, or zero.
+      SDValue ShrAmt =
+        DAG.getConstant(AndMask.getBitWidth() - 1, SDLoc(Shl),
+                        getShiftAmountTy(Shl.getValueType()));
+      SDValue Shr = DAG.getNode(ISD::SRA, SDLoc(N0), VT, Shl, ShrAmt);
+
+      return DAG.getNode(ISD::AND, DL, VT, Shr, N3);
+    }
+  }
+
+  // fold select C, 16, 0 -> shl C, 4
+  if (N2C && isNullConstant(N3) && N2C->getAPIntValue().isPowerOf2() &&
+      TLI.getBooleanContents(N0.getValueType()) ==
+          TargetLowering::ZeroOrOneBooleanContent) {
+
+    // If the caller doesn't want us to simplify this into a zext of a compare,
+    // don't do it.
+    if (NotExtCompare && N2C->isOne())
+      return SDValue();
+
+    // Get a SetCC of the condition
+    // NOTE: Don't create a SETCC if it's not legal on this target.
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SETCC, N0.getValueType())) {
+      SDValue Temp, SCC;
+      // cast from setcc result type to select result type
+      if (LegalTypes) {
+        SCC  = DAG.getSetCC(DL, getSetCCResultType(N0.getValueType()),
+                            N0, N1, CC);
+        if (N2.getValueType().bitsLT(SCC.getValueType()))
+          Temp = DAG.getZeroExtendInReg(SCC, SDLoc(N2),
+                                        N2.getValueType());
+        else
+          Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
+                             N2.getValueType(), SCC);
+      } else {
+        SCC  = DAG.getSetCC(SDLoc(N0), MVT::i1, N0, N1, CC);
+        Temp = DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N2),
+                           N2.getValueType(), SCC);
+      }
+
+      AddToWorklist(SCC.getNode());
+      AddToWorklist(Temp.getNode());
+
+      if (N2C->isOne())
+        return Temp;
+
+      // shl setcc result by log2 n2c
+      return DAG.getNode(
+          ISD::SHL, DL, N2.getValueType(), Temp,
+          DAG.getConstant(N2C->getAPIntValue().logBase2(), SDLoc(Temp),
+                          getShiftAmountTy(Temp.getValueType())));
+    }
+  }
+
+  // Check to see if this is an integer abs.
+  // select_cc setg[te] X,  0,  X, -X ->
+  // select_cc setgt    X, -1,  X, -X ->
+  // select_cc setl[te] X,  0, -X,  X ->
+  // select_cc setlt    X,  1, -X,  X ->
+  // Y = sra (X, size(X)-1); xor (add (X, Y), Y)
+  if (N1C) {
+    ConstantSDNode *SubC = nullptr;
+    if (((N1C->isNullValue() && (CC == ISD::SETGT || CC == ISD::SETGE)) ||
+         (N1C->isAllOnesValue() && CC == ISD::SETGT)) &&
+        N0 == N2 && N3.getOpcode() == ISD::SUB && N0 == N3.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N3.getOperand(0));
+    else if (((N1C->isNullValue() && (CC == ISD::SETLT || CC == ISD::SETLE)) ||
+              (N1C->isOne() && CC == ISD::SETLT)) &&
+             N0 == N3 && N2.getOpcode() == ISD::SUB && N0 == N2.getOperand(1))
+      SubC = dyn_cast<ConstantSDNode>(N2.getOperand(0));
+
+    EVT XType = N0.getValueType();
+    if (SubC && SubC->isNullValue() && XType.isInteger()) {
+      SDLoc DL(N0);
+      SDValue Shift = DAG.getNode(ISD::SRA, DL, XType,
+                                  N0,
+                                  DAG.getConstant(XType.getSizeInBits() - 1, DL,
+                                         getShiftAmountTy(N0.getValueType())));
+      SDValue Add = DAG.getNode(ISD::ADD, DL,
+                                XType, N0, Shift);
+      AddToWorklist(Shift.getNode());
+      AddToWorklist(Add.getNode());
+      return DAG.getNode(ISD::XOR, DL, XType, Add, Shift);
+    }
+  }
+
+  // select_cc seteq X, 0, sizeof(X), ctlz(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), ctlz_zero_undef(X) -> ctlz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz(X) -> cttz(X)
+  // select_cc seteq X, 0, sizeof(X), cttz_zero_undef(X) -> cttz(X)
+  // select_cc setne X, 0, ctlz(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, ctlz_zero_undef(X), sizeof(X) -> ctlz(X)
+  // select_cc setne X, 0, cttz(X), sizeof(X) -> cttz(X)
+  // select_cc setne X, 0, cttz_zero_undef(X), sizeof(X) -> cttz(X)
+  if (N1C && N1C->isNullValue() && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+    SDValue ValueOnZero = N2;
+    SDValue Count = N3;
+    // If the condition is NE instead of E, swap the operands.
+    if (CC == ISD::SETNE)
+      std::swap(ValueOnZero, Count);
+    // Check if the value on zero is a constant equal to the bits in the type.
+    if (auto *ValueOnZeroC = dyn_cast<ConstantSDNode>(ValueOnZero)) {
+      if (ValueOnZeroC->getAPIntValue() == VT.getSizeInBits()) {
+        // If the other operand is cttz/cttz_zero_undef of N0, and cttz is
+        // legal, combine to just cttz.
+        if ((Count.getOpcode() == ISD::CTTZ ||
+             Count.getOpcode() == ISD::CTTZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTTZ, VT)))
+          return DAG.getNode(ISD::CTTZ, DL, VT, N0);
+        // If the other operand is ctlz/ctlz_zero_undef of N0, and ctlz is
+        // legal, combine to just ctlz.
+        if ((Count.getOpcode() == ISD::CTLZ ||
+             Count.getOpcode() == ISD::CTLZ_ZERO_UNDEF) &&
+            N0 == Count.getOperand(0) &&
+            (!LegalOperations || TLI.isOperationLegal(ISD::CTLZ, VT)))
+          return DAG.getNode(ISD::CTLZ, DL, VT, N0);
+      }
+    }
+  }
+
+  return SDValue();
+}
+
+/// This is a stub for TargetLowering::SimplifySetCC.
+SDValue DAGCombiner::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                   ISD::CondCode Cond, const SDLoc &DL,
+                                   bool foldBooleans) {
+  TargetLowering::DAGCombinerInfo
+    DagCombineInfo(DAG, Level, false, this);
+  return TLI.SimplifySetCC(VT, N0, N1, Cond, foldBooleans, DagCombineInfo, DL);
+}
+
+/// Given an ISD::SDIV node expressing a divide by constant, return
+/// a DAG expression to select that will generate the same value by multiplying
+/// by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue DAGCombiner::BuildSDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isNullValue())
+    return SDValue();
+
+  std::vector<SDNode*> Built;
+  SDValue S =
+      TLI.BuildSDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
+
+  for (SDNode *N : Built)
+    AddToWorklist(N);
+  return S;
+}
+
+/// Given an ISD::SDIV node expressing a divide by constant power of 2, return a
+/// DAG expression that will generate the same value by right shifting.
+SDValue DAGCombiner::BuildSDIVPow2(SDNode *N) {
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isNullValue())
+    return SDValue();
+
+  std::vector<SDNode *> Built;
+  SDValue S = TLI.BuildSDIVPow2(N, C->getAPIntValue(), DAG, &Built);
+
+  for (SDNode *N : Built)
+    AddToWorklist(N);
+  return S;
+}
+
+/// Given an ISD::UDIV node expressing a divide by constant, return a DAG
+/// expression that will generate the same value by multiplying by a magic
+/// number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue DAGCombiner::BuildUDIV(SDNode *N) {
+  // when optimising for minimum size, we don't want to expand a div to a mul
+  // and a shift.
+  if (DAG.getMachineFunction().getFunction()->optForMinSize())
+    return SDValue();
+
+  ConstantSDNode *C = isConstOrConstSplat(N->getOperand(1));
+  if (!C)
+    return SDValue();
+
+  // Avoid division by zero.
+  if (C->isNullValue())
+    return SDValue();
+
+  std::vector<SDNode*> Built;
+  SDValue S =
+      TLI.BuildUDIV(N, C->getAPIntValue(), DAG, LegalOperations, &Built);
+
+  for (SDNode *N : Built)
+    AddToWorklist(N);
+  return S;
+}
+
+SDValue DAGCombiner::BuildReciprocalEstimate(SDValue Op, SDNodeFlags *Flags) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+
+  unsigned Iterations = 0;
+  if (SDValue Est = TLI.getRecipEstimate(Op, DCI, Iterations)) {
+    if (Iterations) {
+      // Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+      // For the reciprocal, we need to find the zero of the function:
+      //   F(X) = A X - 1 [which has a zero at X = 1/A]
+      //     =>
+      //   X_{i+1} = X_i (2 - A X_i) = X_i + X_i (1 - A X_i) [this second form
+      //     does not require additional intermediate precision]
+      EVT VT = Op.getValueType();
+      SDLoc DL(Op);
+      SDValue FPOne = DAG.getConstantFP(1.0, DL, VT);
+
+      AddToWorklist(Est.getNode());
+
+      // Newton iterations: Est = Est + Est (1 - Arg * Est)
+      for (unsigned i = 0; i < Iterations; ++i) {
+        SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Op, Est, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FSUB, DL, VT, FPOne, NewEst, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+        AddToWorklist(NewEst.getNode());
+
+        Est = DAG.getNode(ISD::FADD, DL, VT, Est, NewEst, Flags);
+        AddToWorklist(Est.getNode());
+      }
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = X_i (1.5 - A X_i^2 / 2)
+/// As a result, we precompute A/2 prior to the iteration loop.
+SDValue DAGCombiner::buildSqrtNROneConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags *Flags, bool Reciprocal) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue ThreeHalves = DAG.getConstantFP(1.5, DL, VT);
+
+  // We now need 0.5 * Arg which we can write as (1.5 * Arg - Arg) so that
+  // this entire sequence requires only one FP constant.
+  SDValue HalfArg = DAG.getNode(ISD::FMUL, DL, VT, ThreeHalves, Arg, Flags);
+  AddToWorklist(HalfArg.getNode());
+
+  HalfArg = DAG.getNode(ISD::FSUB, DL, VT, HalfArg, Arg, Flags);
+  AddToWorklist(HalfArg.getNode());
+
+  // Newton iterations: Est = Est * (1.5 - HalfArg * Est * Est)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue NewEst = DAG.getNode(ISD::FMUL, DL, VT, Est, Est, Flags);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FMUL, DL, VT, HalfArg, NewEst, Flags);
+    AddToWorklist(NewEst.getNode());
+
+    NewEst = DAG.getNode(ISD::FSUB, DL, VT, ThreeHalves, NewEst, Flags);
+    AddToWorklist(NewEst.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, NewEst, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
+  // If non-reciprocal square root is requested, multiply the result by Arg.
+  if (!Reciprocal) {
+    Est = DAG.getNode(ISD::FMUL, DL, VT, Est, Arg, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
+  return Est;
+}
+
+/// Newton iteration for a function: F(X) is X_{i+1} = X_i - F(X_i)/F'(X_i)
+/// For the reciprocal sqrt, we need to find the zero of the function:
+///   F(X) = 1/X^2 - A [which has a zero at X = 1/sqrt(A)]
+///     =>
+///   X_{i+1} = (-0.5 * X_i) * (A * X_i * X_i + (-3.0))
+SDValue DAGCombiner::buildSqrtNRTwoConst(SDValue Arg, SDValue Est,
+                                         unsigned Iterations,
+                                         SDNodeFlags *Flags, bool Reciprocal) {
+  EVT VT = Arg.getValueType();
+  SDLoc DL(Arg);
+  SDValue MinusThree = DAG.getConstantFP(-3.0, DL, VT);
+  SDValue MinusHalf = DAG.getConstantFP(-0.5, DL, VT);
+
+  // This routine must enter the loop below to work correctly
+  // when (Reciprocal == false).
+  assert(Iterations > 0);
+
+  // Newton iterations for reciprocal square root:
+  // E = (E * -0.5) * ((A * E) * E + -3.0)
+  for (unsigned i = 0; i < Iterations; ++i) {
+    SDValue AE = DAG.getNode(ISD::FMUL, DL, VT, Arg, Est, Flags);
+    AddToWorklist(AE.getNode());
+
+    SDValue AEE = DAG.getNode(ISD::FMUL, DL, VT, AE, Est, Flags);
+    AddToWorklist(AEE.getNode());
+
+    SDValue RHS = DAG.getNode(ISD::FADD, DL, VT, AEE, MinusThree, Flags);
+    AddToWorklist(RHS.getNode());
+
+    // When calculating a square root at the last iteration build:
+    // S = ((A * E) * -0.5) * ((A * E) * E + -3.0)
+    // (notice a common subexpression)
+    SDValue LHS;
+    if (Reciprocal || (i + 1) < Iterations) {
+      // RSQRT: LHS = (E * -0.5)
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, Est, MinusHalf, Flags);
+    } else {
+      // SQRT: LHS = (A * E) * -0.5
+      LHS = DAG.getNode(ISD::FMUL, DL, VT, AE, MinusHalf, Flags);
+    }
+    AddToWorklist(LHS.getNode());
+
+    Est = DAG.getNode(ISD::FMUL, DL, VT, LHS, RHS, Flags);
+    AddToWorklist(Est.getNode());
+  }
+
+  return Est;
+}
+
+/// Build code to calculate either rsqrt(Op) or sqrt(Op). In the latter case
+/// Op*rsqrt(Op) is actually computed, so additional postprocessing is needed if
+/// Op can be zero.
+SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags *Flags,
+                                           bool Reciprocal) {
+  if (Level >= AfterLegalizeDAG)
+    return SDValue();
+
+  // Expose the DAG combiner to the target combiner implementations.
+  TargetLowering::DAGCombinerInfo DCI(DAG, Level, false, this);
+  unsigned Iterations = 0;
+  bool UseOneConstNR = false;
+  if (SDValue Est = TLI.getRsqrtEstimate(Op, DCI, Iterations, UseOneConstNR)) {
+    AddToWorklist(Est.getNode());
+    if (Iterations) {
+      Est = UseOneConstNR
+                ? buildSqrtNROneConst(Op, Est, Iterations, Flags, Reciprocal)
+                : buildSqrtNRTwoConst(Op, Est, Iterations, Flags, Reciprocal);
+    }
+    return Est;
+  }
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::buildRsqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+  return buildSqrtEstimateImpl(Op, Flags, true);
+}
+
+SDValue DAGCombiner::buildSqrtEstimate(SDValue Op, SDNodeFlags *Flags) {
+  SDValue Est = buildSqrtEstimateImpl(Op, Flags, false);
+  if (!Est)
+    return SDValue();
+
+  // Unfortunately, Est is now NaN if the input was exactly 0.
+  // Select out this case and force the answer to 0.
+  EVT VT = Est.getValueType();
+  SDLoc DL(Op);
+  SDValue Zero = DAG.getConstantFP(0.0, DL, VT);
+  EVT CCVT = getSetCCResultType(VT);
+  SDValue ZeroCmp = DAG.getSetCC(DL, CCVT, Op, Zero, ISD::SETEQ);
+  AddToWorklist(ZeroCmp.getNode());
+
+  Est = DAG.getNode(VT.isVector() ? ISD::VSELECT : ISD::SELECT, DL, VT, ZeroCmp,
+                    Zero, Est);
+  AddToWorklist(Est.getNode());
+  return Est;
+}
+
+/// Return true if base is a frame index, which is known not to alias with
+/// anything but itself.  Provides base object and offset as results.
+static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
+                           const GlobalValue *&GV, const void *&CV) {
+  // Assume it is a primitive operation.
+  Base = Ptr; Offset = 0; GV = nullptr; CV = nullptr;
+
+  // If it's an adding a simple constant then integrate the offset.
+  if (Base.getOpcode() == ISD::ADD) {
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
+      Base = Base.getOperand(0);
+      Offset += C->getZExtValue();
+    }
+  }
+
+  // Return the underlying GlobalValue, and update the Offset.  Return false
+  // for GlobalAddressSDNode since the same GlobalAddress may be represented
+  // by multiple nodes with different offsets.
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Base)) {
+    GV = G->getGlobal();
+    Offset += G->getOffset();
+    return false;
+  }
+
+  // Return the underlying Constant value, and update the Offset.  Return false
+  // for ConstantSDNodes since the same constant pool entry may be represented
+  // by multiple nodes with different offsets.
+  if (ConstantPoolSDNode *C = dyn_cast<ConstantPoolSDNode>(Base)) {
+    CV = C->isMachineConstantPoolEntry() ? (const void *)C->getMachineCPVal()
+                                         : (const void *)C->getConstVal();
+    Offset += C->getOffset();
+    return false;
+  }
+  // If it's any of the following then it can't alias with anything but itself.
+  return isa<FrameIndexSDNode>(Base);
+}
+
+/// Return true if there is any possibility that the two addresses overlap.
+bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const {
+  // If they are the same then they must be aliases.
+  if (Op0->getBasePtr() == Op1->getBasePtr()) return true;
+
+  // If they are both volatile then they cannot be reordered.
+  if (Op0->isVolatile() && Op1->isVolatile()) return true;
+
+  // If one operation reads from invariant memory, and the other may store, they
+  // cannot alias. These should really be checking the equivalent of mayWrite,
+  // but it only matters for memory nodes other than load /store.
+  if (Op0->isInvariant() && Op1->writeMem())
+    return false;
+
+  if (Op1->isInvariant() && Op0->writeMem())
+    return false;
+
+  // Gather base node and offset information.
+  SDValue Base1, Base2;
+  int64_t Offset1, Offset2;
+  const GlobalValue *GV1, *GV2;
+  const void *CV1, *CV2;
+  bool isFrameIndex1 = FindBaseOffset(Op0->getBasePtr(),
+                                      Base1, Offset1, GV1, CV1);
+  bool isFrameIndex2 = FindBaseOffset(Op1->getBasePtr(),
+                                      Base2, Offset2, GV2, CV2);
+
+  // If they have a same base address then check to see if they overlap.
+  if (Base1 == Base2 || (GV1 && (GV1 == GV2)) || (CV1 && (CV1 == CV2)))
+    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
+             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
+
+  // It is possible for different frame indices to alias each other, mostly
+  // when tail call optimization reuses return address slots for arguments.
+  // To catch this case, look up the actual index of frame indices to compute
+  // the real alias relationship.
+  if (isFrameIndex1 && isFrameIndex2) {
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    Offset1 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base1)->getIndex());
+    Offset2 += MFI->getObjectOffset(cast<FrameIndexSDNode>(Base2)->getIndex());
+    return !((Offset1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= Offset2 ||
+             (Offset2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= Offset1);
+  }
+
+  // Otherwise, if we know what the bases are, and they aren't identical, then
+  // we know they cannot alias.
+  if ((isFrameIndex1 || CV1 || GV1) && (isFrameIndex2 || CV2 || GV2))
+    return false;
+
+  // If we know required SrcValue1 and SrcValue2 have relatively large alignment
+  // compared to the size and offset of the access, we may be able to prove they
+  // do not alias.  This check is conservative for now to catch cases created by
+  // splitting vector types.
+  if ((Op0->getOriginalAlignment() == Op1->getOriginalAlignment()) &&
+      (Op0->getSrcValueOffset() != Op1->getSrcValueOffset()) &&
+      (Op0->getMemoryVT().getSizeInBits() >> 3 ==
+       Op1->getMemoryVT().getSizeInBits() >> 3) &&
+      (Op0->getOriginalAlignment() > (Op0->getMemoryVT().getSizeInBits() >> 3))) {
+    int64_t OffAlign1 = Op0->getSrcValueOffset() % Op0->getOriginalAlignment();
+    int64_t OffAlign2 = Op1->getSrcValueOffset() % Op1->getOriginalAlignment();
+
+    // There is no overlap between these relatively aligned accesses of similar
+    // size, return no alias.
+    if ((OffAlign1 + (Op0->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign2 ||
+        (OffAlign2 + (Op1->getMemoryVT().getSizeInBits() >> 3)) <= OffAlign1)
+      return false;
+  }
+
+  bool UseAA = CombinerGlobalAA.getNumOccurrences() > 0
+                   ? CombinerGlobalAA
+                   : DAG.getSubtarget().useAA();
+#ifndef NDEBUG
+  if (CombinerAAOnlyFunc.getNumOccurrences() &&
+      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
+    UseAA = false;
+#endif
+  if (UseAA &&
+      Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) {
+    // Use alias analysis information.
+    int64_t MinOffset = std::min(Op0->getSrcValueOffset(),
+                                 Op1->getSrcValueOffset());
+    int64_t Overlap1 = (Op0->getMemoryVT().getSizeInBits() >> 3) +
+        Op0->getSrcValueOffset() - MinOffset;
+    int64_t Overlap2 = (Op1->getMemoryVT().getSizeInBits() >> 3) +
+        Op1->getSrcValueOffset() - MinOffset;
+    AliasResult AAResult =
+        AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap1,
+                                UseTBAA ? Op0->getAAInfo() : AAMDNodes()),
+                 MemoryLocation(Op1->getMemOperand()->getValue(), Overlap2,
+                                UseTBAA ? Op1->getAAInfo() : AAMDNodes()));
+    if (AAResult == NoAlias)
+      return false;
+  }
+
+  // Otherwise we have to assume they alias.
+  return true;
+}
+
+/// Walk up chain skipping non-aliasing memory nodes,
+/// looking for aliasing nodes and adding them to the Aliases vector.
+void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
+                                   SmallVectorImpl<SDValue> &Aliases) {
+  SmallVector<SDValue, 8> Chains;     // List of chains to visit.
+  SmallPtrSet<SDNode *, 16> Visited;  // Visited node set.
+
+  // Get alias information for node.
+  bool IsLoad = isa<LoadSDNode>(N) && !cast<LSBaseSDNode>(N)->isVolatile();
+
+  // Starting off.
+  Chains.push_back(OriginalChain);
+  unsigned Depth = 0;
+
+  // Look at each chain and determine if it is an alias.  If so, add it to the
+  // aliases list.  If not, then continue up the chain looking for the next
+  // candidate.
+  while (!Chains.empty()) {
+    SDValue Chain = Chains.pop_back_val();
+
+    // For TokenFactor nodes, look at each operand and only continue up the
+    // chain until we reach the depth limit.
+    //
+    // FIXME: The depth check could be made to return the last non-aliasing
+    // chain we found before we hit a tokenfactor rather than the original
+    // chain.
+    if (Depth > TLI.getGatherAllAliasesMaxDepth()) {
+      Aliases.clear();
+      Aliases.push_back(OriginalChain);
+      return;
+    }
+
+    // Don't bother if we've been before.
+    if (!Visited.insert(Chain.getNode()).second)
+      continue;
+
+    switch (Chain.getOpcode()) {
+    case ISD::EntryToken:
+      // Entry token is ideal chain operand, but handled in FindBetterChain.
+      break;
+
+    case ISD::LOAD:
+    case ISD::STORE: {
+      // Get alias information for Chain.
+      bool IsOpLoad = isa<LoadSDNode>(Chain.getNode()) &&
+          !cast<LSBaseSDNode>(Chain.getNode())->isVolatile();
+
+      // If chain is alias then stop here.
+      if (!(IsLoad && IsOpLoad) &&
+          isAlias(cast<LSBaseSDNode>(N), cast<LSBaseSDNode>(Chain.getNode()))) {
+        Aliases.push_back(Chain);
+      } else {
+        // Look further up the chain.
+        Chains.push_back(Chain.getOperand(0));
+        ++Depth;
+      }
+      break;
+    }
+
+    case ISD::TokenFactor:
+      // We have to check each of the operands of the token factor for "small"
+      // token factors, so we queue them up.  Adding the operands to the queue
+      // (stack) in reverse order maintains the original order and increases the
+      // likelihood that getNode will find a matching token factor (CSE.)
+      if (Chain.getNumOperands() > 16) {
+        Aliases.push_back(Chain);
+        break;
+      }
+      for (unsigned n = Chain.getNumOperands(); n;)
+        Chains.push_back(Chain.getOperand(--n));
+      ++Depth;
+      break;
+
+    default:
+      // For all other instructions we will just have to take what we can get.
+      Aliases.push_back(Chain);
+      break;
+    }
+  }
+}
+
+/// Walk up chain skipping non-aliasing memory nodes, looking for a better chain
+/// (aliasing node.)
+SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
+  SmallVector<SDValue, 8> Aliases;  // Ops for replacing token factor.
+
+  // Accumulate all the aliases to this node.
+  GatherAllAliases(N, OldChain, Aliases);
+
+  // If no operands then chain to entry token.
+  if (Aliases.size() == 0)
+    return DAG.getEntryNode();
+
+  // If a single operand then chain to it.  We don't need to revisit it.
+  if (Aliases.size() == 1)
+    return Aliases[0];
+
+  // Construct a custom tailored token factor.
+  return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
+}
+
+bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
+  // This holds the base pointer, index, and the offset in bytes from the base
+  // pointer.
+  BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+
+  // We must have a base and an offset.
+  if (!BasePtr.Base.getNode())
+    return false;
+
+  // Do not handle stores to undef base pointers.
+  if (BasePtr.Base.isUndef())
+    return false;
+
+  SmallVector<StoreSDNode *, 8> ChainedStores;
+  ChainedStores.push_back(St);
+
+  // Walk up the chain and look for nodes with offsets from the same
+  // base pointer. Stop when reaching an instruction with a different kind
+  // or instruction which has a different base pointer.
+  StoreSDNode *Index = St;
+  while (Index) {
+    // If the chain has more than one use, then we can't reorder the mem ops.
+    if (Index != St && !SDValue(Index, 0)->hasOneUse())
+      break;
+
+    if (Index->isVolatile() || Index->isIndexed())
+      break;
+
+    // Find the base pointer and offset for this memory node.
+    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
+
+    // Check that the base pointer is the same as the original one.
+    if (!Ptr.equalBaseIndex(BasePtr))
+      break;
+
+    // Find the next memory operand in the chain. If the next operand in the
+    // chain is a store then move up and continue the scan with the next
+    // memory operand. If the next operand is a load save it and use alias
+    // information to check if it interferes with anything.
+    SDNode *NextInChain = Index->getChain().getNode();
+    while (true) {
+      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
+        // We found a store node. Use it for the next iteration.
+        if (STn->isVolatile() || STn->isIndexed()) {
+          Index = nullptr;
+          break;
+        }
+        ChainedStores.push_back(STn);
+        Index = STn;
+        break;
+      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
+        NextInChain = Ldn->getChain().getNode();
+        continue;
+      } else {
+        Index = nullptr;
+        break;
+      }
+    }
+  }
+
+  bool MadeChangeToSt = false;
+  SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
+
+  for (StoreSDNode *ChainedStore : ChainedStores) {
+    SDValue Chain = ChainedStore->getChain();
+    SDValue BetterChain = FindBetterChain(ChainedStore, Chain);
+
+    if (Chain != BetterChain) {
+      if (ChainedStore == St)
+        MadeChangeToSt = true;
+      BetterChains.push_back(std::make_pair(ChainedStore, BetterChain));
+    }
+  }
+
+  // Do all replacements after finding the replacements to make to avoid making
+  // the chains more complicated by introducing new TokenFactors.
+  for (auto Replacement : BetterChains)
+    replaceStoreChain(Replacement.first, Replacement.second);
+
+  return MadeChangeToSt;
+}
+
+/// This is the entry point for the file.
+void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA,
+                           CodeGenOpt::Level OptLevel) {
+  /// This is the main entry point to this class.
+  DAGCombiner(*this, AA, OptLevel).Run(Level);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
new file mode 100644
index 000000000000..b10da002fcfe
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -0,0 +1,2243 @@
+//===-- FastISel.cpp - Implementation of the FastISel class ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the FastISel class.
+//
+// "Fast" instruction selection is designed to emit very poor code quickly.
+// Also, it is not designed to be able to do much lowering, so most illegal
+// types (e.g. i64 on 32-bit targets) and operations are not supported.  It is
+// also not intended to be able to do much optimization, except in a few cases
+// where doing optimizations reduces overall compile time.  For example, folding
+// constants into immediate fields is often done, because it's cheap and it
+// reduces the number of instructions later phases have to examine.
+//
+// "Fast" instruction selection is able to fail gracefully and transfer
+// control to the SelectionDAG selector for operations that it doesn't
+// support.  In many cases, this allows us to avoid duplicating a lot of
+// the complicated lowering logic that SelectionDAG currently has.
+//
+// The intended use for "fast" instruction selection is "-O0" mode
+// compilation, where the quality of the generated code is irrelevant when
+// weighed against the speed at which the code can be generated.  Also,
+// at -O0, the LLVM optimizers are not running, and this makes the
+// compile time of codegen a much higher portion of the overall compile
+// time.  Despite its limitations, "fast" instruction selection is able to
+// handle enough code on its own to provide noticeable overall speedups
+// in -O0 compiles.
+//
+// Basic operations are supported in a target-independent way, by reading
+// the same instruction descriptions that the SelectionDAG selector reads,
+// and identifying simple arithmetic operations that can be directly selected
+// from simple operators.  More complicated operations currently require
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "isel"
+
+STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by "
+                                         "target-independent selector");
+STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
+                                    "target-specific selector");
+STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
+
+void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                           unsigned AttrIdx) {
+  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
+  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
+  Alignment = CS->getParamAlignment(AttrIdx);
+}
+
+/// Set the current block to which generated machine instructions will be
+/// appended, and clear the local CSE map.
+void FastISel::startNewBlock() {
+  LocalValueMap.clear();
+
+  // Instructions are appended to FuncInfo.MBB. If the basic block already
+  // contains labels or copies, use the last instruction as the last local
+  // value.
+  EmitStartPt = nullptr;
+  if (!FuncInfo.MBB->empty())
+    EmitStartPt = &FuncInfo.MBB->back();
+  LastLocalValue = EmitStartPt;
+}
+
+bool FastISel::lowerArguments() {
+  if (!FuncInfo.CanLowerReturn)
+    // Fallback to SDISel argument lowering code to deal with sret pointer
+    // parameter.
+    return false;
+
+  if (!fastLowerArguments())
+    return false;
+
+  // Enter arguments into ValueMap for uses in non-entry BBs.
+  for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
+                                    E = FuncInfo.Fn->arg_end();
+       I != E; ++I) {
+    DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(&*I);
+    assert(VI != LocalValueMap.end() && "Missed an argument?");
+    FuncInfo.ValueMap[&*I] = VI->second;
+  }
+  return true;
+}
+
+void FastISel::flushLocalValueMap() {
+  LocalValueMap.clear();
+  LastLocalValue = EmitStartPt;
+  recomputeInsertPt();
+  SavedInsertPt = FuncInfo.InsertPt;
+}
+
+bool FastISel::hasTrivialKill(const Value *V) {
+  // Don't consider constants or arguments to have trivial kills.
+  const Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  // No-op casts are trivially coalesced by fast-isel.
+  if (const auto *Cast = dyn_cast<CastInst>(I))
+    if (Cast->isNoopCast(DL.getIntPtrType(Cast->getContext())) &&
+        !hasTrivialKill(Cast->getOperand(0)))
+      return false;
+
+  // Even the value might have only one use in the LLVM IR, it is possible that
+  // FastISel might fold the use into another instruction and now there is more
+  // than one use at the Machine Instruction level.
+  unsigned Reg = lookUpRegForValue(V);
+  if (Reg && !MRI.use_empty(Reg))
+    return false;
+
+  // GEPs with all zero indices are trivially coalesced by fast-isel.
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
+      return false;
+
+  // Only instructions with a single use in the same basic block are considered
+  // to have trivial kills.
+  return I->hasOneUse() &&
+         !(I->getOpcode() == Instruction::BitCast ||
+           I->getOpcode() == Instruction::PtrToInt ||
+           I->getOpcode() == Instruction::IntToPtr) &&
+         cast<Instruction>(*I->user_begin())->getParent() == I->getParent();
+}
+
+unsigned FastISel::getRegForValue(const Value *V) {
+  EVT RealVT = TLI.getValueType(DL, V->getType(), /*AllowUnknown=*/true);
+  // Don't handle non-simple values in FastISel.
+  if (!RealVT.isSimple())
+    return 0;
+
+  // Ignore illegal types. We must do this before looking up the value
+  // in ValueMap because Arguments are given virtual registers regardless
+  // of whether FastISel can handle them.
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT)) {
+    // Handle integer promotions, though, because they're common and easy.
+    if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
+      VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
+    else
+      return 0;
+  }
+
+  // Look up the value to see if we already have a register for it.
+  unsigned Reg = lookUpRegForValue(V);
+  if (Reg)
+    return Reg;
+
+  // In bottom-up mode, just create the virtual register which will be used
+  // to hold the value. It will be materialized later.
+  if (isa<Instruction>(V) &&
+      (!isa<AllocaInst>(V) ||
+       !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(V))))
+    return FuncInfo.InitializeRegForValue(V);
+
+  SavePoint SaveInsertPt = enterLocalValueArea();
+
+  // Materialize the value in a register. Emit any instructions in the
+  // local value area.
+  Reg = materializeRegForValue(V, VT);
+
+  leaveLocalValueArea(SaveInsertPt);
+
+  return Reg;
+}
+
+unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
+  unsigned Reg = 0;
+  if (const auto *CI = dyn_cast<ConstantInt>(V)) {
+    if (CI->getValue().getActiveBits() <= 64)
+      Reg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
+  } else if (isa<AllocaInst>(V))
+    Reg = fastMaterializeAlloca(cast<AllocaInst>(V));
+  else if (isa<ConstantPointerNull>(V))
+    // Translate this as an integer zero so that it can be
+    // local-CSE'd with actual integer zeros.
+    Reg = getRegForValue(
+        Constant::getNullValue(DL.getIntPtrType(V->getContext())));
+  else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+    if (CF->isNullValue())
+      Reg = fastMaterializeFloatZero(CF);
+    else
+      // Try to emit the constant directly.
+      Reg = fastEmit_f(VT, VT, ISD::ConstantFP, CF);
+
+    if (!Reg) {
+      // Try to emit the constant by using an integer constant with a cast.
+      const APFloat &Flt = CF->getValueAPF();
+      EVT IntVT = TLI.getPointerTy(DL);
+
+      uint64_t x[2];
+      uint32_t IntBitWidth = IntVT.getSizeInBits();
+      bool isExact;
+      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
+                                 APFloat::rmTowardZero, &isExact);
+      if (isExact) {
+        APInt IntVal(IntBitWidth, x);
+
+        unsigned IntegerReg =
+            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+        if (IntegerReg != 0)
+          Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
+                           /*Kill=*/false);
+      }
+    }
+  } else if (const auto *Op = dyn_cast<Operator>(V)) {
+    if (!selectOperator(Op, Op->getOpcode()))
+      if (!isa<Instruction>(Op) ||
+          !fastSelectInstruction(cast<Instruction>(Op)))
+        return 0;
+    Reg = lookUpRegForValue(Op);
+  } else if (isa<UndefValue>(V)) {
+    Reg = createResultReg(TLI.getRegClassFor(VT));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), Reg);
+  }
+  return Reg;
+}
+
+/// Helper for getRegForValue. This function is called when the value isn't
+/// already available in a register and must be materialized with new
+/// instructions.
+unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  unsigned Reg = 0;
+  // Give the target-specific code a try first.
+  if (isa<Constant>(V))
+    Reg = fastMaterializeConstant(cast<Constant>(V));
+
+  // If target-specific code couldn't or didn't want to handle the value, then
+  // give target-independent code a try.
+  if (!Reg)
+    Reg = materializeConstant(V, VT);
+
+  // Don't cache constant materializations in the general ValueMap.
+  // To do so would require tracking what uses they dominate.
+  if (Reg) {
+    LocalValueMap[V] = Reg;
+    LastLocalValue = MRI.getVRegDef(Reg);
+  }
+  return Reg;
+}
+
+unsigned FastISel::lookUpRegForValue(const Value *V) {
+  // Look up the value to see if we already have a register for it. We
+  // cache values defined by Instructions across blocks, and other values
+  // only locally. This is because Instructions already have the SSA
+  // def-dominates-use requirement enforced.
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  if (I != FuncInfo.ValueMap.end())
+    return I->second;
+  return LocalValueMap[V];
+}
+
+void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
+  if (!isa<Instruction>(I)) {
+    LocalValueMap[I] = Reg;
+    return;
+  }
+
+  unsigned &AssignedReg = FuncInfo.ValueMap[I];
+  if (AssignedReg == 0)
+    // Use the new register.
+    AssignedReg = Reg;
+  else if (Reg != AssignedReg) {
+    // Arrange for uses of AssignedReg to be replaced by uses of Reg.
+    for (unsigned i = 0; i < NumRegs; i++)
+      FuncInfo.RegFixups[AssignedReg + i] = Reg + i;
+
+    AssignedReg = Reg;
+  }
+}
+
+std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
+  unsigned IdxN = getRegForValue(Idx);
+  if (IdxN == 0)
+    // Unhandled operand. Halt "fast" selection and bail.
+    return std::pair<unsigned, bool>(0, false);
+
+  bool IdxNIsKill = hasTrivialKill(Idx);
+
+  // If the index is smaller or larger than intptr_t, truncate or extend it.
+  MVT PtrVT = TLI.getPointerTy(DL);
+  EVT IdxVT = EVT::getEVT(Idx->getType(), /*HandleUnknown=*/false);
+  if (IdxVT.bitsLT(PtrVT)) {
+    IdxN = fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::SIGN_EXTEND, IdxN,
+                      IdxNIsKill);
+    IdxNIsKill = true;
+  } else if (IdxVT.bitsGT(PtrVT)) {
+    IdxN =
+        fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE, IdxN, IdxNIsKill);
+    IdxNIsKill = true;
+  }
+  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+}
+
+void FastISel::recomputeInsertPt() {
+  if (getLastLocalValue()) {
+    FuncInfo.InsertPt = getLastLocalValue();
+    FuncInfo.MBB = FuncInfo.InsertPt->getParent();
+    ++FuncInfo.InsertPt;
+  } else
+    FuncInfo.InsertPt = FuncInfo.MBB->getFirstNonPHI();
+
+  // Now skip past any EH_LABELs, which must remain at the beginning.
+  while (FuncInfo.InsertPt != FuncInfo.MBB->end() &&
+         FuncInfo.InsertPt->getOpcode() == TargetOpcode::EH_LABEL)
+    ++FuncInfo.InsertPt;
+}
+
+void FastISel::removeDeadCode(MachineBasicBlock::iterator I,
+                              MachineBasicBlock::iterator E) {
+  assert(static_cast<MachineInstr *>(I) && static_cast<MachineInstr *>(E) &&
+         std::distance(I, E) > 0 && "Invalid iterator!");
+  while (I != E) {
+    MachineInstr *Dead = &*I;
+    ++I;
+    Dead->eraseFromParent();
+    ++NumFastIselDead;
+  }
+  recomputeInsertPt();
+}
+
+FastISel::SavePoint FastISel::enterLocalValueArea() {
+  MachineBasicBlock::iterator OldInsertPt = FuncInfo.InsertPt;
+  DebugLoc OldDL = DbgLoc;
+  recomputeInsertPt();
+  DbgLoc = DebugLoc();
+  SavePoint SP = {OldInsertPt, OldDL};
+  return SP;
+}
+
+void FastISel::leaveLocalValueArea(SavePoint OldInsertPt) {
+  if (FuncInfo.InsertPt != FuncInfo.MBB->begin())
+    LastLocalValue = &*std::prev(FuncInfo.InsertPt);
+
+  // Restore the previous insert position.
+  FuncInfo.InsertPt = OldInsertPt.InsertPt;
+  DbgLoc = OldInsertPt.DL;
+}
+
+bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
+  EVT VT = EVT::getEVT(I->getType(), /*HandleUnknown=*/true);
+  if (VT == MVT::Other || !VT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // We only handle legal types. For example, on x86-32 the instruction
+  // selector contains all of the 64-bit instructions from x86-64,
+  // under the assumption that i64 won't be used if the target doesn't
+  // support it.
+  if (!TLI.isTypeLegal(VT)) {
+    // MVT::i1 is special. Allow AND, OR, or XOR because they
+    // don't require additional zeroing, which makes them easy.
+    if (VT == MVT::i1 && (ISDOpcode == ISD::AND || ISDOpcode == ISD::OR ||
+                          ISDOpcode == ISD::XOR))
+      VT = TLI.getTypeToTransformTo(I->getContext(), VT);
+    else
+      return false;
+  }
+
+  // Check if the first operand is a constant, and handle it as "ri".  At -O0,
+  // we don't have anything that canonicalizes operand order.
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
+    if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
+      unsigned Op1 = getRegForValue(I->getOperand(1));
+      if (!Op1)
+        return false;
+      bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+      unsigned ResultReg =
+          fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, Op1IsKill,
+                       CI->getZExtValue(), VT.getSimpleVT());
+      if (!ResultReg)
+        return false;
+
+      // We successfully emitted code for the given LLVM Instruction.
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  // Check if the second operand is a constant and handle it appropriately.
+  if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+    uint64_t Imm = CI->getSExtValue();
+
+    // Transform "sdiv exact X, 8" -> "sra X, 3".
+    if (ISDOpcode == ISD::SDIV && isa<BinaryOperator>(I) &&
+        cast<BinaryOperator>(I)->isExact() && isPowerOf2_64(Imm)) {
+      Imm = Log2_64(Imm);
+      ISDOpcode = ISD::SRA;
+    }
+
+    // Transform "urem x, pow2" -> "and x, pow2-1".
+    if (ISDOpcode == ISD::UREM && isa<BinaryOperator>(I) &&
+        isPowerOf2_64(Imm)) {
+      --Imm;
+      ISDOpcode = ISD::AND;
+    }
+
+    unsigned ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+                                      Op0IsKill, Imm, VT.getSimpleVT());
+    if (!ResultReg)
+      return false;
+
+    // We successfully emitted code for the given LLVM Instruction.
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Check if the second operand is a constant float.
+  if (const auto *CF = dyn_cast<ConstantFP>(I->getOperand(1))) {
+    unsigned ResultReg = fastEmit_rf(VT.getSimpleVT(), VT.getSimpleVT(),
+                                     ISDOpcode, Op0, Op0IsKill, CF);
+    if (ResultReg) {
+      // We successfully emitted code for the given LLVM Instruction.
+      updateValueMap(I, ResultReg);
+      return true;
+    }
+  }
+
+  unsigned Op1 = getRegForValue(I->getOperand(1));
+  if (!Op1) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+  bool Op1IsKill = hasTrivialKill(I->getOperand(1));
+
+  // Now we have both operands in registers. Emit the instruction.
+  unsigned ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+                                   ISDOpcode, Op0, Op0IsKill, Op1, Op1IsKill);
+  if (!ResultReg)
+    // Target-specific code wasn't able to find a machine opcode for
+    // the given ISD opcode and type. Halt "fast" selection and bail.
+    return false;
+
+  // We successfully emitted code for the given LLVM Instruction.
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectGetElementPtr(const User *I) {
+  unsigned N = getRegForValue(I->getOperand(0));
+  if (!N) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+  bool NIsKill = hasTrivialKill(I->getOperand(0));
+
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  // FIXME: What's a good SWAG number for MaxOffs?
+  uint64_t MaxOffs = 2048;
+  MVT VT = TLI.getPointerTy(DL);
+  for (gep_type_iterator GTI = gep_type_begin(I), E = gep_type_end(I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (auto *StTy = dyn_cast<StructType>(*GTI)) {
+      uint64_t Field = cast<ConstantInt>(Idx)->getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        TotalOffs += DL.getStructLayout(StTy)->getElementOffset(Field);
+        if (TotalOffs >= MaxOffs) {
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
+      }
+    } else {
+      Type *Ty = GTI.getIndexedType();
+
+      // If this is a constant subscript, handle it quickly.
+      if (const auto *CI = dyn_cast<ConstantInt>(Idx)) {
+        if (CI->isZero())
+          continue;
+        // N = N + Offset
+        uint64_t IdxN = CI->getValue().sextOrTrunc(64).getSExtValue();
+        TotalOffs += DL.getTypeAllocSize(Ty) * IdxN;
+        if (TotalOffs >= MaxOffs) {
+          N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (!N) // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
+        continue;
+      }
+      if (TotalOffs) {
+        N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+        if (!N) // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        NIsKill = true;
+        TotalOffs = 0;
+      }
+
+      // N = N + Idx * ElementSize;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
+      unsigned IdxN = Pair.first;
+      bool IdxNIsKill = Pair.second;
+      if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+
+      if (ElementSize != 1) {
+        IdxN = fastEmit_ri_(VT, ISD::MUL, IdxN, IdxNIsKill, ElementSize, VT);
+        if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
+          return false;
+        IdxNIsKill = true;
+      }
+      N = fastEmit_rr(VT, VT, ISD::ADD, N, NIsKill, IdxN, IdxNIsKill);
+      if (!N) // Unhandled operand. Halt "fast" selection and bail.
+        return false;
+    }
+  }
+  if (TotalOffs) {
+    N = fastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+    if (!N) // Unhandled operand. Halt "fast" selection and bail.
+      return false;
+  }
+
+  // We successfully emitted code for the given LLVM Instruction.
+  updateValueMap(I, N);
+  return true;
+}
+
+bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
+                                   const CallInst *CI, unsigned StartIdx) {
+  for (unsigned i = StartIdx, e = CI->getNumArgOperands(); i != e; ++i) {
+    Value *Val = CI->getArgOperand(i);
+    // Check for constants and encode them with a StackMaps::ConstantOp prefix.
+    if (const auto *C = dyn_cast<ConstantInt>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(C->getSExtValue()));
+    } else if (isa<ConstantPointerNull>(Val)) {
+      Ops.push_back(MachineOperand::CreateImm(StackMaps::ConstantOp));
+      Ops.push_back(MachineOperand::CreateImm(0));
+    } else if (auto *AI = dyn_cast<AllocaInst>(Val)) {
+      // Values coming from a stack location also require a sepcial encoding,
+      // but that is added later on by the target specific frame index
+      // elimination implementation.
+      auto SI = FuncInfo.StaticAllocaMap.find(AI);
+      if (SI != FuncInfo.StaticAllocaMap.end())
+        Ops.push_back(MachineOperand::CreateFI(SI->second));
+      else
+        return false;
+    } else {
+      unsigned Reg = getRegForValue(Val);
+      if (!Reg)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+    }
+  }
+  return true;
+}
+
+bool FastISel::selectStackmap(const CallInst *I) {
+  // void @llvm.experimental.stackmap(i64 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+  assert(I->getCalledFunction()->getReturnType()->isVoidTy() &&
+         "Stackmap cannot return a value.");
+
+  // The stackmap intrinsic only records the live variables (the arguments
+  // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+  // intrinsic, this won't be lowered to a function call. This means we don't
+  // have to worry about calling conventions and target-specific lowering code.
+  // Instead we perform the call lowering right here.
+  //
+  // CALLSEQ_START(0...)
+  // STACKMAP(id, nbytes, ...)
+  // CALLSEQ_END(0, 0)
+  //
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Push live variables for the stack map (skipping the first two arguments
+  // <id> and <numBytes>).
+  if (!addStackMapLiveVars(Ops, I, 2))
+    return false;
+
+  // We are not adding any register mask info here, because the stackmap doesn't
+  // clobber anything.
+
+  // Add scratch registers as implicit def and early clobber.
+  CallingConv::ID CC = I->getCallingConv();
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+
+  // Issue CALLSEQ_START
+  unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+  auto Builder =
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown));
+  const MCInstrDesc &MCID = Builder.getInstr()->getDesc();
+  for (unsigned I = 0, E = MCID.getNumOperands(); I < E; ++I)
+    Builder.addImm(0);
+
+  // Issue STACKMAP.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                    TII.get(TargetOpcode::STACKMAP));
+  for (auto const &MO : Ops)
+    MIB.addOperand(MO);
+
+  // Issue CALLSEQ_END
+  unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
+      .addImm(0)
+      .addImm(0);
+
+  // Inform the Frame Information that we have a stackmap in this function.
+  FuncInfo.MF->getFrameInfo()->setHasStackMap();
+
+  return true;
+}
+
+/// \brief Lower an argument list according to the target calling convention.
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
+                                 unsigned NumArgs, const Value *Callee,
+                                 bool ForceRetVoidTy, CallLoweringInfo &CLI) {
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  ImmutableCallSite CS(CI);
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  Type *RetTy = ForceRetVoidTy ? Type::getVoidTy(CI->getType()->getContext())
+                               : CI->getType();
+  CLI.setCallee(CI->getCallingConv(), RetTy, Callee, std::move(Args), NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+FastISel::CallLoweringInfo &FastISel::CallLoweringInfo::setCallee(
+    const DataLayout &DL, MCContext &Ctx, CallingConv::ID CC, Type *ResultTy,
+    const char *Target, ArgListTy &&ArgsList, unsigned FixedArgs) {
+  SmallString<32> MangledName;
+  Mangler::getNameWithPrefix(MangledName, Target, DL);
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(MangledName);
+  return setCallee(CC, ResultTy, Sym, std::move(ArgsList), FixedArgs);
+}
+
+bool FastISel::selectPatchpoint(const CallInst *I) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+  CallingConv::ID CC = I->getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !I->getType()->isVoidTy();
+  Value *Callee = I->getOperand(PatchPointOpers::TargetPos)->stripPointerCasts();
+
+  // Get the real number of arguments participating in the call <numArgs>
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos)) &&
+         "Expected a constant integer.");
+  const auto *NumArgsVal =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NArgPos));
+  unsigned NumArgs = NumArgsVal->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  // This includes all meta-operands up to but not including CC.
+  unsigned NumMetaOpers = PatchPointOpers::CCPos;
+  assert(I->getNumArgOperands() >= NumMetaOpers + NumArgs &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
+  CallLoweringInfo CLI;
+  CLI.setIsPatchPoint();
+  if (!lowerCallOperands(I, NumMetaOpers, NumCallArgs, Callee, IsAnyRegCC, CLI))
+    return false;
+
+  assert(CLI.Call && "No call instruction specified.");
+
+  SmallVector<MachineOperand, 32> Ops;
+
+  // Add an explicit result reg if we use the anyreg calling convention.
+  if (IsAnyRegCC && HasDef) {
+    assert(CLI.NumResultRegs == 0 && "Unexpected result register.");
+    CLI.ResultReg = createResultReg(TLI.getRegClassFor(MVT::i64));
+    CLI.NumResultRegs = 1;
+    Ops.push_back(MachineOperand::CreateReg(CLI.ResultReg, /*IsDef=*/true));
+  }
+
+  // Add the <id> and <numBytes> constants.
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::IDPos)) &&
+         "Expected a constant integer.");
+  const auto *ID = cast<ConstantInt>(I->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(MachineOperand::CreateImm(ID->getZExtValue()));
+
+  assert(isa<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos)) &&
+         "Expected a constant integer.");
+  const auto *NumBytes =
+      cast<ConstantInt>(I->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(MachineOperand::CreateImm(NumBytes->getZExtValue()));
+
+  // Add the call target.
+  if (const auto *C = dyn_cast<IntToPtrInst>(Callee)) {
+    uint64_t CalleeConstAddr =
+      cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+    Ops.push_back(MachineOperand::CreateImm(CalleeConstAddr));
+  } else if (const auto *C = dyn_cast<ConstantExpr>(Callee)) {
+    if (C->getOpcode() == Instruction::IntToPtr) {
+      uint64_t CalleeConstAddr =
+        cast<ConstantInt>(C->getOperand(0))->getZExtValue();
+      Ops.push_back(MachineOperand::CreateImm(CalleeConstAddr));
+    } else
+      llvm_unreachable("Unsupported ConstantExpr.");
+  } else if (const auto *GV = dyn_cast<GlobalValue>(Callee)) {
+    Ops.push_back(MachineOperand::CreateGA(GV, 0));
+  } else if (isa<ConstantPointerNull>(Callee))
+    Ops.push_back(MachineOperand::CreateImm(0));
+  else
+    llvm_unreachable("Unsupported callee address.");
+
+  // Adjust <numArgs> to account for any arguments that have been passed on
+  // the stack instead.
+  unsigned NumCallRegArgs = IsAnyRegCC ? NumArgs : CLI.OutRegs.size();
+  Ops.push_back(MachineOperand::CreateImm(NumCallRegArgs));
+
+  // Add the calling convention
+  Ops.push_back(MachineOperand::CreateImm((unsigned)CC));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (IsAnyRegCC) {
+    for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) {
+      unsigned Reg = getRegForValue(I->getArgOperand(i));
+      if (!Reg)
+        return false;
+      Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+    }
+  }
+
+  // Push the arguments from the call instruction.
+  for (auto Reg : CLI.OutRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/false));
+
+  // Push live variables for the stack map.
+  if (!addStackMapLiveVars(Ops, I, NumMetaOpers + NumArgs))
+    return false;
+
+  // Push the register mask info.
+  Ops.push_back(MachineOperand::CreateRegMask(
+      TRI.getCallPreservedMask(*FuncInfo.MF, CC)));
+
+  // Add scratch registers as implicit def and early clobber.
+  const MCPhysReg *ScratchRegs = TLI.getScratchRegisters(CC);
+  for (unsigned i = 0; ScratchRegs[i]; ++i)
+    Ops.push_back(MachineOperand::CreateReg(
+        ScratchRegs[i], /*IsDef=*/true, /*IsImp=*/true, /*IsKill=*/false,
+        /*IsDead=*/false, /*IsUndef=*/false, /*IsEarlyClobber=*/true));
+
+  // Add implicit defs (return values).
+  for (auto Reg : CLI.InRegs)
+    Ops.push_back(MachineOperand::CreateReg(Reg, /*IsDef=*/true,
+                                            /*IsImpl=*/true));
+
+  // Insert the patchpoint instruction before the call generated by the target.
+  MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, CLI.Call, DbgLoc,
+                                    TII.get(TargetOpcode::PATCHPOINT));
+
+  for (auto &MO : Ops)
+    MIB.addOperand(MO);
+
+  MIB->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  // Delete the original call instruction.
+  CLI.Call->eraseFromParent();
+
+  // Inform the Frame Information that we have a patchpoint in this function.
+  FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
+
+  if (CLI.NumResultRegs)
+    updateValueMap(I, CLI.ResultReg, CLI.NumResultRegs);
+  return true;
+}
+
+/// Returns an AttributeSet representing the attributes applied to the return
+/// value of the given call.
+static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
+                           Attrs);
+}
+
+bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
+                           unsigned NumArgs) {
+  MCContext &Ctx = MF->getContext();
+  SmallString<32> MangledName;
+  Mangler::getNameWithPrefix(MangledName, SymName, DL);
+  MCSymbol *Sym = Ctx.getOrCreateSymbol(MangledName);
+  return lowerCallTo(CI, Sym, NumArgs);
+}
+
+bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
+                           unsigned NumArgs) {
+  ImmutableCallSite CS(CI);
+
+  FunctionType *FTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
+
+  ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  for (unsigned ArgI = 0; ArgI != NumArgs; ++ArgI) {
+    Value *V = CI->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    ArgListEntry Entry;
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, ArgI + 1);
+    Args.push_back(Entry);
+  }
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
+  // Handle the incoming return values from the call.
+  CLI.clearIns();
+  SmallVector<EVT, 4> RetTys;
+  ComputeValueVTs(TLI, DL, CLI.RetTy, RetTys);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, TLI, DL);
+
+  bool CanLowerReturn = TLI.CanLowerReturn(
+      CLI.CallConv, *FuncInfo.MF, CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  // FIXME: sret demotion isn't supported yet - bail out.
+  if (!CanLowerReturn)
+    return false;
+
+  for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+    EVT VT = RetTys[I];
+    MVT RegisterVT = TLI.getRegisterType(CLI.RetTy->getContext(), VT);
+    unsigned NumRegs = TLI.getNumRegisters(CLI.RetTy->getContext(), VT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      ISD::InputArg MyFlags;
+      MyFlags.VT = RegisterVT;
+      MyFlags.ArgVT = VT;
+      MyFlags.Used = CLI.IsReturnValueUsed;
+      if (CLI.RetSExt)
+        MyFlags.Flags.setSExt();
+      if (CLI.RetZExt)
+        MyFlags.Flags.setZExt();
+      if (CLI.IsInReg)
+        MyFlags.Flags.setInReg();
+      CLI.Ins.push_back(MyFlags);
+    }
+  }
+
+  // Handle all of the outgoing arguments.
+  CLI.clearOuts();
+  for (auto &Arg : CLI.getArgs()) {
+    Type *FinalType = Arg.Ty;
+    if (Arg.IsByVal)
+      FinalType = cast<PointerType>(Arg.Ty)->getElementType();
+    bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg);
+
+    ISD::ArgFlagsTy Flags;
+    if (Arg.IsZExt)
+      Flags.setZExt();
+    if (Arg.IsSExt)
+      Flags.setSExt();
+    if (Arg.IsInReg)
+      Flags.setInReg();
+    if (Arg.IsSRet)
+      Flags.setSRet();
+    if (Arg.IsSwiftSelf)
+      Flags.setSwiftSelf();
+    if (Arg.IsSwiftError)
+      Flags.setSwiftError();
+    if (Arg.IsByVal)
+      Flags.setByVal();
+    if (Arg.IsInAlloca) {
+      Flags.setInAlloca();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // inalloca. This way we can know how many bytes we should've allocated
+      // and how many bytes a callee cleanup function will pop.  If we port
+      // inalloca to more targets, we'll have to add custom inalloca handling in
+      // the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    if (Arg.IsByVal || Arg.IsInAlloca) {
+      PointerType *Ty = cast<PointerType>(Arg.Ty);
+      Type *ElementTy = Ty->getElementType();
+      unsigned FrameSize = DL.getTypeAllocSize(ElementTy);
+      // For ByVal, alignment should come from FE. BE will guess if this info is
+      // not there, but there are cases it cannot get right.
+      unsigned FrameAlign = Arg.Alignment;
+      if (!FrameAlign)
+        FrameAlign = TLI.getByValTypeAlignment(ElementTy, DL);
+      Flags.setByValSize(FrameSize);
+      Flags.setByValAlign(FrameAlign);
+    }
+    if (Arg.IsNest)
+      Flags.setNest();
+    if (NeedsRegBlock)
+      Flags.setInConsecutiveRegs();
+    unsigned OriginalAlignment = DL.getABITypeAlignment(Arg.Ty);
+    Flags.setOrigAlign(OriginalAlignment);
+
+    CLI.OutVals.push_back(Arg.Val);
+    CLI.OutFlags.push_back(Flags);
+  }
+
+  if (!fastLowerCall(CLI))
+    return false;
+
+  // Set all unused physreg defs as dead.
+  assert(CLI.Call && "No call instruction specified.");
+  CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI);
+
+  if (CLI.NumResultRegs && CLI.CS)
+    updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
+
+  return true;
+}
+
+bool FastISel::lowerCall(const CallInst *CI) {
+  ImmutableCallSite CS(CI);
+
+  FunctionType *FuncTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    Entry.Val = V;
+    Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+    Args.push_back(Entry);
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within fastLowerCall.
+  bool IsTailCall = CI->isTailCall();
+  if (IsTailCall && !isInTailCallPosition(CS, TM))
+    IsTailCall = false;
+
+  CallLoweringInfo CLI;
+  CLI.setCallee(RetTy, FuncTy, CI->getCalledValue(), std::move(Args), CS)
+      .setTailCall(IsTailCall);
+
+  return lowerCallTo(CLI);
+}
+
+bool FastISel::selectCall(const User *I) {
+  const CallInst *Call = cast<CallInst>(I);
+
+  // Handle simple inline asms.
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledValue())) {
+    // If the inline asm has side effects, then make sure that no local value
+    // lives across by flushing the local value map.
+    if (IA->hasSideEffects())
+      flushLocalValueMap();
+
+    // Don't attempt to handle constraints.
+    if (!IA->getConstraintString().empty())
+      return false;
+
+    unsigned ExtraInfo = 0;
+    if (IA->hasSideEffects())
+      ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::INLINEASM))
+        .addExternalSymbol(IA->getAsmString().c_str())
+        .addImm(ExtraInfo);
+    return true;
+  }
+
+  MachineModuleInfo &MMI = FuncInfo.MF->getMMI();
+  ComputeUsesVAFloatArgument(*Call, &MMI);
+
+  // Handle intrinsic function calls.
+  if (const auto *II = dyn_cast<IntrinsicInst>(Call))
+    return selectIntrinsicCall(II);
+
+  // Usually, it does not make sense to initialize a value,
+  // make an unrelated function call and use the value, because
+  // it tends to be spilled on the stack. So, we move the pointer
+  // to the last local value to the beginning of the block, so that
+  // all the values which have already been materialized,
+  // appear after the call. It also makes sense to skip intrinsics
+  // since they tend to be inlined.
+  flushLocalValueMap();
+
+  return lowerCall(Call);
+}
+
+bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
+  switch (II->getIntrinsicID()) {
+  default:
+    break;
+  // At -O0 we don't care about the lifetime intrinsics.
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+  // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
+    return true;
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
+    assert(DI->getVariable() && "Missing variable");
+    if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      return true;
+    }
+
+    const Value *Address = DI->getAddress();
+    if (!Address || isa<UndefValue>(Address)) {
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      return true;
+    }
+
+    unsigned Offset = 0;
+    Optional<MachineOperand> Op;
+    if (const auto *Arg = dyn_cast<Argument>(Address))
+      // Some arguments' frame index is recorded during argument lowering.
+      Offset = FuncInfo.getArgumentFrameIndex(Arg);
+    if (Offset)
+      Op = MachineOperand::CreateFI(Offset);
+    if (!Op)
+      if (unsigned Reg = lookUpRegForValue(Address))
+        Op = MachineOperand::CreateReg(Reg, false);
+
+    // If we have a VLA that has a "use" in a metadata node that's then used
+    // here but it has no other uses, then we have a problem. E.g.,
+    //
+    //   int foo (const int *x) {
+    //     char a[*x];
+    //     return 0;
+    //   }
+    //
+    // If we assign 'a' a vreg and fast isel later on has to use the selection
+    // DAG isel, it will want to copy the value to the vreg. However, there are
+    // no uses, which goes counter to what selection DAG isel expects.
+    if (!Op && !Address->use_empty() && isa<Instruction>(Address) &&
+        (!isa<AllocaInst>(Address) ||
+         !FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(Address))))
+      Op = MachineOperand::CreateReg(FuncInfo.InitializeRegForValue(Address),
+                                     false);
+
+    if (Op) {
+      assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+             "Expected inlined-at fields to agree");
+      if (Op->isReg()) {
+        Op->setIsDebug(true);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::DBG_VALUE), false, Op->getReg(), 0,
+                DI->getVariable(), DI->getExpression());
+      } else
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(0)
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
+    } else {
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    }
+    return true;
+  }
+  case Intrinsic::dbg_value: {
+    // This form of DBG_VALUE is target-independent.
+    const DbgValueInst *DI = cast<DbgValueInst>(II);
+    const MCInstrDesc &II = TII.get(TargetOpcode::DBG_VALUE);
+    const Value *V = DI->getValue();
+    assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
+           "Expected inlined-at fields to agree");
+    if (!V) {
+      // Currently the optimizer can produce this; insert an undef to
+      // help debugging.  Probably the optimizer should not do this.
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addReg(0U)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
+    } else if (const auto *CI = dyn_cast<ConstantInt>(V)) {
+      if (CI->getBitWidth() > 64)
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+            .addCImm(CI)
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
+      else
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+            .addImm(CI->getZExtValue())
+            .addImm(DI->getOffset())
+            .addMetadata(DI->getVariable())
+            .addMetadata(DI->getExpression());
+    } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+          .addFPImm(CF)
+          .addImm(DI->getOffset())
+          .addMetadata(DI->getVariable())
+          .addMetadata(DI->getExpression());
+    } else if (unsigned Reg = lookUpRegForValue(V)) {
+      // FIXME: This does not handle register-indirect values at offset 0.
+      bool IsIndirect = DI->getOffset() != 0;
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect, Reg,
+              DI->getOffset(), DI->getVariable(), DI->getExpression());
+    } else {
+      // We can't yet handle anything else here because it would require
+      // generating code, thus altering codegen because of debug info.
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    }
+    return true;
+  }
+  case Intrinsic::objectsize: {
+    ConstantInt *CI = cast<ConstantInt>(II->getArgOperand(1));
+    unsigned long long Res = CI->isZero() ? -1ULL : 0;
+    Constant *ResCI = ConstantInt::get(II->getType(), Res);
+    unsigned ResultReg = getRegForValue(ResCI);
+    if (!ResultReg)
+      return false;
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::expect: {
+    unsigned ResultReg = getRegForValue(II->getArgOperand(0));
+    if (!ResultReg)
+      return false;
+    updateValueMap(II, ResultReg);
+    return true;
+  }
+  case Intrinsic::experimental_stackmap:
+    return selectStackmap(II);
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64:
+    return selectPatchpoint(II);
+  }
+
+  return fastLowerIntrinsicCall(II);
+}
+
+bool FastISel::selectCast(const User *I, unsigned Opcode) {
+  EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstVT = TLI.getValueType(DL, I->getType());
+
+  if (SrcVT == MVT::Other || !SrcVT.isSimple() || DstVT == MVT::Other ||
+      !DstVT.isSimple())
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  // Check if the destination type is legal.
+  if (!TLI.isTypeLegal(DstVT))
+    return false;
+
+  // Check if the source operand is legal.
+  if (!TLI.isTypeLegal(SrcVT))
+    return false;
+
+  unsigned InputReg = getRegForValue(I->getOperand(0));
+  if (!InputReg)
+    // Unhandled operand.  Halt "fast" selection and bail.
+    return false;
+
+  bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
+
+  unsigned ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+                                  Opcode, InputReg, InputRegIsKill);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectBitCast(const User *I) {
+  // If the bitcast doesn't change the type, just use the operand value.
+  if (I->getType() == I->getOperand(0)->getType()) {
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (!Reg)
+      return false;
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  // Bitcasts of other values become reg-reg copies or BITCAST operators.
+  EVT SrcEVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+  EVT DstEVT = TLI.getValueType(DL, I->getType());
+  if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
+      !TLI.isTypeLegal(SrcEVT) || !TLI.isTypeLegal(DstEVT))
+    // Unhandled type. Halt "fast" selection and bail.
+    return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DstVT = DstEVT.getSimpleVT();
+  unsigned Op0 = getRegForValue(I->getOperand(0));
+  if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
+    return false;
+  bool Op0IsKill = hasTrivialKill(I->getOperand(0));
+
+  // First, try to perform the bitcast by inserting a reg-reg copy.
+  unsigned ResultReg = 0;
+  if (SrcVT == DstVT) {
+    const TargetRegisterClass *SrcClass = TLI.getRegClassFor(SrcVT);
+    const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
+    // Don't attempt a cross-class copy. It will likely fail.
+    if (SrcClass == DstClass) {
+      ResultReg = createResultReg(DstClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg).addReg(Op0);
+    }
+  }
+
+  // If the reg-reg copy failed, select a BITCAST opcode.
+  if (!ResultReg)
+    ResultReg = fastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
+
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+// Return true if we should copy from swift error to the final vreg as specified
+// by SwiftErrorWorklist.
+static bool shouldCopySwiftErrorsToFinalVRegs(const TargetLowering &TLI,
+                                              FunctionLoweringInfo &FuncInfo) {
+  if (!TLI.supportSwiftError())
+    return false;
+  return FuncInfo.SwiftErrorWorklist.count(FuncInfo.MBB);
+}
+
+// Remove local value instructions starting from the instruction after
+// SavedLastLocalValue to the current function insert point.
+void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
+{
+  MachineInstr *CurLastLocalValue = getLastLocalValue();
+  if (CurLastLocalValue != SavedLastLocalValue) {
+    // Find the first local value instruction to be deleted. 
+    // This is the instruction after SavedLastLocalValue if it is non-NULL.
+    // Otherwise it's the first instruction in the block.
+    MachineBasicBlock::iterator FirstDeadInst(SavedLastLocalValue);
+    if (SavedLastLocalValue)
+      ++FirstDeadInst;
+    else
+      FirstDeadInst = FuncInfo.MBB->getFirstNonPHI();
+    setLastLocalValue(SavedLastLocalValue);
+    removeDeadCode(FirstDeadInst, FuncInfo.InsertPt);
+  }
+}
+
+bool FastISel::selectInstruction(const Instruction *I) {
+  MachineInstr *SavedLastLocalValue = getLastLocalValue();
+  // Just before the terminator instruction, insert instructions to
+  // feed PHI nodes in successor blocks.
+  if (isa<TerminatorInst>(I)) {
+    // If we need to materialize any vreg from worklist, we bail out of
+    // FastISel.
+    if (shouldCopySwiftErrorsToFinalVRegs(TLI, FuncInfo))
+      return false;
+    if (!handlePHINodesInSuccessorBlocks(I->getParent())) {
+      // PHI node handling may have generated local value instructions,
+      // even though it failed to handle all PHI nodes.
+      // We remove these instructions because SelectionDAGISel will generate 
+      // them again.
+      removeDeadLocalValueCode(SavedLastLocalValue);
+      return false;
+    }
+  }
+
+  // FastISel does not handle any operand bundles except OB_funclet.
+  if (ImmutableCallSite CS = ImmutableCallSite(I))
+    for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i)
+      if (CS.getOperandBundleAt(i).getTagID() != LLVMContext::OB_funclet)
+        return false;
+
+  DbgLoc = I->getDebugLoc();
+
+  SavedInsertPt = FuncInfo.InsertPt;
+
+  if (const auto *Call = dyn_cast<CallInst>(I)) {
+    const Function *F = Call->getCalledFunction();
+    LibFunc::Func Func;
+
+    // As a special case, don't handle calls to builtin library functions that
+    // may be translated directly to target instructions.
+    if (F && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func))
+      return false;
+
+    // Don't handle Intrinsic::trap if a trap function is specified.
+    if (F && F->getIntrinsicID() == Intrinsic::trap &&
+        Call->hasFnAttr("trap-func-name"))
+      return false;
+  }
+
+  // First, try doing target-independent selection.
+  if (!SkipTargetIndependentISel) {
+    if (selectOperator(I, I->getOpcode())) {
+      ++NumFastIselSuccessIndependent;
+      DbgLoc = DebugLoc();
+      return true;
+    }
+    // Remove dead code.
+    recomputeInsertPt();
+    if (SavedInsertPt != FuncInfo.InsertPt)
+      removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+    SavedInsertPt = FuncInfo.InsertPt;
+  }
+  // Next, try calling the target to attempt to handle the instruction.
+  if (fastSelectInstruction(I)) {
+    ++NumFastIselSuccessTarget;
+    DbgLoc = DebugLoc();
+    return true;
+  }
+  // Remove dead code.
+  recomputeInsertPt();
+  if (SavedInsertPt != FuncInfo.InsertPt)
+    removeDeadCode(FuncInfo.InsertPt, SavedInsertPt);
+
+  DbgLoc = DebugLoc();
+  // Undo phi node updates, because they will be added again by SelectionDAG.
+  if (isa<TerminatorInst>(I)) {
+    // PHI node handling may have generated local value instructions. 
+    // We remove them because SelectionDAGISel will generate them again.
+    removeDeadLocalValueCode(SavedLastLocalValue);
+    FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+  }
+  return false;
+}
+
+/// Emit an unconditional branch to the given block, unless it is the immediate
+/// (fall-through) successor, and update the CFG.
+void FastISel::fastEmitBranch(MachineBasicBlock *MSucc,
+                              const DebugLoc &DbgLoc) {
+  if (FuncInfo.MBB->getBasicBlock()->size() > 1 &&
+      FuncInfo.MBB->isLayoutSuccessor(MSucc)) {
+    // For more accurate line information if this is the only instruction
+    // in the block then emit it, otherwise we have the unconditional
+    // fall-through case, which needs no instructions.
+  } else {
+    // The unconditional branch case.
+    TII.InsertBranch(*FuncInfo.MBB, MSucc, nullptr,
+                     SmallVector<MachineOperand, 0>(), DbgLoc);
+  }
+  if (FuncInfo.BPI) {
+    auto BranchProbability = FuncInfo.BPI->getEdgeProbability(
+        FuncInfo.MBB->getBasicBlock(), MSucc->getBasicBlock());
+    FuncInfo.MBB->addSuccessor(MSucc, BranchProbability);
+  } else
+    FuncInfo.MBB->addSuccessorWithoutProb(MSucc);
+}
+
+void FastISel::finishCondBranch(const BasicBlock *BranchBB,
+                                MachineBasicBlock *TrueMBB,
+                                MachineBasicBlock *FalseMBB) {
+  // Add TrueMBB as successor unless it is equal to the FalseMBB: This can
+  // happen in degenerate IR and MachineIR forbids to have a block twice in the
+  // successor/predecessor lists.
+  if (TrueMBB != FalseMBB) {
+    if (FuncInfo.BPI) {
+      auto BranchProbability =
+          FuncInfo.BPI->getEdgeProbability(BranchBB, TrueMBB->getBasicBlock());
+      FuncInfo.MBB->addSuccessor(TrueMBB, BranchProbability);
+    } else
+      FuncInfo.MBB->addSuccessorWithoutProb(TrueMBB);
+  }
+
+  fastEmitBranch(FalseMBB, DbgLoc);
+}
+
+/// Emit an FNeg operation.
+bool FastISel::selectFNeg(const User *I) {
+  unsigned OpReg = getRegForValue(BinaryOperator::getFNegArgument(I));
+  if (!OpReg)
+    return false;
+  bool OpRegIsKill = hasTrivialKill(I);
+
+  // If the target has ISD::FNEG, use it.
+  EVT VT = TLI.getValueType(DL, I->getType());
+  unsigned ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
+                                  OpReg, OpRegIsKill);
+  if (ResultReg) {
+    updateValueMap(I, ResultReg);
+    return true;
+  }
+
+  // Bitcast the value to integer, twiddle the sign bit with xor,
+  // and then bitcast it back to floating-point.
+  if (VT.getSizeInBits() > 64)
+    return false;
+  EVT IntVT = EVT::getIntegerVT(I->getContext(), VT.getSizeInBits());
+  if (!TLI.isTypeLegal(IntVT))
+    return false;
+
+  unsigned IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+                               ISD::BITCAST, OpReg, OpRegIsKill);
+  if (!IntReg)
+    return false;
+
+  unsigned IntResultReg = fastEmit_ri_(
+      IntVT.getSimpleVT(), ISD::XOR, IntReg, /*IsKill=*/true,
+      UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
+  if (!IntResultReg)
+    return false;
+
+  ResultReg = fastEmit_r(IntVT.getSimpleVT(), VT.getSimpleVT(), ISD::BITCAST,
+                         IntResultReg, /*IsKill=*/true);
+  if (!ResultReg)
+    return false;
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
+bool FastISel::selectExtractValue(const User *U) {
+  const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(U);
+  if (!EVI)
+    return false;
+
+  // Make sure we only try to handle extracts with a legal result.  But also
+  // allow i1 because it's easy.
+  EVT RealVT = TLI.getValueType(DL, EVI->getType(), /*AllowUnknown=*/true);
+  if (!RealVT.isSimple())
+    return false;
+  MVT VT = RealVT.getSimpleVT();
+  if (!TLI.isTypeLegal(VT) && VT != MVT::i1)
+    return false;
+
+  const Value *Op0 = EVI->getOperand(0);
+  Type *AggTy = Op0->getType();
+
+  // Get the base result register.
+  unsigned ResultReg;
+  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0);
+  if (I != FuncInfo.ValueMap.end())
+    ResultReg = I->second;
+  else if (isa<Instruction>(Op0))
+    ResultReg = FuncInfo.InitializeRegForValue(Op0);
+  else
+    return false; // fast-isel can't handle aggregate constants at the moment
+
+  // Get the actual result register, which is an offset from the base register.
+  unsigned VTIndex = ComputeLinearIndex(AggTy, EVI->getIndices());
+
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, DL, AggTy, AggValueVTs);
+
+  for (unsigned i = 0; i < VTIndex; i++)
+    ResultReg += TLI.getNumRegisters(FuncInfo.Fn->getContext(), AggValueVTs[i]);
+
+  updateValueMap(EVI, ResultReg);
+  return true;
+}
+
+bool FastISel::selectOperator(const User *I, unsigned Opcode) {
+  switch (Opcode) {
+  case Instruction::Add:
+    return selectBinaryOp(I, ISD::ADD);
+  case Instruction::FAdd:
+    return selectBinaryOp(I, ISD::FADD);
+  case Instruction::Sub:
+    return selectBinaryOp(I, ISD::SUB);
+  case Instruction::FSub:
+    // FNeg is currently represented in LLVM IR as a special case of FSub.
+    if (BinaryOperator::isFNeg(I))
+      return selectFNeg(I);
+    return selectBinaryOp(I, ISD::FSUB);
+  case Instruction::Mul:
+    return selectBinaryOp(I, ISD::MUL);
+  case Instruction::FMul:
+    return selectBinaryOp(I, ISD::FMUL);
+  case Instruction::SDiv:
+    return selectBinaryOp(I, ISD::SDIV);
+  case Instruction::UDiv:
+    return selectBinaryOp(I, ISD::UDIV);
+  case Instruction::FDiv:
+    return selectBinaryOp(I, ISD::FDIV);
+  case Instruction::SRem:
+    return selectBinaryOp(I, ISD::SREM);
+  case Instruction::URem:
+    return selectBinaryOp(I, ISD::UREM);
+  case Instruction::FRem:
+    return selectBinaryOp(I, ISD::FREM);
+  case Instruction::Shl:
+    return selectBinaryOp(I, ISD::SHL);
+  case Instruction::LShr:
+    return selectBinaryOp(I, ISD::SRL);
+  case Instruction::AShr:
+    return selectBinaryOp(I, ISD::SRA);
+  case Instruction::And:
+    return selectBinaryOp(I, ISD::AND);
+  case Instruction::Or:
+    return selectBinaryOp(I, ISD::OR);
+  case Instruction::Xor:
+    return selectBinaryOp(I, ISD::XOR);
+
+  case Instruction::GetElementPtr:
+    return selectGetElementPtr(I);
+
+  case Instruction::Br: {
+    const BranchInst *BI = cast<BranchInst>(I);
+
+    if (BI->isUnconditional()) {
+      const BasicBlock *LLVMSucc = BI->getSuccessor(0);
+      MachineBasicBlock *MSucc = FuncInfo.MBBMap[LLVMSucc];
+      fastEmitBranch(MSucc, BI->getDebugLoc());
+      return true;
+    }
+
+    // Conditional branches are not handed yet.
+    // Halt "fast" selection and bail.
+    return false;
+  }
+
+  case Instruction::Unreachable:
+    if (TM.Options.TrapUnreachable)
+      return fastEmit_(MVT::Other, MVT::Other, ISD::TRAP) != 0;
+    else
+      return true;
+
+  case Instruction::Alloca:
+    // FunctionLowering has the static-sized case covered.
+    if (FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(I)))
+      return true;
+
+    // Dynamic-sized alloca is not handled yet.
+    return false;
+
+  case Instruction::Call:
+    return selectCall(I);
+
+  case Instruction::BitCast:
+    return selectBitCast(I);
+
+  case Instruction::FPToSI:
+    return selectCast(I, ISD::FP_TO_SINT);
+  case Instruction::ZExt:
+    return selectCast(I, ISD::ZERO_EXTEND);
+  case Instruction::SExt:
+    return selectCast(I, ISD::SIGN_EXTEND);
+  case Instruction::Trunc:
+    return selectCast(I, ISD::TRUNCATE);
+  case Instruction::SIToFP:
+    return selectCast(I, ISD::SINT_TO_FP);
+
+  case Instruction::IntToPtr: // Deliberate fall-through.
+  case Instruction::PtrToInt: {
+    EVT SrcVT = TLI.getValueType(DL, I->getOperand(0)->getType());
+    EVT DstVT = TLI.getValueType(DL, I->getType());
+    if (DstVT.bitsGT(SrcVT))
+      return selectCast(I, ISD::ZERO_EXTEND);
+    if (DstVT.bitsLT(SrcVT))
+      return selectCast(I, ISD::TRUNCATE);
+    unsigned Reg = getRegForValue(I->getOperand(0));
+    if (!Reg)
+      return false;
+    updateValueMap(I, Reg);
+    return true;
+  }
+
+  case Instruction::ExtractValue:
+    return selectExtractValue(I);
+
+  case Instruction::PHI:
+    llvm_unreachable("FastISel shouldn't visit PHI nodes!");
+
+  default:
+    // Unhandled instruction. Halt "fast" selection and bail.
+    return false;
+  }
+}
+
+FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
+                   const TargetLibraryInfo *LibInfo,
+                   bool SkipTargetIndependentISel)
+    : FuncInfo(FuncInfo), MF(FuncInfo.MF), MRI(FuncInfo.MF->getRegInfo()),
+      MFI(*FuncInfo.MF->getFrameInfo()), MCP(*FuncInfo.MF->getConstantPool()),
+      TM(FuncInfo.MF->getTarget()), DL(MF->getDataLayout()),
+      TII(*MF->getSubtarget().getInstrInfo()),
+      TLI(*MF->getSubtarget().getTargetLowering()),
+      TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
+      SkipTargetIndependentISel(SkipTargetIndependentISel) {}
+
+FastISel::~FastISel() {}
+
+bool FastISel::fastLowerArguments() { return false; }
+
+bool FastISel::fastLowerCall(CallLoweringInfo & /*CLI*/) { return false; }
+
+bool FastISel::fastLowerIntrinsicCall(const IntrinsicInst * /*II*/) {
+  return false;
+}
+
+unsigned FastISel::fastEmit_(MVT, MVT, unsigned) { return 0; }
+
+unsigned FastISel::fastEmit_r(MVT, MVT, unsigned, unsigned /*Op0*/,
+                              bool /*Op0IsKill*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_rr(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, unsigned /*Op1*/,
+                               bool /*Op1IsKill*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_i(MVT, MVT, unsigned, uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_f(MVT, MVT, unsigned,
+                              const ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_ri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/, uint64_t /*Imm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_rf(MVT, MVT, unsigned, unsigned /*Op0*/,
+                               bool /*Op0IsKill*/,
+                               const ConstantFP * /*FPImm*/) {
+  return 0;
+}
+
+unsigned FastISel::fastEmit_rri(MVT, MVT, unsigned, unsigned /*Op0*/,
+                                bool /*Op0IsKill*/, unsigned /*Op1*/,
+                                bool /*Op1IsKill*/, uint64_t /*Imm*/) {
+  return 0;
+}
+
+/// This method is a wrapper of fastEmit_ri. It first tries to emit an
+/// instruction with an immediate operand using fastEmit_ri.
+/// If that fails, it materializes the immediate into a register and try
+/// fastEmit_rr instead.
+unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
+                                bool Op0IsKill, uint64_t Imm, MVT ImmType) {
+  // If this is a multiply by a power of two, emit this as a shift left.
+  if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
+    Opcode = ISD::SHL;
+    Imm = Log2_64(Imm);
+  } else if (Opcode == ISD::UDIV && isPowerOf2_64(Imm)) {
+    // div x, 8 -> srl x, 3
+    Opcode = ISD::SRL;
+    Imm = Log2_64(Imm);
+  }
+
+  // Horrible hack (to be removed), check to make sure shift amounts are
+  // in-range.
+  if ((Opcode == ISD::SHL || Opcode == ISD::SRA || Opcode == ISD::SRL) &&
+      Imm >= VT.getSizeInBits())
+    return 0;
+
+  // First check if immediate type is legal. If not, we can't use the ri form.
+  unsigned ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
+  if (ResultReg)
+    return ResultReg;
+  unsigned MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  bool IsImmKill = true;
+  if (!MaterialReg) {
+    // This is a bit ugly/slow, but failing here means falling out of
+    // fast-isel, which would be very slow.
+    IntegerType *ITy =
+        IntegerType::get(FuncInfo.Fn->getContext(), VT.getSizeInBits());
+    MaterialReg = getRegForValue(ConstantInt::get(ITy, Imm));
+    if (!MaterialReg)
+      return 0;
+    // FIXME: If the materialized register here has no uses yet then this
+    // will be the first use and we should be able to mark it as killed.
+    // However, the local value area for materialising constant expressions
+    // grows down, not up, which means that any constant expressions we generate
+    // later which also use 'Imm' could be after this instruction and therefore
+    // after this kill.
+    IsImmKill = false;
+  }
+  return fastEmit_rr(VT, VT, Opcode, Op0, Op0IsKill, MaterialReg, IsImmKill);
+}
+
+unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
+  return MRI.createVirtualRegister(RC);
+}
+
+unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
+                                            unsigned OpNum) {
+  if (TargetRegisterInfo::isVirtualRegister(Op)) {
+    const TargetRegisterClass *RegClass =
+        TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
+    if (!MRI.constrainRegClass(Op, RegClass)) {
+      // If it's not legal to COPY between the register classes, something
+      // has gone very wrong before we got here.
+      unsigned NewOp = createResultReg(RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
+      return NewOp;
+    }
+  }
+  return Op;
+}
+
+unsigned FastISel::fastEmitInst_(unsigned MachineInstOpcode,
+                                 const TargetRegisterClass *RC) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg);
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, unsigned Op0,
+                                  bool Op0IsKill) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill));
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, unsigned Op1,
+                                   bool Op1IsKill) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, unsigned Op2,
+                                    bool Op2IsKill) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+  Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addReg(Op2, getKillRegState(Op2IsKill));
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+                                   const TargetRegisterClass *RC, unsigned Op0,
+                                   bool Op0IsKill, uint64_t Imm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, uint64_t Imm1,
+                                    uint64_t Imm2) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addImm(Imm1)
+        .addImm(Imm2);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC,
+                                  const ConstantFP *FPImm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addFPImm(FPImm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addFPImm(FPImm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
+                                    const TargetRegisterClass *RC, unsigned Op0,
+                                    bool Op0IsKill, unsigned Op1,
+                                    bool Op1IsKill, uint64_t Imm) {
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  unsigned ResultReg = createResultReg(RC);
+  Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
+  Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
+        .addReg(Op0, getKillRegState(Op0IsKill))
+        .addReg(Op1, getKillRegState(Op1IsKill))
+        .addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+                                  const TargetRegisterClass *RC, uint64_t Imm) {
+  unsigned ResultReg = createResultReg(RC);
+  const MCInstrDesc &II = TII.get(MachineInstOpcode);
+
+  if (II.getNumDefs() >= 1)
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
+        .addImm(Imm);
+  else {
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addImm(Imm);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::COPY), ResultReg).addReg(II.ImplicitDefs[0]);
+  }
+  return ResultReg;
+}
+
+unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
+                                              bool Op0IsKill, uint32_t Idx) {
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  assert(TargetRegisterInfo::isVirtualRegister(Op0) &&
+         "Cannot yet extract from physregs");
+  const TargetRegisterClass *RC = MRI.getRegClass(Op0);
+  MRI.constrainRegClass(Op0, TRI.getSubClassWithSubReg(RC, Idx));
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
+          ResultReg).addReg(Op0, getKillRegState(Op0IsKill), Idx);
+  return ResultReg;
+}
+
+/// Emit MachineInstrs to compute the value of Op with all but the least
+/// significant bit set to zero.
+unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
+  return fastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
+}
+
+/// HandlePHINodesInSuccessorBlocks - Handle PHI nodes in successor blocks.
+/// Emit code to ensure constants are copied into registers when needed.
+/// Remember the virtual registers that need to be added to the Machine PHI
+/// nodes as input.  We cannot just directly add them, because expansion
+/// might result in multiple MBB's for one BB.  As such, the start of the
+/// BB might correspond to a different MBB than the end.
+bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+  FuncInfo.OrigNumPHINodesToUpdate = FuncInfo.PHINodesToUpdate.size();
+
+  // Check successor nodes' PHI nodes that expect a constant to be available
+  // from this block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin()))
+      continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::const_iterator I = SuccBB->begin();
+         const auto *PN = dyn_cast<PHINode>(I); ++I) {
+
+      // Ignore dead phi's.
+      if (PN->use_empty())
+        continue;
+
+      // Only handle legal types. Two interesting things to note here. First,
+      // by bailing out early, we may leave behind some dead instructions,
+      // since SelectionDAG's HandlePHINodesInSuccessorBlocks will insert its
+      // own moves. Second, this check is necessary because FastISel doesn't
+      // use CreateRegs to create registers, so it always creates
+      // exactly one register for each non-void instruction.
+      EVT VT = TLI.getValueType(DL, PN->getType(), /*AllowUnknown=*/true);
+      if (VT == MVT::Other || !TLI.isTypeLegal(VT)) {
+        // Handle integer promotions, though, because they're common and easy.
+        if (!(VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)) {
+          FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+          return false;
+        }
+      }
+
+      const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      // Set the DebugLoc for the copy. Prefer the location of the operand
+      // if there is one; use the location of the PHI otherwise.
+      DbgLoc = PN->getDebugLoc();
+      if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
+        DbgLoc = Inst->getDebugLoc();
+
+      unsigned Reg = getRegForValue(PHIOp);
+      if (!Reg) {
+        FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
+        return false;
+      }
+      FuncInfo.PHINodesToUpdate.push_back(std::make_pair(&*MBBI++, Reg));
+      DbgLoc = DebugLoc();
+    }
+  }
+
+  return true;
+}
+
+bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
+  assert(LI->hasOneUse() &&
+         "tryToFoldLoad expected a LoadInst with a single use");
+  // We know that the load has a single use, but don't know what it is.  If it
+  // isn't one of the folded instructions, then we can't succeed here.  Handle
+  // this by scanning the single-use users of the load until we get to FoldInst.
+  unsigned MaxUsers = 6; // Don't scan down huge single-use chains of instrs.
+
+  const Instruction *TheUser = LI->user_back();
+  while (TheUser != FoldInst && // Scan up until we find FoldInst.
+         // Stay in the right block.
+         TheUser->getParent() == FoldInst->getParent() &&
+         --MaxUsers) { // Don't scan too far.
+    // If there are multiple or no uses of this instruction, then bail out.
+    if (!TheUser->hasOneUse())
+      return false;
+
+    TheUser = TheUser->user_back();
+  }
+
+  // If we didn't find the fold instruction, then we failed to collapse the
+  // sequence.
+  if (TheUser != FoldInst)
+    return false;
+
+  // Don't try to fold volatile loads.  Target has to deal with alignment
+  // constraints.
+  if (LI->isVolatile())
+    return false;
+
+  // Figure out which vreg this is going into.  If there is no assigned vreg yet
+  // then there actually was no reference to it.  Perhaps the load is referenced
+  // by a dead instruction.
+  unsigned LoadReg = getRegForValue(LI);
+  if (!LoadReg)
+    return false;
+
+  // We can't fold if this vreg has no uses or more than one use.  Multiple uses
+  // may mean that the instruction got lowered to multiple MIs, or the use of
+  // the loaded value ended up being multiple operands of the result.
+  if (!MRI.hasOneUse(LoadReg))
+    return false;
+
+  MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(LoadReg);
+  MachineInstr *User = RI->getParent();
+
+  // Set the insertion point properly.  Folding the load can cause generation of
+  // other random instructions (like sign extends) for addressing modes; make
+  // sure they get inserted in a logical place before the new instruction.
+  FuncInfo.InsertPt = User;
+  FuncInfo.MBB = User->getParent();
+
+  // Ask the target to try folding the load.
+  return tryToFoldLoadIntoMI(User, RI.getOperandNo(), LI);
+}
+
+bool FastISel::canFoldAddIntoGEP(const User *GEP, const Value *Add) {
+  // Must be an add.
+  if (!isa<AddOperator>(Add))
+    return false;
+  // Type size needs to match.
+  if (DL.getTypeSizeInBits(GEP->getType()) !=
+      DL.getTypeSizeInBits(Add->getType()))
+    return false;
+  // Must be in the same basic block.
+  if (isa<Instruction>(Add) &&
+      FuncInfo.MBBMap[cast<Instruction>(Add)->getParent()] != FuncInfo.MBB)
+    return false;
+  // Must have a constant operand.
+  return isa<ConstantInt>(cast<AddOperator>(Add)->getOperand(1));
+}
+
+MachineMemOperand *
+FastISel::createMachineMemOperandFor(const Instruction *I) const {
+  const Value *Ptr;
+  Type *ValTy;
+  unsigned Alignment;
+  MachineMemOperand::Flags Flags;
+  bool IsVolatile;
+
+  if (const auto *LI = dyn_cast<LoadInst>(I)) {
+    Alignment = LI->getAlignment();
+    IsVolatile = LI->isVolatile();
+    Flags = MachineMemOperand::MOLoad;
+    Ptr = LI->getPointerOperand();
+    ValTy = LI->getType();
+  } else if (const auto *SI = dyn_cast<StoreInst>(I)) {
+    Alignment = SI->getAlignment();
+    IsVolatile = SI->isVolatile();
+    Flags = MachineMemOperand::MOStore;
+    Ptr = SI->getPointerOperand();
+    ValTy = SI->getValueOperand()->getType();
+  } else
+    return nullptr;
+
+  bool IsNonTemporal = I->getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+  bool IsInvariant = I->getMetadata(LLVMContext::MD_invariant_load) != nullptr;
+  const MDNode *Ranges = I->getMetadata(LLVMContext::MD_range);
+
+  AAMDNodes AAInfo;
+  I->getAAMetadata(AAInfo);
+
+  if (Alignment == 0) // Ensure that codegen never sees alignment 0.
+    Alignment = DL.getABITypeAlignment(ValTy);
+
+  unsigned Size = DL.getTypeStoreSize(ValTy);
+
+  if (IsVolatile)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (IsNonTemporal)
+    Flags |= MachineMemOperand::MONonTemporal;
+  if (IsInvariant)
+    Flags |= MachineMemOperand::MOInvariant;
+
+  return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
+                                           Alignment, AAInfo, Ranges);
+}
+
+CmpInst::Predicate FastISel::optimizeCmpPredicate(const CmpInst *CI) const {
+  // If both operands are the same, then try to optimize or fold the cmp.
+  CmpInst::Predicate Predicate = CI->getPredicate();
+  if (CI->getOperand(0) != CI->getOperand(1))
+    return Predicate;
+
+  switch (Predicate) {
+  default: llvm_unreachable("Invalid predicate!");
+  case CmpInst::FCMP_FALSE: Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OEQ:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OGE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_OLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_OLE:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_ONE:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::FCMP_ORD:   Predicate = CmpInst::FCMP_ORD;   break;
+  case CmpInst::FCMP_UNO:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UEQ:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UGT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_ULT:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::FCMP_UNE:   Predicate = CmpInst::FCMP_UNO;   break;
+  case CmpInst::FCMP_TRUE:  Predicate = CmpInst::FCMP_TRUE;  break;
+
+  case CmpInst::ICMP_EQ:    Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_NE:    Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_UGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_ULT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_ULE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SGT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SGE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  case CmpInst::ICMP_SLT:   Predicate = CmpInst::FCMP_FALSE; break;
+  case CmpInst::ICMP_SLE:   Predicate = CmpInst::FCMP_TRUE;  break;
+  }
+
+  return Predicate;
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
new file mode 100644
index 000000000000..e669ffc3d02a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -0,0 +1,615 @@
+//===-- FunctionLoweringInfo.cpp ------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating functions from LLVM IR into
+// Machine IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "function-lowering-info"
+
+/// isUsedOutsideOfDefiningBlock - Return true if this instruction is used by
+/// PHI nodes or outside of the basic block that defines it, or used by a
+/// switch or atomic instruction, which may expand to multiple basic blocks.
+static bool isUsedOutsideOfDefiningBlock(const Instruction *I) {
+  if (I->use_empty()) return false;
+  if (isa<PHINode>(I)) return true;
+  const BasicBlock *BB = I->getParent();
+  for (const User *U : I->users())
+    if (cast<Instruction>(U)->getParent() != BB || isa<PHINode>(U))
+      return true;
+
+  return false;
+}
+
+static ISD::NodeType getPreferredExtendForValue(const Value *V) {
+  // For the users of the source value being used for compare instruction, if
+  // the number of signed predicate is greater than unsigned predicate, we
+  // prefer to use SIGN_EXTEND.
+  //
+  // With this optimization, we would be able to reduce some redundant sign or
+  // zero extension instruction, and eventually more machine CSE opportunities
+  // can be exposed.
+  ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+  unsigned NumOfSigned = 0, NumOfUnsigned = 0;
+  for (const User *U : V->users()) {
+    if (const auto *CI = dyn_cast<CmpInst>(U)) {
+      NumOfSigned += CI->isSigned();
+      NumOfUnsigned += CI->isUnsigned();
+    }
+  }
+  if (NumOfSigned > NumOfUnsigned)
+    ExtendKind = ISD::SIGN_EXTEND;
+
+  return ExtendKind;
+}
+
+void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
+                               SelectionDAG *DAG) {
+  Fn = &fn;
+  MF = &mf;
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  MachineModuleInfo &MMI = MF->getMMI();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+
+  // Check whether the function can return without sret-demotion.
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(Fn->getReturnType(), Fn->getAttributes(), Outs, *TLI,
+                mf.getDataLayout());
+  CanLowerReturn = TLI->CanLowerReturn(Fn->getCallingConv(), *MF,
+                                       Fn->isVarArg(), Outs, Fn->getContext());
+
+  // If this personality uses funclets, we need to do a bit more work.
+  DenseMap<const AllocaInst *, int *> CatchObjects;
+  EHPersonality Personality = classifyEHPersonality(
+      Fn->hasPersonalityFn() ? Fn->getPersonalityFn() : nullptr);
+  if (isFuncletEHPersonality(Personality)) {
+    // Calculate state numbers if we haven't already.
+    WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+    if (Personality == EHPersonality::MSVC_CXX)
+      calculateWinCXXEHStateNumbers(&fn, EHInfo);
+    else if (isAsynchronousEHPersonality(Personality))
+      calculateSEHStateNumbers(&fn, EHInfo);
+    else if (Personality == EHPersonality::CoreCLR)
+      calculateClrEHStateNumbers(&fn, EHInfo);
+
+    // Map all BB references in the WinEH data to MBBs.
+    for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+      for (WinEHHandlerType &H : TBME.HandlerArray) {
+        if (const AllocaInst *AI = H.CatchObj.Alloca)
+          CatchObjects.insert({AI, &H.CatchObj.FrameIndex});
+        else
+          H.CatchObj.FrameIndex = INT_MAX;
+      }
+    }
+  }
+
+  // Initialize the mapping of values to registers.  This is only set up for
+  // instruction values that are used outside of the block that defines
+  // them.
+  Function::const_iterator BB = Fn->begin(), EB = Fn->end();
+  for (; BB != EB; ++BB)
+    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end();
+         I != E; ++I) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        Type *Ty = AI->getAllocatedType();
+        unsigned Align =
+          std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
+                   AI->getAlignment());
+
+        // Static allocas can be folded into the initial stack frame
+        // adjustment. For targets that don't realign the stack, don't
+        // do this if there is an extra alignment requirement.
+        if (AI->isStaticAlloca() && 
+            (TFI->isStackRealignable() || (Align <= StackAlign))) {
+          const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
+          uint64_t TySize = MF->getDataLayout().getTypeAllocSize(Ty);
+
+          TySize *= CUI->getZExtValue();   // Get total allocated size.
+          if (TySize == 0) TySize = 1; // Don't create zero-sized stack objects.
+          int FrameIndex = INT_MAX;
+          auto Iter = CatchObjects.find(AI);
+          if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) {
+            FrameIndex = MF->getFrameInfo()->CreateFixedObject(
+                TySize, 0, /*Immutable=*/false, /*isAliased=*/true);
+            MF->getFrameInfo()->setObjectAlignment(FrameIndex, Align);
+          } else {
+            FrameIndex =
+                MF->getFrameInfo()->CreateStackObject(TySize, Align, false, AI);
+          }
+
+          StaticAllocaMap[AI] = FrameIndex;
+          // Update the catch handler information.
+          if (Iter != CatchObjects.end())
+            *Iter->second = FrameIndex;
+        } else {
+          // FIXME: Overaligned static allocas should be grouped into
+          // a single dynamic allocation instead of using a separate
+          // stack allocation for each one.
+          if (Align <= StackAlign)
+            Align = 0;
+          // Inform the Frame Information that we have variable-sized objects.
+          MF->getFrameInfo()->CreateVariableSizedObject(Align ? Align : 1, AI);
+        }
+      }
+
+      // Look for inline asm that clobbers the SP register.
+      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
+        ImmutableCallSite CS(&*I);
+        if (isa<InlineAsm>(CS.getCalledValue())) {
+          unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+          const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+          std::vector<TargetLowering::AsmOperandInfo> Ops =
+              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI, CS);
+          for (size_t I = 0, E = Ops.size(); I != E; ++I) {
+            TargetLowering::AsmOperandInfo &Op = Ops[I];
+            if (Op.Type == InlineAsm::isClobber) {
+              // Clobbers don't have SDValue operands, hence SDValue().
+              TLI->ComputeConstraintToUse(Op, SDValue(), DAG);
+              std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+                  TLI->getRegForInlineAsmConstraint(TRI, Op.ConstraintCode,
+                                                    Op.ConstraintVT);
+              if (PhysReg.first == SP)
+                MF->getFrameInfo()->setHasOpaqueSPAdjustment(true);
+            }
+          }
+        }
+      }
+
+      // Look for calls to the @llvm.va_start intrinsic. We can omit some
+      // prologue boilerplate for variadic functions that don't examine their
+      // arguments.
+      if (const auto *II = dyn_cast<IntrinsicInst>(I)) {
+        if (II->getIntrinsicID() == Intrinsic::vastart)
+          MF->getFrameInfo()->setHasVAStart(true);
+      }
+
+      // If we have a musttail call in a variadic function, we need to ensure we
+      // forward implicit register parameters.
+      if (const auto *CI = dyn_cast<CallInst>(I)) {
+        if (CI->isMustTailCall() && Fn->isVarArg())
+          MF->getFrameInfo()->setHasMustTailInVarArgFunc(true);
+      }
+
+      // Mark values used outside their block as exported, by allocating
+      // a virtual register for them.
+      if (isUsedOutsideOfDefiningBlock(&*I))
+        if (!isa<AllocaInst>(I) || !StaticAllocaMap.count(cast<AllocaInst>(I)))
+          InitializeRegForValue(&*I);
+
+      // Collect llvm.dbg.declare information. This is done now instead of
+      // during the initial isel pass through the IR so that it is done
+      // in a predictable order.
+      if (const DbgDeclareInst *DI = dyn_cast<DbgDeclareInst>(I)) {
+        assert(DI->getVariable() && "Missing variable");
+        assert(DI->getDebugLoc() && "Missing location");
+        if (MMI.hasDebugInfo()) {
+          // Don't handle byval struct arguments or VLAs, for example.
+          // Non-byval arguments are handled here (they refer to the stack
+          // temporary alloca at this point).
+          const Value *Address = DI->getAddress();
+          if (Address) {
+            if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+              Address = BCI->getOperand(0);
+            if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
+              DenseMap<const AllocaInst *, int>::iterator SI =
+                StaticAllocaMap.find(AI);
+              if (SI != StaticAllocaMap.end()) { // Check for VLAs.
+                int FI = SI->second;
+                MMI.setVariableDbgInfo(DI->getVariable(), DI->getExpression(),
+                                       FI, DI->getDebugLoc());
+              }
+            }
+          }
+        }
+      }
+
+      // Decide the preferred extend type for a value.
+      PreferredExtendType[&*I] = getPreferredExtendForValue(&*I);
+    }
+
+  // Create an initial MachineBasicBlock for each LLVM BasicBlock in F.  This
+  // also creates the initial PHI MachineInstrs, though none of the input
+  // operands are populated.
+  for (BB = Fn->begin(); BB != EB; ++BB) {
+    // Don't create MachineBasicBlocks for imaginary EH pad blocks. These blocks
+    // are really data, and no instructions can live here.
+    if (BB->isEHPad()) {
+      const Instruction *I = BB->getFirstNonPHI();
+      // If this is a non-landingpad EH pad, mark this function as using
+      // funclets.
+      // FIXME: SEH catchpads do not create funclets, so we could avoid setting
+      // this in such cases in order to improve frame layout.
+      if (!isa<LandingPadInst>(I)) {
+        MMI.setHasEHFunclets(true);
+        MF->getFrameInfo()->setHasOpaqueSPAdjustment(true);
+      }
+      if (isa<CatchSwitchInst>(I)) {
+        assert(&*BB->begin() == I &&
+               "WinEHPrepare failed to remove PHIs from imaginary BBs");
+        continue;
+      }
+      if (isa<FuncletPadInst>(I))
+        assert(&*BB->begin() == I && "WinEHPrepare failed to demote PHIs");
+    }
+
+    MachineBasicBlock *MBB = mf.CreateMachineBasicBlock(&*BB);
+    MBBMap[&*BB] = MBB;
+    MF->push_back(MBB);
+
+    // Transfer the address-taken flag. This is necessary because there could
+    // be multiple MachineBasicBlocks corresponding to one BasicBlock, and only
+    // the first one should be marked.
+    if (BB->hasAddressTaken())
+      MBB->setHasAddressTaken();
+
+    // Create Machine PHI nodes for LLVM PHI nodes, lowering them as
+    // appropriate.
+    for (BasicBlock::const_iterator I = BB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      if (PN->use_empty()) continue;
+
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
+      DebugLoc DL = PN->getDebugLoc();
+      unsigned PHIReg = ValueMap[PN];
+      assert(PHIReg && "PHI node does not have an assigned virtual register!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(*TLI, MF->getDataLayout(), PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI->getNumRegisters(Fn->getContext(), VT);
+        const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+        for (unsigned i = 0; i != NumRegisters; ++i)
+          BuildMI(MBB, DL, TII->get(TargetOpcode::PHI), PHIReg + i);
+        PHIReg += NumRegisters;
+      }
+    }
+  }
+
+  // Mark landing pad blocks.
+  SmallVector<const LandingPadInst *, 4> LPads;
+  for (BB = Fn->begin(); BB != EB; ++BB) {
+    const Instruction *FNP = BB->getFirstNonPHI();
+    if (BB->isEHPad() && MBBMap.count(&*BB))
+      MBBMap[&*BB]->setIsEHPad();
+    if (const auto *LPI = dyn_cast<LandingPadInst>(FNP))
+      LPads.push_back(LPI);
+  }
+
+  if (!isFuncletEHPersonality(Personality))
+    return;
+
+  WinEHFuncInfo &EHInfo = *MF->getWinEHFuncInfo();
+
+  // Map all BB references in the WinEH data to MBBs.
+  for (WinEHTryBlockMapEntry &TBME : EHInfo.TryBlockMap) {
+    for (WinEHHandlerType &H : TBME.HandlerArray) {
+      if (H.Handler)
+        H.Handler = MBBMap[H.Handler.get<const BasicBlock *>()];
+    }
+  }
+  for (CxxUnwindMapEntry &UME : EHInfo.CxxUnwindMap)
+    if (UME.Cleanup)
+      UME.Cleanup = MBBMap[UME.Cleanup.get<const BasicBlock *>()];
+  for (SEHUnwindMapEntry &UME : EHInfo.SEHUnwindMap) {
+    const BasicBlock *BB = UME.Handler.get<const BasicBlock *>();
+    UME.Handler = MBBMap[BB];
+  }
+  for (ClrEHUnwindMapEntry &CME : EHInfo.ClrEHUnwindMap) {
+    const BasicBlock *BB = CME.Handler.get<const BasicBlock *>();
+    CME.Handler = MBBMap[BB];
+  }
+}
+
+/// clear - Clear out all the function-specific state. This returns this
+/// FunctionLoweringInfo to an empty state, ready to be used for a
+/// different function.
+void FunctionLoweringInfo::clear() {
+  MBBMap.clear();
+  ValueMap.clear();
+  StaticAllocaMap.clear();
+  LiveOutRegInfo.clear();
+  VisitedBBs.clear();
+  ArgDbgValues.clear();
+  ByValArgFrameIndexMap.clear();
+  RegFixups.clear();
+  StatepointStackSlots.clear();
+  StatepointSpillMaps.clear();
+  PreferredExtendType.clear();
+}
+
+/// CreateReg - Allocate a single virtual register for the given type.
+unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
+  return RegInfo->createVirtualRegister(
+      MF->getSubtarget().getTargetLowering()->getRegClassFor(VT));
+}
+
+/// CreateRegs - Allocate the appropriate number of virtual registers of
+/// the correctly promoted or expanded types.  Assign these registers
+/// consecutive vreg numbers and return the first assigned number.
+///
+/// In the case that the given value has struct or array type, this function
+/// will assign registers for each member or element.
+///
+unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
+  const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
+
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
+
+  unsigned FirstReg = 0;
+  for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
+
+    unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      unsigned R = CreateReg(RegisterVT);
+      if (!FirstReg) FirstReg = R;
+    }
+  }
+  return FirstReg;
+}
+
+/// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
+/// register is a PHI destination and the PHI's LiveOutInfo is not valid. If
+/// the register's LiveOutInfo is for a smaller bit width, it is extended to
+/// the larger bit width by zero extension. The bit width must be no smaller
+/// than the LiveOutInfo's existing bit width.
+const FunctionLoweringInfo::LiveOutInfo *
+FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
+  if (!LiveOutRegInfo.inBounds(Reg))
+    return nullptr;
+
+  LiveOutInfo *LOI = &LiveOutRegInfo[Reg];
+  if (!LOI->IsValid)
+    return nullptr;
+
+  if (BitWidth > LOI->KnownZero.getBitWidth()) {
+    LOI->NumSignBits = 1;
+    LOI->KnownZero = LOI->KnownZero.zextOrTrunc(BitWidth);
+    LOI->KnownOne = LOI->KnownOne.zextOrTrunc(BitWidth);
+  }
+
+  return LOI;
+}
+
+/// ComputePHILiveOutRegInfo - Compute LiveOutInfo for a PHI's destination
+/// register based on the LiveOutInfo of its operands.
+void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
+  Type *Ty = PN->getType();
+  if (!Ty->isIntegerTy() || Ty->isVectorTy())
+    return;
+
+  SmallVector<EVT, 1> ValueVTs;
+  ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
+  assert(ValueVTs.size() == 1 &&
+         "PHIs with non-vector integer types should have a single VT.");
+  EVT IntVT = ValueVTs[0];
+
+  if (TLI->getNumRegisters(PN->getContext(), IntVT) != 1)
+    return;
+  IntVT = TLI->getTypeToTransformTo(PN->getContext(), IntVT);
+  unsigned BitWidth = IntVT.getSizeInBits();
+
+  unsigned DestReg = ValueMap[PN];
+  if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+    return;
+  LiveOutRegInfo.grow(DestReg);
+  LiveOutInfo &DestLOI = LiveOutRegInfo[DestReg];
+
+  Value *V = PN->getIncomingValue(0);
+  if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+    DestLOI.NumSignBits = 1;
+    APInt Zero(BitWidth, 0);
+    DestLOI.KnownZero = Zero;
+    DestLOI.KnownOne = Zero;
+    return;
+  }
+
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+    DestLOI.NumSignBits = Val.getNumSignBits();
+    DestLOI.KnownZero = ~Val;
+    DestLOI.KnownOne = Val;
+  } else {
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
+                                "CopyToReg node was created.");
+    unsigned SrcReg = ValueMap[V];
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI = *SrcLOI;
+  }
+
+  assert(DestLOI.KnownZero.getBitWidth() == BitWidth &&
+         DestLOI.KnownOne.getBitWidth() == BitWidth &&
+         "Masks should have the same bit width as the type.");
+
+  for (unsigned i = 1, e = PN->getNumIncomingValues(); i != e; ++i) {
+    Value *V = PN->getIncomingValue(i);
+    if (isa<UndefValue>(V) || isa<ConstantExpr>(V)) {
+      DestLOI.NumSignBits = 1;
+      APInt Zero(BitWidth, 0);
+      DestLOI.KnownZero = Zero;
+      DestLOI.KnownOne = Zero;
+      return;
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      APInt Val = CI->getValue().zextOrTrunc(BitWidth);
+      DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, Val.getNumSignBits());
+      DestLOI.KnownZero &= ~Val;
+      DestLOI.KnownOne &= Val;
+      continue;
+    }
+
+    assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
+                                "its CopyToReg node was created.");
+    unsigned SrcReg = ValueMap[V];
+    if (!TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    const LiveOutInfo *SrcLOI = GetLiveOutRegInfo(SrcReg, BitWidth);
+    if (!SrcLOI) {
+      DestLOI.IsValid = false;
+      return;
+    }
+    DestLOI.NumSignBits = std::min(DestLOI.NumSignBits, SrcLOI->NumSignBits);
+    DestLOI.KnownZero &= SrcLOI->KnownZero;
+    DestLOI.KnownOne &= SrcLOI->KnownOne;
+  }
+}
+
+/// setArgumentFrameIndex - Record frame index for the byval
+/// argument. This overrides previous frame index entry for this argument,
+/// if any.
+void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A,
+                                                 int FI) {
+  ByValArgFrameIndexMap[A] = FI;
+}
+
+/// getArgumentFrameIndex - Get frame index for the byval argument.
+/// If the argument does not have any assigned frame index then 0 is
+/// returned.
+int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
+  DenseMap<const Argument *, int>::iterator I =
+    ByValArgFrameIndexMap.find(A);
+  if (I != ByValArgFrameIndexMap.end())
+    return I->second;
+  DEBUG(dbgs() << "Argument does not have assigned frame index!\n");
+  return 0;
+}
+
+unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
+    const Value *CPI, const TargetRegisterClass *RC) {
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  auto I = CatchPadExceptionPointers.insert({CPI, 0});
+  unsigned &VReg = I.first->second;
+  if (I.second)
+    VReg = MRI.createVirtualRegister(RC);
+  assert(VReg && "null vreg in exception pointer table!");
+  return VReg;
+}
+
+/// ComputeUsesVAFloatArgument - Determine if any floating-point values are
+/// being passed to this variadic function, and set the MachineModuleInfo's
+/// usesVAFloatArgument flag if so. This flag is used to emit an undefined
+/// reference to _fltused on Windows, which will link in MSVCRT's
+/// floating-point support.
+void llvm::ComputeUsesVAFloatArgument(const CallInst &I,
+                                      MachineModuleInfo *MMI)
+{
+  FunctionType *FT = cast<FunctionType>(
+    I.getCalledValue()->getType()->getContainedType(0));
+  if (FT->isVarArg() && !MMI->usesVAFloatArgument()) {
+    for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+      Type* T = I.getArgOperand(i)->getType();
+      for (auto i : post_order(T)) {
+        if (i->isFloatingPointTy()) {
+          MMI->setUsesVAFloatArgument(true);
+          return;
+        }
+      }
+    }
+  }
+}
+
+/// AddLandingPadInfo - Extract the exception handling information from the
+/// landingpad instruction and add them to the specified machine module info.
+void llvm::AddLandingPadInfo(const LandingPadInst &I, MachineModuleInfo &MMI,
+                             MachineBasicBlock *MBB) {
+  if (const auto *PF = dyn_cast<Function>(
+      I.getParent()->getParent()->getPersonalityFn()->stripPointerCasts()))
+    MMI.addPersonality(PF);
+
+  if (I.isCleanup())
+    MMI.addCleanup(MBB);
+
+  // FIXME: New EH - Add the clauses in reverse order. This isn't 100% correct,
+  //        but we need to do it this way because of how the DWARF EH emitter
+  //        processes the clauses.
+  for (unsigned i = I.getNumClauses(); i != 0; --i) {
+    Value *Val = I.getClause(i - 1);
+    if (I.isCatch(i - 1)) {
+      MMI.addCatchTypeInfo(MBB,
+                           dyn_cast<GlobalValue>(Val->stripPointerCasts()));
+    } else {
+      // Add filters in a list.
+      Constant *CVal = cast<Constant>(Val);
+      SmallVector<const GlobalValue*, 4> FilterList;
+      for (User::op_iterator
+             II = CVal->op_begin(), IE = CVal->op_end(); II != IE; ++II)
+        FilterList.push_back(cast<GlobalValue>((*II)->stripPointerCasts()));
+
+      MMI.addFilterTypeInfo(MBB, FilterList);
+    }
+  }
+}
+
+unsigned FunctionLoweringInfo::findSwiftErrorVReg(const MachineBasicBlock *MBB,
+                                                  const Value* Val) const {
+  // Find the index in SwiftErrorVals.
+  SwiftErrorValues::const_iterator I =
+      std::find(SwiftErrorVals.begin(), SwiftErrorVals.end(), Val);
+  assert(I != SwiftErrorVals.end() && "Can't find value in SwiftErrorVals");
+  return SwiftErrorMap.lookup(MBB)[I - SwiftErrorVals.begin()];
+}
+
+void FunctionLoweringInfo::setSwiftErrorVReg(const MachineBasicBlock *MBB,
+                                             const Value* Val, unsigned VReg) {
+  // Find the index in SwiftErrorVals.
+  SwiftErrorValues::iterator I =
+      std::find(SwiftErrorVals.begin(), SwiftErrorVals.end(), Val);
+  assert(I != SwiftErrorVals.end() && "Can't find value in SwiftErrorVals");
+  SwiftErrorMap[MBB][I - SwiftErrorVals.begin()] = VReg;
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
new file mode 100644
index 000000000000..c8af73a3b441
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -0,0 +1,1050 @@
+//==--- InstrEmitter.cpp - Emit MachineInstrs for the SelectionDAG class ---==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "instr-emitter"
+
+/// MinRCSize - Smallest register class we allow when constraining virtual
+/// registers.  If satisfying all register class constraints would require
+/// using a smaller register class, emit a COPY to a new virtual register
+/// instead.
+const unsigned MinRCSize = 4;
+
+/// CountResults - The results of target nodes have register or immediate
+/// operands first, then an optional chain, and optional glue operands (which do
+/// not go into the resulting MachineInstr).
+unsigned InstrEmitter::CountResults(SDNode *Node) {
+  unsigned N = Node->getNumValues();
+  while (N && Node->getValueType(N - 1) == MVT::Glue)
+    --N;
+  if (N && Node->getValueType(N - 1) == MVT::Other)
+    --N;    // Skip over chain result.
+  return N;
+}
+
+/// countOperands - The inputs to target nodes have any actual inputs first,
+/// followed by an optional chain operand, then an optional glue operand.
+/// Compute the number of actual operands that will go into the resulting
+/// MachineInstr.
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
+                              unsigned &NumImpUses) {
+  unsigned N = Node->getNumOperands();
+  while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
+    --N;
+  if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
+    --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  NumImpUses = N - NumExpUses;
+  for (unsigned I = N; I > NumExpUses; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
+  return N;
+}
+
+/// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+/// implicit physical register output.
+void InstrEmitter::
+EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
+                unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VRBase = 0;
+  if (TargetRegisterInfo::isVirtualRegister(SrcReg)) {
+    // Just use the input register directly!
+    SDValue Op(Node, ResNo);
+    if (IsClone)
+      VRBaseMap.erase(Op);
+    bool isNew = VRBaseMap.insert(std::make_pair(Op, SrcReg)).second;
+    (void)isNew; // Silence compiler warning.
+    assert(isNew && "Node emitted out of order - early");
+    return;
+  }
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  bool MatchReg = true;
+  const TargetRegisterClass *UseRC = nullptr;
+  MVT VT = Node->getSimpleValueType(ResNo);
+
+  // Stick to the preferred register classes for legal types.
+  if (TLI->isTypeLegal(VT))
+    UseRC = TLI->getRegClassFor(VT);
+
+  if (!IsClone && !IsCloned)
+    for (SDNode *User : Node->uses()) {
+      bool Match = true;
+      if (User->getOpcode() == ISD::CopyToReg &&
+          User->getOperand(2).getNode() == Node &&
+          User->getOperand(2).getResNo() == ResNo) {
+        unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+        if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+          VRBase = DestReg;
+          Match = false;
+        } else if (DestReg != SrcReg)
+          Match = false;
+      } else {
+        for (unsigned i = 0, e = User->getNumOperands(); i != e; ++i) {
+          SDValue Op = User->getOperand(i);
+          if (Op.getNode() != Node || Op.getResNo() != ResNo)
+            continue;
+          MVT VT = Node->getSimpleValueType(Op.getResNo());
+          if (VT == MVT::Other || VT == MVT::Glue)
+            continue;
+          Match = false;
+          if (User->isMachineOpcode()) {
+            const MCInstrDesc &II = TII->get(User->getMachineOpcode());
+            const TargetRegisterClass *RC = nullptr;
+            if (i+II.getNumDefs() < II.getNumOperands()) {
+              RC = TRI->getAllocatableClass(
+                TII->getRegClass(II, i+II.getNumDefs(), TRI, *MF));
+            }
+            if (!UseRC)
+              UseRC = RC;
+            else if (RC) {
+              const TargetRegisterClass *ComRC =
+                TRI->getCommonSubClass(UseRC, RC, VT.SimpleTy);
+              // If multiple uses expect disjoint register classes, we emit
+              // copies in AddRegisterOperand.
+              if (ComRC)
+                UseRC = ComRC;
+            }
+          }
+        }
+      }
+      MatchReg &= Match;
+      if (VRBase)
+        break;
+    }
+
+  const TargetRegisterClass *SrcRC = nullptr, *DstRC = nullptr;
+  SrcRC = TRI->getMinimalPhysRegClass(SrcReg, VT);
+
+  // Figure out the register class to create for the destreg.
+  if (VRBase) {
+    DstRC = MRI->getRegClass(VRBase);
+  } else if (UseRC) {
+    assert(UseRC->hasType(VT) && "Incompatible phys register def and uses!");
+    DstRC = UseRC;
+  } else {
+    DstRC = TLI->getRegClassFor(VT);
+  }
+
+  // If all uses are reading from the src physical register and copying the
+  // register is either impossible or very expensive, then don't create a copy.
+  if (MatchReg && SrcRC->getCopyCost() < 0) {
+    VRBase = SrcReg;
+  } else {
+    // Create the reg, emit the copy.
+    VRBase = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            VRBase).addReg(SrcReg);
+  }
+
+  SDValue Op(Node, ResNo);
+  if (IsClone)
+    VRBaseMap.erase(Op);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// getDstOfCopyToRegUse - If the only use of the specified result number of
+/// node is a CopyToReg, return its destination register. Return 0 otherwise.
+unsigned InstrEmitter::getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                                unsigned ResNo) const {
+  if (!Node->hasOneUse())
+    return 0;
+
+  SDNode *User = *Node->use_begin();
+  if (User->getOpcode() == ISD::CopyToReg &&
+      User->getOperand(2).getNode() == Node &&
+      User->getOperand(2).getResNo() == ResNo) {
+    unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      return Reg;
+  }
+  return 0;
+}
+
+void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
+                                       MachineInstrBuilder &MIB,
+                                       const MCInstrDesc &II,
+                                       bool IsClone, bool IsCloned,
+                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+  assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
+         "IMPLICIT_DEF should have been handled as a special case elsewhere!");
+
+  unsigned NumResults = CountResults(Node);
+  for (unsigned i = 0; i < II.getNumDefs(); ++i) {
+    // If the specific node value is only used by a CopyToReg and the dest reg
+    // is a vreg in the same register class, use the CopyToReg'd destination
+    // register instead of creating a new vreg.
+    unsigned VRBase = 0;
+    const TargetRegisterClass *RC =
+      TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
+    // Always let the value type influence the used register class. The
+    // constraints on the instruction may be too lax to represent the value
+    // type correctly. For example, a 64-bit float (X86::FR64) can't live in
+    // the 32-bit float super-class (X86::FR32).
+    if (i < NumResults && TLI->isTypeLegal(Node->getSimpleValueType(i))) {
+      const TargetRegisterClass *VTRC =
+        TLI->getRegClassFor(Node->getSimpleValueType(i));
+      if (RC)
+        VTRC = TRI->getCommonSubClass(RC, VTRC);
+      if (VTRC)
+        RC = VTRC;
+    }
+
+    if (II.OpInfo[i].isOptionalDef()) {
+      // Optional def must be a physical register.
+      unsigned NumResults = CountResults(Node);
+      VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
+      assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    if (!VRBase && !IsClone && !IsCloned)
+      for (SDNode *User : Node->uses()) {
+        if (User->getOpcode() == ISD::CopyToReg &&
+            User->getOperand(2).getNode() == Node &&
+            User->getOperand(2).getResNo() == i) {
+          unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+          if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+            const TargetRegisterClass *RegRC = MRI->getRegClass(Reg);
+            if (RegRC == RC) {
+              VRBase = Reg;
+              MIB.addReg(VRBase, RegState::Define);
+              break;
+            }
+          }
+        }
+      }
+
+    // Create the result registers for this node and add the result regs to
+    // the machine instruction.
+    if (VRBase == 0) {
+      assert(RC && "Isn't a register operand!");
+      VRBase = MRI->createVirtualRegister(RC);
+      MIB.addReg(VRBase, RegState::Define);
+    }
+
+    // If this def corresponds to a result of the SDNode insert the VRBase into
+    // the lookup map.
+    if (i < NumResults) {
+      SDValue Op(Node, i);
+      if (IsClone)
+        VRBaseMap.erase(Op);
+      bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+    }
+  }
+}
+
+/// getVR - Return the virtual register corresponding to the specified result
+/// of the specified node.
+unsigned InstrEmitter::getVR(SDValue Op,
+                             DenseMap<SDValue, unsigned> &VRBaseMap) {
+  if (Op.isMachineOpcode() &&
+      Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+    // Add an IMPLICIT_DEF instruction before every use.
+    unsigned VReg = getDstOfOnlyCopyToRegUse(Op.getNode(), Op.getResNo());
+    // IMPLICIT_DEF can produce any type of result so its MCInstrDesc
+    // does not include operand register class info.
+    if (!VReg) {
+      const TargetRegisterClass *RC =
+        TLI->getRegClassFor(Op.getSimpleValueType());
+      VReg = MRI->createVirtualRegister(RC);
+    }
+    BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
+            TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+    return VReg;
+  }
+
+  DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+  assert(I != VRBaseMap.end() && "Node emitted out of order - late");
+  return I->second;
+}
+
+
+/// AddRegisterOperand - Add the specified register as an operand to the
+/// specified machine instr. Insert register copies if the register is
+/// not in the required register class.
+void
+InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
+                                 SDValue Op,
+                                 unsigned IIOpNum,
+                                 const MCInstrDesc *II,
+                                 DenseMap<SDValue, unsigned> &VRBaseMap,
+                                 bool IsDebug, bool IsClone, bool IsCloned) {
+  assert(Op.getValueType() != MVT::Other &&
+         Op.getValueType() != MVT::Glue &&
+         "Chain and glue operands should occur at end of operand list!");
+  // Get/emit the operand.
+  unsigned VReg = getVR(Op, VRBaseMap);
+
+  const MCInstrDesc &MCID = MIB->getDesc();
+  bool isOptDef = IIOpNum < MCID.getNumOperands() &&
+    MCID.OpInfo[IIOpNum].isOptionalDef();
+
+  // If the instruction requires a register in a different class, create
+  // a new virtual register and copy the value into it, but first attempt to
+  // shrink VReg's register class within reason.  For example, if VReg == GR32
+  // and II requires a GR32_NOSP, just constrain VReg to GR32_NOSP.
+  if (II) {
+    const TargetRegisterClass *DstRC = nullptr;
+    if (IIOpNum < II->getNumOperands())
+      DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
+    assert((!DstRC || TargetRegisterInfo::isVirtualRegister(VReg)) &&
+           "Expected VReg");
+    if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
+      unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+      BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), NewVReg).addReg(VReg);
+      VReg = NewVReg;
+    }
+  }
+
+  // If this value has only one use, that use is a kill. This is a
+  // conservative approximation. InstrEmitter does trivial coalescing
+  // with CopyFromReg nodes, so don't emit kill flags for them.
+  // Avoid kill flags on Schedule cloned nodes, since there will be
+  // multiple uses.
+  // Tied operands are never killed, so we need to check that. And that
+  // means we need to determine the index of the operand.
+  bool isKill = Op.hasOneUse() &&
+                Op.getNode()->getOpcode() != ISD::CopyFromReg &&
+                !IsDebug &&
+                !(IsClone || IsCloned);
+  if (isKill) {
+    unsigned Idx = MIB->getNumOperands();
+    while (Idx > 0 &&
+           MIB->getOperand(Idx-1).isReg() &&
+           MIB->getOperand(Idx-1).isImplicit())
+      --Idx;
+    bool isTied = MCID.getOperandConstraint(Idx, MCOI::TIED_TO) != -1;
+    if (isTied)
+      isKill = false;
+  }
+
+  MIB.addReg(VReg, getDefRegState(isOptDef) | getKillRegState(isKill) |
+             getDebugRegState(IsDebug));
+}
+
+/// AddOperand - Add the specified operand to the specified machine instr.  II
+/// specifies the instruction information for the node, and IIOpNum is the
+/// operand number (in the II) that we are adding.
+void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
+                              SDValue Op,
+                              unsigned IIOpNum,
+                              const MCInstrDesc *II,
+                              DenseMap<SDValue, unsigned> &VRBaseMap,
+                              bool IsDebug, bool IsClone, bool IsCloned) {
+  if (Op.isMachineOpcode()) {
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  } else if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    MIB.addImm(C->getSExtValue());
+  } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
+    MIB.addFPImm(F->getConstantFPValue());
+  } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MIB.addReg(R->getReg(), getImplRegState(Imp));
+  } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
+    MIB.addRegMask(RM->getRegMask());
+  } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
+    MIB.addGlobalAddress(TGA->getGlobal(), TGA->getOffset(),
+                         TGA->getTargetFlags());
+  } else if (BasicBlockSDNode *BBNode = dyn_cast<BasicBlockSDNode>(Op)) {
+    MIB.addMBB(BBNode->getBasicBlock());
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Op)) {
+    MIB.addFrameIndex(FI->getIndex());
+  } else if (JumpTableSDNode *JT = dyn_cast<JumpTableSDNode>(Op)) {
+    MIB.addJumpTableIndex(JT->getIndex(), JT->getTargetFlags());
+  } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
+    int Offset = CP->getOffset();
+    unsigned Align = CP->getAlignment();
+    Type *Type = CP->getType();
+    // MachineConstantPool wants an explicit alignment.
+    if (Align == 0) {
+      Align = MF->getDataLayout().getPrefTypeAlignment(Type);
+      if (Align == 0) {
+        // Alignment of vector types.  FIXME!
+        Align = MF->getDataLayout().getTypeAllocSize(Type);
+      }
+    }
+
+    unsigned Idx;
+    MachineConstantPool *MCP = MF->getConstantPool();
+    if (CP->isMachineConstantPoolEntry())
+      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Align);
+    else
+      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Align);
+    MIB.addConstantPoolIndex(Idx, Offset, CP->getTargetFlags());
+  } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
+    MIB.addExternalSymbol(ES->getSymbol(), ES->getTargetFlags());
+  } else if (auto *SymNode = dyn_cast<MCSymbolSDNode>(Op)) {
+    MIB.addSym(SymNode->getMCSymbol());
+  } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
+    MIB.addBlockAddress(BA->getBlockAddress(),
+                        BA->getOffset(),
+                        BA->getTargetFlags());
+  } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
+    MIB.addTargetIndex(TI->getIndex(), TI->getOffset(), TI->getTargetFlags());
+  } else {
+    assert(Op.getValueType() != MVT::Other &&
+           Op.getValueType() != MVT::Glue &&
+           "Chain and glue operands should occur at end of operand list!");
+    AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
+                       IsDebug, IsClone, IsCloned);
+  }
+}
+
+unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+                                          MVT VT, const DebugLoc &DL) {
+  const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
+  const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
+
+  // RC is a sub-class of VRC that supports SubIdx.  Try to constrain VReg
+  // within reason.
+  if (RC && RC != VRC)
+    RC = MRI->constrainRegClass(VReg, RC, MinRCSize);
+
+  // VReg has been adjusted.  It can be used with SubIdx operands now.
+  if (RC)
+    return VReg;
+
+  // VReg couldn't be reasonably constrained.  Emit a COPY to a new virtual
+  // register instead.
+  RC = TRI->getSubClassWithSubReg(TLI->getRegClassFor(VT), SubIdx);
+  assert(RC && "No legal register class for VT supports that SubIdx");
+  unsigned NewReg = MRI->createVirtualRegister(RC);
+  BuildMI(*MBB, InsertPos, DL, TII->get(TargetOpcode::COPY), NewReg)
+    .addReg(VReg);
+  return NewReg;
+}
+
+/// EmitSubregNode - Generate machine code for subreg nodes.
+///
+void InstrEmitter::EmitSubregNode(SDNode *Node,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned VRBase = 0;
+  unsigned Opc = Node->getMachineOpcode();
+
+  // If the node is only used by a CopyToReg and the dest reg is a vreg, use
+  // the CopyToReg'd destination register instead of creating a new vreg.
+  for (SDNode *User : Node->uses()) {
+    if (User->getOpcode() == ISD::CopyToReg &&
+        User->getOperand(2).getNode() == Node) {
+      unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(DestReg)) {
+        VRBase = DestReg;
+        break;
+      }
+    }
+  }
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG) {
+    // EXTRACT_SUBREG is lowered as %dst = COPY %src:sub.  There are no
+    // constraints on the %dst register, COPY can target all legal register
+    // classes.
+    unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    const TargetRegisterClass *TRC =
+      TLI->getRegClassFor(Node->getSimpleValueType(0));
+
+    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+    MachineInstr *DefMI = MRI->getVRegDef(VReg);
+    unsigned SrcReg, DstReg, DefSubIdx;
+    if (DefMI &&
+        TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
+      // Optimize these:
+      // r1025 = s/zext r1024, 4
+      // r1026 = extract_subreg r1025, 4
+      // to a copy
+      // r1026 = copy r1024
+      VRBase = MRI->createVirtualRegister(TRC);
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
+      MRI->clearKillFlags(SrcReg);
+    } else {
+      // VReg may not support a SubIdx sub-register, and we may need to
+      // constrain its register class or issue a COPY to a compatible register
+      // class.
+      VReg = ConstrainForSubReg(VReg, SubIdx,
+                                Node->getOperand(0).getSimpleValueType(),
+                                Node->getDebugLoc());
+
+      // Create the destreg if it is missing.
+      if (VRBase == 0)
+        VRBase = MRI->createVirtualRegister(TRC);
+
+      // Create the extract_subreg machine instruction.
+      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+              TII->get(TargetOpcode::COPY), VRBase).addReg(VReg, 0, SubIdx);
+    }
+  } else if (Opc == TargetOpcode::INSERT_SUBREG ||
+             Opc == TargetOpcode::SUBREG_TO_REG) {
+    SDValue N0 = Node->getOperand(0);
+    SDValue N1 = Node->getOperand(1);
+    SDValue N2 = Node->getOperand(2);
+    unsigned SubIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+    // Figure out the register class to create for the destreg.  It should be
+    // the largest legal register class supporting SubIdx sub-registers.
+    // RegisterCoalescer will constrain it further if it decides to eliminate
+    // the INSERT_SUBREG instruction.
+    //
+    //   %dst = INSERT_SUBREG %src, %sub, SubIdx
+    //
+    // is lowered by TwoAddressInstructionPass to:
+    //
+    //   %dst = COPY %src
+    //   %dst:SubIdx = COPY %sub
+    //
+    // There is no constraint on the %src register class.
+    //
+    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0));
+    SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
+    assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
+
+    if (VRBase == 0 || !SRC->hasSubClassEq(MRI->getRegClass(VRBase)))
+      VRBase = MRI->createVirtualRegister(SRC);
+
+    // Create the insert_subreg or subreg_to_reg machine instruction.
+    MachineInstrBuilder MIB =
+      BuildMI(*MF, Node->getDebugLoc(), TII->get(Opc), VRBase);
+
+    // If creating a subreg_to_reg, then the first input operand
+    // is an implicit value immediate, otherwise it's a register
+    if (Opc == TargetOpcode::SUBREG_TO_REG) {
+      const ConstantSDNode *SD = cast<ConstantSDNode>(N0);
+      MIB.addImm(SD->getZExtValue());
+    } else
+      AddOperand(MIB, N0, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
+                 IsClone, IsCloned);
+    // Add the subregster being inserted
+    AddOperand(MIB, N1, 0, nullptr, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+    MIB.addImm(SubIdx);
+    MBB->insert(InsertPos, MIB);
+  } else
+    llvm_unreachable("Node is not insert_subreg, extract_subreg, or subreg_to_reg");
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, VRBase)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+/// COPY_TO_REGCLASS is just a normal copy, except that the destination
+/// register is constrained to be in a particular register class.
+///
+void
+InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
+                                     DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
+
+  // Create the new VReg in the destination class and emit a copy.
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+  const TargetRegisterClass *DstRC =
+    TRI->getAllocatableClass(TRI->getRegClass(DstRCIdx));
+  unsigned NewVReg = MRI->createVirtualRegister(DstRC);
+  BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+    NewVReg).addReg(VReg);
+
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+///
+void InstrEmitter::EmitRegSequence(SDNode *Node,
+                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  bool IsClone, bool IsCloned) {
+  unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+  const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+  unsigned NewVReg = MRI->createVirtualRegister(TRI->getAllocatableClass(RC));
+  const MCInstrDesc &II = TII->get(TargetOpcode::REG_SEQUENCE);
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II, NewVReg);
+  unsigned NumOps = Node->getNumOperands();
+  assert((NumOps & 1) == 1 &&
+         "REG_SEQUENCE must have an odd number of operands!");
+  for (unsigned i = 1; i != NumOps; ++i) {
+    SDValue Op = Node->getOperand(i);
+    if ((i & 1) == 0) {
+      RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(i-1));
+      // Skip physical registers as they don't have a vreg to get and we'll
+      // insert copies for them in TwoAddressInstructionPass anyway.
+      if (!R || !TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+        unsigned SubIdx = cast<ConstantSDNode>(Op)->getZExtValue();
+        unsigned SubReg = getVR(Node->getOperand(i-1), VRBaseMap);
+        const TargetRegisterClass *TRC = MRI->getRegClass(SubReg);
+        const TargetRegisterClass *SRC =
+        TRI->getMatchingSuperRegClass(RC, TRC, SubIdx);
+        if (SRC && SRC != RC) {
+          MRI->setRegClass(NewVReg, SRC);
+          RC = SRC;
+        }
+      }
+    }
+    AddOperand(MIB, Op, i+1, &II, VRBaseMap, /*IsDebug=*/false,
+               IsClone, IsCloned);
+  }
+
+  MBB->insert(InsertPos, MIB);
+  SDValue Op(Node, 0);
+  bool isNew = VRBaseMap.insert(std::make_pair(Op, NewVReg)).second;
+  (void)isNew; // Silence compiler warning.
+  assert(isNew && "Node emitted out of order - early");
+}
+
+/// EmitDbgValue - Generate machine instruction for a dbg_value node.
+///
+MachineInstr *
+InstrEmitter::EmitDbgValue(SDDbgValue *SD,
+                           DenseMap<SDValue, unsigned> &VRBaseMap) {
+  uint64_t Offset = SD->getOffset();
+  MDNode *Var = SD->getVariable();
+  MDNode *Expr = SD->getExpression();
+  DebugLoc DL = SD->getDebugLoc();
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+
+  if (SD->getKind() == SDDbgValue::FRAMEIX) {
+    // Stack address; this needs to be lowered in target-dependent fashion.
+    // EmitTargetCodeForFrameDebugValue is responsible for allocation.
+    return BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
+        .addFrameIndex(SD->getFrameIx())
+        .addImm(Offset)
+        .addMetadata(Var)
+        .addMetadata(Expr);
+  }
+  // Otherwise, we're going to create an instruction here.
+  const MCInstrDesc &II = TII->get(TargetOpcode::DBG_VALUE);
+  MachineInstrBuilder MIB = BuildMI(*MF, DL, II);
+  if (SD->getKind() == SDDbgValue::SDNODE) {
+    SDNode *Node = SD->getSDNode();
+    SDValue Op = SDValue(Node, SD->getResNo());
+    // It's possible we replaced this SDNode with other(s) and therefore
+    // didn't generate code for it.  It's better to catch these cases where
+    // they happen and transfer the debug info, but trying to guarantee that
+    // in all cases would be very fragile; this is a safeguard for any
+    // that were missed.
+    DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+    if (I==VRBaseMap.end())
+      MIB.addReg(0U);       // undef
+    else
+      AddOperand(MIB, Op, (*MIB).getNumOperands(), &II, VRBaseMap,
+                 /*IsDebug=*/true, /*IsClone=*/false, /*IsCloned=*/false);
+  } else if (SD->getKind() == SDDbgValue::CONST) {
+    const Value *V = SD->getConst();
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+      if (CI->getBitWidth() > 64)
+        MIB.addCImm(CI);
+      else
+        MIB.addImm(CI->getSExtValue());
+    } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
+      MIB.addFPImm(CF);
+    } else {
+      // Could be an Undef.  In any case insert an Undef so we can see what we
+      // dropped.
+      MIB.addReg(0U);
+    }
+  } else {
+    // Insert an Undef so we can see what we dropped.
+    MIB.addReg(0U);
+  }
+
+  // Indirect addressing is indicated by an Imm as the second parameter.
+  if (SD->isIndirect())
+    MIB.addImm(Offset);
+  else {
+    assert(Offset == 0 && "direct value cannot have an offset");
+    MIB.addReg(0U, RegState::Debug);
+  }
+
+  MIB.addMetadata(Var);
+  MIB.addMetadata(Expr);
+
+  return &*MIB;
+}
+
+/// EmitMachineNode - Generate machine code for a target-specific node and
+/// needed dependencies.
+///
+void InstrEmitter::
+EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  unsigned Opc = Node->getMachineOpcode();
+
+  // Handle subreg insert/extract specially
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::INSERT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG) {
+    EmitSubregNode(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  // Handle COPY_TO_REGCLASS specially.
+  if (Opc == TargetOpcode::COPY_TO_REGCLASS) {
+    EmitCopyToRegClassNode(Node, VRBaseMap);
+    return;
+  }
+
+  // Handle REG_SEQUENCE specially.
+  if (Opc == TargetOpcode::REG_SEQUENCE) {
+    EmitRegSequence(Node, VRBaseMap, IsClone, IsCloned);
+    return;
+  }
+
+  if (Opc == TargetOpcode::IMPLICIT_DEF)
+    // We want a unique VR for each IMPLICIT_DEF use.
+    return;
+
+  const MCInstrDesc &II = TII->get(Opc);
+  unsigned NumResults = CountResults(Node);
+  unsigned NumDefs = II.getNumDefs();
+  const MCPhysReg *ScratchRegs = nullptr;
+
+  // Handle STACKMAP and PATCHPOINT specially and then use the generic code.
+  if (Opc == TargetOpcode::STACKMAP || Opc == TargetOpcode::PATCHPOINT) {
+    // Stackmaps do not have arguments and do not preserve their calling
+    // convention. However, to simplify runtime support, they clobber the same
+    // scratch registers as AnyRegCC.
+    unsigned CC = CallingConv::AnyReg;
+    if (Opc == TargetOpcode::PATCHPOINT) {
+      CC = Node->getConstantOperandVal(PatchPointOpers::CCPos);
+      NumDefs = NumResults;
+    }
+    ScratchRegs = TLI->getScratchRegisters((CallingConv::ID) CC);
+  }
+
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands =
+    countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
+  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=nullptr;
+#ifndef NDEBUG
+  unsigned NumMIOperands = NodeOperands + NumResults;
+  if (II.isVariadic())
+    assert(NumMIOperands >= II.getNumOperands() &&
+           "Too few operands for a variadic node!");
+  else
+    assert(NumMIOperands >= II.getNumOperands() &&
+           NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() +
+                            NumImpUses &&
+           "#operands for dag node doesn't match .td file!");
+#endif
+
+  // Create the new machine instruction.
+  MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(), II);
+
+  // Add result register values for things that are defined by this
+  // instruction.
+  if (NumResults)
+    CreateVirtualRegisters(Node, MIB, II, IsClone, IsCloned, VRBaseMap);
+
+  // Emit all of the actual operands of this instruction, adding them to the
+  // instruction as appropriate.
+  bool HasOptPRefs = NumDefs > NumResults;
+  assert((!HasOptPRefs || !HasPhysRegOuts) &&
+         "Unable to cope with optional defs and phys regs defs!");
+  unsigned NumSkip = HasOptPRefs ? NumDefs - NumResults : 0;
+  for (unsigned i = NumSkip; i != NodeOperands; ++i)
+    AddOperand(MIB, Node->getOperand(i), i-NumSkip+NumDefs, &II,
+               VRBaseMap, /*IsDebug=*/false, IsClone, IsCloned);
+
+  // Add scratch registers as implicit def and early clobber
+  if (ScratchRegs)
+    for (unsigned i = 0; ScratchRegs[i]; ++i)
+      MIB.addReg(ScratchRegs[i], RegState::ImplicitDefine |
+                                 RegState::EarlyClobber);
+
+  // Transfer all of the memory reference descriptions of this instruction.
+  MIB.setMemRefs(cast<MachineSDNode>(Node)->memoperands_begin(),
+                 cast<MachineSDNode>(Node)->memoperands_end());
+
+  // Insert the instruction into position in the block. This needs to
+  // happen before any custom inserter hook is called so that the
+  // hook knows where in the block to insert the replacement code.
+  MBB->insert(InsertPos, MIB);
+
+  // The MachineInstr may also define physregs instead of virtregs.  These
+  // physreg values can reach other instructions in different ways:
+  //
+  // 1. When there is a use of a Node value beyond the explicitly defined
+  //    virtual registers, we emit a CopyFromReg for one of the implicitly
+  //    defined physregs.  This only happens when HasPhysRegOuts is true.
+  //
+  // 2. A CopyFromReg reading a physreg may be glued to this instruction.
+  //
+  // 3. A glued instruction may implicitly use a physreg.
+  //
+  // 4. A glued instruction may use a RegisterSDNode operand.
+  //
+  // Collect all the used physreg defs, and make sure that any unused physreg
+  // defs are marked as dead.
+  SmallVector<unsigned, 8> UsedRegs;
+
+  // Additional results must be physical register defs.
+  if (HasPhysRegOuts) {
+    for (unsigned i = NumDefs; i < NumResults; ++i) {
+      unsigned Reg = II.getImplicitDefs()[i - NumDefs];
+      if (!Node->hasAnyUseOfValue(i))
+        continue;
+      // This implicitly defined physreg has a use.
+      UsedRegs.push_back(Reg);
+      EmitCopyFromReg(Node, i, IsClone, IsCloned, Reg, VRBaseMap);
+    }
+  }
+
+  // Scan the glue chain for any used physregs.
+  if (Node->getValueType(Node->getNumValues()-1) == MVT::Glue) {
+    for (SDNode *F = Node->getGluedUser(); F; F = F->getGluedUser()) {
+      if (F->getOpcode() == ISD::CopyFromReg) {
+        UsedRegs.push_back(cast<RegisterSDNode>(F->getOperand(1))->getReg());
+        continue;
+      } else if (F->getOpcode() == ISD::CopyToReg) {
+        // Skip CopyToReg nodes that are internal to the glue chain.
+        continue;
+      }
+      // Collect declared implicit uses.
+      const MCInstrDesc &MCID = TII->get(F->getMachineOpcode());
+      UsedRegs.append(MCID.getImplicitUses(),
+                      MCID.getImplicitUses() + MCID.getNumImplicitUses());
+      // In addition to declared implicit uses, we must also check for
+      // direct RegisterSDNode operands.
+      for (unsigned i = 0, e = F->getNumOperands(); i != e; ++i)
+        if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(F->getOperand(i))) {
+          unsigned Reg = R->getReg();
+          if (TargetRegisterInfo::isPhysicalRegister(Reg))
+            UsedRegs.push_back(Reg);
+        }
+    }
+  }
+
+  // Finally mark unused registers as dead.
+  if (!UsedRegs.empty() || II.getImplicitDefs())
+    MIB->setPhysRegsDeadExcept(UsedRegs, *TRI);
+
+  // Run post-isel target hook to adjust this instruction if needed.
+  if (II.hasPostISelHook())
+    TLI->AdjustInstrPostInstrSelection(*MIB, Node);
+}
+
+/// EmitSpecialNode - Generate machine code for a target-independent node and
+/// needed dependencies.
+void InstrEmitter::
+EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    Node->dump();
+#endif
+    llvm_unreachable("This target-independent node should have been selected!");
+  case ISD::EntryToken:
+    llvm_unreachable("EntryToken should have been excluded from the schedule!");
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor: // fall thru
+    break;
+  case ISD::CopyToReg: {
+    unsigned SrcReg;
+    SDValue SrcVal = Node->getOperand(2);
+    if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
+      SrcReg = R->getReg();
+    else
+      SrcReg = getVR(SrcVal, VRBaseMap);
+
+    unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    if (SrcReg == DestReg) // Coalesced away the copy? Ignore.
+      break;
+
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TargetOpcode::COPY),
+            DestReg).addReg(SrcReg);
+    break;
+  }
+  case ISD::CopyFromReg: {
+    unsigned SrcReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    EmitCopyFromReg(Node, 0, IsClone, IsCloned, SrcReg, VRBaseMap);
+    break;
+  }
+  case ISD::EH_LABEL: {
+    MCSymbol *S = cast<EHLabelSDNode>(Node)->getLabel();
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+            TII->get(TargetOpcode::EH_LABEL)).addSym(S);
+    break;
+  }
+
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END: {
+    unsigned TarOp = (Node->getOpcode() == ISD::LIFETIME_START) ?
+    TargetOpcode::LIFETIME_START : TargetOpcode::LIFETIME_END;
+
+    FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Node->getOperand(1));
+    BuildMI(*MBB, InsertPos, Node->getDebugLoc(), TII->get(TarOp))
+    .addFrameIndex(FI->getIndex());
+    break;
+  }
+
+  case ISD::INLINEASM: {
+    unsigned NumOps = Node->getNumOperands();
+    if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+      --NumOps;  // Ignore the glue operand.
+
+    // Create the inline asm machine instruction.
+    MachineInstrBuilder MIB = BuildMI(*MF, Node->getDebugLoc(),
+                                      TII->get(TargetOpcode::INLINEASM));
+
+    // Add the asm string as an external symbol operand.
+    SDValue AsmStrV = Node->getOperand(InlineAsm::Op_AsmString);
+    const char *AsmStr = cast<ExternalSymbolSDNode>(AsmStrV)->getSymbol();
+    MIB.addExternalSymbol(AsmStr);
+
+    // Add the HasSideEffect, isAlignStack, AsmDialect, MayLoad and MayStore
+    // bits.
+    int64_t ExtraInfo =
+      cast<ConstantSDNode>(Node->getOperand(InlineAsm::Op_ExtraInfo))->
+                          getZExtValue();
+    MIB.addImm(ExtraInfo);
+
+    // Remember to operand index of the group flags.
+    SmallVector<unsigned, 8> GroupIdx;
+
+    // Remember registers that are part of early-clobber defs.
+    SmallVector<unsigned, 8> ECRegs;
+
+    // Add all of the operand registers to the instruction.
+    for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+      unsigned Flags =
+        cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+      const unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+      GroupIdx.push_back(MIB->getNumOperands());
+      MIB.addImm(Flags);
+      ++i;  // Skip the ID value.
+
+      switch (InlineAsm::getKind(Flags)) {
+      default: llvm_unreachable("Bad flags!");
+        case InlineAsm::Kind_RegDef:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          // FIXME: Add dead flags for physical and virtual registers defined.
+          // For now, mark physical register defs as implicit to help fast
+          // regalloc. This makes inline asm look a lot like calls.
+          MIB.addReg(Reg, RegState::Define |
+                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+        }
+        break;
+      case InlineAsm::Kind_RegDefEarlyClobber:
+      case InlineAsm::Kind_Clobber:
+        for (unsigned j = 0; j != NumVals; ++j, ++i) {
+          unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+          MIB.addReg(Reg, RegState::Define | RegState::EarlyClobber |
+                  getImplRegState(TargetRegisterInfo::isPhysicalRegister(Reg)));
+          ECRegs.push_back(Reg);
+        }
+        break;
+      case InlineAsm::Kind_RegUse:  // Use of register.
+      case InlineAsm::Kind_Imm:  // Immediate.
+      case InlineAsm::Kind_Mem:  // Addressing mode.
+        // The addressing mode has been selected, just add all of the
+        // operands to the machine instruction.
+        for (unsigned j = 0; j != NumVals; ++j, ++i)
+          AddOperand(MIB, Node->getOperand(i), 0, nullptr, VRBaseMap,
+                     /*IsDebug=*/false, IsClone, IsCloned);
+
+        // Manually set isTied bits.
+        if (InlineAsm::getKind(Flags) == InlineAsm::Kind_RegUse) {
+          unsigned DefGroup = 0;
+          if (InlineAsm::isUseOperandTiedToDef(Flags, DefGroup)) {
+            unsigned DefIdx = GroupIdx[DefGroup] + 1;
+            unsigned UseIdx = GroupIdx.back() + 1;
+            for (unsigned j = 0; j != NumVals; ++j)
+              MIB->tieOperands(DefIdx + j, UseIdx + j);
+          }
+        }
+        break;
+      }
+    }
+
+    // GCC inline assembly allows input operands to also be early-clobber
+    // output operands (so long as the operand is written only after it's
+    // used), but this does not match the semantics of our early-clobber flag.
+    // If an early-clobber operand register is also an input operand register,
+    // then remove the early-clobber flag.
+    for (unsigned Reg : ECRegs) {
+      if (MIB->readsRegister(Reg, TRI)) {
+        MachineOperand *MO = MIB->findRegisterDefOperand(Reg, false, TRI);
+        assert(MO && "No def operand for clobbered register?");
+        MO->setIsEarlyClobber(false);
+      }
+    }
+
+    // Get the mdnode from the asm if it exists and add it to the instruction.
+    SDValue MDV = Node->getOperand(InlineAsm::Op_MDNode);
+    const MDNode *MD = cast<MDNodeSDNode>(MDV)->getMD();
+    if (MD)
+      MIB.addMetadata(MD);
+
+    MBB->insert(InsertPos, MIB);
+    break;
+  }
+  }
+}
+
+/// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+/// at the given position in the given block.
+InstrEmitter::InstrEmitter(MachineBasicBlock *mbb,
+                           MachineBasicBlock::iterator insertpos)
+    : MF(mbb->getParent()), MRI(&MF->getRegInfo()),
+      TII(MF->getSubtarget().getInstrInfo()),
+      TRI(MF->getSubtarget().getRegisterInfo()),
+      TLI(MF->getSubtarget().getTargetLowering()), MBB(mbb),
+      InsertPos(insertpos) {}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
new file mode 100644
index 000000000000..8a8a1bbd18f7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -0,0 +1,145 @@
+//===- InstrEmitter.h - Emit MachineInstrs for the SelectionDAG -*- C++ -*--==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This declares the Emit routines for the SelectionDAG class, which creates
+// MachineInstrs based on the decisions of the SelectionDAG instruction
+// selection.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_INSTREMITTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+namespace llvm {
+
+class MachineInstrBuilder;
+class MCInstrDesc;
+class SDDbgValue;
+
+class LLVM_LIBRARY_VISIBILITY InstrEmitter {
+  MachineFunction *MF;
+  MachineRegisterInfo *MRI;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+
+  MachineBasicBlock *MBB;
+  MachineBasicBlock::iterator InsertPos;
+
+  /// EmitCopyFromReg - Generate machine code for an CopyFromReg node or an
+  /// implicit physical register output.
+  void EmitCopyFromReg(SDNode *Node, unsigned ResNo,
+                       bool IsClone, bool IsCloned,
+                       unsigned SrcReg,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getDstOfCopyToRegUse - If the only use of the specified result number of
+  /// node is a CopyToReg, return its destination register. Return 0 otherwise.
+  unsigned getDstOfOnlyCopyToRegUse(SDNode *Node,
+                                    unsigned ResNo) const;
+
+  void CreateVirtualRegisters(SDNode *Node,
+                              MachineInstrBuilder &MIB,
+                              const MCInstrDesc &II,
+                              bool IsClone, bool IsCloned,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// getVR - Return the virtual register corresponding to the specified result
+  /// of the specified node.
+  unsigned getVR(SDValue Op,
+                 DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// AddRegisterOperand - Add the specified register as an operand to the
+  /// specified machine instr. Insert register copies if the register is
+  /// not in the required register class.
+  void AddRegisterOperand(MachineInstrBuilder &MIB,
+                          SDValue Op,
+                          unsigned IIOpNum,
+                          const MCInstrDesc *II,
+                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// AddOperand - Add the specified operand to the specified machine instr.  II
+  /// specifies the instruction information for the node, and IIOpNum is the
+  /// operand number (in the II) that we are adding. IIOpNum and II are used for
+  /// assertions only.
+  void AddOperand(MachineInstrBuilder &MIB,
+                  SDValue Op,
+                  unsigned IIOpNum,
+                  const MCInstrDesc *II,
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  bool IsDebug, bool IsClone, bool IsCloned);
+
+  /// ConstrainForSubReg - Try to constrain VReg to a register class that
+  /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
+  /// Return the virtual register to use.
+  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx, MVT VT,
+                              const DebugLoc &DL);
+
+  /// EmitSubregNode - Generate machine code for subreg nodes.
+  ///
+  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                      bool IsClone, bool IsCloned);
+
+  /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
+  /// COPY_TO_REGCLASS is just a normal copy, except that the destination
+  /// register is constrained to be in a particular register class.
+  ///
+  void EmitCopyToRegClassNode(SDNode *Node,
+                              DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
+  ///
+  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+                       bool IsClone, bool IsCloned);
+public:
+  /// CountResults - The results of target nodes have register or immediate
+  /// operands first, then an optional chain, and optional flag operands
+  /// (which do not go into the machine instrs.)
+  static unsigned CountResults(SDNode *Node);
+
+  /// EmitDbgValue - Generate machine instruction for a dbg_value node.
+  ///
+  MachineInstr *EmitDbgValue(SDDbgValue *SD,
+                             DenseMap<SDValue, unsigned> &VRBaseMap);
+
+  /// EmitNode - Generate machine code for a node and needed dependencies.
+  ///
+  void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
+                DenseMap<SDValue, unsigned> &VRBaseMap) {
+    if (Node->isMachineOpcode())
+      EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
+    else
+      EmitSpecialNode(Node, IsClone, IsCloned, VRBaseMap);
+  }
+
+  /// getBlock - Return the current basic block.
+  MachineBasicBlock *getBlock() { return MBB; }
+
+  /// getInsertPos - Return the current insertion position.
+  MachineBasicBlock::iterator getInsertPos() { return InsertPos; }
+
+  /// InstrEmitter - Construct an InstrEmitter and set it to start inserting
+  /// at the given position in the given block.
+  InstrEmitter(MachineBasicBlock *mbb, MachineBasicBlock::iterator insertpos);
+
+private:
+  void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+  void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
+                       DenseMap<SDValue, unsigned> &VRBaseMap);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
new file mode 100644
index 000000000000..81634096c1ba
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -0,0 +1,4422 @@
+//===-- LegalizeDAG.cpp - Implement SelectionDAG::Legalize ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::Legalize method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalizedag"
+
+namespace {
+
+struct FloatSignAsInt;
+
+//===----------------------------------------------------------------------===//
+/// This takes an arbitrary SelectionDAG as input and
+/// hacks on it until the target machine can handle it.  This involves
+/// eliminating value sizes the machine cannot handle (promoting small sizes to
+/// large sizes or splitting up large values into small values) as well as
+/// eliminating operations the machine cannot handle.
+///
+/// This code also does a small amount of optimization and recognition of idioms
+/// as part of its processing.  For example, if a target does not support a
+/// 'setcc' instruction efficiently, but does support 'brcc' instruction, this
+/// will attempt merge setcc and brc instructions into brcc's.
+///
+class SelectionDAGLegalize {
+  const TargetMachine &TM;
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+
+  /// \brief The set of nodes which have already been legalized. We hold a
+  /// reference to it in order to update as necessary on node deletion.
+  SmallPtrSetImpl<SDNode *> &LegalizedNodes;
+
+  /// \brief A set of all the nodes updated during legalization.
+  SmallSetVector<SDNode *, 16> *UpdatedNodes;
+
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  }
+
+  // Libcall insertion helpers.
+
+public:
+  SelectionDAGLegalize(SelectionDAG &DAG,
+                       SmallPtrSetImpl<SDNode *> &LegalizedNodes,
+                       SmallSetVector<SDNode *, 16> *UpdatedNodes = nullptr)
+      : TM(DAG.getTarget()), TLI(DAG.getTargetLoweringInfo()), DAG(DAG),
+        LegalizedNodes(LegalizedNodes), UpdatedNodes(UpdatedNodes) {}
+
+  /// \brief Legalizes the given operation.
+  void LegalizeOp(SDNode *Node);
+
+private:
+  SDValue OptimizeFloatStore(StoreSDNode *ST);
+
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
+  /// Some targets cannot handle a variable
+  /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
+  /// is necessary to spill the vector being inserted into to memory, perform
+  /// the insert there, and then read the result back.
+  SDValue PerformInsertVectorEltInMemory(SDValue Vec, SDValue Val, SDValue Idx,
+                                         const SDLoc &dl);
+  SDValue ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val, SDValue Idx,
+                                  const SDLoc &dl);
+
+  /// Return a vector shuffle operation which
+  /// performs the same shuffe in terms of order or result bytes, but on a type
+  /// whose vector element type is narrower than the original shuffle type.
+  /// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+  SDValue ShuffleWithNarrowerEltType(EVT NVT, EVT VT, const SDLoc &dl,
+                                     SDValue N1, SDValue N2,
+                                     ArrayRef<int> Mask) const;
+
+  bool LegalizeSetCCCondCode(EVT VT, SDValue &LHS, SDValue &RHS, SDValue &CC,
+                             bool &NeedInvert, const SDLoc &dl);
+
+  SDValue ExpandLibCall(RTLIB::Libcall LC, SDNode *Node, bool isSigned);
+  SDValue ExpandLibCall(RTLIB::Libcall LC, EVT RetVT, const SDValue *Ops,
+                        unsigned NumOps, bool isSigned, const SDLoc &dl);
+
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  SDValue ExpandFPLibCall(SDNode *Node, RTLIB::Libcall Call_F32,
+                          RTLIB::Libcall Call_F64, RTLIB::Libcall Call_F80,
+                          RTLIB::Libcall Call_F128,
+                          RTLIB::Libcall Call_PPCF128);
+  SDValue ExpandIntLibCall(SDNode *Node, bool isSigned,
+                           RTLIB::Libcall Call_I8,
+                           RTLIB::Libcall Call_I16,
+                           RTLIB::Libcall Call_I32,
+                           RTLIB::Libcall Call_I64,
+                           RTLIB::Libcall Call_I128);
+  void ExpandDivRemLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandSinCosLibCall(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+
+  SDValue EmitStackConvert(SDValue SrcOp, EVT SlotVT, EVT DestVT,
+                           const SDLoc &dl);
+  SDValue ExpandBUILD_VECTOR(SDNode *Node);
+  SDValue ExpandSCALAR_TO_VECTOR(SDNode *Node);
+  void ExpandDYNAMIC_STACKALLOC(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results);
+  void getSignAsIntValue(FloatSignAsInt &State, const SDLoc &DL,
+                         SDValue Value) const;
+  SDValue modifySignAsInt(const FloatSignAsInt &State, const SDLoc &DL,
+                          SDValue NewIntValue) const;
+  SDValue ExpandFCOPYSIGN(SDNode *Node) const;
+  SDValue ExpandFABS(SDNode *Node) const;
+  SDValue ExpandLegalINT_TO_FP(bool isSigned, SDValue LegalOp, EVT DestVT,
+                               const SDLoc &dl);
+  SDValue PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                const SDLoc &dl);
+  SDValue PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT, bool isSigned,
+                                const SDLoc &dl);
+
+  SDValue ExpandBITREVERSE(SDValue Op, const SDLoc &dl);
+  SDValue ExpandBSWAP(SDValue Op, const SDLoc &dl);
+  SDValue ExpandBitCount(unsigned Opc, SDValue Op, const SDLoc &dl);
+
+  SDValue ExpandExtractFromVectorThroughStack(SDValue Op);
+  SDValue ExpandInsertToVectorThroughStack(SDValue Op);
+  SDValue ExpandVectorBuildThroughStack(SDNode* Node);
+
+  SDValue ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP);
+  SDValue ExpandConstant(ConstantSDNode *CP);
+
+  // if ExpandNode returns false, LegalizeOp falls back to ConvertNodeToLibcall
+  bool ExpandNode(SDNode *Node);
+  void ConvertNodeToLibcall(SDNode *Node);
+  void PromoteNode(SDNode *Node);
+
+public:
+  // Node replacement helpers
+  void ReplacedNode(SDNode *N) {
+    LegalizedNodes.erase(N);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(N);
+  }
+  void ReplaceNode(SDNode *Old, SDNode *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
+    assert(Old->getNumValues() == New->getNumValues() &&
+           "Replacing one node with another that produces a different number "
+           "of values!");
+    DAG.ReplaceAllUsesWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New);
+    ReplacedNode(Old);
+  }
+  void ReplaceNode(SDValue Old, SDValue New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG);
+          dbgs() << "     with:      "; New->dump(&DAG));
+
+    DAG.ReplaceAllUsesWith(Old, New);
+    if (UpdatedNodes)
+      UpdatedNodes->insert(New.getNode());
+    ReplacedNode(Old.getNode());
+  }
+  void ReplaceNode(SDNode *Old, const SDValue *New) {
+    DEBUG(dbgs() << " ... replacing: "; Old->dump(&DAG));
+
+    DAG.ReplaceAllUsesWith(Old, New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
+      DEBUG(dbgs() << (i == 0 ? "     with:      "
+                              : "      and:      ");
+            New[i]->dump(&DAG));
+      if (UpdatedNodes)
+        UpdatedNodes->insert(New[i].getNode());
+    }
+    ReplacedNode(Old);
+  }
+};
+}
+
+/// Return a vector shuffle operation which
+/// performs the same shuffe in terms of order or result bytes, but on a type
+/// whose vector element type is narrower than the original shuffle type.
+/// e.g. <v4i32> <0, 1, 0, 1> -> v8i16 <0, 1, 2, 3, 0, 1, 2, 3>
+SDValue SelectionDAGLegalize::ShuffleWithNarrowerEltType(
+    EVT NVT, EVT VT, const SDLoc &dl, SDValue N1, SDValue N2,
+    ArrayRef<int> Mask) const {
+  unsigned NumMaskElts = VT.getVectorNumElements();
+  unsigned NumDestElts = NVT.getVectorNumElements();
+  unsigned NumEltsGrowth = NumDestElts / NumMaskElts;
+
+  assert(NumEltsGrowth && "Cannot promote to vector type with fewer elts!");
+
+  if (NumEltsGrowth == 1)
+    return DAG.getVectorShuffle(NVT, dl, N1, N2, Mask);
+
+  SmallVector<int, 8> NewMask;
+  for (unsigned i = 0; i != NumMaskElts; ++i) {
+    int Idx = Mask[i];
+    for (unsigned j = 0; j != NumEltsGrowth; ++j) {
+      if (Idx < 0)
+        NewMask.push_back(-1);
+      else
+        NewMask.push_back(Idx * NumEltsGrowth + j);
+    }
+  }
+  assert(NewMask.size() == NumDestElts && "Non-integer NumEltsGrowth?");
+  assert(TLI.isShuffleMaskLegal(NewMask, NVT) && "Shuffle not legal?");
+  return DAG.getVectorShuffle(NVT, dl, N1, N2, NewMask);
+}
+
+/// Expands the ConstantFP node to an integer constant or
+/// a load from the constant pool.
+SDValue
+SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
+  bool Extend = false;
+  SDLoc dl(CFP);
+
+  // If a FP immediate is precise when represented as a float and if the
+  // target can do an extending load from float to double, we put it into
+  // the constant pool as a float, even if it's is statically typed as a
+  // double.  This shrinks FP constants and canonicalizes them for targets where
+  // an FP extending load is the same cost as a normal load (such as on the x87
+  // fp stack or PPC FP unit).
+  EVT VT = CFP->getValueType(0);
+  ConstantFP *LLVMC = const_cast<ConstantFP*>(CFP->getConstantFPValue());
+  if (!UseCP) {
+    assert((VT == MVT::f64 || VT == MVT::f32) && "Invalid type expansion");
+    return DAG.getConstant(LLVMC->getValueAPF().bitcastToAPInt(), dl,
+                           (VT == MVT::f64) ? MVT::i64 : MVT::i32);
+  }
+
+  EVT OrigVT = VT;
+  EVT SVT = VT;
+  while (SVT != MVT::f32 && SVT != MVT::f16) {
+    SVT = (MVT::SimpleValueType)(SVT.getSimpleVT().SimpleTy - 1);
+    if (ConstantFPSDNode::isValueValidForType(SVT, CFP->getValueAPF()) &&
+        // Only do this if the target has a native EXTLOAD instruction from
+        // smaller type.
+        TLI.isLoadExtLegal(ISD::EXTLOAD, OrigVT, SVT) &&
+        TLI.ShouldShrinkFPConstant(OrigVT)) {
+      Type *SType = SVT.getTypeForEVT(*DAG.getContext());
+      LLVMC = cast<ConstantFP>(ConstantExpr::getFPTrunc(LLVMC, SType));
+      VT = SVT;
+      Extend = true;
+    }
+  }
+
+  SDValue CPIdx =
+      DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  if (Extend) {
+    SDValue Result = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), VT,
+        Alignment);
+    return Result;
+  }
+  SDValue Result = DAG.getLoad(
+      OrigVT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  return Result;
+}
+
+/// Expands the Constant node to a load from the constant pool.
+SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
+  SDLoc dl(CP);
+  EVT VT = CP->getValueType(0);
+  SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
+                                      TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  SDValue Result = DAG.getLoad(
+      VT, dl, DAG.getEntryNode(), CPIdx,
+      MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
+  return Result;
+}
+
+/// Some target cannot handle a variable insertion index for the
+/// INSERT_VECTOR_ELT instruction.  In this case, it
+/// is necessary to spill the vector being inserted into to memory, perform
+/// the insert there, and then read the result back.
+SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
+                                                             SDValue Val,
+                                                             SDValue Idx,
+                                                             const SDLoc &dl) {
+  SDValue Tmp1 = Vec;
+  SDValue Tmp2 = Val;
+  SDValue Tmp3 = Idx;
+
+  // If the target doesn't support this, we have to spill the input vector
+  // to a temporary stack slot, update the element, then reload it.  This is
+  // badness.  We could also load the value into a vector register (either
+  // with a "move to register" or "extload into register" instruction, then
+  // permute it into place, if the idx is a constant and if the idx is
+  // supported by the target.
+  EVT VT    = Tmp1.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxVT = Tmp3.getValueType();
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  SDValue StackPtr = DAG.CreateStackTemporary(VT);
+
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+
+  // Store the vector.
+  SDValue Ch = DAG.getStore(
+      DAG.getEntryNode(), dl, Tmp1, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
+
+  // Truncate or zero extend offset to target pointer type.
+  Tmp3 = DAG.getZExtOrTrunc(Tmp3, dl, PtrVT);
+  // Add the offset to the index.
+  unsigned EltSize = EltVT.getSizeInBits()/8;
+  Tmp3 = DAG.getNode(ISD::MUL, dl, IdxVT, Tmp3,
+                     DAG.getConstant(EltSize, dl, IdxVT));
+  SDValue StackPtr2 = DAG.getNode(ISD::ADD, dl, IdxVT, Tmp3, StackPtr);
+  // Store the scalar value.
+  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
+  // Load the updated vector.
+  return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
+                                               DAG.getMachineFunction(), SPFI));
+}
+
+SDValue SelectionDAGLegalize::ExpandINSERT_VECTOR_ELT(SDValue Vec, SDValue Val,
+                                                      SDValue Idx,
+                                                      const SDLoc &dl) {
+  if (ConstantSDNode *InsertPos = dyn_cast<ConstantSDNode>(Idx)) {
+    // SCALAR_TO_VECTOR requires that the type of the value being inserted
+    // match the element type of the vector being created, except for
+    // integers in which case the inserted value can be over width.
+    EVT EltVT = Vec.getValueType().getVectorElementType();
+    if (Val.getValueType() == EltVT ||
+        (EltVT.isInteger() && Val.getValueType().bitsGE(EltVT))) {
+      SDValue ScVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl,
+                                  Vec.getValueType(), Val);
+
+      unsigned NumElts = Vec.getValueType().getVectorNumElements();
+      // We generate a shuffle of InVec and ScVec, so the shuffle mask
+      // should be 0,1,2,3,4,5... with the appropriate element replaced with
+      // elt 0 of the RHS.
+      SmallVector<int, 8> ShufOps;
+      for (unsigned i = 0; i != NumElts; ++i)
+        ShufOps.push_back(i != InsertPos->getZExtValue() ? i : NumElts);
+
+      return DAG.getVectorShuffle(Vec.getValueType(), dl, Vec, ScVec, ShufOps);
+    }
+  }
+  return PerformInsertVectorEltInMemory(Vec, Val, Idx, dl);
+}
+
+SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
+  // Turn 'store float 1.0, Ptr' -> 'store int 0x12345678, Ptr'
+  // FIXME: We shouldn't do this for TargetConstantFP's.
+  // FIXME: move this to the DAG Combiner!  Note that we can't regress due
+  // to phase ordering between legalized code and the dag combiner.  This
+  // probably means that we need to integrate dag combiner and legalizer
+  // together.
+  // We generally can't do this one for long doubles.
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  unsigned Alignment = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+  SDLoc dl(ST);
+  if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
+    if (CFP->getValueType(0) == MVT::f32 &&
+        TLI.isTypeLegal(MVT::i32)) {
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
+                                      bitcastToAPInt().zextOrTrunc(32),
+                                    SDLoc(CFP), MVT::i32);
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
+                          MMOFlags, AAInfo);
+    }
+
+    if (CFP->getValueType(0) == MVT::f64) {
+      // If this target supports 64-bit registers, do a single 64-bit store.
+      if (TLI.isTypeLegal(MVT::i64)) {
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+                                      zextOrTrunc(64), SDLoc(CFP), MVT::i64);
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                            Alignment, MMOFlags, AAInfo);
+      }
+
+      if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
+        // Otherwise, if the target supports 32-bit registers, use 2 32-bit
+        // stores.  If the target supports neither 32- nor 64-bits, this
+        // xform is certainly not worth it.
+        const APInt &IntVal = CFP->getValueAPF().bitcastToAPInt();
+        SDValue Lo = DAG.getConstant(IntVal.trunc(32), dl, MVT::i32);
+        SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), dl, MVT::i32);
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(Lo, Hi);
+
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
+                          MMOFlags, AAInfo);
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                          DAG.getConstant(4, dl, Ptr.getValueType()));
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
+                          ST->getPointerInfo().getWithOffset(4),
+                          MinAlign(Alignment, 4U), MMOFlags, AAInfo);
+
+        return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+      }
+    }
+  }
+  return SDValue(nullptr, 0);
+}
+
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    SDValue Chain = ST->getChain();
+    SDValue Ptr = ST->getBasePtr();
+    SDLoc dl(Node);
+
+    unsigned Alignment = ST->getAlignment();
+    MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+    AAMDNodes AAInfo = ST->getAAInfo();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        ReplaceNode(ST, OptStore);
+        return;
+      }
+
+      {
+        SDValue Value = ST->getValue();
+        MVT VT = Value.getSimpleValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal: {
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          EVT MemVT = ST->getMemoryVT();
+          unsigned AS = ST->getAddressSpace();
+          unsigned Align = ST->getAlignment();
+          const DataLayout &DL = DAG.getDataLayout();
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+            SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+            ReplaceNode(SDValue(ST, 0), Result);
+          }
+          break;
+        }
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res && Res != SDValue(Node, 0))
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Promote: {
+          MVT NVT = TLI.getTypeToPromoteTo(ISD::STORE, VT);
+          assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+                 "Can only promote stores to same size type");
+          Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
+          SDValue Result =
+              DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                           Alignment, MMOFlags, AAInfo);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+        }
+        return;
+      }
+    } else {
+      SDValue Value = ST->getValue();
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+      auto &DL = DAG.getDataLayout();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+        SDValue Result =
+            DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
+                              Alignment, MMOFlags, AAInfo);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (DL.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                 RoundVT, Alignment, MMOFlags, AAInfo);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                            DAG.getConstant(IncrementSize, dl,
+                                            Ptr.getValueType()));
+          Hi = DAG.getNode(
+              ISD::SRL, dl, Value.getValueType(), Value,
+              DAG.getConstant(RoundWidth, dl,
+                              TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          Hi = DAG.getTruncStore(
+              Chain, dl, Hi, Ptr,
+              ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
+              MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(
+              ISD::SRL, dl, Value.getValueType(), Value,
+              DAG.getConstant(ExtraWidth, dl,
+                              TLI.getShiftAmountTy(Value.getValueType(), DL)));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
+                                 RoundVT, Alignment, MMOFlags, AAInfo);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                            DAG.getConstant(IncrementSize, dl,
+                                            Ptr.getValueType()));
+          Lo = DAG.getTruncStore(
+              Chain, dl, Value, Ptr,
+              ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
+              MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+        }
+
+        // The order of the stores doesn't matter.
+        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else {
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal: {
+          EVT MemVT = ST->getMemoryVT();
+          unsigned AS = ST->getAddressSpace();
+          unsigned Align = ST->getAlignment();
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+            SDValue Result = TLI.expandUnalignedStore(ST, DAG);
+            ReplaceNode(SDValue(ST, 0), Result);
+          }
+          break;
+        }
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res && Res != SDValue(Node, 0))
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Expand:
+          assert(!StVT.isVector() &&
+                 "Vector Stores are handled in LegalizeVectorOps");
+
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(TLI.isTypeLegal(StVT) &&
+                 "Do not know how to expand this store!");
+          Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+          SDValue Result =
+              DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                           Alignment, MMOFlags, AAInfo);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+      }
+    }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  SDLoc dl(Node);
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    MVT VT = Node->getSimpleValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal: {
+      EVT MemVT = LD->getMemoryVT();
+      unsigned AS = LD->getAddressSpace();
+      unsigned Align = LD->getAlignment();
+      const DataLayout &DL = DAG.getDataLayout();
+      // If this is an unaligned load and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+        std::tie(RVal, RChain) =  TLI.expandUnalignedLoad(LD, DAG);
+      }
+      break;
+    }
+    case TargetLowering::Custom: {
+      if (SDValue Res = TLI.LowerOperation(RVal, DAG)) {
+        RVal = Res;
+        RChain = Res.getValue(1);
+      }
+      break;
+    }
+    case TargetLowering::Promote: {
+      MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+      assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
+             "Can only promote loads to same size type");
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getMemOperand());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      if (UpdatedNodes) {
+        UpdatedNodes->insert(RVal.getNode());
+        UpdatedNodes->insert(RChain.getNode());
+      }
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  EVT SrcVT = LD->getMemoryVT();
+  unsigned SrcWidth = SrcVT.getSizeInBits();
+  unsigned Alignment = LD->getAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, Node->getValueType(0), MVT::i1) ==
+         TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result =
+        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
+                       LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (SrcWidth & (SrcWidth - 1)) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+    assert(RoundWidth < SrcWidth);
+    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+    auto &DL = DAG.getDataLayout();
+
+    if (DL.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
+                          AAInfo);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getConstant(IncrementSize, dl,
+                                         Ptr.getValueType()));
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
+                          AAInfo);
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(RoundWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
+                          AAInfo);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getConstant(IncrementSize, dl,
+                                         Ptr.getValueType()));
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
+                          AAInfo);
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(
+          ISD::SHL, dl, Hi.getValueType(), Hi,
+          DAG.getConstant(ExtraWidth, dl,
+                          TLI.getShiftAmountTy(Hi.getValueType(), DL)));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, Node->getValueType(0),
+                                 SrcVT.getSimpleVT())) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+      isCustom = true;
+      // FALLTHROUGH
+    case TargetLowering::Legal: {
+      Value = SDValue(Node, 0);
+      Chain = SDValue(Node, 1);
+
+      if (isCustom) {
+        if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
+          Value = Res;
+          Chain = Res.getValue(1);
+        }
+      } else {
+        // If this is an unaligned load and the target doesn't support it,
+        // expand it.
+        EVT MemVT = LD->getMemoryVT();
+        unsigned AS = LD->getAddressSpace();
+        unsigned Align = LD->getAlignment();
+        const DataLayout &DL = DAG.getDataLayout();
+        if (!TLI.allowsMemoryAccess(*DAG.getContext(), DL, MemVT, AS, Align)) {
+          std::tie(Value, Chain) = TLI.expandUnalignedLoad(LD, DAG);
+        }
+      }
+      break;
+    }
+    case TargetLowering::Expand:
+      EVT DestVT = Node->getValueType(0);
+      if (!TLI.isLoadExtLegal(ISD::EXTLOAD, DestVT, SrcVT)) {
+        // If the source type is not legal, see if there is a legal extload to
+        // an intermediate type that we can then extend further.
+        EVT LoadVT = TLI.getRegisterType(SrcVT.getSimpleVT());
+        if (TLI.isTypeLegal(SrcVT) || // Same as SrcVT == LoadVT?
+            TLI.isLoadExtLegal(ExtType, LoadVT, SrcVT)) {
+          // If we are loading a legal type, this is a non-extload followed by a
+          // full extend.
+          ISD::LoadExtType MidExtType =
+              (LoadVT == SrcVT) ? ISD::NON_EXTLOAD : ExtType;
+
+          SDValue Load = DAG.getExtLoad(MidExtType, dl, LoadVT, Chain, Ptr,
+                                        SrcVT, LD->getMemOperand());
+          unsigned ExtendOp =
+              ISD::getExtForLoadExtType(SrcVT.isFloatingPoint(), ExtType);
+          Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+          Chain = Load.getValue(1);
+          break;
+        }
+
+        // Handle the special case of fp16 extloads. EXTLOAD doesn't have the
+        // normal undefined upper bits behavior to allow using an in-reg extend
+        // with the illegal FP type, so load as an integer and do the
+        // from-integer conversion.
+        if (SrcVT.getScalarType() == MVT::f16) {
+          EVT ISrcVT = SrcVT.changeTypeToInteger();
+          EVT IDestVT = DestVT.changeTypeToInteger();
+          EVT LoadVT = TLI.getRegisterType(IDestVT.getSimpleVT());
+
+          SDValue Result = DAG.getExtLoad(ISD::ZEXTLOAD, dl, LoadVT,
+                                          Chain, Ptr, ISrcVT,
+                                          LD->getMemOperand());
+          Value = DAG.getNode(ISD::FP16_TO_FP, dl, DestVT, Result);
+          Chain = Result.getValue(1);
+          break;
+        }
+      }
+
+      assert(!SrcVT.isVector() &&
+             "Vector Loads are handled in LegalizeVectorOps");
+
+      // FIXME: This does not work for vectors on most targets.  Sign-
+      // and zero-extend operations are currently folded into extending
+      // loads, whether they are legal or not, and then we end up here
+      // without any support for legalizing them.
+      assert(ExtType != ISD::EXTLOAD &&
+             "EXTLOAD should always be supported!");
+      // Turn the unsupported load into an EXTLOAD followed by an
+      // explicit zero/sign extend inreg.
+      SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl,
+                                      Node->getValueType(0),
+                                      Chain, Ptr, SrcVT,
+                                      LD->getMemOperand());
+      SDValue ValRes;
+      if (ExtType == ISD::SEXTLOAD)
+        ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                             Result.getValueType(),
+                             Result, DAG.getValueType(SrcVT));
+      else
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+      Value = ValRes;
+      Chain = Result.getValue(1);
+      break;
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Value.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
+    ReplacedNode(Node);
+  }
+}
+
+/// Return a legal replacement for the given operation, with all legal operands.
+void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
+  DEBUG(dbgs() << "\nLegalizing: "; Node->dump(&DAG));
+
+  if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
+    return;
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+    assert((TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Node->getValueType(i))) &&
+           "Unexpected illegal type!");
+
+  for (const SDValue &Op : Node->op_values())
+    assert((TLI.getTypeAction(*DAG.getContext(), Op.getValueType()) ==
+              TargetLowering::TypeLegal ||
+            TLI.isTypeLegal(Op.getValueType()) ||
+            Op.getOpcode() == ISD::TargetConstant) &&
+            "Unexpected illegal type!");
+#endif
+
+  // Figure out the correct action; the way to query this varies by opcode
+  TargetLowering::LegalizeAction Action = TargetLowering::Legal;
+  bool SimpleFinishLegalizing = true;
+  switch (Node->getOpcode()) {
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::STACKSAVE:
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    break;
+  case ISD::VAARG:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getValueType(0));
+    if (Action != TargetLowering::Promote)
+      Action = TLI.getOperationAction(Node->getOpcode(), MVT::Other);
+    break;
+  case ISD::FP_TO_FP16:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::EXTRACT_VECTOR_ELT:
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(0).getValueType());
+    break;
+  case ISD::FP_ROUND_INREG:
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT InnerType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Action = TLI.getOperationAction(Node->getOpcode(), InnerType);
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    Action = TLI.getOperationAction(Node->getOpcode(),
+                                    Node->getOperand(2).getValueType());
+    break;
+  }
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::BR_CC: {
+    unsigned CCOperand = Node->getOpcode() == ISD::SELECT_CC ? 4 :
+                         Node->getOpcode() == ISD::SETCC ? 2 :
+                         Node->getOpcode() == ISD::SETCCE ? 3 : 1;
+    unsigned CompareOperand = Node->getOpcode() == ISD::BR_CC ? 2 : 0;
+    MVT OpVT = Node->getOperand(CompareOperand).getSimpleValueType();
+    ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(CCOperand))->get();
+    Action = TLI.getCondCodeAction(CCCode, OpVT);
+    if (Action == TargetLowering::Legal) {
+      if (Node->getOpcode() == ISD::SELECT_CC)
+        Action = TLI.getOperationAction(Node->getOpcode(),
+                                        Node->getValueType(0));
+      else
+        Action = TLI.getOperationAction(Node->getOpcode(), OpVT);
+    }
+    break;
+  }
+  case ISD::LOAD:
+  case ISD::STORE:
+    // FIXME: Model these properly.  LOAD and STORE are complicated, and
+    // STORE expects the unlegalized operand in some cases.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    // FIXME: This shouldn't be necessary.  These nodes have special properties
+    // dealing with the recursive nature of legalization.  Removing this
+    // special case should be done as part of making LegalizeDAG non-recursive.
+    SimpleFinishLegalizing = false;
+    break;
+  case ISD::EXTRACT_ELEMENT:
+  case ISD::FLT_ROUNDS_:
+  case ISD::FPOWI:
+  case ISD::MERGE_VALUES:
+  case ISD::EH_RETURN:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+  case ISD::EH_SJLJ_SETJMP:
+  case ISD::EH_SJLJ_LONGJMP:
+  case ISD::EH_SJLJ_SETUP_DISPATCH:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be expanded.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Expand;
+    break;
+  case ISD::INIT_TRAMPOLINE:
+  case ISD::ADJUST_TRAMPOLINE:
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+    // These operations lie about being legal: when they claim to be legal,
+    // they should actually be custom-lowered.
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Legal)
+      Action = TargetLowering::Custom;
+    break;
+  case ISD::READCYCLECOUNTER:
+    // READCYCLECOUNTER returns an i64, even if type legalization might have
+    // expanded that to several smaller types.
+    Action = TLI.getOperationAction(Node->getOpcode(), MVT::i64);
+    break;
+  case ISD::READ_REGISTER:
+  case ISD::WRITE_REGISTER:
+    // Named register is legal in the DAG, but blocked by register name
+    // selection if not implemented by target (to chose the correct register)
+    // They'll be converted to Copy(To/From)Reg.
+    Action = TargetLowering::Legal;
+    break;
+  case ISD::DEBUGTRAP:
+    Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    if (Action == TargetLowering::Expand) {
+      // replace ISD::DEBUGTRAP with ISD::TRAP
+      SDValue NewVal;
+      NewVal = DAG.getNode(ISD::TRAP, SDLoc(Node), Node->getVTList(),
+                           Node->getOperand(0));
+      ReplaceNode(Node, NewVal.getNode());
+      LegalizeOp(NewVal.getNode());
+      return;
+    }
+    break;
+
+  default:
+    if (Node->getOpcode() >= ISD::BUILTIN_OP_END) {
+      Action = TargetLowering::Legal;
+    } else {
+      Action = TLI.getOperationAction(Node->getOpcode(), Node->getValueType(0));
+    }
+    break;
+  }
+
+  if (SimpleFinishLegalizing) {
+    SDNode *NewNode = Node;
+    switch (Node->getOpcode()) {
+    default: break;
+    case ISD::SHL:
+    case ISD::SRL:
+    case ISD::SRA:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Node->getOperand(1).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(1));
+        HandleSDNode Handle(SAO);
+        LegalizeOp(SAO.getNode());
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Handle.getValue());
+      }
+      break;
+    case ISD::SRL_PARTS:
+    case ISD::SRA_PARTS:
+    case ISD::SHL_PARTS:
+      // Legalizing shifts/rotates requires adjusting the shift amount
+      // to the appropriate width.
+      if (!Node->getOperand(2).getValueType().isVector()) {
+        SDValue SAO =
+          DAG.getShiftAmountOperand(Node->getOperand(0).getValueType(),
+                                    Node->getOperand(2));
+        HandleSDNode Handle(SAO);
+        LegalizeOp(SAO.getNode());
+        NewNode = DAG.UpdateNodeOperands(Node, Node->getOperand(0),
+                                         Node->getOperand(1),
+                                         Handle.getValue());
+      }
+      break;
+    }
+
+    if (NewNode != Node) {
+      ReplaceNode(Node, NewNode);
+      Node = NewNode;
+    }
+    switch (Action) {
+    case TargetLowering::Legal:
+      return;
+    case TargetLowering::Custom: {
+      // FIXME: The handling for custom lowering with multiple results is
+      // a complete mess.
+      if (SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG)) {
+        if (!(Res.getNode() != Node || Res.getResNo() != 0))
+          return;
+
+        if (Node->getNumValues() == 1) {
+          // We can just directly replace this node with the lowered value.
+          ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+
+        SmallVector<SDValue, 8> ResultVals;
+        for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
+          ResultVals.push_back(Res.getValue(i));
+        ReplaceNode(Node, ResultVals.data());
+        return;
+      }
+    }
+      // FALL THROUGH
+    case TargetLowering::Expand:
+      if (ExpandNode(Node))
+        return;
+      // FALL THROUGH
+    case TargetLowering::LibCall:
+      ConvertNodeToLibcall(Node);
+      return;
+    case TargetLowering::Promote:
+      PromoteNode(Node);
+      return;
+    }
+  }
+
+  switch (Node->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "NODE: ";
+    Node->dump( &DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to legalize this operator!");
+
+  case ISD::CALLSEQ_START:
+  case ISD::CALLSEQ_END:
+    break;
+  case ISD::LOAD: {
+    return LegalizeLoadOps(Node);
+  }
+  case ISD::STORE: {
+    return LegalizeStoreOps(Node);
+  }
+  }
+}
+
+SDValue SelectionDAGLegalize::ExpandExtractFromVectorThroughStack(SDValue Op) {
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  SDLoc dl(Op);
+
+  // Before we generate a new store to a temporary stack slot, see if there is
+  // already one that we can use. There often is because when we scalarize
+  // vector operations (using SelectionDAG::UnrollVectorOp for example) a whole
+  // series of EXTRACT_VECTOR_ELT nodes are generated, one for each element in
+  // the vector. If all are expanded here, we don't want one store per vector
+  // element.
+
+  // Caches for hasPredecessorHelper
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(Idx.getNode());
+  SDValue StackPtr, Ch;
+  for (SDNode::use_iterator UI = Vec.getNode()->use_begin(),
+       UE = Vec.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (StoreSDNode *ST = dyn_cast<StoreSDNode>(User)) {
+      if (ST->isIndexed() || ST->isTruncatingStore() ||
+          ST->getValue() != Vec)
+        continue;
+
+      // Make sure that nothing else could have stored into the destination of
+      // this store.
+      if (!ST->getChain().reachesChainWithoutSideEffects(DAG.getEntryNode()))
+        continue;
+
+      // If the index is dependent on the store we will introduce a cycle when
+      // creating the load (the load uses the index, and by replacing the chain
+      // we will make the index dependent on the load).
+      if (SDNode::hasPredecessorHelper(ST, Visited, Worklist))
+        continue;
+
+      StackPtr = ST->getBasePtr();
+      Ch = SDValue(ST, 0);
+      break;
+    }
+  }
+
+  if (!Ch.getNode()) {
+    // Store the value to a temporary stack slot, then LOAD the returned part.
+    StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+    Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr,
+                      MachinePointerInfo());
+  }
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
+
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
+  StackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, StackPtr);
+
+  SDValue NewLoad;
+
+  if (Op.getValueType().isVector())
+    NewLoad =
+        DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, MachinePointerInfo());
+  else
+    NewLoad = DAG.getExtLoad(ISD::EXTLOAD, dl, Op.getValueType(), Ch, StackPtr,
+                             MachinePointerInfo(),
+                             Vec.getValueType().getVectorElementType());
+
+  // Replace the chain going out of the store, by the one out of the load.
+  DAG.ReplaceAllUsesOfValueWith(Ch, SDValue(NewLoad.getNode(), 1));
+
+  // We introduced a cycle though, so update the loads operands, making sure
+  // to use the original store's chain as an incoming chain.
+  SmallVector<SDValue, 6> NewLoadOperands(NewLoad->op_begin(),
+                                          NewLoad->op_end());
+  NewLoadOperands[0] = Ch;
+  NewLoad =
+      SDValue(DAG.UpdateNodeOperands(NewLoad.getNode(), NewLoadOperands), 0);
+  return NewLoad;
+}
+
+SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Non-vector insert subvector!");
+
+  SDValue Vec  = Op.getOperand(0);
+  SDValue Part = Op.getOperand(1);
+  SDValue Idx  = Op.getOperand(2);
+  SDLoc dl(Op);
+
+  // Store the value to a temporary stack slot, then LOAD the returned part.
+
+  SDValue StackPtr = DAG.CreateStackTemporary(Vec.getValueType());
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  // First store the whole vector.
+  SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+
+  // Then store the inserted part.
+
+  // Add the offset to the index.
+  unsigned EltSize =
+      Vec.getValueType().getVectorElementType().getSizeInBits()/8;
+
+  Idx = DAG.getNode(ISD::MUL, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(EltSize, SDLoc(Vec), Idx.getValueType()));
+  Idx = DAG.getZExtOrTrunc(Idx, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+  SDValue SubStackPtr = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                                    StackPtr);
+
+  // Store the subvector.
+  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
+
+  // Finally, load the updated vector.
+  return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
+}
+
+SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  // We can't handle this case efficiently.  Allocate a sufficiently
+  // aligned object on the stack, store each element into it, then load
+  // the result as a vector.
+  // Create the stack frame object.
+  EVT VT = Node->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(Node);
+  SDValue FIPtr = DAG.CreateStackTemporary(VT);
+  int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI);
+
+  // Emit a store of each element to the stack slot.
+  SmallVector<SDValue, 8> Stores;
+  unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+  // Store (in the right endianness) the elements to memory.
+  for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
+    // Ignore undef elements.
+    if (Node->getOperand(i).isUndef()) continue;
+
+    unsigned Offset = TypeByteSize*i;
+
+    SDValue Idx = DAG.getConstant(Offset, dl, FIPtr.getValueType());
+    Idx = DAG.getNode(ISD::ADD, dl, FIPtr.getValueType(), FIPtr, Idx);
+
+    // If the destination vector element type is narrower than the source
+    // element type, only store the bits necessary.
+    if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+      Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
+                                         Node->getOperand(i), Idx,
+                                         PtrInfo.getWithOffset(Offset), EltVT));
+    } else
+      Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
+                                    Idx, PtrInfo.getWithOffset(Offset)));
+  }
+
+  SDValue StoreChain;
+  if (!Stores.empty())    // Not all undef elements?
+    StoreChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+  else
+    StoreChain = DAG.getEntryNode();
+
+  // Result is a load from the stack slot.
+  return DAG.getLoad(VT, dl, StoreChain, FIPtr, PtrInfo);
+}
+
+namespace {
+/// Keeps track of state when getting the sign of a floating-point value as an
+/// integer.
+struct FloatSignAsInt {
+  EVT FloatVT;
+  SDValue Chain;
+  SDValue FloatPtr;
+  SDValue IntPtr;
+  MachinePointerInfo IntPointerInfo;
+  MachinePointerInfo FloatPointerInfo;
+  SDValue IntValue;
+  APInt SignMask;
+  uint8_t SignBit;
+};
+}
+
+/// Bitcast a floating-point value to an integer value. Only bitcast the part
+/// containing the sign bit if the target has no integer value capable of
+/// holding all bits of the floating-point value.
+void SelectionDAGLegalize::getSignAsIntValue(FloatSignAsInt &State,
+                                             const SDLoc &DL,
+                                             SDValue Value) const {
+  EVT FloatVT = Value.getValueType();
+  unsigned NumBits = FloatVT.getSizeInBits();
+  State.FloatVT = FloatVT;
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), NumBits);
+  // Convert to an integer of the same size.
+  if (TLI.isTypeLegal(IVT)) {
+    State.IntValue = DAG.getNode(ISD::BITCAST, DL, IVT, Value);
+    State.SignMask = APInt::getSignBit(NumBits);
+    State.SignBit = NumBits - 1;
+    return;
+  }
+
+  auto &DataLayout = DAG.getDataLayout();
+  // Store the float to memory, then load the sign part out as an integer.
+  MVT LoadTy = TLI.getRegisterType(*DAG.getContext(), MVT::i8);
+  // First create a temporary that is aligned for both the load and store.
+  SDValue StackPtr = DAG.CreateStackTemporary(FloatVT, LoadTy);
+  int FI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  // Then store the float to it.
+  State.FloatPtr = StackPtr;
+  MachineFunction &MF = DAG.getMachineFunction();
+  State.FloatPointerInfo = MachinePointerInfo::getFixedStack(MF, FI);
+  State.Chain = DAG.getStore(DAG.getEntryNode(), DL, Value, State.FloatPtr,
+                             State.FloatPointerInfo);
+
+  SDValue IntPtr;
+  if (DataLayout.isBigEndian()) {
+    assert(FloatVT.isByteSized() && "Unsupported floating point type!");
+    // Load out a legal integer with the same sign bit as the float.
+    IntPtr = StackPtr;
+    State.IntPointerInfo = State.FloatPointerInfo;
+  } else {
+    // Advance the pointer so that the loaded byte will contain the sign bit.
+    unsigned ByteOffset = (FloatVT.getSizeInBits() / 8) - 1;
+    IntPtr = DAG.getNode(ISD::ADD, DL, StackPtr.getValueType(), StackPtr,
+                      DAG.getConstant(ByteOffset, DL, StackPtr.getValueType()));
+    State.IntPointerInfo = MachinePointerInfo::getFixedStack(MF, FI,
+                                                             ByteOffset);
+  }
+
+  State.IntPtr = IntPtr;
+  State.IntValue = DAG.getExtLoad(ISD::EXTLOAD, DL, LoadTy, State.Chain, IntPtr,
+                                  State.IntPointerInfo, MVT::i8);
+  State.SignMask = APInt::getOneBitSet(LoadTy.getSizeInBits(), 7);
+  State.SignBit = 7;
+}
+
+/// Replace the integer value produced by getSignAsIntValue() with a new value
+/// and cast the result back to a floating-point type.
+SDValue SelectionDAGLegalize::modifySignAsInt(const FloatSignAsInt &State,
+                                              const SDLoc &DL,
+                                              SDValue NewIntValue) const {
+  if (!State.Chain)
+    return DAG.getNode(ISD::BITCAST, DL, State.FloatVT, NewIntValue);
+
+  // Override the part containing the sign bit in the value stored on the stack.
+  SDValue Chain = DAG.getTruncStore(State.Chain, DL, NewIntValue, State.IntPtr,
+                                    State.IntPointerInfo, MVT::i8);
+  return DAG.getLoad(State.FloatVT, DL, Chain, State.FloatPtr,
+                     State.FloatPointerInfo);
+}
+
+SDValue SelectionDAGLegalize::ExpandFCOPYSIGN(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Mag = Node->getOperand(0);
+  SDValue Sign = Node->getOperand(1);
+
+  // Get sign bit into an integer value.
+  FloatSignAsInt SignAsInt;
+  getSignAsIntValue(SignAsInt, DL, Sign);
+
+  EVT IntVT = SignAsInt.IntValue.getValueType();
+  SDValue SignMask = DAG.getConstant(SignAsInt.SignMask, DL, IntVT);
+  SDValue SignBit = DAG.getNode(ISD::AND, DL, IntVT, SignAsInt.IntValue,
+                                SignMask);
+
+  // If FABS is legal transform FCOPYSIGN(x, y) => sign(x) ? -FABS(x) : FABS(X)
+  EVT FloatVT = Mag.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FABS, FloatVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FNEG, FloatVT)) {
+    SDValue AbsValue = DAG.getNode(ISD::FABS, DL, FloatVT, Mag);
+    SDValue NegValue = DAG.getNode(ISD::FNEG, DL, FloatVT, AbsValue);
+    SDValue Cond = DAG.getSetCC(DL, getSetCCResultType(IntVT), SignBit,
+                                DAG.getConstant(0, DL, IntVT), ISD::SETNE);
+    return DAG.getSelect(DL, FloatVT, Cond, NegValue, AbsValue);
+  }
+
+  // Transform Mag value to integer, and clear the sign bit.
+  FloatSignAsInt MagAsInt;
+  getSignAsIntValue(MagAsInt, DL, Mag);
+  EVT MagVT = MagAsInt.IntValue.getValueType();
+  SDValue ClearSignMask = DAG.getConstant(~MagAsInt.SignMask, DL, MagVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, MagVT, MagAsInt.IntValue,
+                                    ClearSignMask);
+
+  // Get the signbit at the right position for MagAsInt.
+  int ShiftAmount = SignAsInt.SignBit - MagAsInt.SignBit;
+  if (SignBit.getValueSizeInBits() > ClearedSign.getValueSizeInBits()) {
+    if (ShiftAmount > 0) {
+      SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, IntVT);
+      SignBit = DAG.getNode(ISD::SRL, DL, IntVT, SignBit, ShiftCnst);
+    } else if (ShiftAmount < 0) {
+      SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, IntVT);
+      SignBit = DAG.getNode(ISD::SHL, DL, IntVT, SignBit, ShiftCnst);
+    }
+    SignBit = DAG.getNode(ISD::TRUNCATE, DL, MagVT, SignBit);
+  } else if (SignBit.getValueSizeInBits() < ClearedSign.getValueSizeInBits()) {
+    SignBit = DAG.getNode(ISD::ZERO_EXTEND, DL, MagVT, SignBit);
+    if (ShiftAmount > 0) {
+      SDValue ShiftCnst = DAG.getConstant(ShiftAmount, DL, MagVT);
+      SignBit = DAG.getNode(ISD::SRL, DL, MagVT, SignBit, ShiftCnst);
+    } else if (ShiftAmount < 0) {
+      SDValue ShiftCnst = DAG.getConstant(-ShiftAmount, DL, MagVT);
+      SignBit = DAG.getNode(ISD::SHL, DL, MagVT, SignBit, ShiftCnst);
+    }
+  }
+
+  // Store the part with the modified sign and convert back to float.
+  SDValue CopiedSign = DAG.getNode(ISD::OR, DL, MagVT, ClearedSign, SignBit);
+  return modifySignAsInt(MagAsInt, DL, CopiedSign);
+}
+
+SDValue SelectionDAGLegalize::ExpandFABS(SDNode *Node) const {
+  SDLoc DL(Node);
+  SDValue Value = Node->getOperand(0);
+
+  // Transform FABS(x) => FCOPYSIGN(x, 0.0) if FCOPYSIGN is legal.
+  EVT FloatVT = Value.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FCOPYSIGN, FloatVT)) {
+    SDValue Zero = DAG.getConstantFP(0.0, DL, FloatVT);
+    return DAG.getNode(ISD::FCOPYSIGN, DL, FloatVT, Value, Zero);
+  }
+
+  // Transform value to integer, clear the sign bit and transform back.
+  FloatSignAsInt ValueAsInt;
+  getSignAsIntValue(ValueAsInt, DL, Value);
+  EVT IntVT = ValueAsInt.IntValue.getValueType();
+  SDValue ClearSignMask = DAG.getConstant(~ValueAsInt.SignMask, DL, IntVT);
+  SDValue ClearedSign = DAG.getNode(ISD::AND, DL, IntVT, ValueAsInt.IntValue,
+                                    ClearSignMask);
+  return modifySignAsInt(ValueAsInt, DL, ClearedSign);
+}
+
+void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
+                                           SmallVectorImpl<SDValue> &Results) {
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+  assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
+          " not tell us which reg is the stack pointer!");
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = SDValue(Node, 0);
+  SDValue Tmp2 = SDValue(Node, 1);
+  SDValue Tmp3 = Node->getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl);
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
+  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
+  if (Align > StackAlign)
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
+
+  Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
+                            DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  Results.push_back(Tmp1);
+  Results.push_back(Tmp2);
+}
+
+/// Legalize a SETCC with given LHS and RHS and condition code CC on the current
+/// target.
+///
+/// If the SETCC has been legalized using AND / OR, then the legalized node
+/// will be stored in LHS. RHS and CC will be set to SDValue(). NeedInvert
+/// will be set to false.
+///
+/// If the SETCC has been legalized by using getSetCCSwappedOperands(),
+/// then the values of LHS and RHS will be swapped, CC will be set to the
+/// new condition, and NeedInvert will be set to false.
+///
+/// If the SETCC has been legalized using the inverse condcode, then LHS and
+/// RHS will be unchanged, CC will set to the inverted condcode, and NeedInvert
+/// will be set to true. The caller must invert the result of the SETCC with
+/// SelectionDAG::getLogicalNOT() or take equivalent action to swap the effect
+/// of a true/false result.
+///
+/// \returns true if the SetCC has been legalized, false if it hasn't.
+bool SelectionDAGLegalize::LegalizeSetCCCondCode(EVT VT, SDValue &LHS,
+                                                 SDValue &RHS, SDValue &CC,
+                                                 bool &NeedInvert,
+                                                 const SDLoc &dl) {
+  MVT OpVT = LHS.getSimpleValueType();
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(CC)->get();
+  NeedInvert = false;
+  switch (TLI.getCondCodeAction(CCCode, OpVT)) {
+  default: llvm_unreachable("Unknown condition code action!");
+  case TargetLowering::Legal:
+    // Nothing to do.
+    break;
+  case TargetLowering::Expand: {
+    ISD::CondCode InvCC = ISD::getSetCCSwappedOperands(CCCode);
+    if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+      std::swap(LHS, RHS);
+      CC = DAG.getCondCode(InvCC);
+      return true;
+    }
+    ISD::CondCode CC1 = ISD::SETCC_INVALID, CC2 = ISD::SETCC_INVALID;
+    unsigned Opc = 0;
+    switch (CCCode) {
+    default: llvm_unreachable("Don't know how to expand this condition!");
+    case ISD::SETO:
+        assert(TLI.getCondCodeAction(ISD::SETOEQ, OpVT)
+            == TargetLowering::Legal
+            && "If SETO is expanded, SETOEQ must be legal!");
+        CC1 = ISD::SETOEQ; CC2 = ISD::SETOEQ; Opc = ISD::AND; break;
+    case ISD::SETUO:
+        assert(TLI.getCondCodeAction(ISD::SETUNE, OpVT)
+            == TargetLowering::Legal
+            && "If SETUO is expanded, SETUNE must be legal!");
+        CC1 = ISD::SETUNE; CC2 = ISD::SETUNE; Opc = ISD::OR;  break;
+    case ISD::SETOEQ:
+    case ISD::SETOGT:
+    case ISD::SETOGE:
+    case ISD::SETOLT:
+    case ISD::SETOLE:
+    case ISD::SETONE:
+    case ISD::SETUEQ:
+    case ISD::SETUNE:
+    case ISD::SETUGT:
+    case ISD::SETUGE:
+    case ISD::SETULT:
+    case ISD::SETULE:
+        // If we are floating point, assign and break, otherwise fall through.
+        if (!OpVT.isInteger()) {
+          // We can use the 4th bit to tell if we are the unordered
+          // or ordered version of the opcode.
+          CC2 = ((unsigned)CCCode & 0x8U) ? ISD::SETUO : ISD::SETO;
+          Opc = ((unsigned)CCCode & 0x8U) ? ISD::OR : ISD::AND;
+          CC1 = (ISD::CondCode)(((int)CCCode & 0x7) | 0x10);
+          break;
+        }
+        // Fallthrough if we are unsigned integer.
+    case ISD::SETLE:
+    case ISD::SETGT:
+    case ISD::SETGE:
+    case ISD::SETLT:
+      // We only support using the inverted operation, which is computed above
+      // and not a different manner of supporting expanding these cases.
+      llvm_unreachable("Don't know how to expand this condition!");
+    case ISD::SETNE:
+    case ISD::SETEQ:
+      // Try inverting the result of the inverse condition.
+      InvCC = CCCode == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ;
+      if (TLI.isCondCodeLegal(InvCC, OpVT)) {
+        CC = DAG.getCondCode(InvCC);
+        NeedInvert = true;
+        return true;
+      }
+      // If inverting the condition didn't work then we have no means to expand
+      // the condition.
+      llvm_unreachable("Don't know how to expand this condition!");
+    }
+
+    SDValue SetCC1, SetCC2;
+    if (CCCode != ISD::SETO && CCCode != ISD::SETUO) {
+      // If we aren't the ordered or unorder operation,
+      // then the pattern is (LHS CC1 RHS) Opc (LHS CC2 RHS).
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, RHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, LHS, RHS, CC2);
+    } else {
+      // Otherwise, the pattern is (LHS CC1 LHS) Opc (RHS CC2 RHS)
+      SetCC1 = DAG.getSetCC(dl, VT, LHS, LHS, CC1);
+      SetCC2 = DAG.getSetCC(dl, VT, RHS, RHS, CC2);
+    }
+    LHS = DAG.getNode(Opc, dl, VT, SetCC1, SetCC2);
+    RHS = SDValue();
+    CC  = SDValue();
+    return true;
+  }
+  }
+  return false;
+}
+
+/// Emit a store/load combination to the stack.  This stores
+/// SrcOp to a stack slot of type SlotVT, truncating it if needed.  It then does
+/// a load from the stack slot to DestVT, extending it if needed.
+/// The resultant code need not be legal.
+SDValue SelectionDAGLegalize::EmitStackConvert(SDValue SrcOp, EVT SlotVT,
+                                               EVT DestVT, const SDLoc &dl) {
+  // Create the stack frame object.
+  unsigned SrcAlign = DAG.getDataLayout().getPrefTypeAlignment(
+      SrcOp.getValueType().getTypeForEVT(*DAG.getContext()));
+  SDValue FIPtr = DAG.CreateStackTemporary(SlotVT, SrcAlign);
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(FIPtr);
+  int SPFI = StackPtrFI->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+  unsigned SrcSize = SrcOp.getValueType().getSizeInBits();
+  unsigned SlotSize = SlotVT.getSizeInBits();
+  unsigned DestSize = DestVT.getSizeInBits();
+  Type *DestType = DestVT.getTypeForEVT(*DAG.getContext());
+  unsigned DestAlign = DAG.getDataLayout().getPrefTypeAlignment(DestType);
+
+  // Emit a store to the stack slot.  Use a truncstore if the input value is
+  // later than DestVT.
+  SDValue Store;
+
+  if (SrcSize > SlotSize)
+    Store = DAG.getTruncStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo,
+                              SlotVT, SrcAlign);
+  else {
+    assert(SrcSize == SlotSize && "Invalid store");
+    Store =
+        DAG.getStore(DAG.getEntryNode(), dl, SrcOp, FIPtr, PtrInfo, SrcAlign);
+  }
+
+  // Result is a load from the stack slot.
+  if (SlotSize == DestSize)
+    return DAG.getLoad(DestVT, dl, Store, FIPtr, PtrInfo, DestAlign);
+
+  assert(SlotSize < DestSize && "Unknown extension!");
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, DestVT, Store, FIPtr, PtrInfo, SlotVT,
+                        DestAlign);
+}
+
+SDValue SelectionDAGLegalize::ExpandSCALAR_TO_VECTOR(SDNode *Node) {
+  SDLoc dl(Node);
+  // Create a vector sized/aligned stack slot, store the value to element #0,
+  // then load the whole vector back out.
+  SDValue StackPtr = DAG.CreateStackTemporary(Node->getValueType(0));
+
+  FrameIndexSDNode *StackPtrFI = cast<FrameIndexSDNode>(StackPtr);
+  int SPFI = StackPtrFI->getIndex();
+
+  SDValue Ch = DAG.getTruncStore(
+      DAG.getEntryNode(), dl, Node->getOperand(0), StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI),
+      Node->getValueType(0).getVectorElementType());
+  return DAG.getLoad(
+      Node->getValueType(0), dl, Ch, StackPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI));
+}
+
+static bool
+ExpandBVWithShuffles(SDNode *Node, SelectionDAG &DAG,
+                     const TargetLowering &TLI, SDValue &Res) {
+  unsigned NumElems = Node->getNumOperands();
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+
+  // Try to group the scalars into pairs, shuffle the pairs together, then
+  // shuffle the pairs of pairs together, etc. until the vector has
+  // been built. This will work only if all of the necessary shuffle masks
+  // are legal.
+
+  // We do this in two phases; first to check the legality of the shuffles,
+  // and next, assuming that all shuffles are legal, to create the new nodes.
+  for (int Phase = 0; Phase < 2; ++Phase) {
+    SmallVector<std::pair<SDValue, SmallVector<int, 16> >, 16> IntermedVals,
+                                                               NewIntermedVals;
+    for (unsigned i = 0; i < NumElems; ++i) {
+      SDValue V = Node->getOperand(i);
+      if (V.isUndef())
+        continue;
+
+      SDValue Vec;
+      if (Phase)
+        Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, V);
+      IntermedVals.push_back(std::make_pair(Vec, SmallVector<int, 16>(1, i)));
+    }
+
+    while (IntermedVals.size() > 2) {
+      NewIntermedVals.clear();
+      for (unsigned i = 0, e = (IntermedVals.size() & ~1u); i < e; i += 2) {
+        // This vector and the next vector are shuffled together (simply to
+        // append the one to the other).
+        SmallVector<int, 16> ShuffleVec(NumElems, -1);
+
+        SmallVector<int, 16> FinalIndices;
+        FinalIndices.reserve(IntermedVals[i].second.size() +
+                             IntermedVals[i+1].second.size());
+
+        int k = 0;
+        for (unsigned j = 0, f = IntermedVals[i].second.size(); j != f;
+             ++j, ++k) {
+          ShuffleVec[k] = j;
+          FinalIndices.push_back(IntermedVals[i].second[j]);
+        }
+        for (unsigned j = 0, f = IntermedVals[i+1].second.size(); j != f;
+             ++j, ++k) {
+          ShuffleVec[k] = NumElems + j;
+          FinalIndices.push_back(IntermedVals[i+1].second[j]);
+        }
+
+        SDValue Shuffle;
+        if (Phase)
+          Shuffle = DAG.getVectorShuffle(VT, dl, IntermedVals[i].first,
+                                         IntermedVals[i+1].first,
+                                         ShuffleVec);
+        else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
+          return false;
+        NewIntermedVals.push_back(
+            std::make_pair(Shuffle, std::move(FinalIndices)));
+      }
+
+      // If we had an odd number of defined values, then append the last
+      // element to the array of new vectors.
+      if ((IntermedVals.size() & 1) != 0)
+        NewIntermedVals.push_back(IntermedVals.back());
+
+      IntermedVals.swap(NewIntermedVals);
+    }
+
+    assert(IntermedVals.size() <= 2 && IntermedVals.size() > 0 &&
+           "Invalid number of intermediate vectors");
+    SDValue Vec1 = IntermedVals[0].first;
+    SDValue Vec2;
+    if (IntermedVals.size() > 1)
+      Vec2 = IntermedVals[1].first;
+    else if (Phase)
+      Vec2 = DAG.getUNDEF(VT);
+
+    SmallVector<int, 16> ShuffleVec(NumElems, -1);
+    for (unsigned i = 0, e = IntermedVals[0].second.size(); i != e; ++i)
+      ShuffleVec[IntermedVals[0].second[i]] = i;
+    for (unsigned i = 0, e = IntermedVals[1].second.size(); i != e; ++i)
+      ShuffleVec[IntermedVals[1].second[i]] = NumElems + i;
+
+    if (Phase)
+      Res = DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
+    else if (!TLI.isShuffleMaskLegal(ShuffleVec, VT))
+      return false;
+  }
+
+  return true;
+}
+
+/// Expand a BUILD_VECTOR node on targets that don't
+/// support the operation, but do support the resultant vector type.
+SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
+  unsigned NumElems = Node->getNumOperands();
+  SDValue Value1, Value2;
+  SDLoc dl(Node);
+  EVT VT = Node->getValueType(0);
+  EVT OpVT = Node->getOperand(0).getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  // If the only non-undef value is the low element, turn this into a
+  // SCALAR_TO_VECTOR node.  If this is { X, X, X, X }, determine X.
+  bool isOnlyLowElement = true;
+  bool MoreThanTwoValues = false;
+  bool isConstant = true;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue V = Node->getOperand(i);
+    if (V.isUndef())
+      continue;
+    if (i > 0)
+      isOnlyLowElement = false;
+    if (!isa<ConstantFPSDNode>(V) && !isa<ConstantSDNode>(V))
+      isConstant = false;
+
+    if (!Value1.getNode()) {
+      Value1 = V;
+    } else if (!Value2.getNode()) {
+      if (V != Value1)
+        Value2 = V;
+    } else if (V != Value1 && V != Value2) {
+      MoreThanTwoValues = true;
+    }
+  }
+
+  if (!Value1.getNode())
+    return DAG.getUNDEF(VT);
+
+  if (isOnlyLowElement)
+    return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Node->getOperand(0));
+
+  // If all elements are constants, create a load from the constant pool.
+  if (isConstant) {
+    SmallVector<Constant*, 16> CV;
+    for (unsigned i = 0, e = NumElems; i != e; ++i) {
+      if (ConstantFPSDNode *V =
+          dyn_cast<ConstantFPSDNode>(Node->getOperand(i))) {
+        CV.push_back(const_cast<ConstantFP *>(V->getConstantFPValue()));
+      } else if (ConstantSDNode *V =
+                 dyn_cast<ConstantSDNode>(Node->getOperand(i))) {
+        if (OpVT==EltVT)
+          CV.push_back(const_cast<ConstantInt *>(V->getConstantIntValue()));
+        else {
+          // If OpVT and EltVT don't match, EltVT is not legal and the
+          // element values have been promoted/truncated earlier.  Undo this;
+          // we don't want a v16i8 to become a v16i32 for example.
+          const ConstantInt *CI = V->getConstantIntValue();
+          CV.push_back(ConstantInt::get(EltVT.getTypeForEVT(*DAG.getContext()),
+                                        CI->getZExtValue()));
+        }
+      } else {
+        assert(Node->getOperand(i).isUndef());
+        Type *OpNTy = EltVT.getTypeForEVT(*DAG.getContext());
+        CV.push_back(UndefValue::get(OpNTy));
+      }
+    }
+    Constant *CP = ConstantVector::get(CV);
+    SDValue CPIdx =
+        DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
+    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    return DAG.getLoad(
+        VT, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
+  }
+
+  SmallSet<SDValue, 16> DefinedValues;
+  for (unsigned i = 0; i < NumElems; ++i) {
+    if (Node->getOperand(i).isUndef())
+      continue;
+    DefinedValues.insert(Node->getOperand(i));
+  }
+
+  if (TLI.shouldExpandBuildVectorWithShuffles(VT, DefinedValues.size())) {
+    if (!MoreThanTwoValues) {
+      SmallVector<int, 8> ShuffleVec(NumElems, -1);
+      for (unsigned i = 0; i < NumElems; ++i) {
+        SDValue V = Node->getOperand(i);
+        if (V.isUndef())
+          continue;
+        ShuffleVec[i] = V == Value1 ? 0 : NumElems;
+      }
+      if (TLI.isShuffleMaskLegal(ShuffleVec, Node->getValueType(0))) {
+        // Get the splatted value into the low element of a vector register.
+        SDValue Vec1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value1);
+        SDValue Vec2;
+        if (Value2.getNode())
+          Vec2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Value2);
+        else
+          Vec2 = DAG.getUNDEF(VT);
+
+        // Return shuffle(LowValVec, undef, <0,0,0,0>)
+        return DAG.getVectorShuffle(VT, dl, Vec1, Vec2, ShuffleVec);
+      }
+    } else {
+      SDValue Res;
+      if (ExpandBVWithShuffles(Node, DAG, TLI, Res))
+        return Res;
+    }
+  }
+
+  // Otherwise, we can't handle this case efficiently.
+  return ExpandVectorBuildThroughStack(Node);
+}
+
+// Expand a node into a call to a libcall.  If the result value
+// does not fit into a register, return the lo part and set the hi part to the
+// by-reg argument.  If it does fit into a single register, return the result
+// and leave the Hi part unset.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                            bool isSigned) {
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+
+  // By default, the input chain to this libcall is the entry node of the
+  // function. If the libcall is going to be emitted as a tail call then
+  // TLI.isUsedByReturnOnly will change it to the right chain if the return
+  // node which is being folded has a non-entry input chain.
+  SDValue InChain = DAG.getEntryNode();
+
+  // isTailCall may be true since the callee does not reference caller stack
+  // frame. Check if it's in the right position and that the return types match.
+  SDValue TCChain = InChain;
+  const Function *F = DAG.getMachineFunction().getFunction();
+  bool isTailCall =
+      TLI.isInTailCallPosition(DAG, Node, TCChain) &&
+      (RetTy == F->getReturnType() || F->getReturnType()->isVoidTy());
+  if (isTailCall)
+    InChain = TCChain;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  if (!CallInfo.second.getNode())
+    // It's a tailcall, return the chain (which is the DAG root).
+    return DAG.getRoot();
+
+  return CallInfo.first;
+}
+
+/// Generate a libcall taking the given operands as arguments
+/// and returning a result of type RetVT.
+SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
+                                            const SDValue *Ops, unsigned NumOps,
+                                            bool isSigned, const SDLoc &dl) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumOps);
+
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 0; i != NumOps; ++i) {
+    Entry.Node = Ops[i];
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo.first;
+}
+
+// Expand a node into a call to a libcall. Similar to
+// ExpandLibCall except that the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
+                                         SDNode *Node,
+                                         bool isSigned) {
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo;
+}
+
+SDValue SelectionDAGLegalize::ExpandFPLibCall(SDNode* Node,
+                                              RTLIB::Libcall Call_F32,
+                                              RTLIB::Libcall Call_F64,
+                                              RTLIB::Libcall Call_F80,
+                                              RTLIB::Libcall Call_F128,
+                                              RTLIB::Libcall Call_PPCF128) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32: LC = Call_F32; break;
+  case MVT::f64: LC = Call_F64; break;
+  case MVT::f80: LC = Call_F80; break;
+  case MVT::f128: LC = Call_F128; break;
+  case MVT::ppcf128: LC = Call_PPCF128; break;
+  }
+  return ExpandLibCall(LC, Node, false);
+}
+
+SDValue SelectionDAGLegalize::ExpandIntLibCall(SDNode* Node, bool isSigned,
+                                               RTLIB::Libcall Call_I8,
+                                               RTLIB::Libcall Call_I16,
+                                               RTLIB::Libcall Call_I32,
+                                               RTLIB::Libcall Call_I64,
+                                               RTLIB::Libcall Call_I128) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC = Call_I8; break;
+  case MVT::i16:  LC = Call_I16; break;
+  case MVT::i32:  LC = Call_I32; break;
+  case MVT::i64:  LC = Call_I64; break;
+  case MVT::i128: LC = Call_I128; break;
+  }
+  return ExpandLibCall(LC, Node, isSigned);
+}
+
+/// Issue libcalls to __{u}divmod to compute div / rem pairs.
+void
+SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  unsigned Opcode = Node->getOpcode();
+  bool isSigned = Opcode == ISD::SDIVREM;
+
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::i8:   LC= isSigned ? RTLIB::SDIVREM_I8  : RTLIB::UDIVREM_I8;  break;
+  case MVT::i16:  LC= isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16; break;
+  case MVT::i32:  LC= isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32; break;
+  case MVT::i64:  LC= isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64; break;
+  case MVT::i128: LC= isSigned ? RTLIB::SDIVREM_I128:RTLIB::UDIVREM_I128; break;
+  }
+
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : Node->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the return address of the remainder.
+  SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = FIPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = isSigned;
+  Entry.isZExt = !isSigned;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  SDLoc dl(Node);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  // Remainder is loaded back from the stack frame.
+  SDValue Rem =
+      DAG.getLoad(RetVT, dl, CallInfo.second, FIPtr, MachinePointerInfo());
+  Results.push_back(CallInfo.first);
+  Results.push_back(Rem);
+}
+
+/// Return true if sincos libcall is available.
+static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+  return TLI.getLibcallName(LC) != nullptr;
+}
+
+/// Return true if sincos libcall is available and can be used to combine sin
+/// and cos.
+static bool canCombineSinCosLibcall(SDNode *Node, const TargetLowering &TLI,
+                                    const TargetMachine &TM) {
+  if (!isSinCosLibcallAvailable(Node, TLI))
+    return false;
+  // GNU sin/cos functions set errno while sincos does not. Therefore
+  // combining sin and cos is only safe if unsafe-fpmath is enabled.
+  if (TM.getTargetTriple().isGNUEnvironment() && !TM.Options.UnsafeFPMath)
+    return false;
+  return true;
+}
+
+/// Only issue sincos libcall if both sin and cos are needed.
+static bool useSinCos(SDNode *Node) {
+  unsigned OtherOpcode = Node->getOpcode() == ISD::FSIN
+    ? ISD::FCOS : ISD::FSIN;
+
+  SDValue Op0 = Node->getOperand(0);
+  for (SDNode::use_iterator UI = Op0.getNode()->use_begin(),
+       UE = Op0.getNode()->use_end(); UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User == Node)
+      continue;
+    // The other user might have been turned into sincos already.
+    if (User->getOpcode() == OtherOpcode || User->getOpcode() == ISD::FSINCOS)
+      return true;
+  }
+  return false;
+}
+
+/// Issue libcalls to sincos to compute sin / cos pairs.
+void
+SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
+  RTLIB::Libcall LC;
+  switch (Node->getSimpleValueType(0).SimpleTy) {
+  default: llvm_unreachable("Unexpected request for libcall!");
+  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
+  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
+  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
+  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
+  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+  }
+
+  // The input chain to this libcall is the entry node of the function.
+  // Legalizing the call will automatically add the previous call to the
+  // dependence.
+  SDValue InChain = DAG.getEntryNode();
+
+  EVT RetVT = Node->getValueType(0);
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+
+  // Pass the argument.
+  Entry.Node = Node->getOperand(0);
+  Entry.Ty = RetTy;
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  // Pass the return address of sin.
+  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = SinPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  // Also pass the return address of the cos.
+  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
+  Entry.Node = CosPtr;
+  Entry.Ty = RetTy->getPointerTo();
+  Entry.isSExt = false;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  SDLoc dl(Node);
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC),
+               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args));
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
+  Results.push_back(
+      DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+}
+
+/// This function is responsible for legalizing a
+/// INT_TO_FP operation of the specified operand when the target requests that
+/// we expand it.  At this point, we know that the result and operand types are
+/// legal for the target.
+SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(bool isSigned, SDValue Op0,
+                                                   EVT DestVT,
+                                                   const SDLoc &dl) {
+  // TODO: Should any fast-math-flags be set for the created nodes?
+
+  if (Op0.getValueType() == MVT::i32 && TLI.isTypeLegal(MVT::f64)) {
+    // simple 32-bit [signed|unsigned] integer to float/double expansion
+
+    // Get the stack frame index of a 8 byte buffer.
+    SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
+
+    // word offset constant for Hi/Lo address computation
+    SDValue WordOff = DAG.getConstant(sizeof(int), dl,
+                                      StackSlot.getValueType());
+    // set up Hi and Lo (into buffer) address based on endian
+    SDValue Hi = StackSlot;
+    SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
+                             StackSlot, WordOff);
+    if (DAG.getDataLayout().isLittleEndian())
+      std::swap(Hi, Lo);
+
+    // if signed map to unsigned space
+    SDValue Op0Mapped;
+    if (isSigned) {
+      // constant used to invert sign bit (signed to unsigned mapping)
+      SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32);
+      Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
+    } else {
+      Op0Mapped = Op0;
+    }
+    // store the lo of the constructed double - based on integer input
+    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
+                                  MachinePointerInfo());
+    // initial hi portion of constructed double
+    SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
+    // store the hi of the constructed double - biased exponent
+    SDValue Store2 =
+        DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
+    // load the constructed double
+    SDValue Load =
+        DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
+    // FP constant to bias correct the final result
+    SDValue Bias = DAG.getConstantFP(isSigned ?
+                                     BitsToDouble(0x4330000080000000ULL) :
+                                     BitsToDouble(0x4330000000000000ULL),
+                                     dl, MVT::f64);
+    // subtract the bias
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Load, Bias);
+    // final result
+    SDValue Result;
+    // handle final rounding
+    if (DestVT == MVT::f64) {
+      // do nothing
+      Result = Sub;
+    } else if (DestVT.bitsLT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_ROUND, dl, DestVT, Sub,
+                           DAG.getIntPtrConstant(0, dl));
+    } else if (DestVT.bitsGT(MVT::f64)) {
+      Result = DAG.getNode(ISD::FP_EXTEND, dl, DestVT, Sub);
+    }
+    return Result;
+  }
+  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+  // Code below here assumes !isSigned without checking again.
+
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt. This implementation has the advantage
+  // of performing rounding correctly, both in the default rounding mode
+  // and in all alternate rounding modes.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f64) {
+    SDValue TwoP52 =
+      DAG.getConstant(UINT64_C(0x4330000000000000), dl, MVT::i64);
+    SDValue TwoP84PlusTwoP52 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x4530000000100000)), dl,
+                        MVT::f64);
+    SDValue TwoP84 =
+      DAG.getConstant(UINT64_C(0x4530000000000000), dl, MVT::i64);
+
+    SDValue Lo = DAG.getZeroExtendInReg(Op0, dl, MVT::i32);
+    SDValue Hi = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0,
+                             DAG.getConstant(32, dl, MVT::i64));
+    SDValue LoOr = DAG.getNode(ISD::OR, dl, MVT::i64, Lo, TwoP52);
+    SDValue HiOr = DAG.getNode(ISD::OR, dl, MVT::i64, Hi, TwoP84);
+    SDValue LoFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, LoOr);
+    SDValue HiFlt = DAG.getNode(ISD::BITCAST, dl, MVT::f64, HiOr);
+    SDValue HiSub = DAG.getNode(ISD::FSUB, dl, MVT::f64, HiFlt,
+                                TwoP84PlusTwoP52);
+    return DAG.getNode(ISD::FADD, dl, MVT::f64, LoFlt, HiSub);
+  }
+
+  // Implementation of unsigned i64 to f32.
+  // TODO: Generalize this for use with other types.
+  if (Op0.getValueType() == MVT::i64 && DestVT == MVT::f32) {
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundidf in compiler_rt.
+    if (!isSigned) {
+      SDValue Fast = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Op0);
+
+      SDValue ShiftConst = DAG.getConstant(
+          1, dl, TLI.getShiftAmountTy(Op0.getValueType(), DAG.getDataLayout()));
+      SDValue Shr = DAG.getNode(ISD::SRL, dl, MVT::i64, Op0, ShiftConst);
+      SDValue AndConst = DAG.getConstant(1, dl, MVT::i64);
+      SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0, AndConst);
+      SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And, Shr);
+
+      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Or);
+      SDValue Slow = DAG.getNode(ISD::FADD, dl, MVT::f32, SignCvt, SignCvt);
+
+      // TODO: This really should be implemented using a branch rather than a
+      // select.  We happen to get lucky and machinesink does the right
+      // thing most of the time.  This would be a good candidate for a
+      //pseudo-op, or, even better, for whole-function isel.
+      SDValue SignBitTest = DAG.getSetCC(dl, getSetCCResultType(MVT::i64),
+        Op0, DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+      return DAG.getSelect(dl, MVT::f32, SignBitTest, Slow, Fast);
+    }
+
+    // Otherwise, implement the fully general conversion.
+
+    SDValue And = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0xfffffffffffff800), dl, MVT::i64));
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::i64, And,
+         DAG.getConstant(UINT64_C(0x800), dl, MVT::i64));
+    SDValue And2 = DAG.getNode(ISD::AND, dl, MVT::i64, Op0,
+         DAG.getConstant(UINT64_C(0x7ff), dl, MVT::i64));
+    SDValue Ne = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), And2,
+                              DAG.getConstant(UINT64_C(0), dl, MVT::i64),
+                              ISD::SETNE);
+    SDValue Sel = DAG.getSelect(dl, MVT::i64, Ne, Or, Op0);
+    SDValue Ge = DAG.getSetCC(dl, getSetCCResultType(MVT::i64), Op0,
+                              DAG.getConstant(UINT64_C(0x0020000000000000), dl,
+                                              MVT::i64),
+                              ISD::SETUGE);
+    SDValue Sel2 = DAG.getSelect(dl, MVT::i64, Ge, Sel, Op0);
+    EVT SHVT = TLI.getShiftAmountTy(Sel2.getValueType(), DAG.getDataLayout());
+
+    SDValue Sh = DAG.getNode(ISD::SRL, dl, MVT::i64, Sel2,
+                             DAG.getConstant(32, dl, SHVT));
+    SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sh);
+    SDValue Fcvt = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Trunc);
+    SDValue TwoP32 =
+      DAG.getConstantFP(BitsToDouble(UINT64_C(0x41f0000000000000)), dl,
+                        MVT::f64);
+    SDValue Fmul = DAG.getNode(ISD::FMUL, dl, MVT::f64, TwoP32, Fcvt);
+    SDValue Lo = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Sel2);
+    SDValue Fcvt2 = DAG.getNode(ISD::UINT_TO_FP, dl, MVT::f64, Lo);
+    SDValue Fadd = DAG.getNode(ISD::FADD, dl, MVT::f64, Fmul, Fcvt2);
+    return DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Fadd,
+                       DAG.getIntPtrConstant(0, dl));
+  }
+
+  SDValue Tmp1 = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+
+  SDValue SignSet = DAG.getSetCC(dl, getSetCCResultType(Op0.getValueType()),
+                                 Op0,
+                                 DAG.getConstant(0, dl, Op0.getValueType()),
+                                 ISD::SETLT);
+  SDValue Zero = DAG.getIntPtrConstant(0, dl),
+          Four = DAG.getIntPtrConstant(4, dl);
+  SDValue CstOffset = DAG.getSelect(dl, Zero.getValueType(),
+                                    SignSet, Four, Zero);
+
+  // If the sign bit of the integer is set, the large number will be treated
+  // as a negative number.  To counteract this, the dynamic code adds an
+  // offset depending on the data type.
+  uint64_t FF;
+  switch (Op0.getSimpleValueType().SimpleTy) {
+  default: llvm_unreachable("Unsupported integer type!");
+  case MVT::i8 : FF = 0x43800000ULL; break;  // 2^8  (as a float)
+  case MVT::i16: FF = 0x47800000ULL; break;  // 2^16 (as a float)
+  case MVT::i32: FF = 0x4F800000ULL; break;  // 2^32 (as a float)
+  case MVT::i64: FF = 0x5F800000ULL; break;  // 2^64 (as a float)
+  }
+  if (DAG.getDataLayout().isLittleEndian())
+    FF <<= 32;
+  Constant *FudgeFactor = ConstantInt::get(
+                                       Type::getInt64Ty(*DAG.getContext()), FF);
+
+  SDValue CPIdx =
+      DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
+  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
+  Alignment = std::min(Alignment, 4u);
+  SDValue FudgeInReg;
+  if (DestVT == MVT::f32)
+    FudgeInReg = DAG.getLoad(
+        MVT::f32, dl, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+        Alignment);
+  else {
+    SDValue Load = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, DestVT, DAG.getEntryNode(), CPIdx,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+        Alignment);
+    HandleSDNode Handle(Load);
+    LegalizeOp(Load.getNode());
+    FudgeInReg = Handle.getValue();
+  }
+
+  return DAG.getNode(ISD::FADD, dl, DestVT, Tmp1, FudgeInReg);
+}
+
+/// This function is responsible for legalizing a
+/// *INT_TO_FP operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal UINT_TO_FP or SINT_TO_FP
+/// operation that takes a larger input.
+SDValue SelectionDAGLegalize::PromoteLegalINT_TO_FP(SDValue LegalOp, EVT DestVT,
+                                                    bool isSigned,
+                                                    const SDLoc &dl) {
+  // First step, figure out the appropriate *INT_TO_FP operation to use.
+  EVT NewInTy = LegalOp.getValueType();
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewInTy = (MVT::SimpleValueType)(NewInTy.getSimpleVT().SimpleTy+1);
+    assert(NewInTy.isInteger() && "Ran out of possibilities!");
+
+    // If the target supports SINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::SINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::SINT_TO_FP;
+      break;
+    }
+    if (isSigned) continue;
+
+    // If the target supports UINT_TO_FP of this type, use it.
+    if (TLI.isOperationLegalOrCustom(ISD::UINT_TO_FP, NewInTy)) {
+      OpToUse = ISD::UINT_TO_FP;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+  // Okay, we found the operation and type to use.  Zero extend our input to the
+  // desired type then run the operation on it.
+  return DAG.getNode(OpToUse, dl, DestVT,
+                     DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND,
+                                 dl, NewInTy, LegalOp));
+}
+
+/// This function is responsible for legalizing a
+/// FP_TO_*INT operation of the specified operand when the target requests that
+/// we promote it.  At this point, we know that the result and operand types are
+/// legal for the target, and that there is a legal FP_TO_UINT or FP_TO_SINT
+/// operation that returns a larger result.
+SDValue SelectionDAGLegalize::PromoteLegalFP_TO_INT(SDValue LegalOp, EVT DestVT,
+                                                    bool isSigned,
+                                                    const SDLoc &dl) {
+  // First step, figure out the appropriate FP_TO*INT operation to use.
+  EVT NewOutTy = DestVT;
+
+  unsigned OpToUse = 0;
+
+  // Scan for the appropriate larger type to use.
+  while (1) {
+    NewOutTy = (MVT::SimpleValueType)(NewOutTy.getSimpleVT().SimpleTy+1);
+    assert(NewOutTy.isInteger() && "Ran out of possibilities!");
+
+    // A larger signed type can hold all unsigned values of the requested type,
+    // so using FP_TO_SINT is valid
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_SINT;
+      break;
+    }
+
+    // However, if the value may be < 0.0, we *must* use some FP_TO_SINT.
+    if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewOutTy)) {
+      OpToUse = ISD::FP_TO_UINT;
+      break;
+    }
+
+    // Otherwise, try a larger type.
+  }
+
+
+  // Okay, we found the operation and type to use.
+  SDValue Operation = DAG.getNode(OpToUse, dl, NewOutTy, LegalOp);
+
+  // Truncate the result of the extended FP_TO_*INT operation to the desired
+  // size.
+  return DAG.getNode(ISD::TRUNCATE, dl, DestVT, Operation);
+}
+
+/// Open code the operations for BITREVERSE.
+SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  unsigned Sz = VT.getScalarSizeInBits();
+
+  SDValue Tmp, Tmp2;
+  Tmp = DAG.getConstant(0, dl, VT);
+  for (unsigned I = 0, J = Sz-1; I < Sz; ++I, --J) {
+    if (I < J)
+      Tmp2 =
+          DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(J - I, dl, SHVT));
+    else
+      Tmp2 =
+          DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(I - J, dl, SHVT));
+
+    APInt Shift(Sz, 1);
+    Shift = Shift.shl(J);
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(Shift, dl, VT));
+    Tmp = DAG.getNode(ISD::OR, dl, VT, Tmp, Tmp2);
+  }
+
+  return Tmp;
+}
+
+/// Open code the operations for BSWAP of the specified operation.
+SDValue SelectionDAGLegalize::ExpandBSWAP(SDValue Op, const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  EVT SHVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4, Tmp5, Tmp6, Tmp7, Tmp8;
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: llvm_unreachable("Unhandled Expand type in BSWAP!");
+  case MVT::i16:
+    Tmp2 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    return DAG.getNode(ISD::OR, dl, VT, Tmp1, Tmp2);
+  case MVT::i32:
+    Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
+                       DAG.getConstant(0xFF0000, dl, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2, DAG.getConstant(0xFF00, dl, VT));
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+  case MVT::i64:
+    Tmp8 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
+    Tmp7 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
+    Tmp6 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp5 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp4 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
+    Tmp3 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+    Tmp2 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(40, dl, SHVT));
+    Tmp1 = DAG.getNode(ISD::SRL, dl, VT, Op, DAG.getConstant(56, dl, SHVT));
+    Tmp7 = DAG.getNode(ISD::AND, dl, VT, Tmp7,
+                       DAG.getConstant(255ULL<<48, dl, VT));
+    Tmp6 = DAG.getNode(ISD::AND, dl, VT, Tmp6,
+                       DAG.getConstant(255ULL<<40, dl, VT));
+    Tmp5 = DAG.getNode(ISD::AND, dl, VT, Tmp5,
+                       DAG.getConstant(255ULL<<32, dl, VT));
+    Tmp4 = DAG.getNode(ISD::AND, dl, VT, Tmp4,
+                       DAG.getConstant(255ULL<<24, dl, VT));
+    Tmp3 = DAG.getNode(ISD::AND, dl, VT, Tmp3,
+                       DAG.getConstant(255ULL<<16, dl, VT));
+    Tmp2 = DAG.getNode(ISD::AND, dl, VT, Tmp2,
+                       DAG.getConstant(255ULL<<8 , dl, VT));
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp7);
+    Tmp6 = DAG.getNode(ISD::OR, dl, VT, Tmp6, Tmp5);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp3);
+    Tmp2 = DAG.getNode(ISD::OR, dl, VT, Tmp2, Tmp1);
+    Tmp8 = DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp6);
+    Tmp4 = DAG.getNode(ISD::OR, dl, VT, Tmp4, Tmp2);
+    return DAG.getNode(ISD::OR, dl, VT, Tmp8, Tmp4);
+  }
+}
+
+/// Expand the specified bitcount instruction into operations.
+SDValue SelectionDAGLegalize::ExpandBitCount(unsigned Opc, SDValue Op,
+                                             const SDLoc &dl) {
+  switch (Opc) {
+  default: llvm_unreachable("Cannot expand this yet!");
+  case ISD::CTPOP: {
+    EVT VT = Op.getValueType();
+    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    unsigned Len = VT.getSizeInBits();
+
+    assert(VT.isInteger() && Len <= 128 && Len % 8 == 0 &&
+           "CTPOP not implemented for this type.");
+
+    // This is the "best" algorithm from
+    // http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetParallel
+
+    SDValue Mask55 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x55)),
+                                     dl, VT);
+    SDValue Mask33 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x33)),
+                                     dl, VT);
+    SDValue Mask0F = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x0F)),
+                                     dl, VT);
+    SDValue Mask01 = DAG.getConstant(APInt::getSplat(Len, APInt(8, 0x01)),
+                                     dl, VT);
+
+    // v = v - ((v >> 1) & 0x55555555...)
+    Op = DAG.getNode(ISD::SUB, dl, VT, Op,
+                     DAG.getNode(ISD::AND, dl, VT,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(1, dl, ShVT)),
+                                 Mask55));
+    // v = (v & 0x33333333...) + ((v >> 2) & 0x33333333...)
+    Op = DAG.getNode(ISD::ADD, dl, VT,
+                     DAG.getNode(ISD::AND, dl, VT, Op, Mask33),
+                     DAG.getNode(ISD::AND, dl, VT,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(2, dl, ShVT)),
+                                 Mask33));
+    // v = (v + (v >> 4)) & 0x0F0F0F0F...
+    Op = DAG.getNode(ISD::AND, dl, VT,
+                     DAG.getNode(ISD::ADD, dl, VT, Op,
+                                 DAG.getNode(ISD::SRL, dl, VT, Op,
+                                             DAG.getConstant(4, dl, ShVT))),
+                     Mask0F);
+    // v = (v * 0x01010101...) >> (Len - 8)
+    Op = DAG.getNode(ISD::SRL, dl, VT,
+                     DAG.getNode(ISD::MUL, dl, VT, Op, Mask01),
+                     DAG.getConstant(Len - 8, dl, ShVT));
+
+    return Op;
+  }
+  case ISD::CTLZ_ZERO_UNDEF:
+    // This trivially expands to CTLZ.
+    return DAG.getNode(ISD::CTLZ, dl, Op.getValueType(), Op);
+  case ISD::CTLZ: {
+    EVT VT = Op.getValueType();
+    unsigned len = VT.getSizeInBits();
+
+    if (TLI.isOperationLegalOrCustom(ISD::CTLZ_ZERO_UNDEF, VT)) {
+      EVT SetCCVT = getSetCCResultType(VT);
+      SDValue CTLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, VT, Op);
+      SDValue Zero = DAG.getConstant(0, dl, VT);
+      SDValue SrcIsZero = DAG.getSetCC(dl, SetCCVT, Op, Zero, ISD::SETEQ);
+      return DAG.getNode(ISD::SELECT, dl, VT, SrcIsZero,
+                         DAG.getConstant(len, dl, VT), CTLZ);
+    }
+
+    // for now, we do this:
+    // x = x | (x >> 1);
+    // x = x | (x >> 2);
+    // ...
+    // x = x | (x >>16);
+    // x = x | (x >>32); // for 64-bit input
+    // return popcount(~x);
+    //
+    // Ref: "Hacker's Delight" by Henry Warren
+    EVT ShVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    for (unsigned i = 0; (1U << i) <= (len / 2); ++i) {
+      SDValue Tmp3 = DAG.getConstant(1ULL << i, dl, ShVT);
+      Op = DAG.getNode(ISD::OR, dl, VT, Op,
+                       DAG.getNode(ISD::SRL, dl, VT, Op, Tmp3));
+    }
+    Op = DAG.getNOT(dl, Op, VT);
+    return DAG.getNode(ISD::CTPOP, dl, VT, Op);
+  }
+  case ISD::CTTZ_ZERO_UNDEF:
+    // This trivially expands to CTTZ.
+    return DAG.getNode(ISD::CTTZ, dl, Op.getValueType(), Op);
+  case ISD::CTTZ: {
+    // for now, we use: { return popcount(~x & (x - 1)); }
+    // unless the target has ctlz but not ctpop, in which case we use:
+    // { return 32 - nlz(~x & (x-1)); }
+    // Ref: "Hacker's Delight" by Henry Warren
+    EVT VT = Op.getValueType();
+    SDValue Tmp3 = DAG.getNode(ISD::AND, dl, VT,
+                               DAG.getNOT(dl, Op, VT),
+                               DAG.getNode(ISD::SUB, dl, VT, Op,
+                                           DAG.getConstant(1, dl, VT)));
+    // If ISD::CTLZ is legal and CTPOP isn't, then do that instead.
+    if (!TLI.isOperationLegalOrCustom(ISD::CTPOP, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::CTLZ, VT))
+      return DAG.getNode(ISD::SUB, dl, VT,
+                         DAG.getConstant(VT.getSizeInBits(), dl, VT),
+                         DAG.getNode(ISD::CTLZ, dl, VT, Tmp3));
+    return DAG.getNode(ISD::CTPOP, dl, VT, Tmp3);
+  }
+  }
+}
+
+bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  bool NeedInvert;
+  switch (Node->getOpcode()) {
+  case ISD::CTPOP:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+    Tmp1 = ExpandBitCount(Node->getOpcode(), Node->getOperand(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BITREVERSE:
+    Results.push_back(ExpandBITREVERSE(Node->getOperand(0), dl));
+    break;
+  case ISD::BSWAP:
+    Results.push_back(ExpandBSWAP(Node->getOperand(0), dl));
+    break;
+  case ISD::FRAMEADDR:
+  case ISD::RETURNADDR:
+  case ISD::FRAME_TO_ARGS_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    break;
+  case ISD::FLT_ROUNDS_:
+    Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
+    break;
+  case ISD::EH_RETURN:
+  case ISD::EH_LABEL:
+  case ISD::PREFETCH:
+  case ISD::VAEND:
+  case ISD::EH_SJLJ_LONGJMP:
+    // If the target didn't expand these, there's nothing to do, so just
+    // preserve the chain and be done.
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::READCYCLECOUNTER:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.append(Node->getNumValues() - 1,
+                   DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::EH_SJLJ_SETJMP:
+    // If the target didn't expand this, just return 'zero' and preserve the
+    // chain.
+    Results.push_back(DAG.getConstant(0, dl, MVT::i32));
+    Results.push_back(Node->getOperand(0));
+    break;
+  case ISD::ATOMIC_LOAD: {
+    // There is no libcall for atomic load; fake it with ATOMIC_CMP_SWAP.
+    SDValue Zero = DAG.getConstant(0, dl, Node->getValueType(0));
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Swap = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Zero, Zero,
+        cast<AtomicSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(0));
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_STORE: {
+    // There is no libcall for atomic store; fake it with ATOMIC_SWAP.
+    SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                                 cast<AtomicSDNode>(Node)->getMemoryVT(),
+                                 Node->getOperand(0),
+                                 Node->getOperand(1), Node->getOperand(2),
+                                 cast<AtomicSDNode>(Node)->getMemOperand(),
+                                 cast<AtomicSDNode>(Node)->getOrdering(),
+                                 cast<AtomicSDNode>(Node)->getSynchScope());
+    Results.push_back(Swap.getValue(1));
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    // Expanding an ATOMIC_CMP_SWAP_WITH_SUCCESS produces an ATOMIC_CMP_SWAP and
+    // splits out the success value as a comparison. Expanding the resulting
+    // ATOMIC_CMP_SWAP will produce a libcall.
+    SDVTList VTs = DAG.getVTList(Node->getValueType(0), MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, dl, cast<AtomicSDNode>(Node)->getMemoryVT(), VTs,
+        Node->getOperand(0), Node->getOperand(1), Node->getOperand(2),
+        Node->getOperand(3), cast<MemSDNode>(Node)->getMemOperand(),
+        cast<AtomicSDNode>(Node)->getSuccessOrdering(),
+        cast<AtomicSDNode>(Node)->getFailureOrdering(),
+        cast<AtomicSDNode>(Node)->getSynchScope());
+
+    SDValue ExtRes = Res;
+    SDValue LHS = Res;
+    SDValue RHS = Node->getOperand(1);
+
+    EVT AtomicType = cast<AtomicSDNode>(Node)->getMemoryVT();
+    EVT OuterType = Node->getValueType(0);
+    switch (TLI.getExtendForAtomicOps()) {
+    case ISD::SIGN_EXTEND:
+      LHS = DAG.getNode(ISD::AssertSext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, OuterType,
+                        Node->getOperand(2), DAG.getValueType(AtomicType));
+      ExtRes = LHS;
+      break;
+    case ISD::ZERO_EXTEND:
+      LHS = DAG.getNode(ISD::AssertZext, dl, OuterType, Res,
+                        DAG.getValueType(AtomicType));
+      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      ExtRes = LHS;
+      break;
+    case ISD::ANY_EXTEND:
+      LHS = DAG.getZeroExtendInReg(Res, dl, AtomicType);
+      RHS = DAG.getNode(ISD::ZERO_EXTEND, dl, OuterType, Node->getOperand(2));
+      break;
+    default:
+      llvm_unreachable("Invalid atomic op extension");
+    }
+
+    SDValue Success =
+        DAG.getSetCC(dl, Node->getValueType(1), LHS, RHS, ISD::SETEQ);
+
+    Results.push_back(ExtRes.getValue(0));
+    Results.push_back(Success);
+    Results.push_back(Res.getValue(1));
+    break;
+  }
+  case ISD::DYNAMIC_STACKALLOC:
+    ExpandDYNAMIC_STACKALLOC(Node, Results);
+    break;
+  case ISD::MERGE_VALUES:
+    for (unsigned i = 0; i < Node->getNumValues(); i++)
+      Results.push_back(Node->getOperand(i));
+    break;
+  case ISD::UNDEF: {
+    EVT VT = Node->getValueType(0);
+    if (VT.isInteger())
+      Results.push_back(DAG.getConstant(0, dl, VT));
+    else {
+      assert(VT.isFloatingPoint() && "Unknown value type!");
+      Results.push_back(DAG.getConstantFP(0, dl, VT));
+    }
+    break;
+  }
+  case ISD::FP_ROUND:
+  case ISD::BITCAST:
+    Tmp1 = EmitStackConvert(Node->getOperand(0), Node->getValueType(0),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_EXTEND:
+    Tmp1 = EmitStackConvert(Node->getOperand(0),
+                            Node->getOperand(0).getValueType(),
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    // NOTE: we could fall back on load/store here too for targets without
+    // SAR.  However, it is doubtful that any exist.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    EVT VT = Node->getValueType(0);
+    EVT ShiftAmountTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    if (VT.isVector())
+      ShiftAmountTy = VT;
+    unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
+                        ExtraVT.getScalarType().getSizeInBits();
+    SDValue ShiftCst = DAG.getConstant(BitsDiff, dl, ShiftAmountTy);
+    Tmp1 = DAG.getNode(ISD::SHL, dl, Node->getValueType(0),
+                       Node->getOperand(0), ShiftCst);
+    Tmp1 = DAG.getNode(ISD::SRA, dl, Node->getValueType(0), Tmp1, ShiftCst);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_ROUND_INREG: {
+    // The only way we can lower this is to turn it into a TRUNCSTORE,
+    // EXTLOAD pair, targeting a temporary location (a stack slot).
+
+    // NOTE: there is a choice here between constantly creating new stack
+    // slots and always reusing the same one.  We currently always create
+    // new ones, as reuse may inhibit scheduling.
+    EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    Tmp1 = EmitStackConvert(Node->getOperand(0), ExtraVT,
+                            Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    Tmp1 = ExpandLegalINT_TO_FP(Node->getOpcode() == ISD::SINT_TO_FP,
+                                Node->getOperand(0), Node->getValueType(0), dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FP_TO_SINT:
+    if (TLI.expandFP_TO_SINT(Node, Tmp1, DAG))
+      Results.push_back(Tmp1);
+    break;
+  case ISD::FP_TO_UINT: {
+    SDValue True, False;
+    EVT VT =  Node->getOperand(0).getValueType();
+    EVT NVT = Node->getValueType(0);
+    APFloat apf(DAG.EVTToAPFloatSemantics(VT),
+                APInt::getNullValue(VT.getSizeInBits()));
+    APInt x = APInt::getSignBit(NVT.getSizeInBits());
+    (void)apf.convertFromAPInt(x, false, APFloat::rmNearestTiesToEven);
+    Tmp1 = DAG.getConstantFP(apf, dl, VT);
+    Tmp2 = DAG.getSetCC(dl, getSetCCResultType(VT),
+                        Node->getOperand(0),
+                        Tmp1, ISD::SETLT);
+    True = DAG.getNode(ISD::FP_TO_SINT, dl, NVT, Node->getOperand(0));
+    // TODO: Should any fast-math-flags be set for the FSUB?
+    False = DAG.getNode(ISD::FP_TO_SINT, dl, NVT,
+                        DAG.getNode(ISD::FSUB, dl, VT,
+                                    Node->getOperand(0), Tmp1));
+    False = DAG.getNode(ISD::XOR, dl, NVT, False,
+                        DAG.getConstant(x, dl, NVT));
+    Tmp1 = DAG.getSelect(dl, NVT, Tmp2, True, False);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VAARG:
+    Results.push_back(DAG.expandVAArg(Node));
+    Results.push_back(Results[0].getValue(1));
+    break;
+  case ISD::VACOPY:
+    Results.push_back(DAG.expandVACopy(Node));
+    break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (Node->getOperand(0).getValueType().getVectorNumElements() == 1)
+      // This must be an access of the only element.  Return it.
+      Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    else
+      Tmp1 = ExpandExtractFromVectorThroughStack(SDValue(Node, 0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    Results.push_back(ExpandExtractFromVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::INSERT_SUBVECTOR:
+    Results.push_back(ExpandInsertToVectorThroughStack(SDValue(Node, 0)));
+    break;
+  case ISD::CONCAT_VECTORS: {
+    Results.push_back(ExpandVectorBuildThroughStack(Node));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR:
+    Results.push_back(ExpandSCALAR_TO_VECTOR(Node));
+    break;
+  case ISD::INSERT_VECTOR_ELT:
+    Results.push_back(ExpandINSERT_VECTOR_ELT(Node->getOperand(0),
+                                              Node->getOperand(1),
+                                              Node->getOperand(2), dl));
+    break;
+  case ISD::VECTOR_SHUFFLE: {
+    SmallVector<int, 32> NewMask;
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    EVT VT = Node->getValueType(0);
+    EVT EltVT = VT.getVectorElementType();
+    SDValue Op0 = Node->getOperand(0);
+    SDValue Op1 = Node->getOperand(1);
+    if (!TLI.isTypeLegal(EltVT)) {
+
+      EVT NewEltVT = TLI.getTypeToTransformTo(*DAG.getContext(), EltVT);
+
+      // BUILD_VECTOR operands are allowed to be wider than the element type.
+      // But if NewEltVT is smaller that EltVT the BUILD_VECTOR does not accept
+      // it.
+      if (NewEltVT.bitsLT(EltVT)) {
+
+        // Convert shuffle node.
+        // If original node was v4i64 and the new EltVT is i32,
+        // cast operands to v8i32 and re-build the mask.
+
+        // Calculate new VT, the size of the new VT should be equal to original.
+        EVT NewVT =
+            EVT::getVectorVT(*DAG.getContext(), NewEltVT,
+                             VT.getSizeInBits() / NewEltVT.getSizeInBits());
+        assert(NewVT.bitsEq(VT));
+
+        // cast operands to new VT
+        Op0 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op0);
+        Op1 = DAG.getNode(ISD::BITCAST, dl, NewVT, Op1);
+
+        // Convert the shuffle mask
+        unsigned int factor =
+                         NewVT.getVectorNumElements()/VT.getVectorNumElements();
+
+        // EltVT gets smaller
+        assert(factor > 0);
+
+        for (unsigned i = 0; i < VT.getVectorNumElements(); ++i) {
+          if (Mask[i] < 0) {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]);
+          }
+          else {
+            for (unsigned fi = 0; fi < factor; ++fi)
+              NewMask.push_back(Mask[i]*factor+fi);
+          }
+        }
+        Mask = NewMask;
+        VT = NewVT;
+      }
+      EltVT = NewEltVT;
+    }
+    unsigned NumElems = VT.getVectorNumElements();
+    SmallVector<SDValue, 16> Ops;
+    for (unsigned i = 0; i != NumElems; ++i) {
+      if (Mask[i] < 0) {
+        Ops.push_back(DAG.getUNDEF(EltVT));
+        continue;
+      }
+      unsigned Idx = Mask[i];
+      if (Idx < NumElems)
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+      else
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
+            DAG.getConstant(Idx - NumElems, dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout()))));
+    }
+
+    Tmp1 = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+    // We may have changed the BUILD_VECTOR type. Cast it back to the Node type.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, Node->getValueType(0), Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    EVT OpTy = Node->getOperand(0).getValueType();
+    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+      // 1 -> Hi
+      Tmp1 = DAG.getNode(ISD::SRL, dl, OpTy, Node->getOperand(0),
+                         DAG.getConstant(OpTy.getSizeInBits() / 2, dl,
+                                         TLI.getShiftAmountTy(
+                                             Node->getOperand(0).getValueType(),
+                                             DAG.getDataLayout())));
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0), Tmp1);
+    } else {
+      // 0 -> Lo
+      Tmp1 = DAG.getNode(ISD::TRUNCATE, dl, Node->getValueType(0),
+                         Node->getOperand(0));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::STACKSAVE:
+    // Expand to CopyFromReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyFromReg(Node->getOperand(0), dl, SP,
+                                           Node->getValueType(0)));
+      Results.push_back(Results[0].getValue(1));
+    } else {
+      Results.push_back(DAG.getUNDEF(Node->getValueType(0)));
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::STACKRESTORE:
+    // Expand to CopyToReg if the target set
+    // StackPointerRegisterToSaveRestore.
+    if (unsigned SP = TLI.getStackPointerRegisterToSaveRestore()) {
+      Results.push_back(DAG.getCopyToReg(Node->getOperand(0), dl, SP,
+                                         Node->getOperand(1)));
+    } else {
+      Results.push_back(Node->getOperand(0));
+    }
+    break;
+  case ISD::GET_DYNAMIC_AREA_OFFSET:
+    Results.push_back(DAG.getConstant(0, dl, Node->getValueType(0)));
+    Results.push_back(Results[0].getValue(0));
+    break;
+  case ISD::FCOPYSIGN:
+    Results.push_back(ExpandFCOPYSIGN(Node));
+    break;
+  case ISD::FNEG:
+    // Expand Y = FNEG(X) ->  Y = SUB -0.0, X
+    Tmp1 = DAG.getConstantFP(-0.0, dl, Node->getValueType(0));
+    // TODO: If FNEG has fast-math-flags, propagate them to the FSUB.
+    Tmp1 = DAG.getNode(ISD::FSUB, dl, Node->getValueType(0), Tmp1,
+                       Node->getOperand(0));
+    Results.push_back(Tmp1);
+    break;
+  case ISD::FABS:
+    Results.push_back(ExpandFABS(Node));
+    break;
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX: {
+    // Expand Y = MAX(A, B) -> Y = (A > B) ? A : B
+    ISD::CondCode Pred;
+    switch (Node->getOpcode()) {
+    default: llvm_unreachable("How did we get here?");
+    case ISD::SMAX: Pred = ISD::SETGT; break;
+    case ISD::SMIN: Pred = ISD::SETLT; break;
+    case ISD::UMAX: Pred = ISD::SETUGT; break;
+    case ISD::UMIN: Pred = ISD::SETULT; break;
+    }
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp1, Tmp2, Pred);
+    Results.push_back(Tmp1);
+    break;
+  }
+
+  case ISD::FSIN:
+  case ISD::FCOS: {
+    EVT VT = Node->getValueType(0);
+    // Turn fsin / fcos into ISD::FSINCOS node if there are a pair of fsin /
+    // fcos which share the same operand and both are used.
+    if ((TLI.isOperationLegalOrCustom(ISD::FSINCOS, VT) ||
+         canCombineSinCosLibcall(Node, TLI, TM))
+        && useSinCos(Node)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(ISD::FSINCOS, dl, VTs, Node->getOperand(0));
+      if (Node->getOpcode() == ISD::FCOS)
+        Tmp1 = Tmp1.getValue(1);
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::FMAD:
+    llvm_unreachable("Illegal fmad should never be formed");
+
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) != MVT::f32) {
+      // We can extend to types bigger than f32 in two steps without changing
+      // the result. Since "f16 -> f32" is much more commonly available, give
+      // CodeGen the option of emitting that before resorting to a libcall.
+      SDValue Res =
+          DAG.getNode(ISD::FP16_TO_FP, dl, MVT::f32, Node->getOperand(0));
+      Results.push_back(
+          DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
+    }
+    break;
+  case ISD::FP_TO_FP16:
+    if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
+      SDValue Op = Node->getOperand(0);
+      MVT SVT = Op.getSimpleValueType();
+      if ((SVT == MVT::f64 || SVT == MVT::f80) &&
+          TLI.isOperationLegalOrCustom(ISD::FP_TO_FP16, MVT::f32)) {
+        // Under fastmath, we can expand this node into a fround followed by
+        // a float-half conversion.
+        SDValue FloatVal = DAG.getNode(ISD::FP_ROUND, dl, MVT::f32, Op,
+                                       DAG.getIntPtrConstant(0, dl));
+        Results.push_back(
+            DAG.getNode(ISD::FP_TO_FP16, dl, Node->getValueType(0), FloatVal));
+      }
+    }
+    break;
+  case ISD::ConstantFP: {
+    ConstantFPSDNode *CFP = cast<ConstantFPSDNode>(Node);
+    // Check to see if this FP immediate is already legal.
+    // If this is a legal constant, turn it into a TargetConstantFP node.
+    if (!TLI.isFPImmLegal(CFP->getValueAPF(), Node->getValueType(0)))
+      Results.push_back(ExpandConstantFP(CFP, true));
+    break;
+  }
+  case ISD::Constant: {
+    ConstantSDNode *CP = cast<ConstantSDNode>(Node);
+    Results.push_back(ExpandConstant(CP));
+    break;
+  }
+  case ISD::FSUB: {
+    EVT VT = Node->getValueType(0);
+    if (TLI.isOperationLegalOrCustom(ISD::FADD, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::FNEG, VT)) {
+      const SDNodeFlags *Flags = &cast<BinaryWithFlagsSDNode>(Node)->Flags;
+      Tmp1 = DAG.getNode(ISD::FNEG, dl, VT, Node->getOperand(1));
+      Tmp1 = DAG.getNode(ISD::FADD, dl, VT, Node->getOperand(0), Tmp1, Flags);
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::SUB: {
+    EVT VT = Node->getValueType(0);
+    assert(TLI.isOperationLegalOrCustom(ISD::ADD, VT) &&
+           TLI.isOperationLegalOrCustom(ISD::XOR, VT) &&
+           "Don't know how to expand this subtraction!");
+    Tmp1 = DAG.getNode(ISD::XOR, dl, VT, Node->getOperand(1),
+               DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), dl,
+                               VT));
+    Tmp1 = DAG.getNode(ISD::ADD, dl, VT, Tmp1, DAG.getConstant(1, dl, VT));
+    Results.push_back(DAG.getNode(ISD::ADD, dl, VT, Node->getOperand(0), Tmp1));
+    break;
+  }
+  case ISD::UREM:
+  case ISD::SREM: {
+    EVT VT = Node->getValueType(0);
+    bool isSigned = Node->getOpcode() == ISD::SREM;
+    unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    Tmp2 = Node->getOperand(0);
+    Tmp3 = Node->getOperand(1);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
+      Results.push_back(Tmp1);
+    } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
+      // X % Y -> X-X/Y*Y
+      Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
+      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::UDIV:
+  case ISD::SDIV: {
+    bool isSigned = Node->getOpcode() == ISD::SDIV;
+    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+    EVT VT = Node->getValueType(0);
+    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
+      SDVTList VTs = DAG.getVTList(VT, VT);
+      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Node->getOperand(0),
+                         Node->getOperand(1));
+      Results.push_back(Tmp1);
+    }
+    break;
+  }
+  case ISD::MULHU:
+  case ISD::MULHS: {
+    unsigned ExpandOpcode = Node->getOpcode() == ISD::MULHU ? ISD::UMUL_LOHI :
+                                                              ISD::SMUL_LOHI;
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    assert(TLI.isOperationLegalOrCustom(ExpandOpcode, VT) &&
+           "If this wasn't legal, it shouldn't have been created!");
+    Tmp1 = DAG.getNode(ExpandOpcode, dl, VTs, Node->getOperand(0),
+                       Node->getOperand(1));
+    Results.push_back(Tmp1.getValue(1));
+    break;
+  }
+  case ISD::MUL: {
+    EVT VT = Node->getValueType(0);
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    // See if multiply or divide can be lowered using two-result operations.
+    // We just need the low half of the multiply; try both the signed
+    // and unsigned forms. If the target supports both SMUL_LOHI and
+    // UMUL_LOHI, form a preference by checking which forms of plain
+    // MULH it supports.
+    bool HasSMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::SMUL_LOHI, VT);
+    bool HasUMUL_LOHI = TLI.isOperationLegalOrCustom(ISD::UMUL_LOHI, VT);
+    bool HasMULHS = TLI.isOperationLegalOrCustom(ISD::MULHS, VT);
+    bool HasMULHU = TLI.isOperationLegalOrCustom(ISD::MULHU, VT);
+    unsigned OpToUse = 0;
+    if (HasSMUL_LOHI && !HasMULHS) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI && !HasMULHU) {
+      OpToUse = ISD::UMUL_LOHI;
+    } else if (HasSMUL_LOHI) {
+      OpToUse = ISD::SMUL_LOHI;
+    } else if (HasUMUL_LOHI) {
+      OpToUse = ISD::UMUL_LOHI;
+    }
+    if (OpToUse) {
+      Results.push_back(DAG.getNode(OpToUse, dl, VTs, Node->getOperand(0),
+                                    Node->getOperand(1)));
+      break;
+    }
+
+    SDValue Lo, Hi;
+    EVT HalfType = VT.getHalfSizedIntegerVT(*DAG.getContext());
+    if (TLI.isOperationLegalOrCustom(ISD::ZERO_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::SHL, VT) &&
+        TLI.isOperationLegalOrCustom(ISD::OR, VT) &&
+        TLI.expandMUL(Node, Lo, Hi, HalfType, DAG)) {
+      Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, Lo);
+      Hi = DAG.getNode(ISD::ANY_EXTEND, dl, VT, Hi);
+      SDValue Shift =
+          DAG.getConstant(HalfType.getSizeInBits(), dl,
+                          TLI.getShiftAmountTy(HalfType, DAG.getDataLayout()));
+      Hi = DAG.getNode(ISD::SHL, dl, VT, Hi, Shift);
+      Results.push_back(DAG.getNode(ISD::OR, dl, VT, Lo, Hi));
+    }
+    break;
+  }
+  case ISD::SADDO:
+  case ISD::SSUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+    EVT ResultType = Node->getValueType(1);
+    EVT OType = getSetCCResultType(Node->getValueType(0));
+
+    SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+
+    //   LHSSign -> LHS >= 0
+    //   RHSSign -> RHS >= 0
+    //   SumSign -> Sum >= 0
+    //
+    //   Add:
+    //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+    //   Sub:
+    //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+    //
+    SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+    SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+    SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                      Node->getOpcode() == ISD::SADDO ?
+                                      ISD::SETEQ : ISD::SETNE);
+
+    SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+    SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+    SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+    Results.push_back(DAG.getBoolExtOrTrunc(Cmp, dl, ResultType, ResultType));
+    break;
+  }
+  case ISD::UADDO:
+  case ISD::USUBO: {
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::UADDO ?
+                              ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                              LHS, RHS);
+    Results.push_back(Sum);
+
+    EVT ResultType = Node->getValueType(1);
+    EVT SetCCType = getSetCCResultType(Node->getValueType(0));
+    ISD::CondCode CC
+      = Node->getOpcode() == ISD::UADDO ? ISD::SETULT : ISD::SETUGT;
+    SDValue SetCC = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
+
+    Results.push_back(DAG.getBoolExtOrTrunc(SetCC, dl, ResultType, ResultType));
+    break;
+  }
+  case ISD::UMULO:
+  case ISD::SMULO: {
+    EVT VT = Node->getValueType(0);
+    EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits() * 2);
+    SDValue LHS = Node->getOperand(0);
+    SDValue RHS = Node->getOperand(1);
+    SDValue BottomHalf;
+    SDValue TopHalf;
+    static const unsigned Ops[2][3] =
+        { { ISD::MULHU, ISD::UMUL_LOHI, ISD::ZERO_EXTEND },
+          { ISD::MULHS, ISD::SMUL_LOHI, ISD::SIGN_EXTEND }};
+    bool isSigned = Node->getOpcode() == ISD::SMULO;
+    if (TLI.isOperationLegalOrCustom(Ops[isSigned][0], VT)) {
+      BottomHalf = DAG.getNode(ISD::MUL, dl, VT, LHS, RHS);
+      TopHalf = DAG.getNode(Ops[isSigned][0], dl, VT, LHS, RHS);
+    } else if (TLI.isOperationLegalOrCustom(Ops[isSigned][1], VT)) {
+      BottomHalf = DAG.getNode(Ops[isSigned][1], dl, DAG.getVTList(VT, VT), LHS,
+                               RHS);
+      TopHalf = BottomHalf.getValue(1);
+    } else if (TLI.isTypeLegal(WideVT)) {
+      LHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, LHS);
+      RHS = DAG.getNode(Ops[isSigned][2], dl, WideVT, RHS);
+      Tmp1 = DAG.getNode(ISD::MUL, dl, WideVT, LHS, RHS);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                               DAG.getIntPtrConstant(0, dl));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Tmp1,
+                            DAG.getIntPtrConstant(1, dl));
+    } else {
+      // We can fall back to a libcall with an illegal type for the MUL if we
+      // have a libcall big enough.
+      // Also, we can fall back to a division in some cases, but that's a big
+      // performance hit in the general case.
+      RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+      if (WideVT == MVT::i16)
+        LC = RTLIB::MUL_I16;
+      else if (WideVT == MVT::i32)
+        LC = RTLIB::MUL_I32;
+      else if (WideVT == MVT::i64)
+        LC = RTLIB::MUL_I64;
+      else if (WideVT == MVT::i128)
+        LC = RTLIB::MUL_I128;
+      assert(LC != RTLIB::UNKNOWN_LIBCALL && "Cannot expand this operation!");
+
+      // The high part is obtained by SRA'ing all but one of the bits of low
+      // part.
+      unsigned LoSize = VT.getSizeInBits();
+      SDValue HiLHS =
+          DAG.getNode(ISD::SRA, dl, VT, RHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout())));
+      SDValue HiRHS =
+          DAG.getNode(ISD::SRA, dl, VT, LHS,
+                      DAG.getConstant(LoSize - 1, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout())));
+
+      // Here we're passing the 2 arguments explicitly as 4 arguments that are
+      // pre-lowered to the correct types. This all depends upon WideVT not
+      // being a legal type for the architecture and thus has to be split to
+      // two arguments.
+      SDValue Args[] = { LHS, HiLHS, RHS, HiRHS };
+      SDValue Ret = ExpandLibCall(LC, WideVT, Args, 4, isSigned, dl);
+      BottomHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                               DAG.getIntPtrConstant(0, dl));
+      TopHalf = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, VT, Ret,
+                            DAG.getIntPtrConstant(1, dl));
+      // Ret is a node with an illegal type. Because such things are not
+      // generally permitted during this phase of legalization, make sure the
+      // node has no more uses. The above EXTRACT_ELEMENT nodes should have been
+      // folded.
+      assert(Ret->use_empty() &&
+             "Unexpected uses of illegally type from expanded lib call.");
+    }
+
+    if (isSigned) {
+      Tmp1 = DAG.getConstant(
+          VT.getSizeInBits() - 1, dl,
+          TLI.getShiftAmountTy(BottomHalf.getValueType(), DAG.getDataLayout()));
+      Tmp1 = DAG.getNode(ISD::SRA, dl, VT, BottomHalf, Tmp1);
+      TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf, Tmp1,
+                             ISD::SETNE);
+    } else {
+      TopHalf = DAG.getSetCC(dl, getSetCCResultType(VT), TopHalf,
+                             DAG.getConstant(0, dl, VT), ISD::SETNE);
+    }
+    Results.push_back(BottomHalf);
+    Results.push_back(TopHalf);
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT PairTy = Node->getValueType(0);
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, PairTy, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::ANY_EXTEND, dl, PairTy, Node->getOperand(1));
+    Tmp2 = DAG.getNode(
+        ISD::SHL, dl, PairTy, Tmp2,
+        DAG.getConstant(PairTy.getSizeInBits() / 2, dl,
+                        TLI.getShiftAmountTy(PairTy, DAG.getDataLayout())));
+    Results.push_back(DAG.getNode(ISD::OR, dl, PairTy, Tmp1, Tmp2));
+    break;
+  }
+  case ISD::SELECT:
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    if (Tmp1.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1.getOperand(0), Tmp1.getOperand(1),
+                             Tmp2, Tmp3,
+                             cast<CondCodeSDNode>(Tmp1.getOperand(2))->get());
+    } else {
+      Tmp1 = DAG.getSelectCC(dl, Tmp1,
+                             DAG.getConstant(0, dl, Tmp1.getValueType()),
+                             Tmp2, Tmp3, ISD::SETNE);
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::BR_JT: {
+    SDValue Chain = Node->getOperand(0);
+    SDValue Table = Node->getOperand(1);
+    SDValue Index = Node->getOperand(2);
+
+    EVT PTy = TLI.getPointerTy(DAG.getDataLayout());
+
+    const DataLayout &TD = DAG.getDataLayout();
+    unsigned EntrySize =
+      DAG.getMachineFunction().getJumpTableInfo()->getEntrySize(TD);
+
+    Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                        DAG.getConstant(EntrySize, dl, Index.getValueType()));
+    SDValue Addr = DAG.getNode(ISD::ADD, dl, Index.getValueType(),
+                               Index, Table);
+
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(), EntrySize * 8);
+    SDValue LD = DAG.getExtLoad(
+        ISD::SEXTLOAD, dl, PTy, Chain, Addr,
+        MachinePointerInfo::getJumpTable(DAG.getMachineFunction()), MemVT);
+    Addr = LD;
+    if (TM.isPositionIndependent()) {
+      // For PIC, the sequence is:
+      // BRIND(load(Jumptable + index) + RelocBase)
+      // RelocBase can be JumpTable, GOT or some sort of global base.
+      Addr = DAG.getNode(ISD::ADD, dl, PTy, Addr,
+                          TLI.getPICJumpTableRelocBase(Table, DAG));
+    }
+    Tmp1 = DAG.getNode(ISD::BRIND, dl, MVT::Other, LD.getValue(1), Addr);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BRCOND:
+    // Expand brcond's setcc into its constituent parts and create a BR_CC
+    // Node.
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    if (Tmp2.getOpcode() == ISD::SETCC) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other,
+                         Tmp1, Tmp2.getOperand(2),
+                         Tmp2.getOperand(0), Tmp2.getOperand(1),
+                         Node->getOperand(2));
+    } else {
+      // We test only the i1 bit.  Skip the AND if UNDEF.
+      Tmp3 = (Tmp2.isUndef()) ? Tmp2 :
+        DAG.getNode(ISD::AND, dl, Tmp2.getValueType(), Tmp2,
+                    DAG.getConstant(1, dl, Tmp2.getValueType()));
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, MVT::Other, Tmp1,
+                         DAG.getCondCode(ISD::SETNE), Tmp3,
+                         DAG.getConstant(0, dl, Tmp3.getValueType()),
+                         Node->getOperand(2));
+    }
+    Results.push_back(Tmp1);
+    break;
+  case ISD::SETCC: {
+    Tmp1 = Node->getOperand(0);
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = Node->getOperand(2);
+    bool Legalized = LegalizeSetCCCondCode(Node->getValueType(0), Tmp1, Tmp2,
+                                           Tmp3, NeedInvert, dl);
+
+    if (Legalized) {
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SETCC node.
+      if (Tmp3.getNode())
+        Tmp1 = DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3);
+
+      // If we expanded the SETCC by inverting the condition code, then wrap
+      // the existing SETCC in a NOT to restore the intended condition.
+      if (NeedInvert)
+        Tmp1 = DAG.getLogicalNOT(dl, Tmp1, Tmp1->getValueType(0));
+
+      Results.push_back(Tmp1);
+      break;
+    }
+
+    // Otherwise, SETCC for the given comparison type must be completely
+    // illegal; expand it into a SELECT_CC.
+    EVT VT = Node->getValueType(0);
+    int TrueValue;
+    switch (TLI.getBooleanContents(Tmp1->getValueType(0))) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+    case TargetLowering::UndefinedBooleanContent:
+      TrueValue = 1;
+      break;
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      TrueValue = -1;
+      break;
+    }
+    Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, VT, Tmp1, Tmp2,
+                       DAG.getConstant(TrueValue, dl, VT),
+                       DAG.getConstant(0, dl, VT),
+                       Tmp3);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SELECT_CC: {
+    Tmp1 = Node->getOperand(0);   // LHS
+    Tmp2 = Node->getOperand(1);   // RHS
+    Tmp3 = Node->getOperand(2);   // True
+    Tmp4 = Node->getOperand(3);   // False
+    EVT VT = Node->getValueType(0);
+    SDValue CC = Node->getOperand(4);
+    ISD::CondCode CCOp = cast<CondCodeSDNode>(CC)->get();
+
+    if (TLI.isCondCodeLegal(CCOp, Tmp1.getSimpleValueType())) {
+      // If the condition code is legal, then we need to expand this
+      // node using SETCC and SELECT.
+      EVT CmpVT = Tmp1.getValueType();
+      assert(!TLI.isOperationExpand(ISD::SELECT, VT) &&
+             "Cannot expand ISD::SELECT_CC when ISD::SELECT also needs to be "
+             "expanded.");
+      EVT CCVT =
+          TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), CmpVT);
+      SDValue Cond = DAG.getNode(ISD::SETCC, dl, CCVT, Tmp1, Tmp2, CC);
+      Results.push_back(DAG.getSelect(dl, VT, Cond, Tmp3, Tmp4));
+      break;
+    }
+
+    // SELECT_CC is legal, so the condition code must not be.
+    bool Legalized = false;
+    // Try to legalize by inverting the condition.  This is for targets that
+    // might support an ordered version of a condition, but not the unordered
+    // version (or vice versa).
+    ISD::CondCode InvCC = ISD::getSetCCInverse(CCOp,
+                                               Tmp1.getValueType().isInteger());
+    if (TLI.isCondCodeLegal(InvCC, Tmp1.getSimpleValueType())) {
+      // Use the new condition code and swap true and false
+      Legalized = true;
+      Tmp1 = DAG.getSelectCC(dl, Tmp1, Tmp2, Tmp4, Tmp3, InvCC);
+    } else {
+      // If The inverse is not legal, then try to swap the arguments using
+      // the inverse condition code.
+      ISD::CondCode SwapInvCC = ISD::getSetCCSwappedOperands(InvCC);
+      if (TLI.isCondCodeLegal(SwapInvCC, Tmp1.getSimpleValueType())) {
+        // The swapped inverse condition is legal, so swap true and false,
+        // lhs and rhs.
+        Legalized = true;
+        Tmp1 = DAG.getSelectCC(dl, Tmp2, Tmp1, Tmp4, Tmp3, SwapInvCC);
+      }
+    }
+
+    if (!Legalized) {
+      Legalized = LegalizeSetCCCondCode(
+          getSetCCResultType(Tmp1.getValueType()), Tmp1, Tmp2, CC, NeedInvert,
+          dl);
+
+      assert(Legalized && "Can't legalize SELECT_CC with legal condition!");
+
+      // If we expanded the SETCC by inverting the condition code, then swap
+      // the True/False operands to match.
+      if (NeedInvert)
+        std::swap(Tmp3, Tmp4);
+
+      // If we expanded the SETCC by swapping LHS and RHS, or by inverting the
+      // condition code, create a new SELECT_CC node.
+      if (CC.getNode()) {
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0),
+                           Tmp1, Tmp2, Tmp3, Tmp4, CC);
+      } else {
+        Tmp2 = DAG.getConstant(0, dl, Tmp1.getValueType());
+        CC = DAG.getCondCode(ISD::SETNE);
+        Tmp1 = DAG.getNode(ISD::SELECT_CC, dl, Node->getValueType(0), Tmp1,
+                           Tmp2, Tmp3, Tmp4, CC);
+      }
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BR_CC: {
+    Tmp1 = Node->getOperand(0);              // Chain
+    Tmp2 = Node->getOperand(2);              // LHS
+    Tmp3 = Node->getOperand(3);              // RHS
+    Tmp4 = Node->getOperand(1);              // CC
+
+    bool Legalized = LegalizeSetCCCondCode(getSetCCResultType(
+        Tmp2.getValueType()), Tmp2, Tmp3, Tmp4, NeedInvert, dl);
+    (void)Legalized;
+    assert(Legalized && "Can't legalize BR_CC with legal condition!");
+
+    // If we expanded the SETCC by inverting the condition code, then wrap
+    // the existing SETCC in a NOT to restore the intended condition.
+    if (NeedInvert)
+      Tmp4 = DAG.getNOT(dl, Tmp4, Tmp4->getValueType(0));
+
+    // If we expanded the SETCC by swapping LHS and RHS, create a new BR_CC
+    // node.
+    if (Tmp4.getNode()) {
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1,
+                         Tmp4, Tmp2, Tmp3, Node->getOperand(4));
+    } else {
+      Tmp3 = DAG.getConstant(0, dl, Tmp2.getValueType());
+      Tmp4 = DAG.getCondCode(ISD::SETNE);
+      Tmp1 = DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0), Tmp1, Tmp4,
+                         Tmp2, Tmp3, Node->getOperand(4));
+    }
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::BUILD_VECTOR:
+    Results.push_back(ExpandBUILD_VECTOR(Node));
+    break;
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::SHL: {
+    // Scalarize vector SRA/SRL/SHL.
+    EVT VT = Node->getValueType(0);
+    assert(VT.isVector() && "Unable to legalize non-vector shift");
+    assert(TLI.isTypeLegal(VT.getScalarType())&& "Element type must be legal");
+    unsigned NumElem = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 8> Scalars;
+    for (unsigned Idx = 0; Idx < NumElem; Idx++) {
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Sh = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
+                                    VT.getScalarType(), Ex, Sh));
+    }
+    SDValue Result =
+      DAG.getNode(ISD::BUILD_VECTOR, dl, Node->getValueType(0), Scalars);
+    ReplaceNode(SDValue(Node, 0), Result);
+    break;
+  }
+  case ISD::GLOBAL_OFFSET_TABLE:
+  case ISD::GlobalAddress:
+  case ISD::GlobalTLSAddress:
+  case ISD::ExternalSymbol:
+  case ISD::ConstantPool:
+  case ISD::JumpTable:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // FIXME: Custom lowering for these operations shouldn't return null!
+    break;
+  }
+
+  // Replace the original node with the legalized result.
+  if (Results.empty())
+    return false;
+
+  ReplaceNode(Node, Results.data());
+  return true;
+}
+
+void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
+  unsigned Opc = Node->getOpcode();
+  switch (Opc) {
+  case ISD::ATOMIC_FENCE: {
+    // If the target didn't lower this, lower it to '__sync_synchronize()' call
+    // FIXME: handle "fence singlethread" more efficiently.
+    TargetLowering::ArgListTy Args;
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("__sync_synchronize",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args));
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  // By default, atomic intrinsics are marked Legal and lowered. Targets
+  // which don't support them directly, however, may want libcalls, in which
+  // case they mark them Expand, and we get here.
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_CMP_SWAP: {
+    MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+    RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
+
+    std::pair<SDValue, SDValue> Tmp = ExpandChainLibCall(LC, Node, false);
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
+  case ISD::TRAP: {
+    // If this operation is not supported, lower it to 'abort()' call
+    TargetLowering::ArgListTy Args;
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(dl)
+        .setChain(Node->getOperand(0))
+        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol("abort",
+                                         TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args));
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+
+    Results.push_back(CallResult.second);
+    break;
+  }
+  case ISD::FMINNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                      RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                      RTLIB::FMIN_PPCF128));
+    break;
+  case ISD::FMAXNUM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                      RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                      RTLIB::FMAX_PPCF128));
+    break;
+  case ISD::FSQRT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                      RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                      RTLIB::SQRT_PPCF128));
+    break;
+  case ISD::FSIN:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                      RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                      RTLIB::SIN_PPCF128));
+    break;
+  case ISD::FCOS:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::COS_F32, RTLIB::COS_F64,
+                                      RTLIB::COS_F80, RTLIB::COS_F128,
+                                      RTLIB::COS_PPCF128));
+    break;
+  case ISD::FSINCOS:
+    // Expand into sincos libcall.
+    ExpandSinCosLibCall(Node, Results);
+    break;
+  case ISD::FLOG:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                      RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                      RTLIB::LOG_PPCF128));
+    break;
+  case ISD::FLOG2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                      RTLIB::LOG2_PPCF128));
+    break;
+  case ISD::FLOG10:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                      RTLIB::LOG10_PPCF128));
+    break;
+  case ISD::FEXP:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                      RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                      RTLIB::EXP_PPCF128));
+    break;
+  case ISD::FEXP2:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                      RTLIB::EXP2_PPCF128));
+    break;
+  case ISD::FTRUNC:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                      RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                      RTLIB::TRUNC_PPCF128));
+    break;
+  case ISD::FFLOOR:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                      RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                      RTLIB::FLOOR_PPCF128));
+    break;
+  case ISD::FCEIL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                      RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                      RTLIB::CEIL_PPCF128));
+    break;
+  case ISD::FRINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                      RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                      RTLIB::RINT_PPCF128));
+    break;
+  case ISD::FNEARBYINT:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::NEARBYINT_F32,
+                                      RTLIB::NEARBYINT_F64,
+                                      RTLIB::NEARBYINT_F80,
+                                      RTLIB::NEARBYINT_F128,
+                                      RTLIB::NEARBYINT_PPCF128));
+    break;
+  case ISD::FROUND:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ROUND_F32,
+                                      RTLIB::ROUND_F64,
+                                      RTLIB::ROUND_F80,
+                                      RTLIB::ROUND_F128,
+                                      RTLIB::ROUND_PPCF128));
+    break;
+  case ISD::FPOWI:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                      RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                      RTLIB::POWI_PPCF128));
+    break;
+  case ISD::FPOW:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
+                                      RTLIB::POW_F80, RTLIB::POW_F128,
+                                      RTLIB::POW_PPCF128));
+    break;
+  case ISD::FDIV:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::DIV_F32, RTLIB::DIV_F64,
+                                      RTLIB::DIV_F80, RTLIB::DIV_F128,
+                                      RTLIB::DIV_PPCF128));
+    break;
+  case ISD::FREM:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::REM_F32, RTLIB::REM_F64,
+                                      RTLIB::REM_F80, RTLIB::REM_F128,
+                                      RTLIB::REM_PPCF128));
+    break;
+  case ISD::FMA:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::FMA_F32, RTLIB::FMA_F64,
+                                      RTLIB::FMA_F80, RTLIB::FMA_F128,
+                                      RTLIB::FMA_PPCF128));
+    break;
+  case ISD::FADD:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                      RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                      RTLIB::ADD_PPCF128));
+    break;
+  case ISD::FMUL:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::MUL_F32, RTLIB::MUL_F64,
+                                      RTLIB::MUL_F80, RTLIB::MUL_F128,
+                                      RTLIB::MUL_PPCF128));
+    break;
+  case ISD::FP16_TO_FP:
+    if (Node->getValueType(0) == MVT::f32) {
+      Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
+    }
+    break;
+  case ISD::FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unable to expand fp_to_fp16");
+    Results.push_back(ExpandLibCall(LC, Node, false));
+    break;
+  }
+  case ISD::FSUB:
+    Results.push_back(ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
+                                      RTLIB::SUB_F80, RTLIB::SUB_F128,
+                                      RTLIB::SUB_PPCF128));
+    break;
+  case ISD::SREM:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SREM_I8,
+                                       RTLIB::SREM_I16, RTLIB::SREM_I32,
+                                       RTLIB::SREM_I64, RTLIB::SREM_I128));
+    break;
+  case ISD::UREM:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UREM_I8,
+                                       RTLIB::UREM_I16, RTLIB::UREM_I32,
+                                       RTLIB::UREM_I64, RTLIB::UREM_I128));
+    break;
+  case ISD::SDIV:
+    Results.push_back(ExpandIntLibCall(Node, true,
+                                       RTLIB::SDIV_I8,
+                                       RTLIB::SDIV_I16, RTLIB::SDIV_I32,
+                                       RTLIB::SDIV_I64, RTLIB::SDIV_I128));
+    break;
+  case ISD::UDIV:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::UDIV_I8,
+                                       RTLIB::UDIV_I16, RTLIB::UDIV_I32,
+                                       RTLIB::UDIV_I64, RTLIB::UDIV_I128));
+    break;
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+    // Expand into divrem libcall
+    ExpandDivRemLibCall(Node, Results);
+    break;
+  case ISD::MUL:
+    Results.push_back(ExpandIntLibCall(Node, false,
+                                       RTLIB::MUL_I8,
+                                       RTLIB::MUL_I16, RTLIB::MUL_I32,
+                                       RTLIB::MUL_I64, RTLIB::MUL_I128));
+    break;
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty())
+    ReplaceNode(Node, Results.data());
+}
+
+// Determine the vector type to use in place of an original scalar element when
+// promoting equally sized vectors.
+static MVT getPromotedVectorElementType(const TargetLowering &TLI,
+                                        MVT EltVT, MVT NewEltVT) {
+  unsigned OldEltsPerNewElt = EltVT.getSizeInBits() / NewEltVT.getSizeInBits();
+  MVT MidVT = MVT::getVectorVT(NewEltVT, OldEltsPerNewElt);
+  assert(TLI.isTypeLegal(MidVT) && "unexpected");
+  return MidVT;
+}
+
+void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
+  SmallVector<SDValue, 8> Results;
+  MVT OVT = Node->getSimpleValueType(0);
+  if (Node->getOpcode() == ISD::UINT_TO_FP ||
+      Node->getOpcode() == ISD::SINT_TO_FP ||
+      Node->getOpcode() == ISD::SETCC ||
+      Node->getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+      Node->getOpcode() == ISD::INSERT_VECTOR_ELT) {
+    OVT = Node->getOperand(0).getSimpleValueType();
+  }
+  if (Node->getOpcode() == ISD::BR_CC)
+    OVT = Node->getOperand(2).getSimpleValueType();
+  MVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), OVT);
+  SDLoc dl(Node);
+  SDValue Tmp1, Tmp2, Tmp3;
+  switch (Node->getOpcode()) {
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+    // Zero extend the argument.
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    if (Node->getOpcode() == ISD::CTTZ) {
+      // The count is the same in the promoted type except if the original
+      // value was zero.  This can be handled by setting the bit just off
+      // the top of the original type.
+      auto TopBit = APInt::getOneBitSet(NVT.getSizeInBits(),
+                                        OVT.getSizeInBits());
+      Tmp1 = DAG.getNode(ISD::OR, dl, NVT, Tmp1,
+                         DAG.getConstant(TopBit, dl, NVT));
+    }
+    // Perform the larger operation. For CTPOP and CTTZ_ZERO_UNDEF, this is
+    // already the correct result.
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    if (Node->getOpcode() == ISD::CTLZ ||
+        Node->getOpcode() == ISD::CTLZ_ZERO_UNDEF) {
+      // Tmp1 = Tmp1 - (sizeinbits(NVT) - sizeinbits(Old VT))
+      Tmp1 = DAG.getNode(ISD::SUB, dl, NVT, Tmp1,
+                          DAG.getConstant(NVT.getSizeInBits() -
+                                          OVT.getSizeInBits(), dl, NVT));
+    }
+    Results.push_back(DAG.getNode(ISD::TRUNCATE, dl, OVT, Tmp1));
+    break;
+  case ISD::BSWAP: {
+    unsigned DiffBits = NVT.getSizeInBits() - OVT.getSizeInBits();
+    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp1 = DAG.getNode(ISD::BSWAP, dl, NVT, Tmp1);
+    Tmp1 = DAG.getNode(
+        ISD::SRL, dl, NVT, Tmp1,
+        DAG.getConstant(DiffBits, dl,
+                        TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    Tmp1 = PromoteLegalFP_TO_INT(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::FP_TO_SINT, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+    Tmp1 = PromoteLegalINT_TO_FP(Node->getOperand(0), Node->getValueType(0),
+                                 Node->getOpcode() == ISD::SINT_TO_FP, dl);
+    Results.push_back(Tmp1);
+    break;
+  case ISD::VAARG: {
+    SDValue Chain = Node->getOperand(0); // Get the chain.
+    SDValue Ptr = Node->getOperand(1); // Get the pointer.
+
+    unsigned TruncOp;
+    if (OVT.isVector()) {
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger()
+        && "VAARG promotion is supported only for vectors or integer types");
+      TruncOp = ISD::TRUNCATE;
+    }
+
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getVAArg(NVT, dl, Chain, Ptr, Node->getOperand(2),
+             Node->getConstantOperandVal(3));
+    Chain = Tmp1.getValue(1);
+
+    Tmp2 = DAG.getNode(TruncOp, dl, OVT, Tmp1);
+
+    // Modified the chain result - switch anything that used the old chain to
+    // use the new one.
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp2);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    if (UpdatedNodes) {
+      UpdatedNodes->insert(Tmp2.getNode());
+      UpdatedNodes->insert(Chain.getNode());
+    }
+    ReplacedNode(Node);
+    break;
+  }
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    unsigned ExtOp, TruncOp;
+    if (OVT.isVector()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else {
+      assert(OVT.isInteger() && "Cannot promote logic operation");
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    }
+    // Promote each of the values to the new type.
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    // Perform the larger operation, then convert back
+    Tmp1 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+    Results.push_back(DAG.getNode(TruncOp, dl, OVT, Tmp1));
+    break;
+  }
+  case ISD::SELECT: {
+    unsigned ExtOp, TruncOp;
+    if (Node->getValueType(0).isVector() ||
+        Node->getValueType(0).getSizeInBits() == NVT.getSizeInBits()) {
+      ExtOp   = ISD::BITCAST;
+      TruncOp = ISD::BITCAST;
+    } else if (Node->getValueType(0).isInteger()) {
+      ExtOp   = ISD::ANY_EXTEND;
+      TruncOp = ISD::TRUNCATE;
+    } else {
+      ExtOp   = ISD::FP_EXTEND;
+      TruncOp = ISD::FP_ROUND;
+    }
+    Tmp1 = Node->getOperand(0);
+    // Promote each of the values to the new type.
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    // Perform the larger operation, then round down.
+    Tmp1 = DAG.getSelect(dl, NVT, Tmp1, Tmp2, Tmp3);
+    if (TruncOp != ISD::FP_ROUND)
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1);
+    else
+      Tmp1 = DAG.getNode(TruncOp, dl, Node->getValueType(0), Tmp1,
+                         DAG.getIntPtrConstant(0, dl));
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Node)->getMask();
+
+    // Cast the two input vectors.
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::BITCAST, dl, NVT, Node->getOperand(1));
+
+    // Convert the shuffle mask to the right # elements.
+    Tmp1 = ShuffleWithNarrowerEltType(NVT, OVT, dl, Tmp1, Tmp2, Mask);
+    Tmp1 = DAG.getNode(ISD::BITCAST, dl, OVT, Tmp1);
+    Results.push_back(Tmp1);
+    break;
+  }
+  case ISD::SETCC: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(2))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(1));
+    Results.push_back(DAG.getNode(ISD::SETCC, dl, Node->getValueType(0),
+                                  Tmp1, Tmp2, Node->getOperand(2)));
+    break;
+  }
+  case ISD::BR_CC: {
+    unsigned ExtOp = ISD::FP_EXTEND;
+    if (NVT.isInteger()) {
+      ISD::CondCode CCCode =
+        cast<CondCodeSDNode>(Node->getOperand(1))->get();
+      ExtOp = isSignedIntSetCC(CCCode) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    }
+    Tmp1 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(2));
+    Tmp2 = DAG.getNode(ExtOp, dl, NVT, Node->getOperand(3));
+    Results.push_back(DAG.getNode(ISD::BR_CC, dl, Node->getValueType(0),
+                                  Node->getOperand(0), Node->getOperand(1),
+                                  Tmp1, Tmp2, Node->getOperand(4)));
+    break;
+  }
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FPOW: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
+                       Node->getFlags());
+    Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                                  Tmp3, DAG.getIntPtrConstant(0, dl)));
+    break;
+  }
+  case ISD::FMA: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
+    Tmp3 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(2));
+    Results.push_back(
+        DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                    DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2, Tmp3),
+                    DAG.getIntPtrConstant(0, dl)));
+    break;
+  }
+  case ISD::FCOPYSIGN:
+  case ISD::FPOWI: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = Node->getOperand(1);
+    Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2);
+
+    // fcopysign doesn't change anything but the sign bit, so
+    //   (fp_round (fcopysign (fpext a), b))
+    // is as precise as
+    //   (fp_round (fpext a))
+    // which is a no-op. Mark it as a TRUNCating FP_ROUND.
+    const bool isTrunc = (Node->getOpcode() == ISD::FCOPYSIGN);
+    Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                                  Tmp3, DAG.getIntPtrConstant(isTrunc, dl)));
+    break;
+  }
+  case ISD::FFLOOR:
+  case ISD::FCEIL:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FTRUNC:
+  case ISD::FNEG:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FABS:
+  case ISD::FEXP:
+  case ISD::FEXP2: {
+    Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
+    Tmp2 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1);
+    Results.push_back(DAG.getNode(ISD::FP_ROUND, dl, OVT,
+                                  Tmp2, DAG.getIntPtrConstant(0, dl)));
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = build_vector i64:x, i64:y => v4i32
+    //  =>
+    //  v4i32 = concat_vectors (v2i32 (bitcast i64:x)), (v2i32 (bitcast i64:y))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for build_vector");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0, E = Node->getNumOperands(); I != E; ++I) {
+      SDValue Op = Node->getOperand(I);
+      NewOps.push_back(DAG.getNode(ISD::BITCAST, SDLoc(Op), MidVT, Op));
+    }
+
+    SDLoc SL(Node);
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewOps);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size.
+    //
+    // e.g. v2i64 = extract_vector_elt x:v2i64, y:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //
+    // i64 = bitcast
+    //   (v2i32 build_vector (i32 (extract_vector_elt castx, (2 * y))),
+    //                       (i32 (extract_vector_elt castx, (2 * y + 1)))
+    //
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for extract_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Idx = Node->getOperand(1);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SL, IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+
+    SmallVector<SDValue, 8> NewOps;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue TmpIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVec, TmpIdx);
+      NewOps.push_back(Elt);
+    }
+
+    SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, SL, MidVT, NewOps);
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, EltVT, NewVec));
+    break;
+  }
+  case ISD::INSERT_VECTOR_ELT: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to a different vector type with the same total bit size
+    //
+    // e.g. v2i64 = insert_vector_elt x:v2i64, y:i64, z:i32
+    //  =>
+    //  v4i32:castx = bitcast x:v2i64
+    //  v2i32:casty = bitcast y:i64
+    //
+    // v2i64 = bitcast
+    //   (v4i32 insert_vector_elt
+    //       (v4i32 insert_vector_elt v4i32:castx,
+    //                                (extract_vector_elt casty, 0), 2 * z),
+    //        (extract_vector_elt casty, 1), (2 * z + 1))
+
+    assert(NVT.isVector() && OVT.getSizeInBits() == NVT.getSizeInBits() &&
+           "Invalid promote type for insert_vector_elt");
+    assert(NewEltVT.bitsLT(EltVT) && "not handled");
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    unsigned NewEltsPerOldElt = MidVT.getVectorNumElements();
+
+    SDValue Val = Node->getOperand(1);
+    SDValue Idx = Node->getOperand(2);
+    EVT IdxVT = Idx.getValueType();
+    SDLoc SL(Node);
+
+    SDValue Factor = DAG.getConstant(NewEltsPerOldElt, SDLoc(), IdxVT);
+    SDValue NewBaseIdx = DAG.getNode(ISD::MUL, SL, IdxVT, Idx, Factor);
+
+    SDValue CastVec = DAG.getNode(ISD::BITCAST, SL, NVT, Node->getOperand(0));
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+
+    SDValue NewVec = CastVec;
+    for (unsigned I = 0; I < NewEltsPerOldElt; ++I) {
+      SDValue IdxOffset = DAG.getConstant(I, SL, IdxVT);
+      SDValue InEltIdx = DAG.getNode(ISD::ADD, SL, IdxVT, NewBaseIdx, IdxOffset);
+
+      SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, NewEltVT,
+                                CastVal, IdxOffset);
+
+      NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, SL, NVT,
+                           NewVec, Elt, InEltIdx);
+    }
+
+    Results.push_back(DAG.getNode(ISD::BITCAST, SL, OVT, NewVec));
+    break;
+  }
+  case ISD::SCALAR_TO_VECTOR: {
+    MVT EltVT = OVT.getVectorElementType();
+    MVT NewEltVT = NVT.getVectorElementType();
+
+    // Handle bitcasts to different vector type with the smae total bit size.
+    //
+    // e.g. v2i64 = scalar_to_vector x:i64
+    //   =>
+    //  concat_vectors (v2i32 bitcast x:i64), (v2i32 undef)
+    //
+
+    MVT MidVT = getPromotedVectorElementType(TLI, EltVT, NewEltVT);
+    SDValue Val = Node->getOperand(0);
+    SDLoc SL(Node);
+
+    SDValue CastVal = DAG.getNode(ISD::BITCAST, SL, MidVT, Val);
+    SDValue Undef = DAG.getUNDEF(MidVT);
+
+    SmallVector<SDValue, 8> NewElts;
+    NewElts.push_back(CastVal);
+    for (unsigned I = 1, NElts = OVT.getVectorNumElements(); I != NElts; ++I)
+      NewElts.push_back(Undef);
+
+    SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, SL, NVT, NewElts);
+    SDValue CvtVec = DAG.getNode(ISD::BITCAST, SL, OVT, Concat);
+    Results.push_back(CvtVec);
+    break;
+  }
+  }
+
+  // Replace the original node with the legalized result.
+  if (!Results.empty())
+    ReplaceNode(Node, Results.data());
+}
+
+/// This is the entry point for the file.
+void SelectionDAG::Legalize() {
+  AssignTopologicalOrder();
+
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes);
+
+  // Visit all the nodes. We start in topological order, so that we see
+  // nodes with their original operands intact. Legalization can produce
+  // new nodes which may themselves need to be legalized. Iterate until all
+  // nodes have been legalized.
+  for (;;) {
+    bool AnyLegalized = false;
+    for (auto NI = allnodes_end(); NI != allnodes_begin();) {
+      --NI;
+
+      SDNode *N = &*NI;
+      if (N->use_empty() && N != getRoot().getNode()) {
+        ++NI;
+        DeleteNode(N);
+        continue;
+      }
+
+      if (LegalizedNodes.insert(N).second) {
+        AnyLegalized = true;
+        Legalizer.LegalizeOp(N);
+
+        if (N->use_empty() && N != getRoot().getNode()) {
+          ++NI;
+          DeleteNode(N);
+        }
+      }
+    }
+    if (!AnyLegalized)
+      break;
+
+  }
+
+  // Remove dead nodes now.
+  RemoveDeadNodes();
+}
+
+bool SelectionDAG::LegalizeOp(SDNode *N,
+                              SmallSetVector<SDNode *, 16> &UpdatedNodes) {
+  SmallPtrSet<SDNode *, 16> LegalizedNodes;
+  SelectionDAGLegalize Legalizer(*this, LegalizedNodes, &UpdatedNodes);
+
+  // Directly insert the node in question, and legalize it. This will recurse
+  // as needed through operands.
+  LegalizedNodes.insert(N);
+  Legalizer.LegalizeOp(N);
+
+  return LegalizedNodes.count(N);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
new file mode 100644
index 000000000000..31ebf7bbec13
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -0,0 +1,2139 @@
+//===-------- LegalizeFloatTypes.cpp - Legalization of float types --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements float type expansion and softening for LegalizeTypes.
+// Softening is the act of turning a computation in an illegal floating point
+// type into a computation in an integer type of the same size; also known as
+// "soft float".  For example, turning f32 arithmetic into operations using i32.
+// The resulting integer value is the same as what you would get by performing
+// the floating point operation and bitcasting the result to the integer type.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  For example,
+// implementing ppcf128 arithmetic in two f64 registers.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+/// GetFPLibCall - Return the right libcall for the given floating point type.
+static RTLIB::Libcall GetFPLibCall(EVT VT,
+                                   RTLIB::Libcall Call_F32,
+                                   RTLIB::Libcall Call_F64,
+                                   RTLIB::Libcall Call_F80,
+                                   RTLIB::Libcall Call_F128,
+                                   RTLIB::Libcall Call_PPCF128) {
+  return
+    VT == MVT::f32 ? Call_F32 :
+    VT == MVT::f64 ? Call_F64 :
+    VT == MVT::f80 ? Call_F80 :
+    VT == MVT::f128 ? Call_F128 :
+    VT == MVT::ppcf128 ? Call_PPCF128 :
+    RTLIB::UNKNOWN_LIBCALL;
+}
+
+//===----------------------------------------------------------------------===//
+//  Convert Float Results to Integer for Non-HW-supported Operations.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Soften float result " << ResNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften the result of this operator!");
+
+    case ISD::Register:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      assert(isLegalInHWReg(N->getValueType(ResNo)) &&
+             "Unsupported SoftenFloatRes opcode!");
+      // Only when isLegalInHWReg, we can skip check of the operands.
+      R = SDValue(N, ResNo);
+      break;
+    case ISD::MERGE_VALUES:R = SoftenFloatRes_MERGE_VALUES(N, ResNo); break;
+    case ISD::BITCAST:     R = SoftenFloatRes_BITCAST(N, ResNo); break;
+    case ISD::BUILD_PAIR:  R = SoftenFloatRes_BUILD_PAIR(N); break;
+    case ISD::ConstantFP:  R = SoftenFloatRes_ConstantFP(N, ResNo); break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      R = SoftenFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FABS:        R = SoftenFloatRes_FABS(N, ResNo); break;
+    case ISD::FMINNUM:     R = SoftenFloatRes_FMINNUM(N); break;
+    case ISD::FMAXNUM:     R = SoftenFloatRes_FMAXNUM(N); break;
+    case ISD::FADD:        R = SoftenFloatRes_FADD(N); break;
+    case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
+    case ISD::FCOPYSIGN:   R = SoftenFloatRes_FCOPYSIGN(N, ResNo); break;
+    case ISD::FCOS:        R = SoftenFloatRes_FCOS(N); break;
+    case ISD::FDIV:        R = SoftenFloatRes_FDIV(N); break;
+    case ISD::FEXP:        R = SoftenFloatRes_FEXP(N); break;
+    case ISD::FEXP2:       R = SoftenFloatRes_FEXP2(N); break;
+    case ISD::FFLOOR:      R = SoftenFloatRes_FFLOOR(N); break;
+    case ISD::FLOG:        R = SoftenFloatRes_FLOG(N); break;
+    case ISD::FLOG2:       R = SoftenFloatRes_FLOG2(N); break;
+    case ISD::FLOG10:      R = SoftenFloatRes_FLOG10(N); break;
+    case ISD::FMA:         R = SoftenFloatRes_FMA(N); break;
+    case ISD::FMUL:        R = SoftenFloatRes_FMUL(N); break;
+    case ISD::FNEARBYINT:  R = SoftenFloatRes_FNEARBYINT(N); break;
+    case ISD::FNEG:        R = SoftenFloatRes_FNEG(N, ResNo); break;
+    case ISD::FP_EXTEND:   R = SoftenFloatRes_FP_EXTEND(N); break;
+    case ISD::FP_ROUND:    R = SoftenFloatRes_FP_ROUND(N); break;
+    case ISD::FP16_TO_FP:  R = SoftenFloatRes_FP16_TO_FP(N); break;
+    case ISD::FPOW:        R = SoftenFloatRes_FPOW(N); break;
+    case ISD::FPOWI:       R = SoftenFloatRes_FPOWI(N); break;
+    case ISD::FREM:        R = SoftenFloatRes_FREM(N); break;
+    case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
+    case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
+    case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
+    case ISD::FSQRT:       R = SoftenFloatRes_FSQRT(N); break;
+    case ISD::FSUB:        R = SoftenFloatRes_FSUB(N); break;
+    case ISD::FTRUNC:      R = SoftenFloatRes_FTRUNC(N); break;
+    case ISD::LOAD:        R = SoftenFloatRes_LOAD(N, ResNo); break;
+    case ISD::SELECT:      R = SoftenFloatRes_SELECT(N, ResNo); break;
+    case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N, ResNo); break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:  R = SoftenFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:       R = SoftenFloatRes_UNDEF(N); break;
+    case ISD::VAARG:       R = SoftenFloatRes_VAARG(N); break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode()) {
+    SetSoftenedFloat(SDValue(N, ResNo), R);
+    ReplaceSoftenFloatResult(N, ResNo, R);
+  }
+  // Return true only if the node is changed,
+  // assuming that the operands are also converted when necessary.
+  // Otherwise, return false to tell caller to scan operands.
+  return R.getNode() && R.getNode() != N;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
+                                                      unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return BitConvertToInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_BUILD_PAIR(SDNode *N) {
+  // Convert the inputs to integers, and build a new pair out of them.
+  return DAG.getNode(ISD::BUILD_PAIR, SDLoc(N),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                                              N->getValueType(0)),
+                     BitConvertToInteger(N->getOperand(0)),
+                     BitConvertToInteger(N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, we can load better from the constant pool.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+  // In ppcf128, the high 64 bits are always first in memory regardless
+  // of Endianness. LLVM's APFloat representation is not Endian sensitive,
+  // and so always converts into a 128-bit APInt in a non-Endian-sensitive
+  // way. However, APInt's are serialized in an Endian-sensitive fashion,
+  // so on big-Endian targets, the two doubles are output in the wrong 
+  // order. Fix this by manually flipping the order of the high 64 bits
+  // and the low 64 bits here.
+  if (DAG.getDataLayout().isBigEndian() &&
+      CN->getValueType(0).getSimpleVT() == llvm::MVT::ppcf128) {
+    uint64_t words[2] = { CN->getValueAPF().bitcastToAPInt().getRawData()[1],
+                          CN->getValueAPF().bitcastToAPInt().getRawData()[0] };
+    APInt Val(128, words);
+    return DAG.getConstant(Val, SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  } else {
+    return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+                           TLI.getTypeToTransformTo(*DAG.getContext(),
+                                                    CN->getValueType(0)));
+  }
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     NewOp.getValueType().getVectorElementType(),
+                     NewOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FABS(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FABS can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned Size = NVT.getSizeInBits();
+
+  // Mask = ~(1 << (Size-1))
+  APInt API = APInt::getAllOnesValue(Size);
+  API.clearBit(Size - 1);
+  SDValue Mask = DAG.getConstant(API, SDLoc(N), NVT);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return DAG.getNode(ISD::AND, SDLoc(N), NVT, Op, Mask);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMINNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMIN_F32,
+                                           RTLIB::FMIN_F64,
+                                           RTLIB::FMIN_F80,
+                                           RTLIB::FMIN_F128,
+                                           RTLIB::FMIN_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMAXNUM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMAX_F32,
+                                           RTLIB::FMAX_F64,
+                                           RTLIB::FMAX_F80,
+                                           RTLIB::FMAX_F128,
+                                           RTLIB::FMAX_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FADD(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ADD_F32,
+                                           RTLIB::ADD_F64,
+                                           RTLIB::ADD_F80,
+                                           RTLIB::ADD_F128,
+                                           RTLIB::ADD_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCEIL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::CEIL_F32,
+                                           RTLIB::CEIL_F64,
+                                           RTLIB::CEIL_F80,
+                                           RTLIB::CEIL_F128,
+                                           RTLIB::CEIL_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FCOPYSIGN can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  SDValue LHS = GetSoftenedFloat(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(
+      ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+      DAG.getConstant(RSize - 1, dl,
+                      TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit =
+        DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit =
+        DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(
+      ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+      DAG.getConstant(LSize - 1, dl,
+                      TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FCOS(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::COS_F32,
+                                           RTLIB::COS_F64,
+                                           RTLIB::COS_F80,
+                                           RTLIB::COS_F128,
+                                           RTLIB::COS_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FDIV(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::DIV_F32,
+                                           RTLIB::DIV_F64,
+                                           RTLIB::DIV_F80,
+                                           RTLIB::DIV_F128,
+                                           RTLIB::DIV_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP_F32,
+                                           RTLIB::EXP_F64,
+                                           RTLIB::EXP_F80,
+                                           RTLIB::EXP_F128,
+                                           RTLIB::EXP_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FEXP2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::EXP2_F32,
+                                           RTLIB::EXP2_F64,
+                                           RTLIB::EXP2_F80,
+                                           RTLIB::EXP2_F128,
+                                           RTLIB::EXP2_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FFLOOR(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FLOOR_F32,
+                                           RTLIB::FLOOR_F64,
+                                           RTLIB::FLOOR_F80,
+                                           RTLIB::FLOOR_F128,
+                                           RTLIB::FLOOR_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG_F32,
+                                           RTLIB::LOG_F64,
+                                           RTLIB::LOG_F80,
+                                           RTLIB::LOG_F128,
+                                           RTLIB::LOG_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG2(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG2_F32,
+                                           RTLIB::LOG2_F64,
+                                           RTLIB::LOG2_F80,
+                                           RTLIB::LOG2_F128,
+                                           RTLIB::LOG2_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FLOG10(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::LOG10_F32,
+                                           RTLIB::LOG10_F64,
+                                           RTLIB::LOG10_F80,
+                                           RTLIB::LOG10_F128,
+                                           RTLIB::LOG10_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMA(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[3] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)),
+                     GetSoftenedFloat(N->getOperand(2)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::FMA_F32,
+                                           RTLIB::FMA_F64,
+                                           RTLIB::FMA_F80,
+                                           RTLIB::FMA_F128,
+                                           RTLIB::FMA_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FMUL(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::MUL_F32,
+                                           RTLIB::MUL_F64,
+                                           RTLIB::MUL_F80,
+                                           RTLIB::MUL_F128,
+                                           RTLIB::MUL_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEARBYINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::NEARBYINT_F32,
+                                           RTLIB::NEARBYINT_F64,
+                                           RTLIB::NEARBYINT_F80,
+                                           RTLIB::NEARBYINT_F128,
+                                           RTLIB::NEARBYINT_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo) {
+  // When LegalInHWReg, FNEG can be implemented as native bitwise operations.
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  // Expand Y = FNEG(X) -> Y = SUB -0.0, X
+  SDValue Ops[2] = { DAG.getConstantFP(-0.0, dl, N->getValueType(0)),
+                     GetSoftenedFloat(N->getOperand(0)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_F128,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, false, dl).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+
+  // There's only a libcall for f16 -> f32, so proceed in two stages. Also, it's
+  // entirely possible for both f16 and f32 to be legal, so use the fully
+  // hard-float FP_EXTEND rather than FP16_TO_FP.
+  if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
+    Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
+    if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
+      SoftenFloatResult(Op.getNode(), 0);
+  }
+
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
+    Op = GetPromotedFloat(Op);
+    // If the promotion did the FP_EXTEND to the destination type for us,
+    // there's nothing left to do here.
+    if (Op.getValueType() == N->getValueType(0)) {
+      return BitConvertToInteger(Op);
+    }
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
+    Op = GetSoftenedFloat(Op);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+}
+
+// FIXME: Should we just use 'normal' FP_EXTEND / FP_TRUNC instead of special
+// nodes?
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP16_TO_FP(SDNode *N) {
+  EVT MidVT = TLI.getTypeToTransformTo(*DAG.getContext(), MVT::f32);
+  SDValue Op = N->getOperand(0);
+  SDValue Res32 = TLI.makeLibCall(DAG, RTLIB::FPEXT_F16_F32, MidVT, Op,
+                                  false, SDLoc(N)).first;
+  if (N->getValueType(0) == MVT::f32)
+    return Res32;
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  RTLIB::Libcall LC = RTLIB::getFPEXT(MVT::f32, N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
+  return TLI.makeLibCall(DAG, LC, NVT, Res32, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FP_ROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = N->getOperand(0);
+  if (N->getValueType(0) == MVT::f16) {
+    // Semi-soften first, to FP_TO_FP16, so that targets which support f16 as a
+    // storage-only type get a chance to select things.
+    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, Op);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(Op.getValueType(), N->getValueType(0));
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND!");
+  return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOW(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POW_F32,
+                                           RTLIB::POW_F64,
+                                           RTLIB::POW_F80,
+                                           RTLIB::POW_F128,
+                                           RTLIB::POW_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FPOWI(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::i32 &&
+         "Unsupported power type!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)), N->getOperand(1) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::POWI_F32,
+                                           RTLIB::POWI_F64,
+                                           RTLIB::POWI_F80,
+                                           RTLIB::POWI_F128,
+                                           RTLIB::POWI_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREM(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::REM_F32,
+                                           RTLIB::REM_F64,
+                                           RTLIB::REM_F80,
+                                           RTLIB::REM_F128,
+                                           RTLIB::REM_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FRINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::RINT_F32,
+                                           RTLIB::RINT_F64,
+                                           RTLIB::RINT_F80,
+                                           RTLIB::RINT_F128,
+                                           RTLIB::RINT_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::ROUND_F32,
+                                           RTLIB::ROUND_F64,
+                                           RTLIB::ROUND_F80,
+                                           RTLIB::ROUND_F128,
+                                           RTLIB::ROUND_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SIN_F32,
+                                           RTLIB::SIN_F64,
+                                           RTLIB::SIN_F80,
+                                           RTLIB::SIN_F128,
+                                           RTLIB::SIN_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSQRT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SQRT_F32,
+                                           RTLIB::SQRT_F64,
+                                           RTLIB::SQRT_F80,
+                                           RTLIB::SQRT_F128,
+                                           RTLIB::SQRT_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FSUB(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Ops[2] = { GetSoftenedFloat(N->getOperand(0)),
+                     GetSoftenedFloat(N->getOperand(1)) };
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::SUB_F32,
+                                           RTLIB::SUB_F64,
+                                           RTLIB::SUB_F80,
+                                           RTLIB::SUB_F128,
+                                           RTLIB::SUB_PPCF128),
+                         NVT, Ops, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_FTRUNC(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  if (N->getValueType(0) == MVT::f16)
+    return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), NVT, N->getOperand(0));
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                           RTLIB::TRUNC_F32,
+                                           RTLIB::TRUNC_F64,
+                                           RTLIB::TRUNC_F80,
+                                           RTLIB::TRUNC_F128,
+                                           RTLIB::TRUNC_PPCF128),
+                         NVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo) {
+  bool LegalInHWReg = isLegalInHWReg(N->getValueType(ResNo));
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  auto MMOFlags =
+      L->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
+  SDValue NewL;
+  if (L->getExtensionType() == ISD::NON_EXTLOAD) {
+    NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl,
+                       L->getChain(), L->getBasePtr(), L->getOffset(),
+                       L->getPointerInfo(), NVT, L->getAlignment(), MMOFlags,
+                       L->getAAInfo());
+    // Legalized the chain result - switch anything that used the old chain to
+    // use the new one.
+    if (N != NewL.getValue(1).getNode())
+      ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+    return NewL;
+  }
+
+  // Do a non-extending load followed by FP_EXTEND.
+  NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(),
+                     dl, L->getChain(), L->getBasePtr(), L->getOffset(),
+                     L->getPointerInfo(), L->getMemoryVT(), L->getAlignment(),
+                     MMOFlags, L->getAAInfo());
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  auto ExtendNode = DAG.getNode(ISD::FP_EXTEND, dl, VT, NewL);
+  if (LegalInHWReg)
+    return ExtendNode;
+  return BitConvertToInteger(ExtendNode);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  SDValue LHS = GetSoftenedFloat(N->getOperand(1));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(2));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo) {
+  if (isLegalInHWReg(N->getValueType(ResNo)))
+    return SDValue(N, ResNo);
+  SDValue LHS = GetSoftenedFloat(N->getOperand(2));
+  SDValue RHS = GetSoftenedFloat(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  SDValue NewVAARG;
+  NewVAARG = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2),
+                          N->getConstantOperandVal(3));
+
+  // Legalized the chain result - switch anything that used the old chain to
+  // use the new one.
+  if (N != NewVAARG.getValue(1).getNode())
+    ReplaceValueWith(SDValue(N, 1), NewVAARG.getValue(1));
+  return NewVAARG;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatRes_XINT_TO_FP(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::SINT_TO_FP;
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  SDLoc dl(N);
+
+  // If the input is not legal, eg: i1 -> fp, then it needs to be promoted to
+  // a larger type, eg: i8 -> fp.  Even if it is legal, no libcall may exactly
+  // match.  Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned t = MVT::FIRST_INTEGER_VALUETYPE;
+       t <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL; ++t) {
+    NVT = (MVT::SimpleValueType)t;
+    // The source needs to big enough to hold the operand.
+    if (NVT.bitsGE(SVT))
+      LC = Signed ? RTLIB::getSINTTOFP(NVT, RVT):RTLIB::getUINTTOFP (NVT, RVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+  // Sign/zero extend the argument if the libcall takes a larger type.
+  SDValue Op = DAG.getNode(Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                           NVT, N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC,
+                         TLI.getTypeToTransformTo(*DAG.getContext(), RVT),
+                         Op, Signed, dl).first;
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Convert Float Operand to Integer for Non-HW-supported Operations.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Soften float operand " << OpNo << ": "; N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+    if (CanSkipSoftenFloatOperand(N, OpNo))
+      return false;
+#ifndef NDEBUG
+    dbgs() << "SoftenFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soften this operator's operand!");
+
+  case ISD::BITCAST:     Res = SoftenFloatOp_BITCAST(N); break;
+  case ISD::BR_CC:       Res = SoftenFloatOp_BR_CC(N); break;
+  case ISD::FP_EXTEND:   Res = SoftenFloatOp_FP_EXTEND(N); break;
+  case ISD::FP_TO_FP16:  // Same as FP_ROUND for softening purposes
+  case ISD::FP_ROUND:    Res = SoftenFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = SoftenFloatOp_FP_TO_XINT(N); break;
+  case ISD::SELECT_CC:   Res = SoftenFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = SoftenFloatOp_SETCC(N); break;
+  case ISD::STORE:
+    Res = SoftenFloatOp_STORE(N, OpNo);
+    // Do not try to analyze or soften this node again if the value is
+    // or can be held in a register. In that case, Res.getNode() should
+    // be equal to N.
+    if (Res.getNode() == N &&
+        isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+      return false;
+    // Otherwise, we need to reanalyze and lower the new Res nodes.
+    break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this to re-analyze.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+bool DAGTypeLegalizer::CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo) {
+  if (!isLegalInHWReg(N->getOperand(OpNo).getValueType()))
+    return false;
+  // When the operand type can be kept in registers, SoftenFloatResult
+  // will call ReplaceValueWith to replace all references and we can
+  // skip softening this operand.
+  switch (N->getOperand(OpNo).getOpcode()) {
+    case ISD::BITCAST:
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+    case ISD::SELECT_CC:
+      return true;
+  }
+  // For some opcodes, SoftenFloatResult handles all conversion of softening
+  // and replacing operands, so that there is no need to soften operands
+  // again, although such opcode could be scanned for other illegal operands.
+  switch (N->getOpcode()) {
+    case ISD::ConstantFP:
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+    case ISD::FABS:
+    case ISD::FCOPYSIGN:
+    case ISD::FNEG:
+    case ISD::Register:
+    case ISD::SELECT:
+      return true;
+  }
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BITCAST(SDNode *N) {
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_EXTEND(SDNode *N) {
+  // If we get here, the result must be legal but the source illegal.
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+
+  if (SVT == MVT::f16)
+    return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), RVT, Op);
+
+  RTLIB::Libcall LC = RTLIB::getFPEXT(SVT, RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND libcall");
+
+  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+}
+
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_ROUND(SDNode *N) {
+  // We actually deal with the partially-softened FP_TO_FP16 node too, which
+  // returns an i16 so doesn't meet the constraints necessary for FP_ROUND.
+  assert(N->getOpcode() == ISD::FP_ROUND || N->getOpcode() == ISD::FP_TO_FP16);
+
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT FloatRVT = N->getOpcode() == ISD::FP_TO_FP16 ? MVT::f16 : RVT;
+
+  RTLIB::Libcall LC = RTLIB::getFPROUND(SVT, FloatRVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_ROUND libcall");
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  return TLI.makeLibCall(DAG, LC, RVT, Op, false, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_FP_TO_XINT(SDNode *N) {
+  bool Signed = N->getOpcode() == ISD::FP_TO_SINT;
+  EVT SVT = N->getOperand(0).getValueType();
+  EVT RVT = N->getValueType(0);
+  EVT NVT = EVT();
+  SDLoc dl(N);
+
+  // If the result is not legal, eg: fp -> i1, then it needs to be promoted to
+  // a larger type, eg: fp -> i32. Even if it is legal, no libcall may exactly
+  // match, eg. we don't have fp -> i8 conversions.
+  // Look for an appropriate libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  for (unsigned IntVT = MVT::FIRST_INTEGER_VALUETYPE;
+       IntVT <= MVT::LAST_INTEGER_VALUETYPE && LC == RTLIB::UNKNOWN_LIBCALL;
+       ++IntVT) {
+    NVT = (MVT::SimpleValueType)IntVT;
+    // The type needs to big enough to hold the result.
+    if (NVT.bitsGE(RVT))
+      LC = Signed ? RTLIB::getFPTOSINT(SVT, NVT):RTLIB::getFPTOUINT(SVT, NVT);
+  }
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_XINT!");
+
+  SDValue Op = GetSoftenedFloat(N->getOperand(0));
+  SDValue Res = TLI.makeLibCall(DAG, LC, NVT, Op, false, dl).first;
+
+  // Truncate the result if the libcall returns a larger type.
+  return DAG.getNode(ISD::TRUNCATE, dl, RVT, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If softenSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  EVT VT = NewLHS.getValueType();
+  NewLHS = GetSoftenedFloat(NewLHS);
+  NewRHS = GetSoftenedFloat(NewRHS);
+  TLI.softenSetCCOperands(DAG, VT, NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If softenSetCCOperands returned a scalar, use it.
+  if (!NewRHS.getNode()) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::SoftenFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc dl(N);
+
+  if (ST->isTruncatingStore())
+    // Do an FP_ROUND followed by a non-truncating store.
+    Val = BitConvertToInteger(DAG.getNode(ISD::FP_ROUND, dl, ST->getMemoryVT(),
+                                          Val, DAG.getIntPtrConstant(0, dl)));
+  else
+    Val = GetSoftenedFloat(Val);
+
+  return DAG.getStore(ST->getChain(), dl, Val, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand float result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+
+  case ISD::MERGE_VALUES:       ExpandRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ConstantFP: ExpandFloatRes_ConstantFP(N, Lo, Hi); break;
+  case ISD::FABS:       ExpandFloatRes_FABS(N, Lo, Hi); break;
+  case ISD::FMINNUM:    ExpandFloatRes_FMINNUM(N, Lo, Hi); break;
+  case ISD::FMAXNUM:    ExpandFloatRes_FMAXNUM(N, Lo, Hi); break;
+  case ISD::FADD:       ExpandFloatRes_FADD(N, Lo, Hi); break;
+  case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:  ExpandFloatRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::FCOS:       ExpandFloatRes_FCOS(N, Lo, Hi); break;
+  case ISD::FDIV:       ExpandFloatRes_FDIV(N, Lo, Hi); break;
+  case ISD::FEXP:       ExpandFloatRes_FEXP(N, Lo, Hi); break;
+  case ISD::FEXP2:      ExpandFloatRes_FEXP2(N, Lo, Hi); break;
+  case ISD::FFLOOR:     ExpandFloatRes_FFLOOR(N, Lo, Hi); break;
+  case ISD::FLOG:       ExpandFloatRes_FLOG(N, Lo, Hi); break;
+  case ISD::FLOG2:      ExpandFloatRes_FLOG2(N, Lo, Hi); break;
+  case ISD::FLOG10:     ExpandFloatRes_FLOG10(N, Lo, Hi); break;
+  case ISD::FMA:        ExpandFloatRes_FMA(N, Lo, Hi); break;
+  case ISD::FMUL:       ExpandFloatRes_FMUL(N, Lo, Hi); break;
+  case ISD::FNEARBYINT: ExpandFloatRes_FNEARBYINT(N, Lo, Hi); break;
+  case ISD::FNEG:       ExpandFloatRes_FNEG(N, Lo, Hi); break;
+  case ISD::FP_EXTEND:  ExpandFloatRes_FP_EXTEND(N, Lo, Hi); break;
+  case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
+  case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
+  case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
+  case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
+  case ISD::FSQRT:      ExpandFloatRes_FSQRT(N, Lo, Hi); break;
+  case ISD::FSUB:       ExpandFloatRes_FSUB(N, Lo, Hi); break;
+  case ISD::FTRUNC:     ExpandFloatRes_FTRUNC(N, Lo, Hi); break;
+  case ISD::LOAD:       ExpandFloatRes_LOAD(N, Lo, Hi); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP: ExpandFloatRes_XINT_TO_FP(N, Lo, Hi); break;
+  case ISD::FREM:       ExpandFloatRes_FREM(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedFloat(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  assert(NVT.getSizeInBits() == integerPartWidth &&
+         "Do not know how to expand this float constant!");
+  APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
+  SDLoc dl(N);
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[1])),
+                         dl, NVT);
+  Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(integerPartWidth, C.getRawData()[0])),
+                         dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FABS(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDLoc dl(N);
+  SDValue Tmp;
+  GetExpandedFloat(N->getOperand(0), Lo, Tmp);
+  Hi = DAG.getNode(ISD::FABS, dl, Tmp.getValueType(), Tmp);
+  // Lo = Hi==fabs(Hi) ? Lo : -Lo;
+  Lo = DAG.getSelectCC(dl, Tmp, Hi, Lo,
+                   DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo),
+                   ISD::SETEQ);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMINNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMIN_F32, RTLIB::FMIN_F64,
+                                         RTLIB::FMIN_F80, RTLIB::FMIN_F128,
+                                         RTLIB::FMIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMAXNUM(SDNode *N, SDValue &Lo,
+                                              SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FMAX_F32, RTLIB::FMAX_F64,
+                                         RTLIB::FMAX_F80, RTLIB::FMAX_F128,
+                                         RTLIB::FMAX_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FADD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ADD_F32, RTLIB::ADD_F64,
+                                         RTLIB::ADD_F80, RTLIB::ADD_F128,
+                                         RTLIB::ADD_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCEIL(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::CEIL_F32, RTLIB::CEIL_F64,
+                                         RTLIB::CEIL_F80, RTLIB::CEIL_F128,
+                                         RTLIB::CEIL_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOPYSIGN(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COPYSIGN_F32,
+                                         RTLIB::COPYSIGN_F64,
+                                         RTLIB::COPYSIGN_F80,
+                                         RTLIB::COPYSIGN_F128,
+                                         RTLIB::COPYSIGN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FCOS(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::COS_F32, RTLIB::COS_F64,
+                                         RTLIB::COS_F80, RTLIB::COS_F128,
+                                         RTLIB::COS_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FDIV(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::DIV_F32,
+                                                   RTLIB::DIV_F64,
+                                                   RTLIB::DIV_F80,
+                                                   RTLIB::DIV_F128,
+                                                   RTLIB::DIV_PPCF128),
+                                 N->getValueType(0), Ops, false,
+                                 SDLoc(N)).first;
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP_F32, RTLIB::EXP_F64,
+                                         RTLIB::EXP_F80, RTLIB::EXP_F128,
+                                         RTLIB::EXP_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FEXP2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::EXP2_F32, RTLIB::EXP2_F64,
+                                         RTLIB::EXP2_F80, RTLIB::EXP2_F128,
+                                         RTLIB::EXP2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FFLOOR(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::FLOOR_F32, RTLIB::FLOOR_F64,
+                                         RTLIB::FLOOR_F80, RTLIB::FLOOR_F128,
+                                         RTLIB::FLOOR_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG_F32, RTLIB::LOG_F64,
+                                         RTLIB::LOG_F80, RTLIB::LOG_F128,
+                                         RTLIB::LOG_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG2(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG2_F32, RTLIB::LOG2_F64,
+                                         RTLIB::LOG2_F80, RTLIB::LOG2_F128,
+                                         RTLIB::LOG2_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FLOG10(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::LOG10_F32, RTLIB::LOG10_F64,
+                                         RTLIB::LOG10_F80, RTLIB::LOG10_F128,
+                                         RTLIB::LOG10_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMA(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue Ops[3] = { N->getOperand(0), N->getOperand(1), N->getOperand(2) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::FMA_F32,
+                                                   RTLIB::FMA_F64,
+                                                   RTLIB::FMA_F80,
+                                                   RTLIB::FMA_F128,
+                                                   RTLIB::FMA_PPCF128),
+                                 N->getValueType(0), Ops, false,
+                                 SDLoc(N)).first;
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FMUL(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::MUL_F32,
+                                                   RTLIB::MUL_F64,
+                                                   RTLIB::MUL_F80,
+                                                   RTLIB::MUL_F128,
+                                                   RTLIB::MUL_PPCF128),
+                                 N->getValueType(0), Ops, false,
+                                 SDLoc(N)).first;
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEARBYINT(SDNode *N,
+                                                 SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::NEARBYINT_F32,
+                                         RTLIB::NEARBYINT_F64,
+                                         RTLIB::NEARBYINT_F80,
+                                         RTLIB::NEARBYINT_F128,
+                                         RTLIB::NEARBYINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FNEG(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FNEG, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FNEG, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FP_EXTEND(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  Hi = DAG.getNode(ISD::FP_EXTEND, dl, NVT, N->getOperand(0));
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOW(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POW_F32, RTLIB::POW_F64,
+                                         RTLIB::POW_F80, RTLIB::POW_F128,
+                                         RTLIB::POW_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::POWI_F32, RTLIB::POWI_F64,
+                                         RTLIB::POWI_F80, RTLIB::POWI_F128,
+                                         RTLIB::POWI_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::REM_F32, RTLIB::REM_F64,
+                                         RTLIB::REM_F80, RTLIB::REM_F128,
+                                         RTLIB::REM_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FRINT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::RINT_F32, RTLIB::RINT_F64,
+                                         RTLIB::RINT_F80, RTLIB::RINT_F128,
+                                         RTLIB::RINT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::ROUND_F32,
+                                         RTLIB::ROUND_F64,
+                                         RTLIB::ROUND_F80,
+                                         RTLIB::ROUND_F128,
+                                         RTLIB::ROUND_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SIN_F32, RTLIB::SIN_F64,
+                                         RTLIB::SIN_F80, RTLIB::SIN_F128,
+                                         RTLIB::SIN_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSQRT(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::SQRT_F32, RTLIB::SQRT_F64,
+                                         RTLIB::SQRT_F80, RTLIB::SQRT_F128,
+                                         RTLIB::SQRT_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FSUB(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Call = TLI.makeLibCall(DAG, GetFPLibCall(N->getValueType(0),
+                                                   RTLIB::SUB_F32,
+                                                   RTLIB::SUB_F64,
+                                                   RTLIB::SUB_F80,
+                                                   RTLIB::SUB_F128,
+                                                   RTLIB::SUB_PPCF128),
+                                 N->getValueType(0), Ops, false,
+                                 SDLoc(N)).first;
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_FTRUNC(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Call = LibCallify(GetFPLibCall(N->getValueType(0),
+                                         RTLIB::TRUNC_F32, RTLIB::TRUNC_F64,
+                                         RTLIB::TRUNC_F80, RTLIB::TRUNC_F128,
+                                         RTLIB::TRUNC_PPCF128),
+                            N, false);
+  GetPairElements(Call, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_LOAD(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), LD->getValueType(0));
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(LD->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+
+  Hi = DAG.getExtLoad(LD->getExtensionType(), dl, NVT, Chain, Ptr,
+                      LD->getMemoryVT(), LD->getMemOperand());
+
+  // Remember the chain.
+  Chain = Hi.getValue(1);
+
+  // The low part is zero.
+  Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                 APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+
+  // Modified the chain - switch anything that used the old chain to use the
+  // new one.
+  ReplaceValueWith(SDValue(LD, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 && "Unsupported XINT_TO_FP!");
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  bool isSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  SDLoc dl(N);
+
+  // First do an SINT_TO_FP, whether the original was signed or unsigned.
+  // When promoting partial word types to i32 we must honor the signedness,
+  // though.
+  if (SrcVT.bitsLE(MVT::i32)) {
+    // The integer can be represented exactly in an f64.
+    Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                      MVT::i32, Src);
+    Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
+                                   APInt(NVT.getSizeInBits(), 0)), dl, NVT);
+    Hi = DAG.getNode(ISD::SINT_TO_FP, dl, NVT, Src);
+  } else {
+    RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+    if (SrcVT.bitsLE(MVT::i64)) {
+      Src = DAG.getNode(isSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND, dl,
+                        MVT::i64, Src);
+      LC = RTLIB::SINTTOFP_I64_PPCF128;
+    } else if (SrcVT.bitsLE(MVT::i128)) {
+      Src = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i128, Src);
+      LC = RTLIB::SINTTOFP_I128_PPCF128;
+    }
+    assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XINT_TO_FP!");
+
+    Hi = TLI.makeLibCall(DAG, LC, VT, Src, true, dl).first;
+    GetPairElements(Hi, Lo, Hi);
+  }
+
+  if (isSigned)
+    return;
+
+  // Unsigned - fix up the SINT_TO_FP value just calculated.
+  Hi = DAG.getNode(ISD::BUILD_PAIR, dl, VT, Lo, Hi);
+  SrcVT = Src.getValueType();
+
+  // x>=0 ? (ppcf128)(iN)x : (ppcf128)(iN)x + 2^N; N=32,64,128.
+  static const uint64_t TwoE32[]  = { 0x41f0000000000000LL, 0 };
+  static const uint64_t TwoE64[]  = { 0x43f0000000000000LL, 0 };
+  static const uint64_t TwoE128[] = { 0x47f0000000000000LL, 0 };
+  ArrayRef<uint64_t> Parts;
+
+  switch (SrcVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("Unsupported UINT_TO_FP!");
+  case MVT::i32:
+    Parts = TwoE32;
+    break;
+  case MVT::i64:
+    Parts = TwoE64;
+    break;
+  case MVT::i128:
+    Parts = TwoE128;
+    break;
+  }
+
+  // TODO: Are there fast-math-flags to propagate to this FADD?
+  Lo = DAG.getNode(ISD::FADD, dl, VT, Hi,
+                   DAG.getConstantFP(APFloat(APFloat::PPCDoubleDouble,
+                                             APInt(128, Parts)),
+                                     dl, MVT::ppcf128));
+  Lo = DAG.getSelectCC(dl, Src, DAG.getConstant(0, dl, SrcVT),
+                       Lo, Hi, ISD::SETLT);
+  GetPairElements(Lo, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandFloatOperand - This method is called when the specified operand of the
+/// specified node is found to need expansion.  At this point, all of the result
+/// types of the node are known to be legal, but other operands of the node may
+/// need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandFloatOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand float operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandFloatOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:         Res = ExpandOp_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:    Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT: Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+
+  case ISD::BR_CC:      Res = ExpandFloatOp_BR_CC(N); break;
+  case ISD::FCOPYSIGN:  Res = ExpandFloatOp_FCOPYSIGN(N); break;
+  case ISD::FP_ROUND:   Res = ExpandFloatOp_FP_ROUND(N); break;
+  case ISD::FP_TO_SINT: Res = ExpandFloatOp_FP_TO_SINT(N); break;
+  case ISD::FP_TO_UINT: Res = ExpandFloatOp_FP_TO_UINT(N); break;
+  case ISD::SELECT_CC:  Res = ExpandFloatOp_SELECT_CC(N); break;
+  case ISD::SETCC:      Res = ExpandFloatOp_SETCC(N); break;
+  case ISD::STORE:      Res = ExpandFloatOp_STORE(cast<StoreSDNode>(N),
+                                                  OpNo); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// FloatExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::FloatExpandSetCCOperands(SDValue &NewLHS,
+                                                SDValue &NewRHS,
+                                                ISD::CondCode &CCCode,
+                                                const SDLoc &dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedFloat(NewLHS, LHSLo, LHSHi);
+  GetExpandedFloat(NewRHS, RHSLo, RHSHi);
+
+  assert(NewLHS.getValueType() == MVT::ppcf128 && "Unsupported setcc type!");
+
+  // FIXME:  This generated code sucks.  We want to generate
+  //         FCMPU crN, hi1, hi2
+  //         BNE crN, L:
+  //         FCMPU crN, lo1, lo2
+  // The following can be improved, but not that much.
+  SDValue Tmp1, Tmp2, Tmp3;
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETOEQ);
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
+                      LHSLo, RHSLo, CCCode);
+  Tmp3 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, ISD::SETUNE);
+  Tmp2 = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
+                      LHSHi, RHSHi, CCCode);
+  Tmp1 = DAG.getNode(ISD::AND, dl, Tmp1.getValueType(), Tmp1, Tmp2);
+  NewLHS = DAG.getNode(ISD::OR, dl, Tmp1.getValueType(), Tmp1, Tmp3);
+  NewRHS = SDValue();   // LHS is the result, not a compare.
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FCOPYSIGN(SDNode *N) {
+  assert(N->getOperand(1).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(1), Lo, Hi);
+  // The ppcf128 value is providing only the sign; take it from the
+  // higher-order double (which must have the larger magnitude).
+  return DAG.getNode(ISD::FCOPYSIGN, SDLoc(N),
+                     N->getValueType(0), N->getOperand(0), Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_ROUND(SDNode *N) {
+  assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+  SDValue Lo, Hi;
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  // Round it the rest of the way (e.g. to f32) if needed.
+  return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                     N->getValueType(0), Hi, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_SINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    SDValue Res = DAG.getNode(ISD::FP_ROUND_INREG, dl, MVT::ppcf128,
+                              N->getOperand(0), DAG.getValueType(MVT::f64));
+    Res = DAG.getNode(ISD::FP_ROUND, dl, MVT::f64, Res,
+                      DAG.getIntPtrConstant(1, dl));
+    return DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Res);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_SINT!");
+  return TLI.makeLibCall(DAG, LC, RVT, N->getOperand(0), false, dl).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_FP_TO_UINT(SDNode *N) {
+  EVT RVT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
+  // PPC (the libcall is not available).  FIXME: Do this in a less hacky way.
+  if (RVT == MVT::i32) {
+    assert(N->getOperand(0).getValueType() == MVT::ppcf128 &&
+           "Logic only correct for ppcf128!");
+    const uint64_t TwoE31[] = {0x41e0000000000000LL, 0};
+    APFloat APF = APFloat(APFloat::PPCDoubleDouble, APInt(128, TwoE31));
+    SDValue Tmp = DAG.getConstantFP(APF, dl, MVT::ppcf128);
+    //  X>=2^31 ? (int)(X-2^31)+0x80000000 : (int)X
+    // FIXME: generated code sucks.
+    // TODO: Are there fast-math-flags to propagate to this FSUB?
+    return DAG.getSelectCC(dl, N->getOperand(0), Tmp,
+                           DAG.getNode(ISD::ADD, dl, MVT::i32,
+                                       DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32,
+                                                   DAG.getNode(ISD::FSUB, dl,
+                                                               MVT::ppcf128,
+                                                               N->getOperand(0),
+                                                               Tmp)),
+                                       DAG.getConstant(0x80000000, dl,
+                                                       MVT::i32)),
+                           DAG.getNode(ISD::FP_TO_SINT, dl,
+                                       MVT::i32, N->getOperand(0)),
+                           ISD::SETGE);
+  }
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(N->getOperand(0).getValueType(), RVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_TO_UINT!");
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), N->getOperand(0),
+                         false, dl).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  FloatExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (!NewRHS.getNode()) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(),
+                                     ST->getValue().getValueType());
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  assert(ST->getMemoryVT().bitsLE(NVT) && "Float type not round?");
+  (void)NVT;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(ST->getValue(), Lo, Hi);
+
+  return DAG.getTruncStore(Chain, SDLoc(N), Hi, Ptr,
+                           ST->getMemoryVT(), ST->getMemOperand());
+}
+
+//===----------------------------------------------------------------------===//
+//  Float Operand Promotion
+//===----------------------------------------------------------------------===//
+//
+
+static ISD::NodeType GetPromotionOpcode(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+      return ISD::FP16_TO_FP;
+  } else if (RetVT == MVT::f16) {
+      return ISD::FP_TO_FP16;
+  }
+
+  report_fatal_error("Attempt at an invalid promotion-related conversion");
+}
+
+bool DAGTypeLegalizer::PromoteFloatOperand(SDNode *N, unsigned OpNo) {
+  SDValue R = SDValue();
+
+  // Nodes that use a promotion-requiring floating point operand, but doesn't
+  // produce a promotion-requiring floating point result, need to be legalized
+  // to use the promoted float operand.  Nodes that produce at least one
+  // promotion-requiring floating point result have their operands legalized as
+  // a part of PromoteFloatResult.
+  switch (N->getOpcode()) {
+    default:
+      llvm_unreachable("Do not know how to promote this operator's operand!");
+
+    case ISD::BITCAST:    R = PromoteFloatOp_BITCAST(N, OpNo); break;
+    case ISD::FCOPYSIGN:  R = PromoteFloatOp_FCOPYSIGN(N, OpNo); break;
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: R = PromoteFloatOp_FP_TO_XINT(N, OpNo); break;
+    case ISD::FP_EXTEND:  R = PromoteFloatOp_FP_EXTEND(N, OpNo); break;
+    case ISD::SELECT_CC:  R = PromoteFloatOp_SELECT_CC(N, OpNo); break;
+    case ISD::SETCC:      R = PromoteFloatOp_SETCC(N, OpNo); break;
+    case ISD::STORE:      R = PromoteFloatOp_STORE(N, OpNo); break;
+  }
+
+  if (R.getNode())
+    ReplaceValueWith(SDValue(N, 0), R);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo) {
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op->getValueType(0);
+
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), OpVT.getSizeInBits());
+  assert (IVT == N->getValueType(0) && "Bitcast to type of different size");
+
+  SDValue Promoted = GetPromotedFloat(N->getOperand(0));
+  EVT PromotedVT = Promoted->getValueType(0);
+
+  // Convert the promoted float value to the desired IVT.
+  return DAG.getNode(GetPromotionOpcode(PromotedVT, OpVT), SDLoc(N), IVT,
+                     Promoted);
+}
+
+// Promote Operand 1 of FCOPYSIGN.  Operand 0 ought to be handled by
+// PromoteFloatRes_FCOPYSIGN.
+SDValue DAGTypeLegalizer::PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo) {
+  assert (OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), Op1);
+}
+
+// Convert the promoted float value to the desired integer type
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo) {
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+  EVT VT = N->getValueType(0);
+
+  // Desired VT is same as promoted type.  Use promoted float directly.
+  if (VT == Op->getValueType(0))
+    return Op;
+
+  // Else, extend the promoted float value to the desired VT.
+  return DAG.getNode(ISD::FP_EXTEND, SDLoc(N), VT, Op);
+}
+
+// Promote the float operands used for comparison.  The true- and false-
+// operands have the same type as the result and are promoted, if needed, by
+// PromoteFloatRes_SELECT_CC
+SDValue DAGTypeLegalizer::PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  SDValue LHS = GetPromotedFloat(N->getOperand(0));
+  SDValue RHS = GetPromotedFloat(N->getOperand(1));
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+                     LHS, RHS, N->getOperand(2), N->getOperand(3),
+                     N->getOperand(4));
+}
+
+// Construct a SETCC that compares the promoted values and sets the conditional
+// code.
+SDValue DAGTypeLegalizer::PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+
+  return DAG.getSetCC(SDLoc(N), NVT, Op0, Op1, CCCode);
+
+}
+
+// Lower the promoted Float down to the integer value of same size and construct
+// a STORE of the integer value.
+SDValue DAGTypeLegalizer::PromoteFloatOp_STORE(SDNode *N, unsigned OpNo) {
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc DL(N);
+
+  SDValue Promoted = GetPromotedFloat(Val);
+  EVT VT = ST->getOperand(1)->getValueType(0);
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  SDValue NewVal;
+  NewVal = DAG.getNode(GetPromotionOpcode(Promoted.getValueType(), VT), DL,
+                       IVT, Promoted);
+
+  return DAG.getStore(ST->getChain(), DL, NewVal, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
+
+//===----------------------------------------------------------------------===//
+//  Float Result Promotion
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+    // These opcodes cannot appear if promotion of FP16 is done in the backend
+    // instead of Clang
+    case ISD::FP16_TO_FP:
+    case ISD::FP_TO_FP16:
+    default:
+      llvm_unreachable("Do not know how to promote this operator's result!");
+
+    case ISD::BITCAST:    R = PromoteFloatRes_BITCAST(N); break;
+    case ISD::ConstantFP: R = PromoteFloatRes_ConstantFP(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:
+                          R = PromoteFloatRes_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::FCOPYSIGN:  R = PromoteFloatRes_FCOPYSIGN(N); break;
+
+    // Unary FP Operations
+    case ISD::FABS:
+    case ISD::FCEIL:
+    case ISD::FCOS:
+    case ISD::FEXP:
+    case ISD::FEXP2:
+    case ISD::FFLOOR:
+    case ISD::FLOG:
+    case ISD::FLOG2:
+    case ISD::FLOG10:
+    case ISD::FNEARBYINT:
+    case ISD::FNEG:
+    case ISD::FRINT:
+    case ISD::FROUND:
+    case ISD::FSIN:
+    case ISD::FSQRT:
+    case ISD::FTRUNC:     R = PromoteFloatRes_UnaryOp(N); break;
+
+    // Binary FP Operations
+    case ISD::FADD:
+    case ISD::FDIV:
+    case ISD::FMAXNAN:
+    case ISD::FMINNAN:
+    case ISD::FMAXNUM:
+    case ISD::FMINNUM:
+    case ISD::FMUL:
+    case ISD::FPOW:
+    case ISD::FREM:
+    case ISD::FSUB:       R = PromoteFloatRes_BinOp(N); break;
+
+    case ISD::FMA:        // FMA is same as FMAD
+    case ISD::FMAD:       R = PromoteFloatRes_FMAD(N); break;
+
+    case ISD::FPOWI:      R = PromoteFloatRes_FPOWI(N); break;
+
+    case ISD::FP_ROUND:   R = PromoteFloatRes_FP_ROUND(N); break;
+    case ISD::LOAD:       R = PromoteFloatRes_LOAD(N); break;
+    case ISD::SELECT:     R = PromoteFloatRes_SELECT(N); break;
+    case ISD::SELECT_CC:  R = PromoteFloatRes_SELECT_CC(N); break;
+
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP: R = PromoteFloatRes_XINT_TO_FP(N); break;
+    case ISD::UNDEF:      R = PromoteFloatRes_UNDEF(N); break;
+
+  }
+
+  if (R.getNode())
+    SetPromotedFloat(SDValue(N, ResNo), R);
+}
+
+// Bitcast from i16 to f16:  convert the i16 to a f32 value instead.
+// At this point, it is not possible to determine if the bitcast value is
+// eventually stored to memory or promoted to f32 or promoted to a floating
+// point at a higher precision.  Some of these cases are handled by FP_EXTEND,
+// STORE promotion handlers.
+SDValue DAGTypeLegalizer::PromoteFloatRes_BITCAST(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT,
+                     N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_ConstantFP(SDNode *N) {
+  ConstantFPSDNode *CFPNode = cast<ConstantFPSDNode>(N);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  // Get the (bit-cast) APInt of the APFloat and build an integer constant
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  SDValue C = DAG.getConstant(CFPNode->getValueAPF().bitcastToAPInt(), DL,
+                              IVT);
+
+  // Convert the Constant to the desired FP type
+  // FIXME We might be able to do the conversion during compilation and get rid
+  // of it from the object code
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, C);
+}
+
+// If the Index operand is a constant, try to redirect the extract operation to
+// the correct legalized vector.  If not, bit-convert the input vector to
+// equivalent integer vector.  Extract the element as an (bit-cast) integer
+// value and convert it to the promoted type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc DL(N);
+
+  // If the index is constant, try to extract the value from the legalized
+  // vector type.
+  if (isa<ConstantSDNode>(N->getOperand(1))) {
+    SDValue Vec = N->getOperand(0);
+    SDValue Idx = N->getOperand(1);
+    EVT VecVT = Vec->getValueType(0);
+    EVT EltVT = VecVT.getVectorElementType();
+
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+    switch (getTypeAction(VecVT)) {
+    default: break;
+    case TargetLowering::TypeScalarizeVector: {
+      SDValue Res = GetScalarizedVector(N->getOperand(0));
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+    case TargetLowering::TypeWidenVector: {
+      Vec = GetWidenedVector(Vec);
+      SDValue Res = DAG.getNode(N->getOpcode(), DL, EltVT, Vec, Idx);
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+    case TargetLowering::TypeSplitVector: {
+      SDValue Lo, Hi;
+      GetSplitVector(Vec, Lo, Hi);
+
+      uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+      SDValue Res;
+      if (IdxVal < LoElts)
+        Res = DAG.getNode(N->getOpcode(), DL, EltVT, Lo, Idx);
+      else
+        Res = DAG.getNode(N->getOpcode(), DL, EltVT, Hi,
+                          DAG.getConstant(IdxVal - LoElts, DL,
+                                          Idx.getValueType()));
+      ReplaceValueWith(SDValue(N, 0), Res);
+      return SDValue();
+    }
+
+    }
+  }
+
+  // Bit-convert the input vector to the equivalent integer vector
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  EVT IVT = NewOp.getValueType().getVectorElementType();
+
+  // Extract the element as an (bit-cast) integer value
+  SDValue NewVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IVT,
+                               NewOp, N->getOperand(1));
+
+  // Convert the element to the desired FP type
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, NewVal);
+}
+
+// FCOPYSIGN(X, Y) returns the value of X with the sign of Y.  If the result
+// needs promotion, so does the argument X.  Note that Y, if needed, will be
+// handled during operand promotion.
+SDValue DAGTypeLegalizer::PromoteFloatRes_FCOPYSIGN(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+
+  SDValue Op1 = N->getOperand(1);
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
+}
+
+// Unary operation where the result and the operand have PromoteFloat type
+// action.  Construct a new SDNode with the promoted float value of the old
+// operand.
+SDValue DAGTypeLegalizer::PromoteFloatRes_UnaryOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op = GetPromotedFloat(N->getOperand(0));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op);
+}
+
+// Binary operations where the result and both operands have PromoteFloat type
+// action.  Construct a new SDNode with the promoted float values of the old
+// operands.
+SDValue DAGTypeLegalizer::PromoteFloatRes_BinOp(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_FMAD(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = GetPromotedFloat(N->getOperand(1));
+  SDValue Op2 = GetPromotedFloat(N->getOperand(2));
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1, Op2);
+}
+
+// Promote the Float (first) operand and retain the Integer (second) operand
+SDValue DAGTypeLegalizer::PromoteFloatRes_FPOWI(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Op0 = GetPromotedFloat(N->getOperand(0));
+  SDValue Op1 = N->getOperand(1);
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), NVT, Op0, Op1);
+}
+
+// Explicit operation to reduce precision.  Reduce the value to half precision
+// and promote it back to the legal type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_FP_ROUND(SDNode *N) {
+  SDLoc DL(N);
+
+  SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  EVT OpVT = Op->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+
+  // Round promoted float to desired precision
+  SDValue Round = DAG.getNode(GetPromotionOpcode(OpVT, VT), DL, IVT, Op);
+  // Promote it back to the legal output type
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), DL, NVT, Round);
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+  EVT VT = N->getValueType(0);
+
+  // Load the value as an integer value with the same number of bits.
+  EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+  auto MMOFlags =
+      L->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
+  SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), IVT,
+                             SDLoc(N), L->getChain(), L->getBasePtr(),
+                             L->getOffset(), L->getPointerInfo(), IVT,
+                             L->getAlignment(), MMOFlags, L->getAAInfo());
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
+
+  // Convert the integer value to the desired FP type
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  return DAG.getNode(GetPromotionOpcode(VT, NVT), SDLoc(N), NVT, newL);
+}
+
+// Construct a new SELECT node with the promoted true- and false- values.
+SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT(SDNode *N) {
+  SDValue TrueVal = GetPromotedFloat(N->getOperand(1));
+  SDValue FalseVal = GetPromotedFloat(N->getOperand(2));
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), TrueVal->getValueType(0),
+                     N->getOperand(0), TrueVal, FalseVal);
+}
+
+// Construct a new SELECT_CC node with the promoted true- and false- values.
+// The operands used for comparison are promoted by PromoteFloatOp_SELECT_CC.
+SDValue DAGTypeLegalizer::PromoteFloatRes_SELECT_CC(SDNode *N) {
+  SDValue TrueVal = GetPromotedFloat(N->getOperand(2));
+  SDValue FalseVal = GetPromotedFloat(N->getOperand(3));
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0),
+                     N->getOperand(0), N->getOperand(1), TrueVal, FalseVal,
+                     N->getOperand(4));
+}
+
+// Construct a SDNode that transforms the SINT or UINT operand to the promoted
+// float type.
+SDValue DAGTypeLegalizer::PromoteFloatRes_XINT_TO_FP(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue NV = DAG.getNode(N->getOpcode(), DL, NVT, N->getOperand(0));
+  // Round the value to the desired precision (that of the source type).
+  return DAG.getNode(
+      ISD::FP_EXTEND, DL, NVT,
+      DAG.getNode(ISD::FP_ROUND, DL, VT, NV, DAG.getIntPtrConstant(0, DL)));
+}
+
+SDValue DAGTypeLegalizer::PromoteFloatRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
new file mode 100644
index 000000000000..3ab9459c8af7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -0,0 +1,3391 @@
+//===----- LegalizeIntegerTypes.cpp - Legalization of integer types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements integer type expansion and promotion for LegalizeTypes.
+// Promotion is the act of changing a computation in an illegal type into a
+// computation in a larger type.  For example, implementing i8 arithmetic in an
+// i32 register (often needed on powerpc).
+// Expansion is the act of changing a computation in an illegal type into a
+// computation in two identical registers of a smaller type.  For example,
+// implementing i64 arithmetic in two i32 registers (often needed on 32-bit
+// targets).
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerResult - This method is called when a result of a node is
+/// found to be in need of promotion to a larger type.  At this point, the node
+/// may also have invalid operands or may have other results that need
+/// expansion, we just know that (at least) one result needs promotion.
+void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Promote integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "PromoteIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to promote this operator!");
+  case ISD::MERGE_VALUES:Res = PromoteIntRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::AssertSext:  Res = PromoteIntRes_AssertSext(N); break;
+  case ISD::AssertZext:  Res = PromoteIntRes_AssertZext(N); break;
+  case ISD::BITCAST:     Res = PromoteIntRes_BITCAST(N); break;
+  case ISD::BITREVERSE:  Res = PromoteIntRes_BITREVERSE(N); break;
+  case ISD::BSWAP:       Res = PromoteIntRes_BSWAP(N); break;
+  case ISD::BUILD_PAIR:  Res = PromoteIntRes_BUILD_PAIR(N); break;
+  case ISD::Constant:    Res = PromoteIntRes_Constant(N); break;
+  case ISD::CONVERT_RNDSAT:
+                         Res = PromoteIntRes_CONVERT_RNDSAT(N); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        Res = PromoteIntRes_CTLZ(N); break;
+  case ISD::CTPOP:       Res = PromoteIntRes_CTPOP(N); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        Res = PromoteIntRes_CTTZ(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+                         Res = PromoteIntRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::LOAD:        Res = PromoteIntRes_LOAD(cast<LoadSDNode>(N)); break;
+  case ISD::MLOAD:       Res = PromoteIntRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
+  case ISD::MGATHER:     Res = PromoteIntRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
+  case ISD::SELECT:      Res = PromoteIntRes_SELECT(N); break;
+  case ISD::VSELECT:     Res = PromoteIntRes_VSELECT(N); break;
+  case ISD::SELECT_CC:   Res = PromoteIntRes_SELECT_CC(N); break;
+  case ISD::SETCC:       Res = PromoteIntRes_SETCC(N); break;
+  case ISD::SMIN:
+  case ISD::SMAX:        Res = PromoteIntRes_SExtIntBinOp(N); break;
+  case ISD::UMIN:
+  case ISD::UMAX:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
+
+  case ISD::SHL:         Res = PromoteIntRes_SHL(N); break;
+  case ISD::SIGN_EXTEND_INREG:
+                         Res = PromoteIntRes_SIGN_EXTEND_INREG(N); break;
+  case ISD::SRA:         Res = PromoteIntRes_SRA(N); break;
+  case ISD::SRL:         Res = PromoteIntRes_SRL(N); break;
+  case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
+  case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
+  case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+
+  case ISD::EXTRACT_SUBVECTOR:
+                         Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::VECTOR_SHUFFLE:
+                         Res = PromoteIntRes_VECTOR_SHUFFLE(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                         Res = PromoteIntRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::BUILD_VECTOR:
+                         Res = PromoteIntRes_BUILD_VECTOR(N); break;
+  case ISD::SCALAR_TO_VECTOR:
+                         Res = PromoteIntRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:
+                         Res = PromoteIntRes_CONCAT_VECTORS(N); break;
+
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:  Res = PromoteIntRes_INT_EXTEND(N); break;
+
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:  Res = PromoteIntRes_FP_TO_XINT(N); break;
+
+  case ISD::FP_TO_FP16:  Res = PromoteIntRes_FP_TO_FP16(N); break;
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:         Res = PromoteIntRes_SimpleIntBinOp(N); break;
+
+  case ISD::SDIV:
+  case ISD::SREM:        Res = PromoteIntRes_SExtIntBinOp(N); break;
+
+  case ISD::UDIV:
+  case ISD::UREM:        Res = PromoteIntRes_ZExtIntBinOp(N); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO:       Res = PromoteIntRes_SADDSUBO(N, ResNo); break;
+  case ISD::UADDO:
+  case ISD::USUBO:       Res = PromoteIntRes_UADDSUBO(N, ResNo); break;
+  case ISD::SMULO:
+  case ISD::UMULO:       Res = PromoteIntRes_XMULO(N, ResNo); break;
+
+  case ISD::ATOMIC_LOAD:
+    Res = PromoteIntRes_Atomic0(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+    Res = PromoteIntRes_Atomic1(cast<AtomicSDNode>(N)); break;
+
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+    Res = PromoteIntRes_AtomicCmpSwap(cast<AtomicSDNode>(N), ResNo);
+    break;
+  }
+
+  // If the result is null then the sub-method took care of registering it.
+  if (Res.getNode())
+    SetPromotedInteger(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MERGE_VALUES(SDNode *N,
+                                                     unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetPromotedInteger(Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertSext(SDNode *N) {
+  // Sign-extend the new bits, and continue the assertion.
+  SDValue Op = SExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertSext, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AssertZext(SDNode *N) {
+  // Zero the new bits, and continue the assertion.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::AssertZext, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic0(AtomicSDNode *N) {
+  EVT ResVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
+                              N->getMemoryVT(), ResVT,
+                              N->getChain(), N->getBasePtr(),
+                              N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Atomic1(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Res = DAG.getAtomic(N->getOpcode(), SDLoc(N),
+                              N->getMemoryVT(),
+                              N->getChain(), N->getBasePtr(),
+                              Op2, N->getMemOperand(), N->getOrdering(),
+                              N->getSynchScope());
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
+                                                      unsigned ResNo) {
+  if (ResNo == 1) {
+    assert(N->getOpcode() == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+    EVT SVT = getSetCCResultType(N->getOperand(2).getValueType());
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+
+    // Only use the result of getSetCCResultType if it is legal,
+    // otherwise just use the promoted result type (NVT).
+    if (!TLI.isTypeLegal(SVT))
+      SVT = NVT;
+
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), SVT, MVT::Other);
+    SDValue Res = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, SDLoc(N), N->getMemoryVT(), VTs,
+        N->getChain(), N->getBasePtr(), N->getOperand(2), N->getOperand(3),
+        N->getMemOperand(), N->getSuccessOrdering(), N->getFailureOrdering(),
+        N->getSynchScope());
+    ReplaceValueWith(SDValue(N, 0), Res.getValue(0));
+    ReplaceValueWith(SDValue(N, 2), Res.getValue(2));
+    return Res.getValue(1);
+  }
+
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  SDVTList VTs =
+      DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
+  SDValue Res = DAG.getAtomicCmpSwap(
+      N->getOpcode(), SDLoc(N), N->getMemoryVT(), VTs, N->getChain(),
+      N->getBasePtr(), Op2, Op3, N->getMemOperand(), N->getSuccessOrdering(),
+      N->getFailureOrdering(), N->getSynchScope());
+  // Update the use to N with the newly created Res.
+  for (unsigned i = 1, NumResults = N->getNumValues(); i < NumResults; ++i)
+    ReplaceValueWith(SDValue(N, i), Res.getValue(i));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT NInVT = TLI.getTypeToTransformTo(*DAG.getContext(), InVT);
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypePromoteInteger:
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector() && !NInVT.isVector())
+      // The input promotes to the same size.  Convert the promoted value.
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetPromotedInteger(InOp));
+    break;
+  case TargetLowering::TypeSoftenFloat:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case TargetLowering::TypePromoteFloat: {
+    // Convert the promoted float by hand.
+    SDValue PromotedOp = GetPromotedFloat(InOp);
+    return DAG.getNode(ISD::FP_TO_FP16, dl, NOutVT, PromotedOp);
+    break;
+  }
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    break;
+  case TargetLowering::TypeScalarizeVector:
+    // Convert the element to an integer and promote it by hand.
+    if (!NOutVT.isVector())
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                         BitConvertToInteger(GetScalarizedVector(InOp)));
+    break;
+  case TargetLowering::TypeSplitVector: {
+    // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
+    // pieces of the input into integers and reassemble in the final type.
+    SDValue Lo, Hi;
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+    Lo = BitConvertToInteger(Lo);
+    Hi = BitConvertToInteger(Hi);
+
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Lo, Hi);
+
+    InOp = DAG.getNode(ISD::ANY_EXTEND, dl,
+                       EVT::getIntegerVT(*DAG.getContext(),
+                                         NOutVT.getSizeInBits()),
+                       JoinIntegers(Lo, Hi));
+    return DAG.getNode(ISD::BITCAST, dl, NOutVT, InOp);
+  }
+  case TargetLowering::TypeWidenVector:
+    // The input is widened to the same size. Convert to the widened value.
+    // Make sure that the outgoing value is not a vector, because this would
+    // make us bitcast between two vectors which are legalized in different ways.
+    if (NOutVT.bitsEq(NInVT) && !NOutVT.isVector())
+      return DAG.getNode(ISD::BITCAST, dl, NOutVT, GetWidenedVector(InOp));
+  }
+
+  return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
+                     CreateStackStoreLoad(InOp, OutVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  return DAG.getNode(
+      ISD::SRL, dl, NVT, DAG.getNode(ISD::BSWAP, dl, NVT, Op),
+      DAG.getConstant(DiffBits, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BITREVERSE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+
+  unsigned DiffBits = NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits();
+  return DAG.getNode(
+      ISD::SRL, dl, NVT, DAG.getNode(ISD::BITREVERSE, dl, NVT, Op),
+      DAG.getConstant(DiffBits, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout())));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_PAIR(SDNode *N) {
+  // The pair element type may be legal, or may not promote to the same type as
+  // the result, for example i14 = BUILD_PAIR (i7, i7).  Handle all cases.
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N),
+                     TLI.getTypeToTransformTo(*DAG.getContext(),
+                     N->getValueType(0)), JoinIntegers(N->getOperand(0),
+                     N->getOperand(1)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_Constant(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  // FIXME there is no actual debug info here
+  SDLoc dl(N);
+  // Zero extend things like i1, sign extend everything else.  It shouldn't
+  // matter in theory which one we pick, but this tends to give better code?
+  unsigned Opc = VT.isByteSized() ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  SDValue Result = DAG.getNode(Opc, dl,
+                               TLI.getTypeToTransformTo(*DAG.getContext(), VT),
+                               SDValue(N, 0));
+  assert(isa<ConstantSDNode>(Result) && "Didn't constant fold ext?");
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_SF || CvtCode == ISD::CVT_UF) &&
+          "can only promote integers");
+  EVT OutVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getConvertRndSat(OutVT, SDLoc(N), N->getOperand(0),
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTLZ(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  SDLoc dl(N);
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  Op = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+  // Subtract off the extra leading bits in the bigger type.
+  return DAG.getNode(
+      ISD::SUB, dl, NVT, Op,
+      DAG.getConstant(NVT.getScalarSizeInBits() - OVT.getScalarSizeInBits(), dl,
+                      NVT));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTPOP(SDNode *N) {
+  // Zero extend to the promoted type and do the count there.
+  SDValue Op = ZExtPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::CTPOP, SDLoc(N), Op.getValueType(), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CTTZ(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  EVT OVT = N->getValueType(0);
+  EVT NVT = Op.getValueType();
+  SDLoc dl(N);
+  if (N->getOpcode() == ISD::CTTZ) {
+    // The count is the same in the promoted type except if the original
+    // value was zero.  This can be handled by setting the bit just off
+    // the top of the original type.
+    auto TopBit = APInt::getOneBitSet(NVT.getScalarSizeInBits(),
+                                      OVT.getScalarSizeInBits());
+    Op = DAG.getNode(ISD::OR, dl, NVT, Op, DAG.getConstant(TopBit, dl, NVT));
+  }
+  return DAG.getNode(N->getOpcode(), dl, NVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc dl(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NVT, N->getOperand(0),
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_XINT(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NewOpc = N->getOpcode();
+  SDLoc dl(N);
+
+  // If we're promoting a UINT to a larger size and the larger FP_TO_UINT is
+  // not Legal, check to see if we can use FP_TO_SINT instead.  (If both UINT
+  // and SINT conversions are Custom, there is no way to tell which is
+  // preferable. We choose SINT because that's the right thing on PPC.)
+  if (N->getOpcode() == ISD::FP_TO_UINT &&
+      !TLI.isOperationLegal(ISD::FP_TO_UINT, NVT) &&
+      TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NVT))
+    NewOpc = ISD::FP_TO_SINT;
+
+  SDValue Res = DAG.getNode(NewOpc, dl, NVT, N->getOperand(0));
+
+  // Assert that the converted value fits in the original type.  If it doesn't
+  // (eg: because the value being converted is too big), then the result of the
+  // original operation was undefined anyway, so the assert is still correct.
+  return DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ?
+                     ISD::AssertZext : ISD::AssertSext, dl, NVT, Res,
+                     DAG.getValueType(N->getValueType(0).getScalarType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_FP_TO_FP16(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  if (getTypeAction(N->getOperand(0).getValueType())
+      == TargetLowering::TypePromoteInteger) {
+    SDValue Res = GetPromotedInteger(N->getOperand(0));
+    assert(Res.getValueType().bitsLE(NVT) && "Extension doesn't make sense!");
+
+    // If the result and operand types are the same after promotion, simplify
+    // to an in-register extension.
+    if (NVT == Res.getValueType()) {
+      // The high bits are not guaranteed to be anything.  Insert an extend.
+      if (N->getOpcode() == ISD::SIGN_EXTEND)
+        return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                           DAG.getValueType(N->getOperand(0).getValueType()));
+      if (N->getOpcode() == ISD::ZERO_EXTEND)
+        return DAG.getZeroExtendInReg(Res, dl,
+                      N->getOperand(0).getValueType().getScalarType());
+      assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
+      return Res;
+    }
+  }
+
+  // Otherwise, just extend the original operand all the way to the larger type.
+  return DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_LOAD(LoadSDNode *N) {
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  ISD::LoadExtType ExtType =
+    ISD::isNON_EXTLoad(N) ? ISD::EXTLOAD : N->getExtensionType();
+  SDLoc dl(N);
+  SDValue Res = DAG.getExtLoad(ExtType, dl, NVT, N->getChain(), N->getBasePtr(),
+                               N->getMemoryVT(), N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MLOAD(MaskedLoadSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getSrc0());
+
+  SDLoc dl(N);
+  SDValue Res = DAG.getMaskedLoad(NVT, dl, N->getChain(), N->getBasePtr(),
+                                  N->getMask(), ExtSrc0, N->getMemoryVT(),
+                                  N->getMemOperand(), ISD::SEXTLOAD);
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_MGATHER(MaskedGatherSDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue ExtSrc0 = GetPromotedInteger(N->getValue());
+  assert(NVT == ExtSrc0.getValueType() &&
+      "Gather result type and the passThru agrument type should be the same");
+
+  SDLoc dl(N);
+  SDValue Ops[] = {N->getChain(), ExtSrc0, N->getMask(), N->getBasePtr(),
+                   N->getIndex()};
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(NVT, MVT::Other),
+                                    N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand()); 
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+/// Promote the overflow flag of an overflowing arithmetic node.
+SDValue DAGTypeLegalizer::PromoteIntRes_Overflow(SDNode *N) {
+  // Simply change the return type of the boolean result.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(1));
+  EVT ValueVTs[] = { N->getValueType(0), NVT };
+  SDValue Ops[] = { N->getOperand(0), N->getOperand(1) };
+  SDValue Res = DAG.getNode(N->getOpcode(), SDLoc(N),
+                            DAG.getVTList(ValueVTs), Ops);
+
+  // Modified the sum result - switch anything that used the old sum to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 0), Res);
+
+  return SDValue(Res.getNode(), 1);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // sign extension of its truncation to the original type.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  SDLoc dl(N);
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::SADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: sign extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
+                            DAG.getValueType(OVT));
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VSELECT(SDNode *N) {
+  SDValue Mask = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  Mask = PromoteTargetBoolean(Mask, OpTy);
+  SDValue LHS = GetPromotedInteger(N->getOperand(1));
+  SDValue RHS = GetPromotedInteger(N->getOperand(2));
+  return DAG.getNode(ISD::VSELECT, SDLoc(N),
+                     LHS.getValueType(), Mask, LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetPromotedInteger(N->getOperand(2));
+  SDValue RHS = GetPromotedInteger(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     LHS.getValueType(), N->getOperand(0),
+                     N->getOperand(1), LHS, RHS, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SETCC(SDNode *N) {
+  EVT SVT = getSetCCResultType(N->getOperand(0).getValueType());
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  // Only use the result of getSetCCResultType if it is legal,
+  // otherwise just use the promoted result type (NVT).
+  if (!TLI.isTypeLegal(SVT))
+    SVT = NVT;
+
+  SDLoc dl(N);
+  assert(SVT.isVector() == N->getOperand(0).getValueType().isVector() &&
+         "Vector compare must return a vector result!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (LHS.getValueType() != RHS.getValueType()) {
+    if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger &&
+        !LHS.getValueType().isVector())
+      LHS = GetPromotedInteger(LHS);
+    if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger &&
+        !RHS.getValueType().isVector())
+      RHS = GetPromotedInteger(RHS);
+  }
+
+  // Get the SETCC result using the canonical SETCC type.
+  SDValue SetCC = DAG.getNode(N->getOpcode(), dl, SVT, LHS, RHS,
+                              N->getOperand(2));
+
+  assert(NVT.bitsLE(SVT) && "Integer type overpromoted?");
+  // Convert to the expected type.
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, SetCC);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SHL(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    LHS = GetPromotedInteger(LHS);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  return DAG.getNode(ISD::SHL, SDLoc(N), LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SimpleIntBinOp(SDNode *N) {
+  // The input may have strange things in the top bits of the registers, but
+  // these operations don't care.  They may have weird bits going out, but
+  // that too is okay if they are integer operations.
+  SDValue LHS = GetPromotedInteger(N->getOperand(0));
+  SDValue RHS = GetPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SExtIntBinOp(SDNode *N) {
+  // Sign extend the input.
+  SDValue LHS = SExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = SExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_ZExtIntBinOp(SDNode *N) {
+  // Zero extend the input.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRA(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  // The input value must be properly sign extended.
+  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    LHS = SExtPromotedInteger(LHS);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  return DAG.getNode(ISD::SRA, SDLoc(N), LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SRL(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  // The input value must be properly zero extended.
+  if (getTypeAction(LHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    LHS = ZExtPromotedInteger(LHS);
+  if (getTypeAction(RHS.getValueType()) == TargetLowering::TypePromoteInteger)
+    RHS = ZExtPromotedInteger(RHS);
+  return DAG.getNode(ISD::SRL, SDLoc(N), LHS.getValueType(), LHS, RHS);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res;
+  SDValue InOp = N->getOperand(0);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InOp.getValueType())) {
+  default: llvm_unreachable("Unknown type action!");
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypeExpandInteger:
+    Res = InOp;
+    break;
+  case TargetLowering::TypePromoteInteger:
+    Res = GetPromotedInteger(InOp);
+    break;
+  case TargetLowering::TypeSplitVector:
+    EVT InVT = InOp.getValueType();
+    assert(InVT.isVector() && "Cannot split scalar types");
+    unsigned NumElts = InVT.getVectorNumElements();
+    assert(NumElts == NVT.getVectorNumElements() &&
+           "Dst and Src must have the same number of elements");
+    assert(isPowerOf2_32(NumElts) &&
+           "Promoted vector type must be a power of two");
+
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
+    EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
+                                   NumElts/2);
+    EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
+    EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
+  }
+
+  // Truncate to NVT instead of VT
+  return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  // The operation overflowed iff the result in the larger type is not the
+  // zero extension of its truncation to the original type.
+  SDValue LHS = ZExtPromotedInteger(N->getOperand(0));
+  SDValue RHS = ZExtPromotedInteger(N->getOperand(1));
+  EVT OVT = N->getOperand(0).getValueType();
+  EVT NVT = LHS.getValueType();
+  SDLoc dl(N);
+
+  // Do the arithmetic in the larger type.
+  unsigned Opcode = N->getOpcode() == ISD::UADDO ? ISD::ADD : ISD::SUB;
+  SDValue Res = DAG.getNode(Opcode, dl, NVT, LHS, RHS);
+
+  // Calculate the overflow flag: zero extend the arithmetic result from
+  // the original type.
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
+  // Overflowed if and only if this is not equal to Res.
+  Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_XMULO(SDNode *N, unsigned ResNo) {
+  // Promote the overflow bit trivially.
+  if (ResNo == 1)
+    return PromoteIntRes_Overflow(N);
+
+  SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+  SDLoc DL(N);
+  EVT SmallVT = LHS.getValueType();
+
+  // To determine if the result overflowed in a larger type, we extend the
+  // input to the larger type, do the multiply (checking if it overflows),
+  // then also check the high bits of the result to see if overflow happened
+  // there.
+  if (N->getOpcode() == ISD::SMULO) {
+    LHS = SExtPromotedInteger(LHS);
+    RHS = SExtPromotedInteger(RHS);
+  } else {
+    LHS = ZExtPromotedInteger(LHS);
+    RHS = ZExtPromotedInteger(RHS);
+  }
+  SDVTList VTs = DAG.getVTList(LHS.getValueType(), N->getValueType(1));
+  SDValue Mul = DAG.getNode(N->getOpcode(), DL, VTs, LHS, RHS);
+
+  // Overflow occurred if it occurred in the larger type, or if the high part
+  // of the result does not zero/sign-extend the low part.  Check this second
+  // possibility first.
+  SDValue Overflow;
+  if (N->getOpcode() == ISD::UMULO) {
+    // Unsigned overflow occurred if the high part is non-zero.
+    SDValue Hi = DAG.getNode(ISD::SRL, DL, Mul.getValueType(), Mul,
+                             DAG.getIntPtrConstant(SmallVT.getSizeInBits(),
+                                                   DL));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), Hi,
+                            DAG.getConstant(0, DL, Hi.getValueType()),
+                            ISD::SETNE);
+  } else {
+    // Signed overflow occurred if the high part does not sign extend the low.
+    SDValue SExt = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Mul.getValueType(),
+                               Mul, DAG.getValueType(SmallVT));
+    Overflow = DAG.getSetCC(DL, N->getValueType(1), SExt, Mul, ISD::SETNE);
+  }
+
+  // The only other way for overflow to occur is if the multiplication in the
+  // larger type itself overflowed.
+  Overflow = DAG.getNode(ISD::OR, DL, N->getValueType(1), Overflow,
+                         SDValue(Mul.getNode(), 1));
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Overflow);
+  return Mul;
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(TLI.getTypeToTransformTo(*DAG.getContext(),
+                                               N->getValueType(0)));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
+  SDValue Chain = N->getOperand(0); // Get the chain.
+  SDValue Ptr = N->getOperand(1); // Get the pointer.
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  MVT RegVT = TLI.getRegisterType(*DAG.getContext(), VT);
+  unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), VT);
+  // The argument is passed as NumRegs registers of type RegVT.
+
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned i = 0; i < NumRegs; ++i) {
+    Parts[i] = DAG.getVAArg(RegVT, dl, Chain, Ptr, N->getOperand(2),
+                            N->getConstantOperandVal(3));
+    Chain = Parts[i].getValue(1);
+  }
+
+  // Handle endianness of the load.
+  if (DAG.getDataLayout().isBigEndian())
+    std::reverse(Parts.begin(), Parts.end());
+
+  // Assemble the parts in the promoted type.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Res = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[0]);
+  for (unsigned i = 1; i < NumRegs; ++i) {
+    SDValue Part = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Parts[i]);
+    // Shift it to the right position and "or" it in.
+    Part = DAG.getNode(ISD::SHL, dl, NVT, Part,
+                       DAG.getConstant(i * RegVT.getSizeInBits(), dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    Res = DAG.getNode(ISD::OR, dl, NVT, Res, Part);
+  }
+
+  // Modified the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+
+  return Res;
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Promotion
+//===----------------------------------------------------------------------===//
+
+/// PromoteIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need promotion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+    default:
+  #ifndef NDEBUG
+    dbgs() << "PromoteIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to promote this operator's operand!");
+
+  case ISD::ANY_EXTEND:   Res = PromoteIntOp_ANY_EXTEND(N); break;
+  case ISD::ATOMIC_STORE:
+    Res = PromoteIntOp_ATOMIC_STORE(cast<AtomicSDNode>(N));
+    break;
+  case ISD::BITCAST:      Res = PromoteIntOp_BITCAST(N); break;
+  case ISD::BR_CC:        Res = PromoteIntOp_BR_CC(N, OpNo); break;
+  case ISD::BRCOND:       Res = PromoteIntOp_BRCOND(N, OpNo); break;
+  case ISD::BUILD_PAIR:   Res = PromoteIntOp_BUILD_PAIR(N); break;
+  case ISD::BUILD_VECTOR: Res = PromoteIntOp_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS: Res = PromoteIntOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = PromoteIntOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::CONVERT_RNDSAT:
+                          Res = PromoteIntOp_CONVERT_RNDSAT(N); break;
+  case ISD::INSERT_VECTOR_ELT:
+                          Res = PromoteIntOp_INSERT_VECTOR_ELT(N, OpNo);break;
+  case ISD::SCALAR_TO_VECTOR:
+                          Res = PromoteIntOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:       Res = PromoteIntOp_SELECT(N, OpNo); break;
+  case ISD::SELECT_CC:    Res = PromoteIntOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:        Res = PromoteIntOp_SETCC(N, OpNo); break;
+  case ISD::SIGN_EXTEND:  Res = PromoteIntOp_SIGN_EXTEND(N); break;
+  case ISD::SINT_TO_FP:   Res = PromoteIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:        Res = PromoteIntOp_STORE(cast<StoreSDNode>(N),
+                                                   OpNo); break;
+  case ISD::MSTORE:       Res = PromoteIntOp_MSTORE(cast<MaskedStoreSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MLOAD:        Res = PromoteIntOp_MLOAD(cast<MaskedLoadSDNode>(N),
+                                                    OpNo); break;
+  case ISD::MGATHER:  Res = PromoteIntOp_MGATHER(cast<MaskedGatherSDNode>(N),
+                                                 OpNo); break;
+  case ISD::MSCATTER: Res = PromoteIntOp_MSCATTER(cast<MaskedScatterSDNode>(N),
+                                                  OpNo); break;
+  case ISD::TRUNCATE:     Res = PromoteIntOp_TRUNCATE(N); break;
+  case ISD::FP16_TO_FP:
+  case ISD::UINT_TO_FP:   Res = PromoteIntOp_UINT_TO_FP(N); break;
+  case ISD::ZERO_EXTEND:  Res = PromoteIntOp_ZERO_EXTEND(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntOp_EXTRACT_SUBVECTOR(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR: Res = PromoteIntOp_Shift(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// PromoteSetCCOperands - Promote the operands of a comparison.  This code is
+/// shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::PromoteSetCCOperands(SDValue &NewLHS,SDValue &NewRHS,
+                                            ISD::CondCode CCCode) {
+  // We have to insert explicit sign or zero extends.  Note that we could
+  // insert sign extends for ALL conditions, but zero extend is cheaper on
+  // many machines (an AND instead of two shifts), so prefer it.
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer comparison!");
+  case ISD::SETEQ:
+  case ISD::SETNE: {
+    SDValue OpL = GetPromotedInteger(NewLHS);
+    SDValue OpR = GetPromotedInteger(NewRHS);
+
+    // We would prefer to promote the comparison operand with sign extension,
+    // if we find the operand is actually to truncate an AssertSext. With this
+    // optimization, we can avoid inserting real truncate instruction, which
+    // is redudant eventually.
+    if (OpL->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpL->getOperand(1))->getVT() == NewLHS.getValueType() &&
+        OpR->getOpcode() == ISD::AssertSext &&
+        cast<VTSDNode>(OpR->getOperand(1))->getVT() == NewRHS.getValueType()) {
+      NewLHS = OpL;
+      NewRHS = OpR;
+    } else {
+      NewLHS = ZExtPromotedInteger(NewLHS);
+      NewRHS = ZExtPromotedInteger(NewRHS);
+    }
+    break;
+  }
+  case ISD::SETUGE:
+  case ISD::SETUGT:
+  case ISD::SETULE:
+  case ISD::SETULT:
+    // ALL of these operations will work if we either sign or zero extend
+    // the operands (including the unsigned comparisons!).  Zero extend is
+    // usually a simpler/cheaper operation, so prefer it.
+    NewLHS = ZExtPromotedInteger(NewLHS);
+    NewRHS = ZExtPromotedInteger(NewRHS);
+    break;
+  case ISD::SETGE:
+  case ISD::SETGT:
+  case ISD::SETLT:
+  case ISD::SETLE:
+    NewLHS = SExtPromotedInteger(NewLHS);
+    NewRHS = SExtPromotedInteger(NewRHS);
+    break;
+  }
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ANY_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N) {
+  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  return DAG.getAtomic(N->getOpcode(), SDLoc(N), N->getMemoryVT(),
+                       N->getChain(), N->getBasePtr(), Op2, N->getMemOperand(),
+                       N->getOrdering(), N->getSynchScope());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BITCAST(SDNode *N) {
+  // This should only occur in unusual situations like bitcasting to an
+  // x86_fp80, so just turn it into a store+load
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 2 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(2);
+  SDValue RHS = N->getOperand(3);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(1))->get());
+
+  // The chain (Op#0), CC (#1) and basic block destination (Op#4) are always
+  // legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), LHS, RHS, N->getOperand(4)),
+                 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "only know how to promote condition");
+
+  // Promote all the way up to the canonical SetCC type.
+  SDValue Cond = PromoteTargetBoolean(N->getOperand(1), MVT::Other);
+
+  // The chain (Op#0) and basic block destination (Op#2) are always legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Cond,
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_PAIR(SDNode *N) {
+  // Since the result type is legal, the operands must promote to it.
+  EVT OVT = N->getOperand(0).getValueType();
+  SDValue Lo = ZExtPromotedInteger(N->getOperand(0));
+  SDValue Hi = GetPromotedInteger(N->getOperand(1));
+  assert(Lo.getValueType() == N->getValueType(0) && "Operand over promoted?");
+  SDLoc dl(N);
+
+  Hi = DAG.getNode(ISD::SHL, dl, N->getValueType(0), Hi,
+                   DAG.getConstant(OVT.getSizeInBits(), dl,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  return DAG.getNode(ISD::OR, dl, N->getValueType(0), Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type is not.  This implies
+  // that the vector is a power-of-two in length and that the element
+  // type does not have a strange size (eg: it is not i1).
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  assert(!((NumElts & 1) && (!TLI.isTypeLegal(VecVT))) &&
+         "Legal vector of one illegal element?");
+
+  // Promote the inserted value.  The type does not need to match the
+  // vector element type.  Check that any extra bits introduced will be
+  // truncated away.
+  assert(N->getOperand(0).getValueType().getSizeInBits() >=
+         N->getValueType(0).getVectorElementType().getSizeInBits() &&
+         "Type of inserted value narrower than vector element type!");
+
+  SmallVector<SDValue, 16> NewOps;
+  for (unsigned i = 0; i < NumElts; ++i)
+    NewOps.push_back(GetPromotedInteger(N->getOperand(i)));
+
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONVERT_RNDSAT(SDNode *N) {
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+  assert ((CvtCode == ISD::CVT_SS || CvtCode == ISD::CVT_SU ||
+           CvtCode == ISD::CVT_US || CvtCode == ISD::CVT_UU ||
+           CvtCode == ISD::CVT_FS || CvtCode == ISD::CVT_FU) &&
+           "can only promote integer arguments");
+  SDValue InOp = GetPromotedInteger(N->getOperand(0));
+  return DAG.getConvertRndSat(N->getValueType(0), SDLoc(N), InOp,
+                              N->getOperand(1), N->getOperand(2),
+                              N->getOperand(3), N->getOperand(4), CvtCode);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N,
+                                                         unsigned OpNo) {
+  if (OpNo == 1) {
+    // Promote the inserted value.  This is valid because the type does not
+    // have to match the vector element type.
+
+    // Check that any extra bits introduced will be truncated away.
+    assert(N->getOperand(1).getValueType().getSizeInBits() >=
+           N->getValueType(0).getVectorElementType().getSizeInBits() &&
+           "Type of inserted value narrower than vector element type!");
+    return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                  GetPromotedInteger(N->getOperand(1)),
+                                  N->getOperand(2)),
+                   0);
+  }
+
+  assert(OpNo == 2 && "Different operand and result vector types?");
+
+  // Promote the index.
+  SDValue Idx = DAG.getZExtOrTrunc(N->getOperand(2), SDLoc(N),
+                                   TLI.getVectorIdxTy(DAG.getDataLayout()));
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                N->getOperand(1), Idx), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N) {
+  // Integer SCALAR_TO_VECTOR operands are implicitly truncated, so just promote
+  // the operand in place.
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                GetPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Only know how to promote the condition!");
+  SDValue Cond = N->getOperand(0);
+  EVT OpTy = N->getOperand(1).getValueType();
+
+  // Promote all the way up to the canonical SetCC type.
+  EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
+  Cond = PromoteTargetBoolean(Cond, OpVT);
+
+  return SDValue(DAG.UpdateNodeOperands(N, Cond, N->getOperand(1),
+                                        N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(4))->get());
+
+  // The CC (#4) and the possible return values (#2 and #3) have legal types.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2),
+                                N->getOperand(3), N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SETCC(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 0 && "Don't know how to promote this operand!");
+
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  PromoteSetCCOperands(LHS, RHS, cast<CondCodeSDNode>(N->getOperand(2))->get());
+
+  // The CC (#2) is always legal.
+  return SDValue(DAG.UpdateNodeOperands(N, LHS, RHS, N->getOperand(2)), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_Shift(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                ZExtPromotedInteger(N->getOperand(1))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SIGN_EXTEND(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  SDLoc dl(N);
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(),
+                     Op, DAG.getValueType(N->getOperand(0).getValueType()));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_SINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                SExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  SDValue Ch = N->getChain(), Ptr = N->getBasePtr();
+  SDLoc dl(N);
+
+  SDValue Val = GetPromotedInteger(N->getValue());  // Get promoted value.
+
+  // Truncate the value and store the result.
+  return DAG.getTruncStore(Ch, dl, Val, Ptr,
+                           N->getMemoryVT(), N->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSTORE(MaskedStoreSDNode *N,
+                                              unsigned OpNo) {
+
+  SDValue DataOp = N->getValue();
+  EVT DataVT = DataOp.getValueType();
+  SDValue Mask = N->getMask();
+  SDLoc dl(N);
+
+  bool TruncateStore = false;
+  if (OpNo == 2) {
+    // Mask comes before the data operand. If the data operand is legal, we just
+    // promote the mask.
+    // When the data operand has illegal type, we should legalize the data
+    // operand first. The mask will be promoted/splitted/widened according to
+    // the data operand type.
+    if (TLI.isTypeLegal(DataVT))
+      Mask = PromoteTargetBoolean(Mask, DataVT);
+    else {
+      if (getTypeAction(DataVT) == TargetLowering::TypePromoteInteger)
+        return PromoteIntOp_MSTORE(N, 3);
+
+      else if (getTypeAction(DataVT) == TargetLowering::TypeWidenVector)
+        return WidenVecOp_MSTORE(N, 3);
+
+      else {
+        assert (getTypeAction(DataVT) == TargetLowering::TypeSplitVector);
+        return SplitVecOp_MSTORE(N, 3);
+      }
+    }
+  } else { // Data operand
+    assert(OpNo == 3 && "Unexpected operand for promotion");
+    DataOp = GetPromotedInteger(DataOp);
+    Mask = PromoteTargetBoolean(Mask, DataOp.getValueType());
+    TruncateStore = true;
+  }
+
+  return DAG.getMaskedStore(N->getChain(), dl, DataOp, N->getBasePtr(), Mask,
+                            N->getMemoryVT(), N->getMemOperand(),
+                            TruncateStore);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
+                                             unsigned OpNo) {
+  assert(OpNo == 2 && "Only know how to promote the mask!");
+  EVT DataVT = N->getValueType(0);
+  SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
+  NewOps[OpNo] = Mask;
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
+                                               unsigned OpNo) {
+
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValueType(0);
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
+                                                unsigned OpNo) {
+  SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+  if (OpNo == 2) {
+    // The Mask
+    EVT DataVT = N->getValue().getValueType();
+    NewOps[OpNo] = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
+  } else
+    NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
+  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_TRUNCATE(SDNode *N) {
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_UINT_TO_FP(SDNode *N) {
+  return SDValue(DAG.UpdateNodeOperands(N,
+                                ZExtPromotedInteger(N->getOperand(0))), 0);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Op = GetPromotedInteger(N->getOperand(0));
+  Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
+  return DAG.getZeroExtendInReg(Op, dl,
+                                N->getOperand(0).getValueType().getScalarType());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Integer Result Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerResult - This method is called when the specified result of the
+/// specified node is found to need expansion.  At this point, the node may also
+/// have invalid operands or may have other results that need promotion, we just
+/// know that (at least) one result needs expansion.
+void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Expand integer result: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Lo, Hi;
+  Lo = Hi = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ExpandIntegerResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to expand the result of this operator!");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+
+  case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
+  case ISD::EXTRACT_ELEMENT:    ExpandRes_EXTRACT_ELEMENT(N, Lo, Hi); break;
+  case ISD::EXTRACT_VECTOR_ELT: ExpandRes_EXTRACT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::VAARG:              ExpandRes_VAARG(N, Lo, Hi); break;
+
+  case ISD::ANY_EXTEND:  ExpandIntRes_ANY_EXTEND(N, Lo, Hi); break;
+  case ISD::AssertSext:  ExpandIntRes_AssertSext(N, Lo, Hi); break;
+  case ISD::AssertZext:  ExpandIntRes_AssertZext(N, Lo, Hi); break;
+  case ISD::BITREVERSE:  ExpandIntRes_BITREVERSE(N, Lo, Hi); break;
+  case ISD::BSWAP:       ExpandIntRes_BSWAP(N, Lo, Hi); break;
+  case ISD::Constant:    ExpandIntRes_Constant(N, Lo, Hi); break;
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTLZ:        ExpandIntRes_CTLZ(N, Lo, Hi); break;
+  case ISD::CTPOP:       ExpandIntRes_CTPOP(N, Lo, Hi); break;
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTTZ:        ExpandIntRes_CTTZ(N, Lo, Hi); break;
+  case ISD::FP_TO_SINT:  ExpandIntRes_FP_TO_SINT(N, Lo, Hi); break;
+  case ISD::FP_TO_UINT:  ExpandIntRes_FP_TO_UINT(N, Lo, Hi); break;
+  case ISD::LOAD:        ExpandIntRes_LOAD(cast<LoadSDNode>(N), Lo, Hi); break;
+  case ISD::MUL:         ExpandIntRes_MUL(N, Lo, Hi); break;
+  case ISD::READCYCLECOUNTER: ExpandIntRes_READCYCLECOUNTER(N, Lo, Hi); break;
+  case ISD::SDIV:        ExpandIntRes_SDIV(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND: ExpandIntRes_SIGN_EXTEND(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: ExpandIntRes_SIGN_EXTEND_INREG(N, Lo, Hi); break;
+  case ISD::SREM:        ExpandIntRes_SREM(N, Lo, Hi); break;
+  case ISD::TRUNCATE:    ExpandIntRes_TRUNCATE(N, Lo, Hi); break;
+  case ISD::UDIV:        ExpandIntRes_UDIV(N, Lo, Hi); break;
+  case ISD::UREM:        ExpandIntRes_UREM(N, Lo, Hi); break;
+  case ISD::ZERO_EXTEND: ExpandIntRes_ZERO_EXTEND(N, Lo, Hi); break;
+  case ISD::ATOMIC_LOAD: ExpandIntRes_ATOMIC_LOAD(N, Lo, Hi); break;
+
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_CMP_SWAP: {
+    std::pair<SDValue, SDValue> Tmp = ExpandAtomic(N);
+    SplitInteger(Tmp.first, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Tmp.second);
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: {
+    AtomicSDNode *AN = cast<AtomicSDNode>(N);
+    SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::Other);
+    SDValue Tmp = DAG.getAtomicCmpSwap(
+        ISD::ATOMIC_CMP_SWAP, SDLoc(N), AN->getMemoryVT(), VTs,
+        N->getOperand(0), N->getOperand(1), N->getOperand(2), N->getOperand(3),
+        AN->getMemOperand(), AN->getSuccessOrdering(), AN->getFailureOrdering(),
+        AN->getSynchScope());
+
+    // Expanding to the strong ATOMIC_CMP_SWAP node means we can determine
+    // success simply by comparing the loaded value against the ingoing
+    // comparison.
+    SDValue Success = DAG.getSetCC(SDLoc(N), N->getValueType(1), Tmp,
+                                   N->getOperand(2), ISD::SETEQ);
+
+    SplitInteger(Tmp, Lo, Hi);
+    ReplaceValueWith(SDValue(N, 1), Success);
+    ReplaceValueWith(SDValue(N, 2), Tmp.getValue(1));
+    break;
+  }
+
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: ExpandIntRes_Logical(N, Lo, Hi); break;
+
+  case ISD::UMAX:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::SMIN: ExpandIntRes_MINMAX(N, Lo, Hi); break;
+
+  case ISD::ADD:
+  case ISD::SUB: ExpandIntRes_ADDSUB(N, Lo, Hi); break;
+
+  case ISD::ADDC:
+  case ISD::SUBC: ExpandIntRes_ADDSUBC(N, Lo, Hi); break;
+
+  case ISD::ADDE:
+  case ISD::SUBE: ExpandIntRes_ADDSUBE(N, Lo, Hi); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL: ExpandIntRes_Shift(N, Lo, Hi); break;
+
+  case ISD::SADDO:
+  case ISD::SSUBO: ExpandIntRes_SADDSUBO(N, Lo, Hi); break;
+  case ISD::UADDO:
+  case ISD::USUBO: ExpandIntRes_UADDSUBO(N, Lo, Hi); break;
+  case ISD::UMULO:
+  case ISD::SMULO: ExpandIntRes_XMULO(N, Lo, Hi); break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetExpandedInteger(SDValue(N, ResNo), Lo, Hi);
+}
+
+/// Lower an atomic node to the appropriate builtin call.
+std::pair <SDValue, SDValue> DAGTypeLegalizer::ExpandAtomic(SDNode *Node) {
+  unsigned Opc = Node->getOpcode();
+  MVT VT = cast<AtomicSDNode>(Node)->getMemoryVT().getSimpleVT();
+  RTLIB::Libcall LC = RTLIB::getSYNC(Opc, VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected atomic op or value type!");
+
+  return ExpandChainLibCall(LC, Node, false);
+}
+
+/// N is a shift by a value that needs to be expanded,
+/// and the shift amount is a constant 'Amt'.  Expand the operation.
+void DAGTypeLegalizer::ExpandShiftByConstant(SDNode *N, const APInt &Amt,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDLoc DL(N);
+  // Expand the incoming operand to be shifted, so that we have its parts
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // Though Amt shouldn't usually be 0, it's possible. E.g. when legalization
+  // splitted a vector shift, like this: <op1, op2> SHL <0, 2>.
+  if (!Amt) {
+    Lo = InL;
+    Hi = InH;
+    return;
+  }
+
+  EVT NVT = InL.getValueType();
+  unsigned VTBits = N->getValueType(0).getSizeInBits();
+  unsigned NVTBits = NVT.getSizeInBits();
+  EVT ShTy = N->getOperand(1).getValueType();
+
+  if (N->getOpcode() == ISD::SHL) {
+    if (Amt.ugt(VTBits)) {
+      Lo = Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = DAG.getConstant(0, DL, NVT);
+      Hi = DAG.getNode(ISD::SHL, DL,
+                       NVT, InL, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+    } else if (Amt == NVTBits) {
+      Lo = DAG.getConstant(0, DL, NVT);
+      Hi = InL;
+    } else {
+      Lo = DAG.getNode(ISD::SHL, DL, NVT, InL, DAG.getConstant(Amt, DL, ShTy));
+      Hi = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(Amt, DL, ShTy)),
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::SRL) {
+    if (Amt.ugt(VTBits)) {
+      Lo = Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt.ugt(NVTBits)) {
+      Lo = DAG.getNode(ISD::SRL, DL,
+                       NVT, InH, DAG.getConstant(Amt - NVTBits, DL, ShTy));
+      Hi = DAG.getConstant(0, DL, NVT);
+    } else if (Amt == NVTBits) {
+      Lo = InH;
+      Hi = DAG.getConstant(0, DL, NVT);
+    } else {
+      Lo = DAG.getNode(ISD::OR, DL, NVT,
+                       DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                   DAG.getConstant(Amt, DL, ShTy)),
+                       DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                   DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+      Hi = DAG.getNode(ISD::SRL, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+    }
+    return;
+  }
+
+  assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+  if (Amt.ugt(VTBits)) {
+    Hi = Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                          DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else if (Amt.ugt(NVTBits)) {
+    Lo = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(Amt - NVTBits, DL, ShTy));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else if (Amt == NVTBits) {
+    Lo = InH;
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH,
+                     DAG.getConstant(NVTBits - 1, DL, ShTy));
+  } else {
+    Lo = DAG.getNode(ISD::OR, DL, NVT,
+                     DAG.getNode(ISD::SRL, DL, NVT, InL,
+                                 DAG.getConstant(Amt, DL, ShTy)),
+                     DAG.getNode(ISD::SHL, DL, NVT, InH,
+                                 DAG.getConstant(-Amt + NVTBits, DL, ShTy)));
+    Hi = DAG.getNode(ISD::SRA, DL, NVT, InH, DAG.getConstant(Amt, DL, ShTy));
+  }
+}
+
+/// ExpandShiftWithKnownAmountBit - Try to determine whether we can simplify
+/// this shift based on knowledge of the high bit of the shift amount.  If we
+/// can tell this, we know that it is >= 32 or < 32, without knowing the actual
+/// shift amount.
+bool DAGTypeLegalizer::
+ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned ShBits = ShTy.getScalarType().getSizeInBits();
+  unsigned NVTBits = NVT.getScalarType().getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  SDLoc dl(N);
+
+  APInt HighBitMask = APInt::getHighBitsSet(ShBits, ShBits - Log2_32(NVTBits));
+  APInt KnownZero, KnownOne;
+  DAG.computeKnownBits(N->getOperand(1), KnownZero, KnownOne);
+
+  // If we don't know anything about the high bits, exit.
+  if (((KnownZero|KnownOne) & HighBitMask) == 0)
+    return false;
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  // If we know that any of the high bits of the shift amount are one, then we
+  // can do this as a couple of simple shifts.
+  if (KnownOne.intersects(HighBitMask)) {
+    // Mask out the high bit, which we know is set.
+    Amt = DAG.getNode(ISD::AND, dl, ShTy, Amt,
+                      DAG.getConstant(~HighBitMask, dl, ShTy));
+
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:
+      Lo = DAG.getConstant(0, dl, NVT);              // Low part is zero.
+      Hi = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt); // High part from Lo part.
+      return true;
+    case ISD::SRL:
+      Hi = DAG.getConstant(0, dl, NVT);              // Hi part is zero.
+      Lo = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    case ISD::SRA:
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, InH,       // Sign extend high part.
+                       DAG.getConstant(NVTBits - 1, dl, ShTy));
+      Lo = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt); // Lo part from Hi part.
+      return true;
+    }
+  }
+
+  // If we know that all of the high bits of the shift amount are zero, then we
+  // can do this as a couple of simple shifts.
+  if ((KnownZero & HighBitMask) == HighBitMask) {
+    // Calculate 31-x. 31 is used instead of 32 to avoid creating an undefined
+    // shift if x is zero.  We can use XOR here because x is known to be smaller
+    // than 32.
+    SDValue Amt2 = DAG.getNode(ISD::XOR, dl, ShTy, Amt,
+                               DAG.getConstant(NVTBits - 1, dl, ShTy));
+
+    unsigned Op1, Op2;
+    switch (N->getOpcode()) {
+    default: llvm_unreachable("Unknown shift");
+    case ISD::SHL:  Op1 = ISD::SHL; Op2 = ISD::SRL; break;
+    case ISD::SRL:
+    case ISD::SRA:  Op1 = ISD::SRL; Op2 = ISD::SHL; break;
+    }
+
+    // When shifting right the arithmetic for Lo and Hi is swapped.
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(InL, InH);
+
+    // Use a little trick to get the bits that move from Lo to Hi. First
+    // shift by one bit.
+    SDValue Sh1 = DAG.getNode(Op2, dl, NVT, InL, DAG.getConstant(1, dl, ShTy));
+    // Then compute the remaining shift with amount-1.
+    SDValue Sh2 = DAG.getNode(Op2, dl, NVT, Sh1, Amt2);
+
+    Lo = DAG.getNode(N->getOpcode(), dl, NVT, InL, Amt);
+    Hi = DAG.getNode(ISD::OR, dl, NVT, DAG.getNode(Op1, dl, NVT, InH, Amt),Sh2);
+
+    if (N->getOpcode() != ISD::SHL)
+      std::swap(Hi, Lo);
+    return true;
+  }
+
+  return false;
+}
+
+/// ExpandShiftWithUnknownAmountBit - Fully general expansion of integer shift
+/// of any size.
+bool DAGTypeLegalizer::
+ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue Amt = N->getOperand(1);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ShTy = Amt.getValueType();
+  unsigned NVTBits = NVT.getSizeInBits();
+  assert(isPowerOf2_32(NVTBits) &&
+         "Expanded integer type size not a power of two!");
+  SDLoc dl(N);
+
+  // Get the incoming operand to be shifted.
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+
+  SDValue NVBitsNode = DAG.getConstant(NVTBits, dl, ShTy);
+  SDValue AmtExcess = DAG.getNode(ISD::SUB, dl, ShTy, Amt, NVBitsNode);
+  SDValue AmtLack = DAG.getNode(ISD::SUB, dl, ShTy, NVBitsNode, Amt);
+  SDValue isShort = DAG.getSetCC(dl, getSetCCResultType(ShTy),
+                                 Amt, NVBitsNode, ISD::SETULT);
+  SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(ShTy),
+                                Amt, DAG.getConstant(0, dl, ShTy),
+                                ISD::SETEQ);
+
+  SDValue LoS, HiS, LoL, HiL;
+  switch (N->getOpcode()) {
+  default: llvm_unreachable("Unknown shift");
+  case ISD::SHL:
+    // Short: ShAmt < NVTBits
+    LoS = DAG.getNode(ISD::SHL, dl, NVT, InL, Amt);
+    HiS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, Amt),
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    LoL = DAG.getConstant(0, dl, NVT);                    // Lo part is zero.
+    HiL = DAG.getNode(ISD::SHL, dl, NVT, InL, AmtExcess); // Hi from Lo part.
+
+    Lo = DAG.getSelect(dl, NVT, isShort, LoS, LoL);
+    Hi = DAG.getSelect(dl, NVT, isZero, InH,
+                       DAG.getSelect(dl, NVT, isShort, HiS, HiL));
+    return true;
+  case ISD::SRL:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRL, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+    // FIXME: If Amt is zero, the following shift generates an undefined result
+    // on some architectures.
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getConstant(0, dl, NVT);                    // Hi part is zero.
+    LoL = DAG.getNode(ISD::SRL, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getSelect(dl, NVT, isZero, InL,
+                       DAG.getSelect(dl, NVT, isShort, LoS, LoL));
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
+    return true;
+  case ISD::SRA:
+    // Short: ShAmt < NVTBits
+    HiS = DAG.getNode(ISD::SRA, dl, NVT, InH, Amt);
+    LoS = DAG.getNode(ISD::OR, dl, NVT,
+                      DAG.getNode(ISD::SRL, dl, NVT, InL, Amt),
+                      DAG.getNode(ISD::SHL, dl, NVT, InH, AmtLack));
+
+    // Long: ShAmt >= NVTBits
+    HiL = DAG.getNode(ISD::SRA, dl, NVT, InH,             // Sign of Hi part.
+                      DAG.getConstant(NVTBits - 1, dl, ShTy));
+    LoL = DAG.getNode(ISD::SRA, dl, NVT, InH, AmtExcess); // Lo from Hi part.
+
+    Lo = DAG.getSelect(dl, NVT, isZero, InL,
+                       DAG.getSelect(dl, NVT, isShort, LoS, LoL));
+    Hi = DAG.getSelect(dl, NVT, isShort, HiS, HiL);
+    return true;
+  }
+}
+
+static std::pair<ISD::CondCode, ISD::NodeType> getExpandedMinMaxOps(int Op) {
+
+  switch (Op) {
+    default: llvm_unreachable("invalid min/max opcode");
+    case ISD::SMAX:
+      return std::make_pair(ISD::SETGT, ISD::UMAX);
+    case ISD::UMAX:
+      return std::make_pair(ISD::SETUGT, ISD::UMAX);
+    case ISD::SMIN:
+      return std::make_pair(ISD::SETLT, ISD::UMIN);
+    case ISD::UMIN:
+      return std::make_pair(ISD::SETULT, ISD::UMIN);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MINMAX(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDLoc DL(N);
+  ISD::NodeType LoOpc;
+  ISD::CondCode CondC;
+  std::tie(CondC, LoOpc) = getExpandedMinMaxOps(N->getOpcode());
+
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  // Value types
+  EVT NVT = LHSL.getValueType();
+  EVT CCT = getSetCCResultType(NVT);
+
+  // Hi part is always the same op
+  Hi = DAG.getNode(N->getOpcode(), DL, {NVT, NVT}, {LHSH, RHSH});
+
+  // We need to know whether to select Lo part that corresponds to 'winning'
+  // Hi part or if Hi parts are equal.
+  SDValue IsHiLeft = DAG.getSetCC(DL, CCT, LHSH, RHSH, CondC);
+  SDValue IsHiEq = DAG.getSetCC(DL, CCT, LHSH, RHSH, ISD::SETEQ);
+
+  // Lo part corresponding to the 'winning' Hi part
+  SDValue LoCmp = DAG.getSelect(DL, NVT, IsHiLeft, LHSL, RHSL);
+
+  // Recursed Lo part if Hi parts are equal, this uses unsigned version
+  SDValue LoMinMax = DAG.getNode(LoOpc, DL, {NVT, NVT}, {LHSL, RHSL});
+
+  Lo = DAG.getSelect(DL, NVT, IsHiEq, LoMinMax, LoCmp);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUB(SDNode *N,
+                                           SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+
+  EVT NVT = LHSL.getValueType();
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  // Do not generate ADDC/ADDE or SUBC/SUBE if the target does not support
+  // them.  TODO: Teach operation legalization how to expand unsupported
+  // ADDC/ADDE/SUBC/SUBE.  The problem is that these operations generate
+  // a carry of type MVT::Glue, but there doesn't seem to be any way to
+  // generate a value of this type in the expanded code sequence.
+  bool hasCarry =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::ADDC : ISD::SUBC,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+
+  if (hasCarry) {
+    SDVTList VTList = DAG.getVTList(NVT, MVT::Glue);
+    if (N->getOpcode() == ISD::ADD) {
+      Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
+    } else {
+      Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
+      HiOps[2] = Lo.getValue(1);
+      Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
+    }
+    return;
+  }
+
+  bool hasOVF =
+    TLI.isOperationLegalOrCustom(N->getOpcode() == ISD::ADD ?
+                                   ISD::UADDO : ISD::USUBO,
+                                 TLI.getTypeToExpandTo(*DAG.getContext(), NVT));
+  if (hasOVF) {
+    SDVTList VTList = DAG.getVTList(NVT, NVT);
+    TargetLoweringBase::BooleanContent BoolType = TLI.getBooleanContents(NVT);
+    int RevOpc;
+    if (N->getOpcode() == ISD::ADD) {
+      RevOpc = ISD::SUB;
+      Lo = DAG.getNode(ISD::UADDO, dl, VTList, LoOps);
+      Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
+    } else {
+      RevOpc = ISD::ADD;
+      Lo = DAG.getNode(ISD::USUBO, dl, VTList, LoOps);
+      Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2));
+    }
+    SDValue OVF = Lo.getValue(1);
+
+    switch (BoolType) {
+    case TargetLoweringBase::UndefinedBooleanContent:
+      OVF = DAG.getNode(ISD::AND, dl, NVT, DAG.getConstant(1, dl, NVT), OVF);
+      // Fallthrough
+    case TargetLoweringBase::ZeroOrOneBooleanContent:
+      Hi = DAG.getNode(N->getOpcode(), dl, NVT, Hi, OVF);
+      break;
+    case TargetLoweringBase::ZeroOrNegativeOneBooleanContent:
+      Hi = DAG.getNode(RevOpc, dl, NVT, Hi, OVF);
+    }
+    return;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, makeArrayRef(HiOps, 2));
+    SDValue Cmp1 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[0],
+                                ISD::SETULT);
+    SDValue Carry1 = DAG.getSelect(dl, NVT, Cmp1,
+                                   DAG.getConstant(1, dl, NVT),
+                                   DAG.getConstant(0, dl, NVT));
+    SDValue Cmp2 = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo, LoOps[1],
+                                ISD::SETULT);
+    SDValue Carry2 = DAG.getSelect(dl, NVT, Cmp2,
+                                   DAG.getConstant(1, dl, NVT), Carry1);
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, Hi, Carry2);
+  } else {
+    Lo = DAG.getNode(ISD::SUB, dl, NVT, LoOps);
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, makeArrayRef(HiOps, 2));
+    SDValue Cmp =
+      DAG.getSetCC(dl, getSetCCResultType(LoOps[0].getValueType()),
+                   LoOps[0], LoOps[1], ISD::SETULT);
+    SDValue Borrow = DAG.getSelect(dl, NVT, Cmp,
+                                   DAG.getConstant(1, dl, NVT),
+                                   DAG.getConstant(0, dl, NVT));
+    Hi = DAG.getNode(ISD::SUB, dl, NVT, Hi, Borrow);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBC(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[2] = { LHSL, RHSL };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  if (N->getOpcode() == ISD::ADDC) {
+    Lo = DAG.getNode(ISD::ADDC, dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::ADDE, dl, VTList, HiOps);
+  } else {
+    Lo = DAG.getNode(ISD::SUBC, dl, VTList, LoOps);
+    HiOps[2] = Lo.getValue(1);
+    Hi = DAG.getNode(ISD::SUBE, dl, VTList, HiOps);
+  }
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ADDSUBE(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  // Expand the subcomponents.
+  SDValue LHSL, LHSH, RHSL, RHSH;
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+  GetExpandedInteger(N->getOperand(1), RHSL, RHSH);
+  SDVTList VTList = DAG.getVTList(LHSL.getValueType(), MVT::Glue);
+  SDValue LoOps[3] = { LHSL, RHSL, N->getOperand(2) };
+  SDValue HiOps[3] = { LHSH, RHSH };
+
+  Lo = DAG.getNode(N->getOpcode(), dl, VTList, LoOps);
+  HiOps[2] = Lo.getValue(1);
+  Hi = DAG.getNode(N->getOpcode(), dl, VTList, HiOps);
+
+  // Legalized the flag result - switch anything that used the old flag to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ANY_EXTEND(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is any extension of the input (which degenerates to a copy).
+    Lo = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Op);
+    Hi = DAG.getUNDEF(NVT);   // The high part is undefined.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertSext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertSext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertSext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part replicates the sign bit of Lo, make it explicit.
+    Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                     DAG.getConstant(NVTBits - 1, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_AssertZext(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+  unsigned NVTBits = NVT.getSizeInBits();
+  unsigned EVTBits = EVT.getSizeInBits();
+
+  if (NVTBits < EVTBits) {
+    Hi = DAG.getNode(ISD::AssertZext, dl, NVT, Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        EVTBits - NVTBits)));
+  } else {
+    Lo = DAG.getNode(ISD::AssertZext, dl, NVT, Lo, DAG.getValueType(EVT));
+    // The high part must be zero, make it explicit.
+    Hi = DAG.getConstant(0, dl, NVT);
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BITREVERSE(SDNode *N,
+                                               SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BITREVERSE, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BITREVERSE, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_BSWAP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Hi, Lo);  // Note swapped operands.
+  Lo = DAG.getNode(ISD::BSWAP, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::BSWAP, dl, Hi.getValueType(), Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Constant(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned NBitWidth = NVT.getSizeInBits();
+  auto Constant = cast<ConstantSDNode>(N);
+  const APInt &Cst = Constant->getAPIntValue();
+  bool IsTarget = Constant->isTargetOpcode();
+  bool IsOpaque = Constant->isOpaque();
+  SDLoc dl(N);
+  Lo = DAG.getConstant(Cst.trunc(NBitWidth), dl, NVT, IsTarget, IsOpaque);
+  Hi = DAG.getConstant(Cst.lshr(NBitWidth).trunc(NBitWidth), dl, NVT, IsTarget,
+                       IsOpaque);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTLZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // ctlz (HiLo) -> Hi != 0 ? ctlz(Hi) : (ctlz(Lo)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue HiNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Hi,
+                                   DAG.getConstant(0, dl, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(N->getOpcode(), dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(ISD::CTLZ_ZERO_UNDEF, dl, NVT, Hi);
+
+  Lo = DAG.getSelect(dl, NVT, HiNotZero, HiLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, LoLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), dl,
+                                                 NVT)));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTPOP(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // ctpop(HiLo) -> ctpop(Hi)+ctpop(Lo)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+  Lo = DAG.getNode(ISD::ADD, dl, NVT, DAG.getNode(ISD::CTPOP, dl, NVT, Lo),
+                   DAG.getNode(ISD::CTPOP, dl, NVT, Hi));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_CTTZ(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  // cttz (HiLo) -> Lo != 0 ? cttz(Lo) : (cttz(Hi)+32)
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT NVT = Lo.getValueType();
+
+  SDValue LoNotZero = DAG.getSetCC(dl, getSetCCResultType(NVT), Lo,
+                                   DAG.getConstant(0, dl, NVT), ISD::SETNE);
+
+  SDValue LoLZ = DAG.getNode(ISD::CTTZ_ZERO_UNDEF, dl, NVT, Lo);
+  SDValue HiLZ = DAG.getNode(N->getOpcode(), dl, NVT, Hi);
+
+  Lo = DAG.getSelect(dl, NVT, LoNotZero, LoLZ,
+                     DAG.getNode(ISD::ADD, dl, NVT, HiLZ,
+                                 DAG.getConstant(NVT.getSizeInBits(), dl,
+                                                 NVT)));
+  Hi = DAG.getConstant(0, dl, NVT);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue Op = N->getOperand(0);
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
+    Op = GetPromotedFloat(Op);
+
+  RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, true/*irrelevant*/, dl).first,
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
+                                               SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue Op = N->getOperand(0);
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
+    Op = GetPromotedFloat(Op);
+
+  RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Op, false/*irrelevant*/, dl).first,
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  if (ISD::isNormalLoad(N)) {
+    ExpandRes_NormalLoad(N, Lo, Hi);
+    return;
+  }
+
+  assert(ISD::isUNINDEXEDLoad(N) && "Indexed load during type legalization!");
+
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  unsigned Alignment = N->getAlignment();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDLoc dl(N);
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    EVT MemVT = N->getMemoryVT();
+
+    Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT,
+                        Alignment, MMOFlags, AAInfo);
+
+    // Remember the chain.
+    Ch = Lo.getValue(1);
+
+    if (ExtType == ISD::SEXTLOAD) {
+      // The high part is obtained by SRA'ing all but one of the bits of the
+      // lo part.
+      unsigned LoSize = Lo.getValueType().getSizeInBits();
+      Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
+                       DAG.getConstant(LoSize - 1, dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    } else if (ExtType == ISD::ZEXTLOAD) {
+      // The high part is just a zero.
+      Hi = DAG.getConstant(0, dl, NVT);
+    } else {
+      assert(ExtType == ISD::EXTLOAD && "Unknown extload!");
+      // The high part is undefined.
+      Hi = DAG.getUNDEF(NVT);
+    }
+  } else if (DAG.getDataLayout().isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                     AAInfo);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
+                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+  } else {
+    // Big-endian - high bits are at low addresses.  Favor aligned loads at
+    // the cost of some bit-fiddling.
+    EVT MemVT = N->getMemoryVT();
+    unsigned EBytes = MemVT.getStoreSize();
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    unsigned ExcessBits = (EBytes - IncrementSize)*8;
+
+    // Load both the high bits and maybe some of the low bits.
+    Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
+                        EVT::getIntegerVT(*DAG.getContext(),
+                                          MemVT.getSizeInBits() - ExcessBits),
+                        Alignment, MMOFlags, AAInfo);
+
+    // Increment the pointer to the other half.
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    // Load the rest of the low bits.
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
+                        N->getPointerInfo().getWithOffset(IncrementSize),
+                        EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+
+    // Build a factor node to remember that this load is independent of the
+    // other one.
+    Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                     Hi.getValue(1));
+
+    if (ExcessBits < NVT.getSizeInBits()) {
+      // Transfer low bits from the bottom of Hi to the top of Lo.
+      Lo = DAG.getNode(
+          ISD::OR, dl, NVT, Lo,
+          DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                      DAG.getConstant(ExcessBits, dl,
+                                      TLI.getPointerTy(DAG.getDataLayout()))));
+      // Move high bits to the right position in Hi.
+      Hi = DAG.getNode(ExtType == ISD::SEXTLOAD ? ISD::SRA : ISD::SRL, dl, NVT,
+                       Hi,
+                       DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
+                                       TLI.getPointerTy(DAG.getDataLayout())));
+    }
+  }
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Ch);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Logical(SDNode *N,
+                                            SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), LH, RH);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_MUL(SDNode *N,
+                                        SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  SDValue LL, LH, RL, RH;
+  GetExpandedInteger(N->getOperand(0), LL, LH);
+  GetExpandedInteger(N->getOperand(1), RL, RH);
+
+  if (TLI.expandMUL(N, Lo, Hi, NVT, DAG, LL, LH, RL, RH))
+    return;
+
+  // If nothing else, we can make a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::MUL_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::MUL_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MUL_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MUL_I128;
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL) {
+    // We'll expand the multiplication by brute force because we have no other
+    // options. This is a trivially-generalized version of the code from
+    // Hacker's Delight (itself derived from Knuth's Algorithm M from section
+    // 4.3.1).
+    SDValue Mask =
+      DAG.getConstant(APInt::getLowBitsSet(NVT.getSizeInBits(),
+                                           NVT.getSizeInBits() >> 1), dl, NVT);
+    SDValue LLL = DAG.getNode(ISD::AND, dl, NVT, LL, Mask);
+    SDValue RLL = DAG.getNode(ISD::AND, dl, NVT, RL, Mask);
+
+    SDValue T = DAG.getNode(ISD::MUL, dl, NVT, LLL, RLL);
+    SDValue TL = DAG.getNode(ISD::AND, dl, NVT, T, Mask);
+
+    SDValue Shift =
+      DAG.getConstant(NVT.getSizeInBits() >> 1, dl,
+                      TLI.getShiftAmountTy(NVT, DAG.getDataLayout()));
+    SDValue TH = DAG.getNode(ISD::SRL, dl, NVT, T, Shift);
+    SDValue LLH = DAG.getNode(ISD::SRL, dl, NVT, LL, Shift);
+    SDValue RLH = DAG.getNode(ISD::SRL, dl, NVT, RL, Shift);
+
+    SDValue U = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLH, RLL), TL);
+    SDValue UL = DAG.getNode(ISD::AND, dl, NVT, U, Mask);
+    SDValue UH = DAG.getNode(ISD::SRL, dl, NVT, U, Shift);
+
+    SDValue V = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LLL, RLH), UL);
+    SDValue VH = DAG.getNode(ISD::SRL, dl, NVT, V, Shift);
+
+    SDValue W = DAG.getNode(ISD::ADD, dl, NVT,
+                            DAG.getNode(ISD::MUL, dl, NVT, LL, RL),
+                            DAG.getNode(ISD::ADD, dl, NVT, UH, VH));
+    Lo = DAG.getNode(ISD::ADD, dl, NVT, TH,
+                     DAG.getNode(ISD::SHL, dl, NVT, V, Shift));
+
+    Hi = DAG.getNode(ISD::ADD, dl, NVT, W,
+                     DAG.getNode(ISD::ADD, dl, NVT,
+                                 DAG.getNode(ISD::MUL, dl, NVT, RH, LL), 
+                                 DAG.getNode(ISD::MUL, dl, NVT, RL, LH)));
+    return;
+  }
+
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true/*irrelevant*/, dl).first,
+               Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_READCYCLECOUNTER(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDLoc DL(N);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDVTList VTs = DAG.getVTList(NVT, NVT, MVT::Other);
+  SDValue R = DAG.getNode(N->getOpcode(), DL, VTs, N->getOperand(0));
+  Lo = R.getValue(0);
+  Hi = R.getValue(1);
+  ReplaceValueWith(SDValue(N, 1), R.getValue(2));
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SADDSUBO(SDNode *Node,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = Node->getOperand(0);
+  SDValue RHS = Node->getOperand(1);
+  SDLoc dl(Node);
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(Node->getOpcode() == ISD::SADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Compute the overflow.
+  //
+  //   LHSSign -> LHS >= 0
+  //   RHSSign -> RHS >= 0
+  //   SumSign -> Sum >= 0
+  //
+  //   Add:
+  //   Overflow -> (LHSSign == RHSSign) && (LHSSign != SumSign)
+  //   Sub:
+  //   Overflow -> (LHSSign != RHSSign) && (LHSSign != SumSign)
+  //
+  EVT OType = Node->getValueType(1);
+  SDValue Zero = DAG.getConstant(0, dl, LHS.getValueType());
+
+  SDValue LHSSign = DAG.getSetCC(dl, OType, LHS, Zero, ISD::SETGE);
+  SDValue RHSSign = DAG.getSetCC(dl, OType, RHS, Zero, ISD::SETGE);
+  SDValue SignsMatch = DAG.getSetCC(dl, OType, LHSSign, RHSSign,
+                                    Node->getOpcode() == ISD::SADDO ?
+                                    ISD::SETEQ : ISD::SETNE);
+
+  SDValue SumSign = DAG.getSetCC(dl, OType, Sum, Zero, ISD::SETGE);
+  SDValue SumSignNE = DAG.getSetCC(dl, OType, LHSSign, SumSign, ISD::SETNE);
+
+  SDValue Cmp = DAG.getNode(ISD::AND, dl, OType, SignsMatch, SumSignNE);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(Node, 1), Cmp);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(0), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SDIV!");
+
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_Shift(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // If we can emit an efficient shift operation, do so now.  Check to see if
+  // the RHS is a constant.
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N->getOperand(1)))
+    return ExpandShiftByConstant(N, CN->getAPIntValue(), Lo, Hi);
+
+  // If we can determine that the high bit of the shift is zero or one, even if
+  // the low bits are variable, emit this shift in an optimized form.
+  if (ExpandShiftWithKnownAmountBit(N, Lo, Hi))
+    return;
+
+  // If this target supports shift_PARTS, use it.  First, map to the _PARTS opc.
+  unsigned PartsOpc;
+  if (N->getOpcode() == ISD::SHL) {
+    PartsOpc = ISD::SHL_PARTS;
+  } else if (N->getOpcode() == ISD::SRL) {
+    PartsOpc = ISD::SRL_PARTS;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    PartsOpc = ISD::SRA_PARTS;
+  }
+
+  // Next check to see if the target supports this SHL_PARTS operation or if it
+  // will custom expand it.
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  TargetLowering::LegalizeAction Action = TLI.getOperationAction(PartsOpc, NVT);
+  if ((Action == TargetLowering::Legal && TLI.isTypeLegal(NVT)) ||
+      Action == TargetLowering::Custom) {
+    // Expand the subcomponents.
+    SDValue LHSL, LHSH;
+    GetExpandedInteger(N->getOperand(0), LHSL, LHSH);
+    EVT VT = LHSL.getValueType();
+
+    // If the shift amount operand is coming from a vector legalization it may
+    // have an illegal type.  Fix that first by casting the operand, otherwise
+    // the new SHL_PARTS operation would need further legalization.
+    SDValue ShiftOp = N->getOperand(1);
+    EVT ShiftTy = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+    assert(ShiftTy.getScalarType().getSizeInBits() >=
+           Log2_32_Ceil(VT.getScalarType().getSizeInBits()) &&
+           "ShiftAmountTy is too small to cover the range of this type!");
+    if (ShiftOp.getValueType() != ShiftTy)
+      ShiftOp = DAG.getZExtOrTrunc(ShiftOp, dl, ShiftTy);
+
+    SDValue Ops[] = { LHSL, LHSH, ShiftOp };
+    Lo = DAG.getNode(PartsOpc, dl, DAG.getVTList(VT, VT), Ops);
+    Hi = Lo.getValue(1);
+    return;
+  }
+
+  // Otherwise, emit a libcall.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  bool isSigned;
+  if (N->getOpcode() == ISD::SHL) {
+    isSigned = false; /*sign irrelevant*/
+    if (VT == MVT::i16)
+      LC = RTLIB::SHL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SHL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SHL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SHL_I128;
+  } else if (N->getOpcode() == ISD::SRL) {
+    isSigned = false;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRL_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRL_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRL_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRL_I128;
+  } else {
+    assert(N->getOpcode() == ISD::SRA && "Unknown shift!");
+    isSigned = true;
+    if (VT == MVT::i16)
+      LC = RTLIB::SRA_I16;
+    else if (VT == MVT::i32)
+      LC = RTLIB::SRA_I32;
+    else if (VT == MVT::i64)
+      LC = RTLIB::SRA_I64;
+    else if (VT == MVT::i128)
+      LC = RTLIB::SRA_I128;
+  }
+
+  if (LC != RTLIB::UNKNOWN_LIBCALL && TLI.getLibcallName(LC)) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, isSigned, dl).first, Lo, Hi);
+    return;
+  }
+
+  if (!ExpandShiftWithUnknownAmountBit(N, Lo, Hi))
+    llvm_unreachable("Unsupported shift!");
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SIGN_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is sign extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::SIGN_EXTEND, dl, NVT, N->getOperand(0));
+    // The high part is obtained by SRA'ing all but one of the bits of low part.
+    unsigned LoSize = NVT.getSizeInBits();
+    Hi = DAG.getNode(
+        ISD::SRA, dl, NVT, Lo,
+        DAG.getConstant(LoSize - 1, dl, TLI.getPointerTy(DAG.getDataLayout())));
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::
+ExpandIntRes_SIGN_EXTEND_INREG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  EVT EVT = cast<VTSDNode>(N->getOperand(1))->getVT();
+
+  if (EVT.bitsLE(Lo.getValueType())) {
+    // sext_inreg the low part if needed.
+    Lo = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Lo.getValueType(), Lo,
+                     N->getOperand(1));
+
+    // The high part gets the sign extension from the lo-part.  This handles
+    // things like sextinreg V:i64 from i8.
+    Hi = DAG.getNode(ISD::SRA, dl, Hi.getValueType(), Lo,
+                     DAG.getConstant(Hi.getValueType().getSizeInBits() - 1, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+  } else {
+    // For example, extension of an i48 to an i64.  Leave the low part alone,
+    // sext_inreg the high part.
+    unsigned ExcessBits =
+      EVT.getSizeInBits() - Lo.getValueType().getSizeInBits();
+    Hi = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Hi.getValueType(), Hi,
+                     DAG.getValueType(EVT::getIntegerVT(*DAG.getContext(),
+                                                        ExcessBits)));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_SREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::SDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::SDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(1), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::SREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::SREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::SREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::SREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported SREM!");
+
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, true, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_TRUNCATE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, NVT, N->getOperand(0));
+  Hi = DAG.getNode(ISD::SRL, dl, N->getOperand(0).getValueType(),
+                   N->getOperand(0),
+                   DAG.getConstant(NVT.getSizeInBits(), dl,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, NVT, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDLoc dl(N);
+
+  // Expand the result by simply replacing it with the equivalent
+  // non-overflow-checking operation.
+  SDValue Sum = DAG.getNode(N->getOpcode() == ISD::UADDO ?
+                            ISD::ADD : ISD::SUB, dl, LHS.getValueType(),
+                            LHS, RHS);
+  SplitInteger(Sum, Lo, Hi);
+
+  // Calculate the overflow: addition overflows iff a + b < a, and subtraction
+  // overflows iff a - b > a.
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Sum, LHS,
+                             N->getOpcode () == ISD::UADDO ?
+                             ISD::SETULT : ISD::SETUGT);
+
+  // Use the calculated overflow everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
+                                          SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // A divide for UMULO should be faster than a function call.
+  if (N->getOpcode() == ISD::UMULO) {
+    SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
+
+    SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
+    SplitInteger(MUL, Lo, Hi);
+
+    // A divide for UMULO will be faster than a function call. Select to
+    // make sure we aren't using 0.
+    SDValue isZero = DAG.getSetCC(dl, getSetCCResultType(VT),
+                                  RHS, DAG.getConstant(0, dl, VT), ISD::SETEQ);
+    SDValue NotZero = DAG.getSelect(dl, VT, isZero,
+                                    DAG.getConstant(1, dl, VT), RHS);
+    SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
+    SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
+                                    ISD::SETNE);
+    Overflow = DAG.getSelect(dl, N->getValueType(1), isZero,
+                             DAG.getConstant(0, dl, N->getValueType(1)),
+                             Overflow);
+    ReplaceValueWith(SDValue(N, 1), Overflow);
+    return;
+  }
+
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  EVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+
+  // Replace this with a libcall that will check overflow.
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i32)
+    LC = RTLIB::MULO_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::MULO_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::MULO_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported XMULO!");
+
+  SDValue Temp = DAG.CreateStackTemporary(PtrVT);
+  // Temporary for the overflow value, default it to zero.
+  SDValue Chain =
+      DAG.getStore(DAG.getEntryNode(), dl, DAG.getConstant(0, dl, PtrVT), Temp,
+                   MachinePointerInfo());
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (const SDValue &Op : N->op_values()) {
+    EVT ArgVT = Op.getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Op;
+    Entry.Ty = ArgTy;
+    Entry.isSExt = true;
+    Entry.isZExt = false;
+    Args.push_back(Entry);
+  }
+
+  // Also pass the address of the overflow check.
+  Entry.Node = Temp;
+  Entry.Ty = PtrTy->getPointerTo();
+  Entry.isSExt = true;
+  Entry.isZExt = false;
+  Args.push_back(Entry);
+
+  SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(Chain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
+    .setSExtResult();
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  SplitInteger(CallInfo.first, Lo, Hi);
+  SDValue Temp2 =
+      DAG.getLoad(PtrVT, dl, CallInfo.second, Temp, MachinePointerInfo());
+  SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
+                             DAG.getConstant(0, dl, PtrVT),
+                             ISD::SETNE);
+  // Use the overflow from the libcall everywhere.
+  ReplaceValueWith(SDValue(N, 1), Ofl);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UDIV(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(0), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UDIV_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UDIV_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UDIV_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UDIV_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UDIV!");
+
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_UREM(SDNode *N,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+
+  if (TLI.getOperationAction(ISD::UDIVREM, VT) == TargetLowering::Custom) {
+    SDValue Res = DAG.getNode(ISD::UDIVREM, dl, DAG.getVTList(VT, VT), Ops);
+    SplitInteger(Res.getValue(1), Lo, Hi);
+    return;
+  }
+
+  RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
+  if (VT == MVT::i16)
+    LC = RTLIB::UREM_I16;
+  else if (VT == MVT::i32)
+    LC = RTLIB::UREM_I32;
+  else if (VT == MVT::i64)
+    LC = RTLIB::UREM_I64;
+  else if (VT == MVT::i128)
+    LC = RTLIB::UREM_I128;
+  assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported UREM!");
+
+  SplitInteger(TLI.makeLibCall(DAG, LC, VT, Ops, false, dl).first, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ZERO_EXTEND(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  SDValue Op = N->getOperand(0);
+  if (Op.getValueType().bitsLE(NVT)) {
+    // The low part is zero extension of the input (degenerates to a copy).
+    Lo = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N->getOperand(0));
+    Hi = DAG.getConstant(0, dl, NVT);   // The high part is just a zero.
+  } else {
+    // For example, extension of an i48 to an i64.  The operand type necessarily
+    // promotes to the result type, so will end up being expanded too.
+    assert(getTypeAction(Op.getValueType()) ==
+           TargetLowering::TypePromoteInteger &&
+           "Only know how to promote this result!");
+    SDValue Res = GetPromotedInteger(Op);
+    assert(Res.getValueType() == N->getValueType(0) &&
+           "Operand over promoted?");
+    // Split the promoted operand.  This will simplify when it is expanded.
+    SplitInteger(Res, Lo, Hi);
+    unsigned ExcessBits =
+      Op.getValueType().getSizeInBits() - NVT.getSizeInBits();
+    Hi = DAG.getZeroExtendInReg(Hi, dl,
+                                EVT::getIntegerVT(*DAG.getContext(),
+                                                  ExcessBits));
+  }
+}
+
+void DAGTypeLegalizer::ExpandIntRes_ATOMIC_LOAD(SDNode *N,
+                                                SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(N);
+  EVT VT = cast<AtomicSDNode>(N)->getMemoryVT();
+  SDVTList VTs = DAG.getVTList(VT, MVT::i1, MVT::Other);
+  SDValue Zero = DAG.getConstant(0, dl, VT);
+  SDValue Swap = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl,
+      cast<AtomicSDNode>(N)->getMemoryVT(), VTs, N->getOperand(0),
+      N->getOperand(1), Zero, Zero, cast<AtomicSDNode>(N)->getMemOperand(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getOrdering(),
+      cast<AtomicSDNode>(N)->getSynchScope());
+
+  ReplaceValueWith(SDValue(N, 0), Swap.getValue(0));
+  ReplaceValueWith(SDValue(N, 1), Swap.getValue(2));
+}
+
+//===----------------------------------------------------------------------===//
+//  Integer Operand Expansion
+//===----------------------------------------------------------------------===//
+
+/// ExpandIntegerOperand - This method is called when the specified operand of
+/// the specified node is found to need expansion.  At this point, all of the
+/// result types of the node are known to be legal, but other operands of the
+/// node may need promotion or expansion as well as the specified one.
+bool DAGTypeLegalizer::ExpandIntegerOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Expand integer operand: "; N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "ExpandIntegerOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to expand this operator's operand!");
+
+  case ISD::BITCAST:           Res = ExpandOp_BITCAST(N); break;
+  case ISD::BR_CC:             Res = ExpandIntOp_BR_CC(N); break;
+  case ISD::BUILD_VECTOR:      Res = ExpandOp_BUILD_VECTOR(N); break;
+  case ISD::EXTRACT_ELEMENT:   Res = ExpandOp_EXTRACT_ELEMENT(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = ExpandOp_INSERT_VECTOR_ELT(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = ExpandOp_SCALAR_TO_VECTOR(N); break;
+  case ISD::SELECT_CC:         Res = ExpandIntOp_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = ExpandIntOp_SETCC(N); break;
+  case ISD::SETCCE:            Res = ExpandIntOp_SETCCE(N); break;
+  case ISD::SINT_TO_FP:        Res = ExpandIntOp_SINT_TO_FP(N); break;
+  case ISD::STORE:   Res = ExpandIntOp_STORE(cast<StoreSDNode>(N), OpNo); break;
+  case ISD::TRUNCATE:          Res = ExpandIntOp_TRUNCATE(N); break;
+  case ISD::UINT_TO_FP:        Res = ExpandIntOp_UINT_TO_FP(N); break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:              Res = ExpandIntOp_Shift(N); break;
+  case ISD::RETURNADDR:
+  case ISD::FRAMEADDR:         Res = ExpandIntOp_RETURNADDR(N); break;
+
+  case ISD::ATOMIC_STORE:      Res = ExpandIntOp_ATOMIC_STORE(N); break;
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// IntegerExpandSetCCOperands - Expand the operands of a comparison.  This code
+/// is shared among BR_CC, SELECT_CC, and SETCC handlers.
+void DAGTypeLegalizer::IntegerExpandSetCCOperands(SDValue &NewLHS,
+                                                  SDValue &NewRHS,
+                                                  ISD::CondCode &CCCode,
+                                                  const SDLoc &dl) {
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(NewLHS, LHSLo, LHSHi);
+  GetExpandedInteger(NewRHS, RHSLo, RHSHi);
+
+  if (CCCode == ISD::SETEQ || CCCode == ISD::SETNE) {
+    if (RHSLo == RHSHi) {
+      if (ConstantSDNode *RHSCST = dyn_cast<ConstantSDNode>(RHSLo)) {
+        if (RHSCST->isAllOnesValue()) {
+          // Equality comparison to -1.
+          NewLHS = DAG.getNode(ISD::AND, dl,
+                               LHSLo.getValueType(), LHSLo, LHSHi);
+          NewRHS = RHSLo;
+          return;
+        }
+      }
+    }
+
+    NewLHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSLo, RHSLo);
+    NewRHS = DAG.getNode(ISD::XOR, dl, LHSLo.getValueType(), LHSHi, RHSHi);
+    NewLHS = DAG.getNode(ISD::OR, dl, NewLHS.getValueType(), NewLHS, NewRHS);
+    NewRHS = DAG.getConstant(0, dl, NewLHS.getValueType());
+    return;
+  }
+
+  // If this is a comparison of the sign bit, just look at the top part.
+  // X > -1,  x < 0
+  if (ConstantSDNode *CST = dyn_cast<ConstantSDNode>(NewRHS))
+    if ((CCCode == ISD::SETLT && CST->isNullValue()) ||     // X < 0
+        (CCCode == ISD::SETGT && CST->isAllOnesValue())) {  // X > -1
+      NewLHS = LHSHi;
+      NewRHS = RHSHi;
+      return;
+    }
+
+  // FIXME: This generated code sucks.
+  ISD::CondCode LowCC;
+  switch (CCCode) {
+  default: llvm_unreachable("Unknown integer setcc!");
+  case ISD::SETLT:
+  case ISD::SETULT: LowCC = ISD::SETULT; break;
+  case ISD::SETGT:
+  case ISD::SETUGT: LowCC = ISD::SETUGT; break;
+  case ISD::SETLE:
+  case ISD::SETULE: LowCC = ISD::SETULE; break;
+  case ISD::SETGE:
+  case ISD::SETUGE: LowCC = ISD::SETUGE; break;
+  }
+
+  // Tmp1 = lo(op1) < lo(op2)   // Always unsigned comparison
+  // Tmp2 = hi(op1) < hi(op2)   // Signedness depends on operands
+  // dest = hi(op1) == hi(op2) ? Tmp1 : Tmp2;
+
+  // NOTE: on targets without efficient SELECT of bools, we can always use
+  // this identity: (B1 ? B2 : B3) --> (B1 & B2)|(!B1&B3)
+  TargetLowering::DAGCombinerInfo DagCombineInfo(DAG, AfterLegalizeTypes, true,
+                                                 nullptr);
+  SDValue Tmp1, Tmp2;
+  if (TLI.isTypeLegal(LHSLo.getValueType()) &&
+      TLI.isTypeLegal(RHSLo.getValueType()))
+    Tmp1 = TLI.SimplifySetCC(getSetCCResultType(LHSLo.getValueType()),
+                             LHSLo, RHSLo, LowCC, false, DagCombineInfo, dl);
+  if (!Tmp1.getNode())
+    Tmp1 = DAG.getSetCC(dl, getSetCCResultType(LHSLo.getValueType()),
+                        LHSLo, RHSLo, LowCC);
+  if (TLI.isTypeLegal(LHSHi.getValueType()) &&
+      TLI.isTypeLegal(RHSHi.getValueType()))
+    Tmp2 = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
+                             LHSHi, RHSHi, CCCode, false, DagCombineInfo, dl);
+  if (!Tmp2.getNode())
+    Tmp2 = DAG.getNode(ISD::SETCC, dl,
+                       getSetCCResultType(LHSHi.getValueType()),
+                       LHSHi, RHSHi, DAG.getCondCode(CCCode));
+
+  ConstantSDNode *Tmp1C = dyn_cast<ConstantSDNode>(Tmp1.getNode());
+  ConstantSDNode *Tmp2C = dyn_cast<ConstantSDNode>(Tmp2.getNode());
+  if ((Tmp1C && Tmp1C->isNullValue()) ||
+      (Tmp2C && Tmp2C->isNullValue() &&
+       (CCCode == ISD::SETLE || CCCode == ISD::SETGE ||
+        CCCode == ISD::SETUGE || CCCode == ISD::SETULE)) ||
+      (Tmp2C && Tmp2C->getAPIntValue() == 1 &&
+       (CCCode == ISD::SETLT || CCCode == ISD::SETGT ||
+        CCCode == ISD::SETUGT || CCCode == ISD::SETULT))) {
+    // low part is known false, returns high part.
+    // For LE / GE, if high part is known false, ignore the low part.
+    // For LT / GT, if high part is known true, ignore the low part.
+    NewLHS = Tmp2;
+    NewRHS = SDValue();
+    return;
+  }
+
+  if (LHSHi == RHSHi) {
+    // Comparing the low bits is enough.
+    NewLHS = Tmp1;
+    NewRHS = SDValue();
+    return;
+  }
+
+  // Lower with SETCCE if the target supports it.
+  // FIXME: Make all targets support this, then remove the other lowering.
+  if (TLI.getOperationAction(
+          ISD::SETCCE,
+          TLI.getTypeToExpandTo(*DAG.getContext(), LHSLo.getValueType())) ==
+      TargetLowering::Custom) {
+    // SETCCE can detect < and >= directly. For > and <=, flip operands and
+    // condition code.
+    bool FlipOperands = false;
+    switch (CCCode) {
+    case ISD::SETGT:  CCCode = ISD::SETLT;  FlipOperands = true; break;
+    case ISD::SETUGT: CCCode = ISD::SETULT; FlipOperands = true; break;
+    case ISD::SETLE:  CCCode = ISD::SETGE;  FlipOperands = true; break;
+    case ISD::SETULE: CCCode = ISD::SETUGE; FlipOperands = true; break;
+    default: break;
+    }
+    if (FlipOperands) {
+      std::swap(LHSLo, RHSLo);
+      std::swap(LHSHi, RHSHi);
+    }
+    // Perform a wide subtraction, feeding the carry from the low part into
+    // SETCCE. The SETCCE operation is essentially looking at the high part of
+    // the result of LHS - RHS. It is negative iff LHS < RHS. It is zero or
+    // positive iff LHS >= RHS.
+    SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
+    SDValue LowCmp = DAG.getNode(ISD::SUBC, dl, VTList, LHSLo, RHSLo);
+    SDValue Res =
+        DAG.getNode(ISD::SETCCE, dl, getSetCCResultType(LHSLo.getValueType()),
+                    LHSHi, RHSHi, LowCmp.getValue(1), DAG.getCondCode(CCCode));
+    NewLHS = Res;
+    NewRHS = SDValue();
+    return;
+  }
+
+  NewLHS = TLI.SimplifySetCC(getSetCCResultType(LHSHi.getValueType()),
+                             LHSHi, RHSHi, ISD::SETEQ, false,
+                             DagCombineInfo, dl);
+  if (!NewLHS.getNode())
+    NewLHS = DAG.getSetCC(dl, getSetCCResultType(LHSHi.getValueType()),
+                          LHSHi, RHSHi, ISD::SETEQ);
+  NewLHS = DAG.getSelect(dl, Tmp1.getValueType(),
+                         NewLHS, Tmp1, Tmp2);
+  NewRHS = SDValue();
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_BR_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(2), NewRHS = N->getOperand(3);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(1))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0),
+                                DAG.getCondCode(CCCode), NewLHS, NewRHS,
+                                N->getOperand(4)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SELECT_CC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(4))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, we need to compare the result
+  // against zero to select between true and false values.
+  if (!NewRHS.getNode()) {
+    NewRHS = DAG.getConstant(0, SDLoc(N), NewLHS.getValueType());
+    CCCode = ISD::SETNE;
+  }
+
+  // Update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                N->getOperand(2), N->getOperand(3),
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCC(SDNode *N) {
+  SDValue NewLHS = N->getOperand(0), NewRHS = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  IntegerExpandSetCCOperands(NewLHS, NewRHS, CCCode, SDLoc(N));
+
+  // If ExpandSetCCOperands returned a scalar, use it.
+  if (!NewRHS.getNode()) {
+    assert(NewLHS.getValueType() == N->getValueType(0) &&
+           "Unexpected setcc expansion!");
+    return NewLHS;
+  }
+
+  // Otherwise, update N to have the operands specified.
+  return SDValue(DAG.UpdateNodeOperands(N, NewLHS, NewRHS,
+                                DAG.getCondCode(CCCode)), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SETCCE(SDNode *N) {
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  SDValue Carry = N->getOperand(2);
+  SDValue Cond = N->getOperand(3);
+  SDLoc dl = SDLoc(N);
+
+  SDValue LHSLo, LHSHi, RHSLo, RHSHi;
+  GetExpandedInteger(LHS, LHSLo, LHSHi);
+  GetExpandedInteger(RHS, RHSLo, RHSHi);
+
+  // Expand to a SUBE for the low part and a smaller SETCCE for the high.
+  SDVTList VTList = DAG.getVTList(LHSLo.getValueType(), MVT::Glue);
+  SDValue LowCmp = DAG.getNode(ISD::SUBE, dl, VTList, LHSLo, RHSLo, Carry);
+  return DAG.getNode(ISD::SETCCE, dl, N->getValueType(0), LHSHi, RHSHi,
+                     LowCmp.getValue(1), Cond);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_Shift(SDNode *N) {
+  // The value being shifted is legal, but the shift amount is too big.
+  // It follows that either the result of the shift is undefined, or the
+  // upper half of the shift amount is zero.  Just use the lower half.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(1), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, N->getOperand(0), Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_RETURNADDR(SDNode *N) {
+  // The argument of RETURNADDR / FRAMEADDR builtin is 32 bit contant.  This
+  // surely makes pretty nice problems on 8/16 bit targets. Just truncate this
+  // constant to valid type.
+  SDValue Lo, Hi;
+  GetExpandedInteger(N->getOperand(0), Lo, Hi);
+  return SDValue(DAG.UpdateNodeOperands(N, Lo), 0);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_SINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+  RTLIB::Libcall LC = RTLIB::getSINTTOFP(Op.getValueType(), DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this SINT_TO_FP!");
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, SDLoc(N)).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  if (ISD::isNormalStore(N))
+    return ExpandOp_NormalStore(N, OpNo);
+
+  assert(ISD::isUNINDEXEDStore(N) && "Indexed store during type legalization!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+
+  EVT VT = N->getOperand(1).getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  unsigned Alignment = N->getAlignment();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDLoc dl(N);
+  SDValue Lo, Hi;
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  if (N->getMemoryVT().bitsLE(NVT)) {
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+    return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                             N->getMemoryVT(), Alignment, MMOFlags, AAInfo);
+  }
+
+  if (DAG.getDataLayout().isLittleEndian()) {
+    // Little-endian - low bits are at low addresses.
+    GetExpandedInteger(N->getValue(), Lo, Hi);
+
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                      AAInfo);
+
+    unsigned ExcessBits =
+      N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
+    EVT NEVT = EVT::getIntegerVT(*DAG.getContext(), ExcessBits);
+
+    // Increment the pointer to the other half.
+    unsigned IncrementSize = NVT.getSizeInBits()/8;
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Hi = DAG.getTruncStore(
+        Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
+        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+    return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+  }
+
+  // Big-endian - high bits are at low addresses.  Favor aligned stores at
+  // the cost of some bit-fiddling.
+  GetExpandedInteger(N->getValue(), Lo, Hi);
+
+  EVT ExtVT = N->getMemoryVT();
+  unsigned EBytes = ExtVT.getStoreSize();
+  unsigned IncrementSize = NVT.getSizeInBits()/8;
+  unsigned ExcessBits = (EBytes - IncrementSize)*8;
+  EVT HiVT = EVT::getIntegerVT(*DAG.getContext(),
+                               ExtVT.getSizeInBits() - ExcessBits);
+
+  if (ExcessBits < NVT.getSizeInBits()) {
+    // Transfer high bits from the top of Lo to the bottom of Hi.
+    Hi = DAG.getNode(ISD::SHL, dl, NVT, Hi,
+                     DAG.getConstant(NVT.getSizeInBits() - ExcessBits, dl,
+                                     TLI.getPointerTy(DAG.getDataLayout())));
+    Hi = DAG.getNode(
+        ISD::OR, dl, NVT, Hi,
+        DAG.getNode(ISD::SRL, dl, NVT, Lo,
+                    DAG.getConstant(ExcessBits, dl,
+                                    TLI.getPointerTy(DAG.getDataLayout()))));
+  }
+
+  // Store both the high bits and maybe some of the low bits.
+  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, Alignment,
+                         MMOFlags, AAInfo);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+  // Store the lowest ExcessBits bits in the second half.
+  Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
+                         N->getPointerInfo().getWithOffset(IncrementSize),
+                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
+                         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_TRUNCATE(SDNode *N) {
+  SDValue InL, InH;
+  GetExpandedInteger(N->getOperand(0), InL, InH);
+  // Just truncate the low part of the source.
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(N), N->getValueType(0), InL);
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_UINT_TO_FP(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  EVT SrcVT = Op.getValueType();
+  EVT DstVT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // The following optimization is valid only if every value in SrcVT (when
+  // treated as signed) is representable in DstVT.  Check that the mantissa
+  // size of DstVT is >= than the number of bits in SrcVT -1.
+  const fltSemantics &sem = DAG.EVTToAPFloatSemantics(DstVT);
+  if (APFloat::semanticsPrecision(sem) >= SrcVT.getSizeInBits()-1 &&
+      TLI.getOperationAction(ISD::SINT_TO_FP, SrcVT) == TargetLowering::Custom){
+    // Do a signed conversion then adjust the result.
+    SDValue SignedConv = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Op);
+    SignedConv = TLI.LowerOperation(SignedConv, DAG);
+
+    // The result of the signed conversion needs adjusting if the 'sign bit' of
+    // the incoming integer was set.  To handle this, we dynamically test to see
+    // if it is set, and, if so, add a fudge factor.
+
+    const uint64_t F32TwoE32  = 0x4F800000ULL;
+    const uint64_t F32TwoE64  = 0x5F800000ULL;
+    const uint64_t F32TwoE128 = 0x7F800000ULL;
+
+    APInt FF(32, 0);
+    if (SrcVT == MVT::i32)
+      FF = APInt(32, F32TwoE32);
+    else if (SrcVT == MVT::i64)
+      FF = APInt(32, F32TwoE64);
+    else if (SrcVT == MVT::i128)
+      FF = APInt(32, F32TwoE128);
+    else
+      llvm_unreachable("Unsupported UINT_TO_FP!");
+
+    // Check whether the sign bit is set.
+    SDValue Lo, Hi;
+    GetExpandedInteger(Op, Lo, Hi);
+    SDValue SignSet = DAG.getSetCC(dl,
+                                   getSetCCResultType(Hi.getValueType()),
+                                   Hi,
+                                   DAG.getConstant(0, dl, Hi.getValueType()),
+                                   ISD::SETLT);
+
+    // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+    SDValue FudgePtr =
+        DAG.getConstantPool(ConstantInt::get(*DAG.getContext(), FF.zext(64)),
+                            TLI.getPointerTy(DAG.getDataLayout()));
+
+    // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
+    SDValue Zero = DAG.getIntPtrConstant(0, dl);
+    SDValue Four = DAG.getIntPtrConstant(4, dl);
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Zero, Four);
+    SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet,
+                                   Zero, Four);
+    unsigned Alignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlignment();
+    FudgePtr = DAG.getNode(ISD::ADD, dl, FudgePtr.getValueType(),
+                           FudgePtr, Offset);
+    Alignment = std::min(Alignment, 4u);
+
+    // Load the value out, extending it from f32 to the destination float type.
+    // FIXME: Avoid the extend by constructing the right constant pool?
+    SDValue Fudge = DAG.getExtLoad(
+        ISD::EXTLOAD, dl, DstVT, DAG.getEntryNode(), FudgePtr,
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
+        Alignment);
+    return DAG.getNode(ISD::FADD, dl, DstVT, SignedConv, Fudge);
+  }
+
+  // Otherwise, use a libcall.
+  RTLIB::Libcall LC = RTLIB::getUINTTOFP(SrcVT, DstVT);
+  assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+         "Don't know how to expand this UINT_TO_FP!");
+  return TLI.makeLibCall(DAG, LC, DstVT, Op, true, dl).first;
+}
+
+SDValue DAGTypeLegalizer::ExpandIntOp_ATOMIC_STORE(SDNode *N) {
+  SDLoc dl(N);
+  SDValue Swap = DAG.getAtomic(ISD::ATOMIC_SWAP, dl,
+                               cast<AtomicSDNode>(N)->getMemoryVT(),
+                               N->getOperand(0),
+                               N->getOperand(1), N->getOperand(2),
+                               cast<AtomicSDNode>(N)->getMemOperand(),
+                               cast<AtomicSDNode>(N)->getOrdering(),
+                               cast<AtomicSDNode>(N)->getSynchScope());
+  return Swap.getValue(1);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp0 = N->getOperand(0);
+  EVT InVT = InOp0.getValueType();
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned OutNumElems = OutVT.getVectorNumElements();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDLoc dl(N);
+  SDValue BaseIdx = N->getOperand(1);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(OutNumElems);
+  for (unsigned i = 0; i != OutNumElems; ++i) {
+
+    // Extract the element from the original vector.
+    SDValue Index = DAG.getNode(ISD::ADD, dl, BaseIdx.getValueType(),
+      BaseIdx, DAG.getConstant(i, dl, BaseIdx.getValueType()));
+    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+      InVT.getVectorElementType(), N->getOperand(0), Index);
+
+    SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, Ext);
+    // Insert the converted element to the new vector.
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_VECTOR_SHUFFLE(SDNode *N) {
+  ShuffleVectorSDNode *SV = cast<ShuffleVectorSDNode>(N);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  ArrayRef<int> NewMask = SV->getMask().slice(0, VT.getVectorNumElements());
+
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = GetPromotedInteger(N->getOperand(1));
+  EVT OutVT = V0.getValueType();
+
+  return DAG.getVectorShuffle(OutVT, dl, V0, V1, NewMask);
+}
+
+
+SDValue DAGTypeLegalizer::PromoteIntRes_BUILD_VECTOR(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  unsigned NumElems = N->getNumOperands();
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDLoc dl(N);
+
+  SmallVector<SDValue, 8> Ops;
+  Ops.reserve(NumElems);
+  for (unsigned i = 0; i != NumElems; ++i) {
+    SDValue Op;
+    // BUILD_VECTOR integer operand types are allowed to be larger than the
+    // result's element type. This may still be true after the promotion. For
+    // example, we might be promoting (<v?i1> = BV <i32>, <i32>, ...) to
+    // (v?i16 = BV <i32>, <i32>, ...), and we can't any_extend <i32> to <i16>.
+    if (N->getOperand(i).getValueType().bitsLT(NOutVTElem))
+      Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(i));
+    else
+      Op = N->getOperand(i);
+    Ops.push_back(Op);
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
+
+  SDLoc dl(N);
+
+  assert(!N->getOperand(0).getValueType().isVector() &&
+         "Input must be a scalar");
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDValue Op = DAG.getNode(ISD::ANY_EXTEND, dl, NOutVTElem, N->getOperand(0));
+
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NOutVT, Op);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
+  SDLoc dl(N);
+
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT InElemTy = OutVT.getVectorElementType();
+  EVT OutElemTy = NOutVT.getVectorElementType();
+
+  unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
+  unsigned NumOutElem = NOutVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+  assert(NumElem * NumOperands == NumOutElem &&
+         "Unexpected number of elements");
+
+  // Take the elements from the first vector.
+  SmallVector<SDValue, 8> Ops(NumOutElem);
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue Op = N->getOperand(i);
+    for (unsigned j = 0; j < NumElem; ++j) {
+      SDValue Ext = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, InElemTy, Op,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  assert(NOutVT.isVector() && "This type must be promoted to a vector type");
+
+  EVT NOutVTElem = NOutVT.getVectorElementType();
+
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+
+  SDValue ConvElem = DAG.getNode(ISD::ANY_EXTEND, dl,
+    NOutVTElem, N->getOperand(1));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NOutVT,
+    V0, ConvElem, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  SDValue V1 = DAG.getZExtOrTrunc(N->getOperand(1), dl,
+                                  TLI.getVectorIdxTy(DAG.getDataLayout()));
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+    V0->getValueType(0).getScalarType(), V0, V1);
+
+  // EXTRACT_VECTOR_ELT can return types which are wider than the incoming
+  // element types. If this is the case then we need to expand the outgoing
+  // value and not truncate it.
+  return DAG.getAnyExtOrTrunc(Ext, dl, N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDLoc dl(N);
+  SDValue V0 = GetPromotedInteger(N->getOperand(0));
+  MVT InVT = V0.getValueType().getSimpleVT();
+  MVT OutVT = MVT::getVectorVT(InVT.getVectorElementType(),
+                               N->getValueType(0).getVectorNumElements());
+  SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, V0, N->getOperand(1));
+  return DAG.getNode(ISD::TRUNCATE, dl, N->getValueType(0), Ext);
+}
+
+SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
+  SDLoc dl(N);
+  unsigned NumElems = N->getNumOperands();
+
+  EVT RetSclrTy = N->getValueType(0).getVectorElementType();
+
+  SmallVector<SDValue, 8> NewOps;
+  NewOps.reserve(NumElems);
+
+  // For each incoming vector
+  for (unsigned VecIdx = 0; VecIdx != NumElems; ++VecIdx) {
+    SDValue Incoming = GetPromotedInteger(N->getOperand(VecIdx));
+    EVT SclrTy = Incoming->getValueType(0).getVectorElementType();
+    unsigned NumElem = Incoming->getValueType(0).getVectorNumElements();
+
+    for (unsigned i=0; i<NumElem; ++i) {
+      // Extract element from incoming vector
+      SDValue Ex = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
+          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
+      NewOps.push_back(Tr);
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl,  N->getValueType(0), NewOps);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
new file mode 100644
index 000000000000..144bed241ee7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -0,0 +1,1177 @@
+//===-- LegalizeTypes.cpp - Common code for DAG type legalizer ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeTypes method.  It transforms
+// an arbitrary well-formed SelectionDAG to only consist of legal types.  This
+// is common code shared among the LegalizeTypes*.cpp files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+static cl::opt<bool>
+EnableExpensiveChecks("enable-legalize-types-checking", cl::Hidden);
+
+/// Do extensive, expensive, sanity checking.
+void DAGTypeLegalizer::PerformExpensiveChecks() {
+  // If a node is not processed, then none of its values should be mapped by any
+  // of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+
+  // If a node is processed, then each value with an illegal type must be mapped
+  // by exactly one of PromotedIntegers, ExpandedIntegers, ..., ReplacedValues.
+  // Values with a legal type may be mapped by ReplacedValues, but not by any of
+  // the other maps.
+
+  // Note that these invariants may not hold momentarily when processing a node:
+  // the node being processed may be put in a map before being marked Processed.
+
+  // Note that it is possible to have nodes marked NewNode in the DAG.  This can
+  // occur in two ways.  Firstly, a node may be created during legalization but
+  // never passed to the legalization core.  This is usually due to the implicit
+  // folding that occurs when using the DAG.getNode operators.  Secondly, a new
+  // node may be passed to the legalization core, but when analyzed may morph
+  // into a different node, leaving the original node as a NewNode in the DAG.
+  // A node may morph if one of its operands changes during analysis.  Whether
+  // it actually morphs or not depends on whether, after updating its operands,
+  // it is equivalent to an existing node: if so, it morphs into that existing
+  // node (CSE).  An operand can change during analysis if the operand is a new
+  // node that morphs, or it is a processed value that was mapped to some other
+  // value (as recorded in ReplacedValues) in which case the operand is turned
+  // into that other value.  If a node morphs then the node it morphed into will
+  // be used instead of it for legalization, however the original node continues
+  // to live on in the DAG.
+  // The conclusion is that though there may be nodes marked NewNode in the DAG,
+  // all uses of such nodes are also marked NewNode: the result is a fungus of
+  // NewNodes growing on top of the useful nodes, and perhaps using them, but
+  // not used by them.
+
+  // If a value is mapped by ReplacedValues, then it must have no uses, except
+  // by nodes marked NewNode (see above).
+
+  // The final node obtained by mapping by ReplacedValues is not marked NewNode.
+  // Note that ReplacedValues should be applied iteratively.
+
+  // Note that the ReplacedValues map may also map deleted nodes (by iterating
+  // over the DAG we never dereference deleted nodes).  This means that it may
+  // also map nodes marked NewNode if the deallocated memory was reallocated as
+  // another node, and that new node was not seen by the LegalizeTypes machinery
+  // (for example because it was created but not used).  In general, we cannot
+  // distinguish between new nodes and deleted nodes.
+  SmallVector<SDNode*, 16> NewNodes;
+  for (SDNode &Node : DAG.allnodes()) {
+    // Remember nodes marked NewNode - they are subject to extra checking below.
+    if (Node.getNodeId() == NewNode)
+      NewNodes.push_back(&Node);
+
+    for (unsigned i = 0, e = Node.getNumValues(); i != e; ++i) {
+      SDValue Res(&Node, i);
+      bool Failed = false;
+
+      unsigned Mapped = 0;
+      if (ReplacedValues.find(Res) != ReplacedValues.end()) {
+        Mapped |= 1;
+        // Check that remapped values are only used by nodes marked NewNode.
+        for (SDNode::use_iterator UI = Node.use_begin(), UE = Node.use_end();
+             UI != UE; ++UI)
+          if (UI.getUse().getResNo() == i)
+            assert(UI->getNodeId() == NewNode &&
+                   "Remapped value has non-trivial use!");
+
+        // Check that the final result of applying ReplacedValues is not
+        // marked NewNode.
+        SDValue NewVal = ReplacedValues[Res];
+        DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(NewVal);
+        while (I != ReplacedValues.end()) {
+          NewVal = I->second;
+          I = ReplacedValues.find(NewVal);
+        }
+        assert(NewVal.getNode()->getNodeId() != NewNode &&
+               "ReplacedValues maps to a new node!");
+      }
+      if (PromotedIntegers.find(Res) != PromotedIntegers.end())
+        Mapped |= 2;
+      if (SoftenedFloats.find(Res) != SoftenedFloats.end())
+        Mapped |= 4;
+      if (ScalarizedVectors.find(Res) != ScalarizedVectors.end())
+        Mapped |= 8;
+      if (ExpandedIntegers.find(Res) != ExpandedIntegers.end())
+        Mapped |= 16;
+      if (ExpandedFloats.find(Res) != ExpandedFloats.end())
+        Mapped |= 32;
+      if (SplitVectors.find(Res) != SplitVectors.end())
+        Mapped |= 64;
+      if (WidenedVectors.find(Res) != WidenedVectors.end())
+        Mapped |= 128;
+
+      if (Node.getNodeId() != Processed) {
+        // Since we allow ReplacedValues to map deleted nodes, it may map nodes
+        // marked NewNode too, since a deleted node may have been reallocated as
+        // another node that has not been seen by the LegalizeTypes machinery.
+        if ((Node.getNodeId() == NewNode && Mapped > 1) ||
+            (Node.getNodeId() != NewNode && Mapped != 0)) {
+          dbgs() << "Unprocessed value in a map!";
+          Failed = true;
+        }
+      } else if (isTypeLegal(Res.getValueType()) || IgnoreNodeResults(&Node)) {
+        if (Mapped > 1) {
+          dbgs() << "Value with legal type was transformed!";
+          Failed = true;
+        }
+      } else {
+        if (Mapped == 0) {
+          dbgs() << "Processed value not in any map!";
+          Failed = true;
+        } else if (Mapped & (Mapped - 1)) {
+          dbgs() << "Value in multiple maps!";
+          Failed = true;
+        }
+      }
+
+      if (Failed) {
+        if (Mapped & 1)
+          dbgs() << " ReplacedValues";
+        if (Mapped & 2)
+          dbgs() << " PromotedIntegers";
+        if (Mapped & 4)
+          dbgs() << " SoftenedFloats";
+        if (Mapped & 8)
+          dbgs() << " ScalarizedVectors";
+        if (Mapped & 16)
+          dbgs() << " ExpandedIntegers";
+        if (Mapped & 32)
+          dbgs() << " ExpandedFloats";
+        if (Mapped & 64)
+          dbgs() << " SplitVectors";
+        if (Mapped & 128)
+          dbgs() << " WidenedVectors";
+        dbgs() << "\n";
+        llvm_unreachable(nullptr);
+      }
+    }
+  }
+
+  // Checked that NewNodes are only used by other NewNodes.
+  for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
+    SDNode *N = NewNodes[i];
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI)
+      assert(UI->getNodeId() == NewNode && "NewNode used by non-NewNode!");
+  }
+}
+
+/// This is the main entry point for the type legalizer. This does a top-down
+/// traversal of the dag, legalizing types as it goes. Returns "true" if it made
+/// any changes.
+bool DAGTypeLegalizer::run() {
+  bool Changed = false;
+
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted, and tracking any
+  // changes of the root.
+  HandleSDNode Dummy(DAG.getRoot());
+  Dummy.setNodeId(Unanalyzed);
+
+  // The root of the dag may dangle to deleted nodes until the type legalizer is
+  // done.  Set it to null to avoid confusion.
+  DAG.setRoot(SDValue());
+
+  // Walk all nodes in the graph, assigning them a NodeId of 'ReadyToProcess'
+  // (and remembering them) if they are leaves and assigning 'Unanalyzed' if
+  // non-leaves.
+  for (SDNode &Node : DAG.allnodes()) {
+    if (Node.getNumOperands() == 0) {
+      Node.setNodeId(ReadyToProcess);
+      Worklist.push_back(&Node);
+    } else {
+      Node.setNodeId(Unanalyzed);
+    }
+  }
+
+  // Now that we have a set of nodes to process, handle them all.
+  while (!Worklist.empty()) {
+#ifndef EXPENSIVE_CHECKS
+    if (EnableExpensiveChecks)
+#endif
+      PerformExpensiveChecks();
+
+    SDNode *N = Worklist.back();
+    Worklist.pop_back();
+    assert(N->getNodeId() == ReadyToProcess &&
+           "Node should be ready if on worklist!");
+
+    if (IgnoreNodeResults(N))
+      goto ScanOperands;
+
+    // Scan the values produced by the node, checking to see if any result
+    // types are illegal.
+    for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i) {
+      EVT ResultVT = N->getValueType(i);
+      switch (getTypeAction(ResultVT)) {
+      case TargetLowering::TypeLegal:
+        break;
+      // The following calls must take care of *all* of the node's results,
+      // not just the illegal result they were passed (this includes results
+      // with a legal type).  Results can be remapped using ReplaceValueWith,
+      // or their promoted/expanded/etc values registered in PromotedIntegers,
+      // ExpandedIntegers etc.
+      case TargetLowering::TypePromoteInteger:
+        PromoteIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeExpandInteger:
+        ExpandIntegerResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSoftenFloat:
+        Changed = SoftenFloatResult(N, i);
+        if (Changed)
+          goto NodeDone;
+        // If not changed, the result type should be legally in register.
+        assert(isLegalInHWReg(ResultVT) &&
+               "Unchanged SoftenFloatResult should be legal in register!");
+        goto ScanOperands;
+      case TargetLowering::TypeExpandFloat:
+        ExpandFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeScalarizeVector:
+        ScalarizeVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeSplitVector:
+        SplitVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypeWidenVector:
+        WidenVectorResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      case TargetLowering::TypePromoteFloat:
+        PromoteFloatResult(N, i);
+        Changed = true;
+        goto NodeDone;
+      }
+    }
+
+ScanOperands:
+    // Scan the operand list for the node, handling any nodes with operands that
+    // are illegal.
+    {
+    unsigned NumOperands = N->getNumOperands();
+    bool NeedsReanalyzing = false;
+    unsigned i;
+    for (i = 0; i != NumOperands; ++i) {
+      if (IgnoreNodeResults(N->getOperand(i).getNode()))
+        continue;
+
+      EVT OpVT = N->getOperand(i).getValueType();
+      switch (getTypeAction(OpVT)) {
+      case TargetLowering::TypeLegal:
+        continue;
+      // The following calls must either replace all of the node's results
+      // using ReplaceValueWith, and return "false"; or update the node's
+      // operands in place, and return "true".
+      case TargetLowering::TypePromoteInteger:
+        NeedsReanalyzing = PromoteIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandInteger:
+        NeedsReanalyzing = ExpandIntegerOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSoftenFloat:
+        NeedsReanalyzing = SoftenFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeExpandFloat:
+        NeedsReanalyzing = ExpandFloatOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeScalarizeVector:
+        NeedsReanalyzing = ScalarizeVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeSplitVector:
+        NeedsReanalyzing = SplitVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypeWidenVector:
+        NeedsReanalyzing = WidenVectorOperand(N, i);
+        Changed = true;
+        break;
+      case TargetLowering::TypePromoteFloat:
+        NeedsReanalyzing = PromoteFloatOperand(N, i);
+        Changed = true;
+        break;
+      }
+      break;
+    }
+
+    // The sub-method updated N in place.  Check to see if any operands are new,
+    // and if so, mark them.  If the node needs revisiting, don't add all users
+    // to the worklist etc.
+    if (NeedsReanalyzing) {
+      assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+      N->setNodeId(NewNode);
+      // Recompute the NodeId and correct processed operands, adding the node to
+      // the worklist if ready.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M == N)
+        // The node didn't morph - nothing special to do, it will be revisited.
+        continue;
+
+      // The node morphed - this is equivalent to legalizing by replacing every
+      // value of N with the corresponding value of M.  So do that now.
+      assert(N->getNumValues() == M->getNumValues() &&
+             "Node morphing changed the number of results!");
+      for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+        // Replacing the value takes care of remapping the new value.
+        ReplaceValueWith(SDValue(N, i), SDValue(M, i));
+      assert(N->getNodeId() == NewNode && "Unexpected node state!");
+      // The node continues to live on as part of the NewNode fungus that
+      // grows on top of the useful nodes.  Nothing more needs to be done
+      // with it - move on to the next node.
+      continue;
+    }
+
+    if (i == NumOperands) {
+      DEBUG(dbgs() << "Legally typed node: "; N->dump(&DAG); dbgs() << "\n");
+    }
+    }
+NodeDone:
+
+    // If we reach here, the node was processed, potentially creating new nodes.
+    // Mark it as processed and add its users to the worklist as appropriate.
+    assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+    N->setNodeId(Processed);
+
+    for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+         UI != E; ++UI) {
+      SDNode *User = *UI;
+      int NodeId = User->getNodeId();
+
+      // This node has two options: it can either be a new node or its Node ID
+      // may be a count of the number of operands it has that are not ready.
+      if (NodeId > 0) {
+        User->setNodeId(NodeId-1);
+
+        // If this was the last use it was waiting on, add it to the ready list.
+        if (NodeId-1 == ReadyToProcess)
+          Worklist.push_back(User);
+        continue;
+      }
+
+      // If this is an unreachable new node, then ignore it.  If it ever becomes
+      // reachable by being used by a newly created node then it will be handled
+      // by AnalyzeNewNode.
+      if (NodeId == NewNode)
+        continue;
+
+      // Otherwise, this node is new: this is the first operand of it that
+      // became ready.  Its new NodeId is the number of operands it has minus 1
+      // (as this node is now processed).
+      assert(NodeId == Unanalyzed && "Unknown node ID!");
+      User->setNodeId(User->getNumOperands() - 1);
+
+      // If the node only has a single operand, it is now ready.
+      if (User->getNumOperands() == 1)
+        Worklist.push_back(User);
+    }
+  }
+
+#ifndef EXPENSIVE_CHECKS
+  if (EnableExpensiveChecks)
+#endif
+    PerformExpensiveChecks();
+
+  // If the root changed (e.g. it was a dead load) update the root.
+  DAG.setRoot(Dummy.getValue());
+
+  // Remove dead nodes.  This is important to do for cleanliness but also before
+  // the checking loop below.  Implicit folding by the DAG.getNode operators and
+  // node morphing can cause unreachable nodes to be around with their flags set
+  // to new.
+  DAG.RemoveDeadNodes();
+
+  // In a debug build, scan all the nodes to make sure we found them all.  This
+  // ensures that there are no cycles and that everything got processed.
+#ifndef NDEBUG
+  for (SDNode &Node : DAG.allnodes()) {
+    bool Failed = false;
+
+    // Check that all result types are legal.
+    // A value type is illegal if its TypeAction is not TypeLegal,
+    // and TLI.RegClassForVT does not have a register class for this type.
+    // For example, the x86_64 target has f128 that is not TypeLegal,
+    // to have softened operators, but it also has FR128 register class to
+    // pass and return f128 values. Hence a legalized node can have f128 type.
+    if (!IgnoreNodeResults(&Node))
+      for (unsigned i = 0, NumVals = Node.getNumValues(); i < NumVals; ++i)
+        if (!isTypeLegal(Node.getValueType(i)) &&
+            !TLI.isTypeLegal(Node.getValueType(i))) {
+          dbgs() << "Result type " << i << " illegal: ";
+          Node.dump();
+          Failed = true;
+        }
+
+    // Check that all operand types are legal.
+    for (unsigned i = 0, NumOps = Node.getNumOperands(); i < NumOps; ++i)
+      if (!IgnoreNodeResults(Node.getOperand(i).getNode()) &&
+          !isTypeLegal(Node.getOperand(i).getValueType()) &&
+          !TLI.isTypeLegal(Node.getOperand(i).getValueType())) {
+        dbgs() << "Operand type " << i << " illegal: ";
+        Node.getOperand(i).dump();
+        Failed = true;
+      }
+
+    if (Node.getNodeId() != Processed) {
+       if (Node.getNodeId() == NewNode)
+         dbgs() << "New node not analyzed?\n";
+       else if (Node.getNodeId() == Unanalyzed)
+         dbgs() << "Unanalyzed node not noticed?\n";
+       else if (Node.getNodeId() > 0)
+         dbgs() << "Operand not processed?\n";
+       else if (Node.getNodeId() == ReadyToProcess)
+         dbgs() << "Not added to worklist?\n";
+       Failed = true;
+    }
+
+    if (Failed) {
+      Node.dump(&DAG); dbgs() << "\n";
+      llvm_unreachable(nullptr);
+    }
+  }
+#endif
+
+  return Changed;
+}
+
+/// The specified node is the root of a subtree of potentially new nodes.
+/// Correct any processed operands (this may change the node) and calculate the
+/// NodeId. If the node itself changes to a processed node, it is not remapped -
+/// the caller needs to take care of this. Returns the potentially changed node.
+SDNode *DAGTypeLegalizer::AnalyzeNewNode(SDNode *N) {
+  // If this was an existing node that is already done, we're done.
+  if (N->getNodeId() != NewNode && N->getNodeId() != Unanalyzed)
+    return N;
+
+  // Remove any stale map entries.
+  ExpungeNode(N);
+
+  // Okay, we know that this node is new.  Recursively walk all of its operands
+  // to see if they are new also.  The depth of this walk is bounded by the size
+  // of the new tree that was constructed (usually 2-3 nodes), so we don't worry
+  // about revisiting of nodes.
+  //
+  // As we walk the operands, keep track of the number of nodes that are
+  // processed.  If non-zero, this will become the new nodeid of this node.
+  // Operands may morph when they are analyzed.  If so, the node will be
+  // updated after all operands have been analyzed.  Since this is rare,
+  // the code tries to minimize overhead in the non-morphing case.
+
+  SmallVector<SDValue, 8> NewOps;
+  unsigned NumProcessed = 0;
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue OrigOp = N->getOperand(i);
+    SDValue Op = OrigOp;
+
+    AnalyzeNewValue(Op); // Op may morph.
+
+    if (Op.getNode()->getNodeId() == Processed)
+      ++NumProcessed;
+
+    if (!NewOps.empty()) {
+      // Some previous operand changed.  Add this one to the list.
+      NewOps.push_back(Op);
+    } else if (Op != OrigOp) {
+      // This is the first operand to change - add all operands so far.
+      NewOps.append(N->op_begin(), N->op_begin() + i);
+      NewOps.push_back(Op);
+    }
+  }
+
+  // Some operands changed - update the node.
+  if (!NewOps.empty()) {
+    SDNode *M = DAG.UpdateNodeOperands(N, NewOps);
+    if (M != N) {
+      // The node morphed into a different node.  Normally for this to happen
+      // the original node would have to be marked NewNode.  However this can
+      // in theory momentarily not be the case while ReplaceValueWith is doing
+      // its stuff.  Mark the original node NewNode to help sanity checking.
+      N->setNodeId(NewNode);
+      if (M->getNodeId() != NewNode && M->getNodeId() != Unanalyzed)
+        // It morphed into a previously analyzed node - nothing more to do.
+        return M;
+
+      // It morphed into a different new node.  Do the equivalent of passing
+      // it to AnalyzeNewNode: expunge it and calculate the NodeId.  No need
+      // to remap the operands, since they are the same as the operands we
+      // remapped above.
+      N = M;
+      ExpungeNode(N);
+    }
+  }
+
+  // Calculate the NodeId.
+  N->setNodeId(N->getNumOperands() - NumProcessed);
+  if (N->getNodeId() == ReadyToProcess)
+    Worklist.push_back(N);
+
+  return N;
+}
+
+/// Call AnalyzeNewNode, updating the node in Val if needed.
+/// If the node changes to a processed node, then remap it.
+void DAGTypeLegalizer::AnalyzeNewValue(SDValue &Val) {
+  Val.setNode(AnalyzeNewNode(Val.getNode()));
+  if (Val.getNode()->getNodeId() == Processed)
+    // We were passed a processed node, or it morphed into one - remap it.
+    RemapValue(Val);
+}
+
+/// If N has a bogus mapping in ReplacedValues, eliminate it.
+/// This can occur when a node is deleted then reallocated as a new node -
+/// the mapping in ReplacedValues applies to the deleted node, not the new
+/// one.
+/// The only map that can have a deleted node as a source is ReplacedValues.
+/// Other maps can have deleted nodes as targets, but since their looked-up
+/// values are always immediately remapped using RemapValue, resulting in a
+/// not-deleted node, this is harmless as long as ReplacedValues/RemapValue
+/// always performs correct mappings.  In order to keep the mapping correct,
+/// ExpungeNode should be called on any new nodes *before* adding them as
+/// either source or target to ReplacedValues (which typically means calling
+/// Expunge when a new node is first seen, since it may no longer be marked
+/// NewNode by the time it is added to ReplacedValues).
+void DAGTypeLegalizer::ExpungeNode(SDNode *N) {
+  if (N->getNodeId() != NewNode)
+    return;
+
+  // If N is not remapped by ReplacedValues then there is nothing to do.
+  unsigned i, e;
+  for (i = 0, e = N->getNumValues(); i != e; ++i)
+    if (ReplacedValues.find(SDValue(N, i)) != ReplacedValues.end())
+      break;
+
+  if (i == e)
+    return;
+
+  // Remove N from all maps - this is expensive but rare.
+
+  for (DenseMap<SDValue, SDValue>::iterator I = PromotedIntegers.begin(),
+       E = PromotedIntegers.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = SoftenedFloats.begin(),
+       E = SoftenedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ScalarizedVectors.begin(),
+       E = ScalarizedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = WidenedVectors.begin(),
+       E = WidenedVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedIntegers.begin(), E = ExpandedIntegers.end(); I != E; ++I){
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = ExpandedFloats.begin(), E = ExpandedFloats.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, std::pair<SDValue, SDValue> >::iterator
+       I = SplitVectors.begin(), E = SplitVectors.end(); I != E; ++I) {
+    assert(I->first.getNode() != N);
+    RemapValue(I->second.first);
+    RemapValue(I->second.second);
+  }
+
+  for (DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.begin(),
+       E = ReplacedValues.end(); I != E; ++I)
+    RemapValue(I->second);
+
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    ReplacedValues.erase(SDValue(N, i));
+}
+
+/// If the specified value was already legalized to another value,
+/// replace it by that value.
+void DAGTypeLegalizer::RemapValue(SDValue &N) {
+  DenseMap<SDValue, SDValue>::iterator I = ReplacedValues.find(N);
+  if (I != ReplacedValues.end()) {
+    // Use path compression to speed up future lookups if values get multiply
+    // replaced with other values.
+    RemapValue(I->second);
+    N = I->second;
+
+    // Note that it is possible to have N.getNode()->getNodeId() == NewNode at
+    // this point because it is possible for a node to be put in the map before
+    // being processed.
+  }
+}
+
+namespace {
+  /// This class is a DAGUpdateListener that listens for updates to nodes and
+  /// recomputes their ready state.
+  class NodeUpdateListener : public SelectionDAG::DAGUpdateListener {
+    DAGTypeLegalizer &DTL;
+    SmallSetVector<SDNode*, 16> &NodesToAnalyze;
+  public:
+    explicit NodeUpdateListener(DAGTypeLegalizer &dtl,
+                                SmallSetVector<SDNode*, 16> &nta)
+      : SelectionDAG::DAGUpdateListener(dtl.getDAG()),
+        DTL(dtl), NodesToAnalyze(nta) {}
+
+    void NodeDeleted(SDNode *N, SDNode *E) override {
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      // It is possible, though rare, for the deleted node N to occur as a
+      // target in a map, so note the replacement N -> E in ReplacedValues.
+      assert(E && "Node not replaced?");
+      DTL.NoteDeletion(N, E);
+
+      // In theory the deleted node could also have been scheduled for analysis.
+      // So remove it from the set of nodes which will be analyzed.
+      NodesToAnalyze.remove(N);
+
+      // In general nothing needs to be done for E, since it didn't change but
+      // only gained new uses.  However N -> E was just added to ReplacedValues,
+      // and the result of a ReplacedValues mapping is not allowed to be marked
+      // NewNode.  So if E is marked NewNode, then it needs to be analyzed.
+      if (E->getNodeId() == DAGTypeLegalizer::NewNode)
+        NodesToAnalyze.insert(E);
+    }
+
+    void NodeUpdated(SDNode *N) override {
+      // Node updates can mean pretty much anything.  It is possible that an
+      // operand was set to something already processed (f.e.) in which case
+      // this node could become ready.  Recompute its flags.
+      assert(N->getNodeId() != DAGTypeLegalizer::ReadyToProcess &&
+             N->getNodeId() != DAGTypeLegalizer::Processed &&
+             "Invalid node ID for RAUW deletion!");
+      N->setNodeId(DAGTypeLegalizer::NewNode);
+      NodesToAnalyze.insert(N);
+    }
+  };
+}
+
+
+/// The specified value was legalized to the specified other value.
+/// Update the DAG and NodeIds replacing any uses of From to use To instead.
+void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
+  assert(From.getNode() != To.getNode() && "Potential legalization loop!");
+
+  // If expansion produced new nodes, make sure they are properly marked.
+  ExpungeNode(From.getNode());
+  AnalyzeNewValue(To); // Expunges To.
+
+  // Anything that used the old node should now use the new one.  Note that this
+  // can potentially cause recursive merging.
+  SmallSetVector<SDNode*, 16> NodesToAnalyze;
+  NodeUpdateListener NUL(*this, NodesToAnalyze);
+  do {
+    DAG.ReplaceAllUsesOfValueWith(From, To);
+
+    // The old node may still be present in a map like ExpandedIntegers or
+    // PromotedIntegers.  Inform maps about the replacement.
+    ReplacedValues[From] = To;
+
+    // Process the list of nodes that need to be reanalyzed.
+    while (!NodesToAnalyze.empty()) {
+      SDNode *N = NodesToAnalyze.back();
+      NodesToAnalyze.pop_back();
+      if (N->getNodeId() != DAGTypeLegalizer::NewNode)
+        // The node was analyzed while reanalyzing an earlier node - it is safe
+        // to skip.  Note that this is not a morphing node - otherwise it would
+        // still be marked NewNode.
+        continue;
+
+      // Analyze the node's operands and recalculate the node ID.
+      SDNode *M = AnalyzeNewNode(N);
+      if (M != N) {
+        // The node morphed into a different node.  Make everyone use the new
+        // node instead.
+        assert(M->getNodeId() != NewNode && "Analysis resulted in NewNode!");
+        assert(N->getNumValues() == M->getNumValues() &&
+               "Node morphing changed the number of results!");
+        for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+          SDValue OldVal(N, i);
+          SDValue NewVal(M, i);
+          if (M->getNodeId() == Processed)
+            RemapValue(NewVal);
+          DAG.ReplaceAllUsesOfValueWith(OldVal, NewVal);
+          // OldVal may be a target of the ReplacedValues map which was marked
+          // NewNode to force reanalysis because it was updated.  Ensure that
+          // anything that ReplacedValues mapped to OldVal will now be mapped
+          // all the way to NewVal.
+          ReplacedValues[OldVal] = NewVal;
+        }
+        // The original node continues to exist in the DAG, marked NewNode.
+      }
+    }
+    // When recursively update nodes with new nodes, it is possible to have
+    // new uses of From due to CSE. If this happens, replace the new uses of
+    // From with To.
+  } while (!From.use_empty());
+}
+
+void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted integer");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = PromotedIntegers[Op];
+  assert(!OpEntry.getNode() && "Node is already promoted!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetSoftenedFloat(SDValue Op, SDValue Result) {
+  // f128 of x86_64 could be kept in SSE registers,
+  // but sometimes softened to i128.
+  assert((Result.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) ||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Invalid type for softened float");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = SoftenedFloats[Op];
+  // Allow repeated calls to save f128 type nodes
+  // or any node with type that transforms to itself.
+  // Many operations on these types are not softened.
+  assert((!OpEntry.getNode()||
+          Op.getValueType() ==
+          TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType())) &&
+         "Node is already converted to integer!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for promoted float");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = PromotedFloats[Op];
+  assert(!OpEntry.getNode() && "Node is already promoted!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
+  // Note that in some cases vector operation operands may be greater than
+  // the vector element type. For example BUILD_VECTOR of type <1 x i1> with
+  // a constant i8 operand.
+  assert(Result.getValueType().getSizeInBits() >=
+         Op.getValueType().getVectorElementType().getSizeInBits() &&
+         "Invalid type for scalarized vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = ScalarizedVectors[Op];
+  assert(!OpEntry.getNode() && "Node is already scalarized!");
+  OpEntry = Result;
+}
+
+void DAGTypeLegalizer::GetExpandedInteger(SDValue Op, SDValue &Lo,
+                                          SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedInteger(SDValue Op, SDValue Lo,
+                                          SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded integer");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedIntegers[Op];
+  assert(!Entry.first.getNode() && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetExpandedFloat(SDValue Op, SDValue &Lo,
+                                        SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't expanded");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetExpandedFloat(SDValue Op, SDValue Lo,
+                                        SDValue Hi) {
+  assert(Lo.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for expanded float");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = ExpandedFloats[Op];
+  assert(!Entry.first.getNode() && "Node already expanded");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
+                                      SDValue &Hi) {
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  RemapValue(Entry.first);
+  RemapValue(Entry.second);
+  assert(Entry.first.getNode() && "Operand isn't split");
+  Lo = Entry.first;
+  Hi = Entry.second;
+}
+
+void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
+                                      SDValue Hi) {
+  assert(Lo.getValueType().getVectorElementType() ==
+         Op.getValueType().getVectorElementType() &&
+         2*Lo.getValueType().getVectorNumElements() ==
+         Op.getValueType().getVectorNumElements() &&
+         Hi.getValueType() == Lo.getValueType() &&
+         "Invalid type for split vector");
+  // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
+  AnalyzeNewValue(Lo);
+  AnalyzeNewValue(Hi);
+
+  // Remember that this is the result of the node.
+  std::pair<SDValue, SDValue> &Entry = SplitVectors[Op];
+  assert(!Entry.first.getNode() && "Node already split");
+  Entry.first = Lo;
+  Entry.second = Hi;
+}
+
+void DAGTypeLegalizer::SetWidenedVector(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() ==
+         TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType()) &&
+         "Invalid type for widened vector");
+  AnalyzeNewValue(Result);
+
+  SDValue &OpEntry = WidenedVectors[Op];
+  assert(!OpEntry.getNode() && "Node already widened!");
+  OpEntry = Result;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Utilities.
+//===----------------------------------------------------------------------===//
+
+/// Convert to an integer of the same size.
+SDValue DAGTypeLegalizer::BitConvertToInteger(SDValue Op) {
+  unsigned BitWidth = Op.getValueType().getSizeInBits();
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
+                     EVT::getIntegerVT(*DAG.getContext(), BitWidth), Op);
+}
+
+/// Convert to a vector of integers of the same size.
+SDValue DAGTypeLegalizer::BitConvertVectorToIntegerVector(SDValue Op) {
+  assert(Op.getValueType().isVector() && "Only applies to vectors!");
+  unsigned EltWidth = Op.getValueType().getVectorElementType().getSizeInBits();
+  EVT EltNVT = EVT::getIntegerVT(*DAG.getContext(), EltWidth);
+  unsigned NumElts = Op.getValueType().getVectorNumElements();
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op),
+                     EVT::getVectorVT(*DAG.getContext(), EltNVT, NumElts), Op);
+}
+
+SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
+                                               EVT DestVT) {
+  SDLoc dl(Op);
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and destination types.
+  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+  // Emit a store to the stack slot.
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
+  // Result is a load from the stack slot.
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
+}
+
+/// Replace the node's results with custom code provided by the target and
+/// return "true", or do nothing and return "false".
+/// The last parameter is FALSE if we are dealing with a node with legal
+/// result types and illegal operand. The second parameter denotes the type of
+/// illegal OperandNo in that case.
+/// The last parameter being TRUE means we are dealing with a
+/// node with illegal result types. The second parameter denotes the type of
+/// illegal ResNo in that case.
+bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  if (LegalizeResult)
+    TLI.ReplaceNodeResults(N, Results, DAG);
+  else
+    TLI.LowerOperationWrapper(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom lower it after all.
+    return false;
+
+  // When called from DAGTypeLegalizer::ExpandIntegerResult, we might need to
+  // provide the same kind of custom splitting behavior.
+  if (Results.size() == N->getNumValues() + 1 && LegalizeResult) {
+    // We've legalized a return type by splitting it. If there is a chain,
+    // replace that too.
+    SetExpandedInteger(SDValue(N, 0), Results[0], Results[1]);
+    if (N->getNumValues() > 1)
+      ReplaceValueWith(SDValue(N, 1), Results[2]);
+    return true;
+  }
+
+  // Make everything that once used N's values now use those in Results instead.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i) {
+    ReplaceValueWith(SDValue(N, i), Results[i]);
+  }
+  return true;
+}
+
+
+/// Widen the node's results with custom code provided by the target and return
+/// "true", or do nothing and return "false".
+bool DAGTypeLegalizer::CustomWidenLowerNode(SDNode *N, EVT VT) {
+  // See if the target wants to custom lower this node.
+  if (TLI.getOperationAction(N->getOpcode(), VT) != TargetLowering::Custom)
+    return false;
+
+  SmallVector<SDValue, 8> Results;
+  TLI.ReplaceNodeResults(N, Results, DAG);
+
+  if (Results.empty())
+    // The target didn't want to custom widen lower its result after all.
+    return false;
+
+  // Update the widening map.
+  assert(Results.size() == N->getNumValues() &&
+         "Custom lowering returned the wrong number of results!");
+  for (unsigned i = 0, e = Results.size(); i != e; ++i)
+    SetWidenedVector(SDValue(N, i), Results[i]);
+  return true;
+}
+
+SDValue DAGTypeLegalizer::DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo) {
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i)
+    if (i != ResNo)
+      ReplaceValueWith(SDValue(N, i), SDValue(N->getOperand(i)));
+  return SDValue(N->getOperand(ResNo));
+}
+
+/// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+/// given value.
+void DAGTypeLegalizer::GetPairElements(SDValue Pair,
+                                       SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(Pair);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Pair.getValueType());
+  Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(0, dl));
+  Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, NVT, Pair,
+                   DAG.getIntPtrConstant(1, dl));
+}
+
+SDValue DAGTypeLegalizer::GetVectorElementPointer(SDValue VecPtr, EVT EltVT,
+                                                  SDValue Index) {
+  SDLoc dl(Index);
+  // Make sure the index type is big enough to compute in.
+  Index = DAG.getZExtOrTrunc(Index, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+  // Calculate the element offset and add it to the pointer.
+  unsigned EltSize = EltVT.getSizeInBits() / 8; // FIXME: should be ABI size.
+  assert(EltSize * 8 == EltVT.getSizeInBits() &&
+         "Converting bits to bytes lost precision");
+
+  Index = DAG.getNode(ISD::MUL, dl, Index.getValueType(), Index,
+                      DAG.getConstant(EltSize, dl, Index.getValueType()));
+  return DAG.getNode(ISD::ADD, dl, Index.getValueType(), Index, VecPtr);
+}
+
+/// Build an integer with low bits Lo and high bits Hi.
+SDValue DAGTypeLegalizer::JoinIntegers(SDValue Lo, SDValue Hi) {
+  // Arbitrarily use dlHi for result SDLoc
+  SDLoc dlHi(Hi);
+  SDLoc dlLo(Lo);
+  EVT LVT = Lo.getValueType();
+  EVT HVT = Hi.getValueType();
+  EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                              LVT.getSizeInBits() + HVT.getSizeInBits());
+
+  Lo = DAG.getNode(ISD::ZERO_EXTEND, dlLo, NVT, Lo);
+  Hi = DAG.getNode(ISD::ANY_EXTEND, dlHi, NVT, Hi);
+  Hi = DAG.getNode(ISD::SHL, dlHi, NVT, Hi,
+                   DAG.getConstant(LVT.getSizeInBits(), dlHi,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  return DAG.getNode(ISD::OR, dlHi, NVT, Lo, Hi);
+}
+
+/// Convert the node into a libcall with the same prototype.
+SDValue DAGTypeLegalizer::LibCallify(RTLIB::Libcall LC, SDNode *N,
+                                     bool isSigned) {
+  unsigned NumOps = N->getNumOperands();
+  SDLoc dl(N);
+  if (NumOps == 0) {
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), None, isSigned,
+                           dl).first;
+  } else if (NumOps == 1) {
+    SDValue Op = N->getOperand(0);
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Op, isSigned,
+                           dl).first;
+  } else if (NumOps == 2) {
+    SDValue Ops[2] = { N->getOperand(0), N->getOperand(1) };
+    return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned,
+                           dl).first;
+  }
+  SmallVector<SDValue, 8> Ops(NumOps);
+  for (unsigned i = 0; i < NumOps; ++i)
+    Ops[i] = N->getOperand(i);
+
+  return TLI.makeLibCall(DAG, LC, N->getValueType(0), Ops, isSigned, dl).first;
+}
+
+/// Expand a node into a call to a libcall. Similar to ExpandLibCall except that
+/// the first operand is the in-chain.
+std::pair<SDValue, SDValue>
+DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
+                                     bool isSigned) {
+  SDValue InChain = Node->getOperand(0);
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  for (unsigned i = 1, e = Node->getNumOperands(); i != e; ++i) {
+    EVT ArgVT = Node->getOperand(i).getValueType();
+    Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
+    Entry.Node = Node->getOperand(i);
+    Entry.Ty = ArgTy;
+    Entry.isSExt = isSigned;
+    Entry.isZExt = !isSigned;
+    Args.push_back(Entry);
+  }
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
+    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setSExtResult(isSigned).setZExtResult(!isSigned);
+
+  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+
+  return CallInfo;
+}
+
+/// Promote the given target boolean to a target boolean of the given type.
+/// A target boolean is an integer value, not necessarily of type i1, the bits
+/// of which conform to getBooleanContents.
+///
+/// ValVT is the type of values that produced the boolean.
+SDValue DAGTypeLegalizer::PromoteTargetBoolean(SDValue Bool, EVT ValVT) {
+  SDLoc dl(Bool);
+  EVT BoolVT = getSetCCResultType(ValVT);
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(ValVT));
+  return DAG.getNode(ExtendCode, dl, BoolVT, Bool);
+}
+
+/// Widen the given target boolean to a target boolean of the given type.
+/// The boolean vector is widened and then promoted to match the target boolean
+/// type of the given ValVT.
+SDValue DAGTypeLegalizer::WidenTargetBoolean(SDValue Bool, EVT ValVT,
+                                             bool WithZeroes) {
+  SDLoc dl(Bool);
+  EVT BoolVT = Bool.getValueType();
+
+  assert(ValVT.getVectorNumElements() > BoolVT.getVectorNumElements() &&
+         TLI.isTypeLegal(ValVT) &&
+         "Unexpected types in WidenTargetBoolean");
+  EVT WideVT = EVT::getVectorVT(*DAG.getContext(), BoolVT.getScalarType(),
+                                ValVT.getVectorNumElements());
+  Bool = ModifyToType(Bool, WideVT, WithZeroes);
+  return PromoteTargetBoolean(Bool, ValVT);
+}
+
+/// Return the lower LoVT bits of Op in Lo and the upper HiVT bits in Hi.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    EVT LoVT, EVT HiVT,
+                                    SDValue &Lo, SDValue &Hi) {
+  SDLoc dl(Op);
+  assert(LoVT.getSizeInBits() + HiVT.getSizeInBits() ==
+         Op.getValueType().getSizeInBits() && "Invalid integer splitting!");
+  Lo = DAG.getNode(ISD::TRUNCATE, dl, LoVT, Op);
+  Hi = DAG.getNode(ISD::SRL, dl, Op.getValueType(), Op,
+                   DAG.getConstant(LoVT.getSizeInBits(), dl,
+                                   TLI.getPointerTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(ISD::TRUNCATE, dl, HiVT, Hi);
+}
+
+/// Return the lower and upper halves of Op's bits in a value type half the
+/// size of Op's.
+void DAGTypeLegalizer::SplitInteger(SDValue Op,
+                                    SDValue &Lo, SDValue &Hi) {
+  EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(),
+                                 Op.getValueType().getSizeInBits()/2);
+  SplitInteger(Op, HalfVT, HalfVT, Lo, Hi);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Entry Point
+//===----------------------------------------------------------------------===//
+
+/// This transforms the SelectionDAG into a SelectionDAG that only uses types
+/// natively supported by the target. Returns "true" if it made any changes.
+///
+/// Note that this is an involved process that may invalidate pointers into
+/// the graph.
+bool SelectionDAG::LegalizeTypes() {
+  return DAGTypeLegalizer(*this).run();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
new file mode 100644
index 000000000000..84ad8f83d906
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -0,0 +1,859 @@
+//===-- LegalizeTypes.h - DAG Type Legalizer class definition ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the DAGTypeLegalizer class.  This is a private interface
+// shared between the code that implements the SelectionDAG::LegalizeTypes
+// method.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_LEGALIZETYPES_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+//===----------------------------------------------------------------------===//
+/// This takes an arbitrary SelectionDAG as input and hacks on it until only
+/// value types the target machine can handle are left. This involves promoting
+/// small sizes to large sizes or splitting up large values into small values.
+///
+class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
+  const TargetLowering &TLI;
+  SelectionDAG &DAG;
+public:
+  /// This pass uses the NodeId on the SDNodes to hold information about the
+  /// state of the node. The enum has all the values.
+  enum NodeIdFlags {
+    /// All operands have been processed, so this node is ready to be handled.
+    ReadyToProcess = 0,
+
+    /// This is a new node, not before seen, that was created in the process of
+    /// legalizing some other node.
+    NewNode = -1,
+
+    /// This node's ID needs to be set to the number of its unprocessed
+    /// operands.
+    Unanalyzed = -2,
+
+    /// This is a node that has already been processed.
+    Processed = -3
+
+    // 1+ - This is a node which has this many unprocessed operands.
+  };
+private:
+
+  /// This is a bitvector that contains two bits for each simple value type,
+  /// where the two bits correspond to the LegalizeAction enum from
+  /// TargetLowering. This can be queried with "getTypeAction(VT)".
+  TargetLowering::ValueTypeActionImpl ValueTypeActions;
+
+  /// Return how we should legalize values of this type.
+  TargetLowering::LegalizeTypeAction getTypeAction(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT);
+  }
+
+  /// Return true if this type is legal on this target.
+  bool isTypeLegal(EVT VT) const {
+    return TLI.getTypeAction(*DAG.getContext(), VT) == TargetLowering::TypeLegal;
+  }
+
+  /// Return true if this is a simple legal type.
+  bool isSimpleLegalType(EVT VT) const {
+    return VT.isSimple() && TLI.isTypeLegal(VT);
+  }
+
+  /// Return true if this type can be passed in registers.
+  /// For example, x86_64's f128, should to be legally in registers
+  /// and only some operations converted to library calls or integer
+  /// bitwise operations.
+  bool isLegalInHWReg(EVT VT) const {
+    EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+    return VT == NVT && isSimpleLegalType(VT);
+  }
+
+  EVT getSetCCResultType(EVT VT) const {
+    return TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  }
+
+  /// Pretend all of this node's results are legal.
+  bool IgnoreNodeResults(SDNode *N) const {
+    return N->getOpcode() == ISD::TargetConstant;
+  }
+
+  /// For integer nodes that are below legal width, this map indicates what
+  /// promoted value to use.
+  SmallDenseMap<SDValue, SDValue, 8> PromotedIntegers;
+
+  /// For integer nodes that need to be expanded this map indicates which
+  /// operands are the expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedIntegers;
+
+  /// For floating-point nodes converted to integers of the same size, this map
+  /// indicates the converted value to use.
+  SmallDenseMap<SDValue, SDValue, 8> SoftenedFloats;
+
+  /// For floating-point nodes that have a smaller precision than the smallest
+  /// supported precision, this map indicates what promoted value to use.
+  SmallDenseMap<SDValue, SDValue, 8> PromotedFloats;
+
+  /// For float nodes that need to be expanded this map indicates which operands
+  /// are the expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> ExpandedFloats;
+
+  /// For nodes that are <1 x ty>, this map indicates the scalar value of type
+  /// 'ty' to use.
+  SmallDenseMap<SDValue, SDValue, 8> ScalarizedVectors;
+
+  /// For nodes that need to be split this map indicates which operands are the
+  /// expanded version of the input.
+  SmallDenseMap<SDValue, std::pair<SDValue, SDValue>, 8> SplitVectors;
+
+  /// For vector nodes that need to be widened, indicates the widened value to
+  /// use.
+  SmallDenseMap<SDValue, SDValue, 8> WidenedVectors;
+
+  /// For values that have been replaced with another, indicates the replacement
+  /// value to use.
+  SmallDenseMap<SDValue, SDValue, 8> ReplacedValues;
+
+  /// This defines a worklist of nodes to process. In order to be pushed onto
+  /// this worklist, all operands of a node must have already been processed.
+  SmallVector<SDNode*, 128> Worklist;
+
+public:
+  explicit DAGTypeLegalizer(SelectionDAG &dag)
+    : TLI(dag.getTargetLoweringInfo()), DAG(dag),
+    ValueTypeActions(TLI.getValueTypeActions()) {
+    static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                  "Too many value types for ValueTypeActions to hold!");
+  }
+
+  /// This is the main entry point for the type legalizer.  This does a
+  /// top-down traversal of the dag, legalizing types as it goes.  Returns
+  /// "true" if it made any changes.
+  bool run();
+
+  void NoteDeletion(SDNode *Old, SDNode *New) {
+    ExpungeNode(Old);
+    ExpungeNode(New);
+    for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i)
+      ReplacedValues[SDValue(Old, i)] = SDValue(New, i);
+  }
+
+  SelectionDAG &getDAG() const { return DAG; }
+
+private:
+  SDNode *AnalyzeNewNode(SDNode *N);
+  void AnalyzeNewValue(SDValue &Val);
+  void ExpungeNode(SDNode *N);
+  void PerformExpensiveChecks();
+  void RemapValue(SDValue &N);
+
+  // Common routines.
+  SDValue BitConvertToInteger(SDValue Op);
+  SDValue BitConvertVectorToIntegerVector(SDValue Op);
+  SDValue CreateStackStoreLoad(SDValue Op, EVT DestVT);
+  bool CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult);
+  bool CustomWidenLowerNode(SDNode *N, EVT VT);
+
+  /// Replace each result of the given MERGE_VALUES node with the corresponding
+  /// input operand, except for the result 'ResNo', for which the corresponding
+  /// input operand is returned.
+  SDValue DisintegrateMERGE_VALUES(SDNode *N, unsigned ResNo);
+
+  SDValue GetVectorElementPointer(SDValue VecPtr, EVT EltVT, SDValue Index);
+  SDValue JoinIntegers(SDValue Lo, SDValue Hi);
+  SDValue LibCallify(RTLIB::Libcall LC, SDNode *N, bool isSigned);
+
+  std::pair<SDValue, SDValue> ExpandChainLibCall(RTLIB::Libcall LC,
+                                                 SDNode *Node, bool isSigned);
+  std::pair<SDValue, SDValue> ExpandAtomic(SDNode *Node);
+
+  SDValue PromoteTargetBoolean(SDValue Bool, EVT ValVT);
+
+  /// Modify Bit Vector to match SetCC result type of ValVT.
+  /// The bit vector is widened with zeroes when WithZeroes is true.
+  SDValue WidenTargetBoolean(SDValue Bool, EVT ValVT, bool WithZeroes = false);
+
+  void ReplaceValueWith(SDValue From, SDValue To);
+  void SplitInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
+                    SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Promotion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was promoted to a larger integer type,
+  /// this returns the promoted value. The low bits of the promoted value
+  /// corresponding to the original type are exactly equal to Op.
+  /// The extra bits contain rubbish, so the promoted value may need to be zero-
+  /// or sign-extended from the original type before it is usable (the helpers
+  /// SExtPromotedInteger and ZExtPromotedInteger can do this for you).
+  /// For example, if Op is an i16 and was promoted to an i32, then this method
+  /// returns an i32, the lower 16 bits of which coincide with Op, and the upper
+  /// 16 bits of which contain rubbish.
+  SDValue GetPromotedInteger(SDValue Op) {
+    SDValue &PromotedOp = PromotedIntegers[Op];
+    RemapValue(PromotedOp);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedInteger(SDValue Op, SDValue Result);
+
+  /// Get a promoted operand and sign extend it to the final size.
+  SDValue SExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, Op.getValueType(), Op,
+                       DAG.getValueType(OldVT));
+  }
+
+  /// Get a promoted operand and zero extend it to the final size.
+  SDValue ZExtPromotedInteger(SDValue Op) {
+    EVT OldVT = Op.getValueType();
+    SDLoc dl(Op);
+    Op = GetPromotedInteger(Op);
+    return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
+  }
+
+  // Integer Result Promotion.
+  void PromoteIntegerResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_AssertSext(SDNode *N);
+  SDValue PromoteIntRes_AssertZext(SDNode *N);
+  SDValue PromoteIntRes_Atomic0(AtomicSDNode *N);
+  SDValue PromoteIntRes_Atomic1(AtomicSDNode *N);
+  SDValue PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntRes_BITCAST(SDNode *N);
+  SDValue PromoteIntRes_BSWAP(SDNode *N);
+  SDValue PromoteIntRes_BITREVERSE(SDNode *N);
+  SDValue PromoteIntRes_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntRes_Constant(SDNode *N);
+  SDValue PromoteIntRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntRes_CTLZ(SDNode *N);
+  SDValue PromoteIntRes_CTPOP(SDNode *N);
+  SDValue PromoteIntRes_CTTZ(SDNode *N);
+  SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
+  SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
+  SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
+  SDValue PromoteIntRes_LOAD(LoadSDNode *N);
+  SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
+  SDValue PromoteIntRes_MGATHER(MaskedGatherSDNode *N);
+  SDValue PromoteIntRes_Overflow(SDNode *N);
+  SDValue PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_SELECT(SDNode *N);
+  SDValue PromoteIntRes_VSELECT(SDNode *N);
+  SDValue PromoteIntRes_SELECT_CC(SDNode *N);
+  SDValue PromoteIntRes_SETCC(SDNode *N);
+  SDValue PromoteIntRes_SHL(SDNode *N);
+  SDValue PromoteIntRes_SimpleIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_ZExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SExtIntBinOp(SDNode *N);
+  SDValue PromoteIntRes_SIGN_EXTEND_INREG(SDNode *N);
+  SDValue PromoteIntRes_SRA(SDNode *N);
+  SDValue PromoteIntRes_SRL(SDNode *N);
+  SDValue PromoteIntRes_TRUNCATE(SDNode *N);
+  SDValue PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo);
+  SDValue PromoteIntRes_UNDEF(SDNode *N);
+  SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
+
+  // Integer Operand Promotion.
+  bool PromoteIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue PromoteIntOp_ANY_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_ATOMIC_STORE(AtomicSDNode *N);
+  SDValue PromoteIntOp_BITCAST(SDNode *N);
+  SDValue PromoteIntOp_BUILD_PAIR(SDNode *N);
+  SDValue PromoteIntOp_BR_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BRCOND(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_BUILD_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONVERT_RNDSAT(SDNode *N);
+  SDValue PromoteIntOp_INSERT_VECTOR_ELT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteIntOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue PromoteIntOp_CONCAT_VECTORS(SDNode *N);
+  SDValue PromoteIntOp_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue PromoteIntOp_SELECT(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_SETCC(SDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_Shift(SDNode *N);
+  SDValue PromoteIntOp_SIGN_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_SINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_TRUNCATE(SDNode *N);
+  SDValue PromoteIntOp_UINT_TO_FP(SDNode *N);
+  SDValue PromoteIntOp_ZERO_EXTEND(SDNode *N);
+  SDValue PromoteIntOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MLOAD(MaskedLoadSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
+  SDValue PromoteIntOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+
+  void PromoteSetCCOperands(SDValue &LHS,SDValue &RHS, ISD::CondCode Code);
+
+  //===--------------------------------------------------------------------===//
+  // Integer Expansion Support: LegalizeIntegerTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was expanded into two integers of half
+  /// the size, this returns the two halves. The low bits of Op are exactly
+  /// equal to the bits of Lo; the high bits exactly equal Hi.
+  /// For example, if Op is an i64 which was expanded into two i32's, then this
+  /// method returns the two i32's, with Lo being equal to the lower 32 bits of
+  /// Op, and Hi being equal to the upper 32 bits.
+  void GetExpandedInteger(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedInteger(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Integer Result Expansion.
+  void ExpandIntegerResult(SDNode *N, unsigned ResNo);
+  void ExpandIntRes_ANY_EXTEND        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertSext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_AssertZext        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Constant          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTLZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTPOP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_CTTZ              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_LOAD          (LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_READCYCLECOUNTER  (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SIGN_EXTEND_INREG (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_TRUNCATE          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ZERO_EXTEND       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_SINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_FP_TO_UINT        (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_Logical           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUB            (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBC           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_ADDSUBE           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BITREVERSE        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_BSWAP             (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_MUL               (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_SREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UDIV              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UREM              (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_Shift             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_MINMAX            (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_SADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_UADDSUBO          (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandIntRes_XMULO             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandIntRes_ATOMIC_LOAD       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void ExpandShiftByConstant(SDNode *N, const APInt &Amt,
+                             SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithKnownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+  bool ExpandShiftWithUnknownAmountBit(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Integer Operand Expansion.
+  bool ExpandIntegerOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandIntOp_BR_CC(SDNode *N);
+  SDValue ExpandIntOp_SELECT_CC(SDNode *N);
+  SDValue ExpandIntOp_SETCC(SDNode *N);
+  SDValue ExpandIntOp_SETCCE(SDNode *N);
+  SDValue ExpandIntOp_Shift(SDNode *N);
+  SDValue ExpandIntOp_SINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ExpandIntOp_TRUNCATE(SDNode *N);
+  SDValue ExpandIntOp_UINT_TO_FP(SDNode *N);
+  SDValue ExpandIntOp_RETURNADDR(SDNode *N);
+  SDValue ExpandIntOp_ATOMIC_STORE(SDNode *N);
+
+  void IntegerExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                  ISD::CondCode &CCCode, const SDLoc &dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float to Integer Conversion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given an operand Op of Float type, returns the integer if the Op is not
+  /// supported in target HW and converted to the integer.
+  /// The integer contains exactly the same bits as Op - only the type changed.
+  /// For example, if Op is an f32 which was softened to an i32, then this method
+  /// returns an i32, the bits of which coincide with those of Op.
+  /// If the Op can be efficiently supported in target HW or the operand must
+  /// stay in a register, the Op is not converted to an integer.
+  /// In that case, the given op is returned.
+  SDValue GetSoftenedFloat(SDValue Op) {
+    SDValue &SoftenedOp = SoftenedFloats[Op];
+    if (!SoftenedOp.getNode() &&
+        isSimpleLegalType(Op.getValueType()))
+      return Op;
+    RemapValue(SoftenedOp);
+    assert(SoftenedOp.getNode() && "Operand wasn't converted to integer?");
+    return SoftenedOp;
+  }
+  void SetSoftenedFloat(SDValue Op, SDValue Result);
+
+  // Call ReplaceValueWith(SDValue(N, ResNo), Res) if necessary.
+  void ReplaceSoftenFloatResult(SDNode *N, unsigned ResNo, SDValue &NewRes) {
+    // When the result type can be kept in HW registers, the converted
+    // NewRes node could have the same type. We can save the effort in
+    // cloning every user of N in SoftenFloatOperand or other legalization functions,
+    // by calling ReplaceValueWith here to update all users.
+    if (NewRes.getNode() != N && isLegalInHWReg(N->getValueType(ResNo)))
+      ReplaceValueWith(SDValue(N, ResNo), NewRes);
+  }
+
+  // Convert Float Results to Integer for Non-HW-supported Operations.
+  bool SoftenFloatResult(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_BITCAST(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_BUILD_PAIR(SDNode *N);
+  SDValue SoftenFloatRes_ConstantFP(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftenFloatRes_FABS(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FMINNUM(SDNode *N);
+  SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
+  SDValue SoftenFloatRes_FADD(SDNode *N);
+  SDValue SoftenFloatRes_FCEIL(SDNode *N);
+  SDValue SoftenFloatRes_FCOPYSIGN(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FCOS(SDNode *N);
+  SDValue SoftenFloatRes_FDIV(SDNode *N);
+  SDValue SoftenFloatRes_FEXP(SDNode *N);
+  SDValue SoftenFloatRes_FEXP2(SDNode *N);
+  SDValue SoftenFloatRes_FFLOOR(SDNode *N);
+  SDValue SoftenFloatRes_FLOG(SDNode *N);
+  SDValue SoftenFloatRes_FLOG2(SDNode *N);
+  SDValue SoftenFloatRes_FLOG10(SDNode *N);
+  SDValue SoftenFloatRes_FMA(SDNode *N);
+  SDValue SoftenFloatRes_FMUL(SDNode *N);
+  SDValue SoftenFloatRes_FNEARBYINT(SDNode *N);
+  SDValue SoftenFloatRes_FNEG(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatRes_FP16_TO_FP(SDNode *N);
+  SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatRes_FPOW(SDNode *N);
+  SDValue SoftenFloatRes_FPOWI(SDNode *N);
+  SDValue SoftenFloatRes_FREM(SDNode *N);
+  SDValue SoftenFloatRes_FRINT(SDNode *N);
+  SDValue SoftenFloatRes_FROUND(SDNode *N);
+  SDValue SoftenFloatRes_FSIN(SDNode *N);
+  SDValue SoftenFloatRes_FSQRT(SDNode *N);
+  SDValue SoftenFloatRes_FSUB(SDNode *N);
+  SDValue SoftenFloatRes_FTRUNC(SDNode *N);
+  SDValue SoftenFloatRes_LOAD(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_SELECT_CC(SDNode *N, unsigned ResNo);
+  SDValue SoftenFloatRes_UNDEF(SDNode *N);
+  SDValue SoftenFloatRes_VAARG(SDNode *N);
+  SDValue SoftenFloatRes_XINT_TO_FP(SDNode *N);
+
+  // Return true if we can skip softening the given operand or SDNode because
+  // it was soften before by SoftenFloatResult and references to the operand
+  // were replaced by ReplaceValueWith.
+  bool CanSkipSoftenFloatOperand(SDNode *N, unsigned OpNo);
+
+  // Convert Float Operand to Integer for Non-HW-supported Operations.
+  bool SoftenFloatOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftenFloatOp_BITCAST(SDNode *N);
+  SDValue SoftenFloatOp_BR_CC(SDNode *N);
+  SDValue SoftenFloatOp_FP_EXTEND(SDNode *N);
+  SDValue SoftenFloatOp_FP_ROUND(SDNode *N);
+  SDValue SoftenFloatOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftenFloatOp_SELECT_CC(SDNode *N);
+  SDValue SoftenFloatOp_SETCC(SDNode *N);
+  SDValue SoftenFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Float Expansion Support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed operand Op which was expanded into two floating-point
+  /// values of half the size, this returns the two halves.
+  /// The low bits of Op are exactly equal to the bits of Lo; the high bits
+  /// exactly equal Hi.  For example, if Op is a ppcf128 which was expanded
+  /// into two f64's, then this method returns the two f64's, with Lo being
+  /// equal to the lower 64 bits of Op, and Hi to the upper 64 bits.
+  void GetExpandedFloat(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetExpandedFloat(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Float Result Expansion.
+  void ExpandFloatResult(SDNode *N, unsigned ResNo);
+  void ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FABS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FADD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCEIL     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOPYSIGN (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FCOS      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FDIV      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FEXP2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FFLOOR    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG2     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FLOG10    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMA       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FMUL      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEARBYINT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FNEG      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FTRUNC    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_LOAD      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_XINT_TO_FP(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Float Operand Expansion.
+  bool ExpandFloatOperand(SDNode *N, unsigned OperandNo);
+  SDValue ExpandFloatOp_BR_CC(SDNode *N);
+  SDValue ExpandFloatOp_FCOPYSIGN(SDNode *N);
+  SDValue ExpandFloatOp_FP_ROUND(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_SINT(SDNode *N);
+  SDValue ExpandFloatOp_FP_TO_UINT(SDNode *N);
+  SDValue ExpandFloatOp_SELECT_CC(SDNode *N);
+  SDValue ExpandFloatOp_SETCC(SDNode *N);
+  SDValue ExpandFloatOp_STORE(SDNode *N, unsigned OpNo);
+
+  void FloatExpandSetCCOperands(SDValue &NewLHS, SDValue &NewRHS,
+                                ISD::CondCode &CCCode, const SDLoc &dl);
+
+  //===--------------------------------------------------------------------===//
+  // Float promotion support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  SDValue GetPromotedFloat(SDValue Op) {
+    SDValue &PromotedOp = PromotedFloats[Op];
+    RemapValue(PromotedOp);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetPromotedFloat(SDValue Op, SDValue Result);
+
+  void PromoteFloatResult(SDNode *N, unsigned ResNo);
+  SDValue PromoteFloatRes_BITCAST(SDNode *N);
+  SDValue PromoteFloatRes_BinOp(SDNode *N);
+  SDValue PromoteFloatRes_ConstantFP(SDNode *N);
+  SDValue PromoteFloatRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue PromoteFloatRes_FCOPYSIGN(SDNode *N);
+  SDValue PromoteFloatRes_FMAD(SDNode *N);
+  SDValue PromoteFloatRes_FPOWI(SDNode *N);
+  SDValue PromoteFloatRes_FP_ROUND(SDNode *N);
+  SDValue PromoteFloatRes_LOAD(SDNode *N);
+  SDValue PromoteFloatRes_SELECT(SDNode *N);
+  SDValue PromoteFloatRes_SELECT_CC(SDNode *N);
+  SDValue PromoteFloatRes_UnaryOp(SDNode *N);
+  SDValue PromoteFloatRes_UNDEF(SDNode *N);
+  SDValue PromoteFloatRes_XINT_TO_FP(SDNode *N);
+
+  bool PromoteFloatOperand(SDNode *N, unsigned ResNo);
+  SDValue PromoteFloatOp_BITCAST(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_EXTEND(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_FP_TO_XINT(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_STORE(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Scalarization Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed one-element vector Op which was scalarized to its
+  /// element type, this returns the element. For example, if Op is a v1i32,
+  /// Op = < i32 val >, this method returns val, an i32.
+  SDValue GetScalarizedVector(SDValue Op) {
+    SDValue &ScalarizedOp = ScalarizedVectors[Op];
+    RemapValue(ScalarizedOp);
+    assert(ScalarizedOp.getNode() && "Operand wasn't scalarized?");
+    return ScalarizedOp;
+  }
+  void SetScalarizedVector(SDValue Op, SDValue Result);
+
+  // Vector Result Scalarization: <1 x ty> -> ty.
+  void ScalarizeVectorResult(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo);
+  SDValue ScalarizeVecRes_BinOp(SDNode *N);
+  SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecRes_InregOp(SDNode *N);
+
+  SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
+  SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
+  SDValue ScalarizeVecRes_FPOWI(SDNode *N);
+  SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
+  SDValue ScalarizeVecRes_VSELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT(SDNode *N);
+  SDValue ScalarizeVecRes_SELECT_CC(SDNode *N);
+  SDValue ScalarizeVecRes_SETCC(SDNode *N);
+  SDValue ScalarizeVecRes_UNDEF(SDNode *N);
+  SDValue ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N);
+  SDValue ScalarizeVecRes_VSETCC(SDNode *N);
+
+  // Vector Operand Scalarization: <1 x ty> -> ty.
+  bool ScalarizeVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_BITCAST(SDNode *N);
+  SDValue ScalarizeVecOp_UnaryOp(SDNode *N);
+  SDValue ScalarizeVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue ScalarizeVecOp_VSELECT(SDNode *N);
+  SDValue ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Splitting Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed vector Op which was split into vectors of half the size,
+  /// this method returns the halves. The first elements of Op coincide with the
+  /// elements of Lo; the remaining elements of Op coincide with the elements of
+  /// Hi: Op is what you would get by concatenating Lo and Hi.
+  /// For example, if Op is a v8i32 that was split into two v4i32's, then this
+  /// method returns the two v4i32's, with Lo corresponding to the first 4
+  /// elements of Op, and Hi to the last 4 elements.
+  void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
+  void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
+
+  // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
+  void SplitVectorResult(SDNode *N, unsigned OpNo);
+  void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_InregOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  void SplitVecRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FPOWI(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_LOAD(LoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MLOAD(MaskedLoadSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_MGATHER(MaskedGatherSDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N, SDValue &Lo,
+                                  SDValue &Hi);
+
+  // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>.
+  bool SplitVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_UnaryOp(SDNode *N);
+  SDValue SplitVecOp_TruncateHelper(SDNode *N);
+
+  SDValue SplitVecOp_BITCAST(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_MGATHER(MaskedGatherSDNode *N, unsigned OpNo);
+  SDValue SplitVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue SplitVecOp_VSETCC(SDNode *N);
+  SDValue SplitVecOp_FP_ROUND(SDNode *N);
+  SDValue SplitVecOp_FCOPYSIGN(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Given a processed vector Op which was widened into a larger vector, this
+  /// method returns the larger vector. The elements of the returned vector
+  /// consist of the elements of Op followed by elements containing rubbish.
+  /// For example, if Op is a v2i32 that was widened to a v4i32, then this
+  /// method returns a v4i32 for which the first two elements are the same as
+  /// those of Op, while the last two elements contain rubbish.
+  SDValue GetWidenedVector(SDValue Op) {
+    SDValue &WidenedOp = WidenedVectors[Op];
+    RemapValue(WidenedOp);
+    assert(WidenedOp.getNode() && "Operand wasn't widened?");
+    return WidenedOp;
+  }
+  void SetWidenedVector(SDValue Op, SDValue Result);
+
+  // Widen Vector Result Promotion.
+  void WidenVectorResult(SDNode *N, unsigned ResNo);
+  SDValue WidenVecRes_MERGE_VALUES(SDNode* N, unsigned ResNo);
+  SDValue WidenVecRes_BITCAST(SDNode* N);
+  SDValue WidenVecRes_BUILD_VECTOR(SDNode* N);
+  SDValue WidenVecRes_CONCAT_VECTORS(SDNode* N);
+  SDValue WidenVecRes_CONVERT_RNDSAT(SDNode* N);
+  SDValue WidenVecRes_EXTEND_VECTOR_INREG(SDNode* N);
+  SDValue WidenVecRes_EXTRACT_SUBVECTOR(SDNode* N);
+  SDValue WidenVecRes_INSERT_VECTOR_ELT(SDNode* N);
+  SDValue WidenVecRes_LOAD(SDNode* N);
+  SDValue WidenVecRes_MLOAD(MaskedLoadSDNode* N);
+  SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
+  SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
+  SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVecRes_SELECT_CC(SDNode* N);
+  SDValue WidenVecRes_SETCC(SDNode* N);
+  SDValue WidenVecRes_UNDEF(SDNode *N);
+  SDValue WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N);
+  SDValue WidenVecRes_VSETCC(SDNode* N);
+
+  SDValue WidenVecRes_Ternary(SDNode *N);
+  SDValue WidenVecRes_Binary(SDNode *N);
+  SDValue WidenVecRes_BinaryCanTrap(SDNode *N);
+  SDValue WidenVecRes_Convert(SDNode *N);
+  SDValue WidenVecRes_FCOPYSIGN(SDNode *N);
+  SDValue WidenVecRes_POWI(SDNode *N);
+  SDValue WidenVecRes_Shift(SDNode *N);
+  SDValue WidenVecRes_Unary(SDNode *N);
+  SDValue WidenVecRes_InregOp(SDNode *N);
+
+  // Widen Vector Operand.
+  bool WidenVectorOperand(SDNode *N, unsigned OpNo);
+  SDValue WidenVecOp_BITCAST(SDNode *N);
+  SDValue WidenVecOp_CONCAT_VECTORS(SDNode *N);
+  SDValue WidenVecOp_EXTEND(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N);
+  SDValue WidenVecOp_STORE(SDNode* N);
+  SDValue WidenVecOp_MSTORE(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_MSCATTER(SDNode* N, unsigned OpNo);
+  SDValue WidenVecOp_SETCC(SDNode* N);
+
+  SDValue WidenVecOp_Convert(SDNode *N);
+  SDValue WidenVecOp_FCOPYSIGN(SDNode *N);
+
+  //===--------------------------------------------------------------------===//
+  // Vector Widening Utilities Support: LegalizeVectorTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  /// Helper function to generate a set of loads to load a vector with a
+  /// resulting wider type. It takes:
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  SDValue GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
+                              LoadSDNode *LD);
+
+  /// Helper function to generate a set of extension loads to load a vector with
+  /// a resulting wider type. It takes:
+  ///   LdChain: list of chains for the load to be generated.
+  ///   Ld:      load to widen
+  ///   ExtType: extension element type
+  SDValue GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
+                                 LoadSDNode *LD, ISD::LoadExtType ExtType);
+
+  /// Helper function to generate a set of stores to store a widen vector into
+  /// non-widen memory.
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain, StoreSDNode *ST);
+
+  /// Helper function to generate a set of stores to store a truncate widen
+  /// vector into non-widen memory.
+  ///   StChain: list of chains for the stores we have generated
+  ///   ST:      store of a widen value
+  void GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
+                                 StoreSDNode *ST);
+
+  /// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+  /// input vector must have the same element type as NVT.
+  /// When FillWithZeroes is "on" the vector will be widened with zeroes.
+  /// By default, the vector will be widened with undefined values.
+  SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Splitting: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // not necessarily identical types.  As such they can be used for splitting
+  // vectors and expanding integers and floats.
+
+  void GetSplitOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isVector())
+      GetSplitVector(Op, Lo, Hi);
+    else if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+  /// Use ISD::EXTRACT_ELEMENT nodes to extract the low and high parts of the
+  /// given value.
+  void GetPairElements(SDValue Pair, SDValue &Lo, SDValue &Hi);
+
+  // Generic Result Splitting.
+  void SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                             SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  //===--------------------------------------------------------------------===//
+  // Generic Expansion: LegalizeTypesGeneric.cpp
+  //===--------------------------------------------------------------------===//
+
+  // Legalization methods which only use that the illegal type is split into two
+  // identical types of half the size, and that the Lo/Hi part is stored first
+  // in memory on little/big-endian machines, followed by the Hi/Lo part.  As
+  // such they can be used for expanding integers and floats.
+
+  void GetExpandedOp(SDValue Op, SDValue &Lo, SDValue &Hi) {
+    if (Op.getValueType().isInteger())
+      GetExpandedInteger(Op, Lo, Hi);
+    else
+      GetExpandedFloat(Op, Lo, Hi);
+  }
+
+
+  /// This function will split the integer \p Op into \p NumElements
+  /// operations of type \p EltVT and store them in \p Ops.
+  void IntegerToVector(SDValue Op, unsigned NumElements,
+                       SmallVectorImpl<SDValue> &Ops, EVT EltVT);
+
+  // Generic Result Expansion.
+  void ExpandRes_MERGE_VALUES      (SDNode *N, unsigned ResNo,
+                                    SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BITCAST           (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_BUILD_PAIR        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_ELEMENT   (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_NormalLoad        (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandRes_VAARG             (SDNode *N, SDValue &Lo, SDValue &Hi);
+
+  // Generic Operand Expansion.
+  SDValue ExpandOp_BITCAST          (SDNode *N);
+  SDValue ExpandOp_BUILD_VECTOR     (SDNode *N);
+  SDValue ExpandOp_EXTRACT_ELEMENT  (SDNode *N);
+  SDValue ExpandOp_INSERT_VECTOR_ELT(SDNode *N);
+  SDValue ExpandOp_SCALAR_TO_VECTOR (SDNode *N);
+  SDValue ExpandOp_NormalStore      (SDNode *N, unsigned OpNo);
+};
+
+} // end namespace llvm.
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
new file mode 100644
index 000000000000..665180e119b7
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -0,0 +1,555 @@
+//===-------- LegalizeTypesGeneric.cpp - Generic type legalization --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements generic type expansion and splitting for LegalizeTypes.
+// The routines here perform legalization when the details of the type (such as
+// whether it is an integer or a float) do not matter.
+// Expansion is the act of changing a computation in an illegal type to be a
+// computation in two identical registers of a smaller type.  The Lo/Hi part
+// is required to be stored first in memory on little/big-endian machines.
+// Splitting is the act of changing a computation in an illegal type to be a
+// computation in two not necessarily identical registers of a smaller type.
+// There are no requirements on how the type is represented in memory.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DataLayout.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+// Generic Result Expansion.
+//===----------------------------------------------------------------------===//
+
+// These routines assume that the Lo/Hi part is stored first in memory on
+// little/big-endian machines, followed by the Hi/Lo part.  This means that
+// they cannot be used as is on vectors, for which Lo is always stored first.
+void DAGTypeLegalizer::ExpandRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                              SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetExpandedOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OutVT = N->getValueType(0);
+  EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  SDLoc dl(N);
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+    case TargetLowering::TypeLegal:
+    case TargetLowering::TypePromoteInteger:
+      break;
+    case TargetLowering::TypePromoteFloat:
+      llvm_unreachable("Bitcast of a promotion-needing float should never need"
+                       "expansion");
+    case TargetLowering::TypeSoftenFloat: {
+      // Expand the floating point operand only if it was converted to integers.
+      // Otherwise, it is a legal type like f128 that can be saved in a register.
+      auto SoftenedOp = GetSoftenedFloat(InOp);
+      if (SoftenedOp == InOp)
+        break;
+      SplitInteger(SoftenedOp, Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+    case TargetLowering::TypeExpandInteger:
+    case TargetLowering::TypeExpandFloat: {
+      auto &DL = DAG.getDataLayout();
+      // Convert the expanded pieces of the input.
+      GetExpandedOp(InOp, Lo, Hi);
+      if (TLI.hasBigEndianPartOrdering(InVT, DL) !=
+          TLI.hasBigEndianPartOrdering(OutVT, DL))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+    case TargetLowering::TypeSplitVector:
+      GetSplitVector(InOp, Lo, Hi);
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeScalarizeVector:
+      // Convert the element instead.
+      SplitInteger(BitConvertToInteger(GetScalarizedVector(InOp)), Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    case TargetLowering::TypeWidenVector: {
+      assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
+      InOp = GetWidenedVector(InOp);
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(InVT);
+      std::tie(Lo, Hi) = DAG.SplitVector(InOp, dl, LoVT, HiVT);
+      if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
+      return;
+    }
+  }
+
+  if (InVT.isVector() && OutVT.isInteger()) {
+    // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
+    // is legal but the result is not.
+    unsigned NumElems = 2;
+    EVT ElemVT = NOutVT;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+
+    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
+    while (!isTypeLegal(NVT)) {
+      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
+      // If the element size is smaller than byte, bail.
+      if (NewSizeInBits < 8)
+        break;
+      NumElems *= 2;
+      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
+      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+    }
+
+    if (isTypeLegal(NVT)) {
+      SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
+
+      SmallVector<SDValue, 8> Vals;
+      for (unsigned i = 0; i < NumElems; ++i)
+        Vals.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, ElemVT, CastInOp,
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+
+      // Build Lo, Hi pair by pairing extracted elements if needed.
+      unsigned Slot = 0;
+      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
+        // Each iteration will BUILD_PAIR two nodes and append the result until
+        // there are only two nodes left, i.e. Lo and Hi.
+        SDValue LHS = Vals[Slot];
+        SDValue RHS = Vals[Slot + 1];
+
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(LHS, RHS);
+
+        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
+                                   EVT::getIntegerVT(
+                                     *DAG.getContext(),
+                                     LHS.getValueType().getSizeInBits() << 1),
+                                   LHS, RHS));
+      }
+      Lo = Vals[Slot++];
+      Hi = Vals[Slot++];
+
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      return;
+    }
+  }
+
+  // Lower the bit-convert to a store/load from the stack.
+  assert(NOutVT.isByteSized() && "Expanded type not byte sized!");
+
+  // Create the stack frame object.  Make sure it is aligned for both
+  // the source and expanded destination types.
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(
+      NOutVT.getTypeForEVT(*DAG.getContext()));
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+  // Emit a store to the stack slot.
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
+
+  // Load the first half from the stack slot.
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getConstant(IncrementSize, dl,
+                                         StackPtr.getValueType()));
+
+  // Load the second half from the stack slot.
+  Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
+                   PtrInfo.getWithOffset(IncrementSize),
+                   MinAlign(Alignment, IncrementSize));
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_BUILD_PAIR(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  // Return the operands.
+  Lo = N->getOperand(0);
+  Hi = N->getOperand(1);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_ELEMENT(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  SDValue Part = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ?
+                   Hi : Lo;
+
+  assert(Part.getValueType() == N->getValueType(0) &&
+         "Type twice as big as expanded type not itself expanded!");
+
+  GetPairElements(Part, Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue OldVec = N->getOperand(0);
+  unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
+  SDLoc dl(N);
+
+  // Convert to a vector of the expanded element type, for example
+  // <3 x i64> -> <6 x i32>.
+  EVT OldVT = N->getValueType(0);
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, 2*OldElts),
+                               OldVec);
+
+  // Extract the elements at 2 * Idx and 2 * Idx + 1 from the new vector.
+  SDValue Idx = N->getOperand(1);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx,
+                    DAG.getConstant(1, dl, Idx.getValueType()));
+  Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, NewVec, Idx);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+}
+
+void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  assert(ISD::isNormalLoad(N) && "This routine only for normal loads!");
+  SDLoc dl(N);
+
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  EVT ValueVT = LD->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  unsigned Alignment = LD->getAlignment();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), Alignment,
+                   LD->getMemOperand()->getFlags(), AAInfo);
+
+  // Increment the pointer to the other half.
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+  Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
+                   LD->getPointerInfo().getWithOffset(IncrementSize),
+                   MinAlign(Alignment, IncrementSize),
+                   LD->getMemOperand()->getFlags(), AAInfo);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                      Hi.getValue(1));
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
+}
+
+void DAGTypeLegalizer::ExpandRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT OVT = N->getValueType(0);
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), OVT);
+  SDValue Chain = N->getOperand(0);
+  SDValue Ptr = N->getOperand(1);
+  SDLoc dl(N);
+  const unsigned Align = N->getConstantOperandVal(3);
+
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, N->getOperand(2), Align);
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, N->getOperand(2), 0);
+
+  // Handle endianness of the load.
+  if (TLI.hasBigEndianPartOrdering(OVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), Hi.getValue(1));
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Operand Expansion.
+//===--------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::IntegerToVector(SDValue Op, unsigned NumElements,
+                                       SmallVectorImpl<SDValue> &Ops,
+                                       EVT EltVT) {
+  assert(Op.getValueType().isInteger());
+  SDLoc DL(Op);
+  SDValue Parts[2];
+
+  if (NumElements > 1) {
+    NumElements >>= 1;
+    SplitInteger(Op, Parts[0], Parts[1]);
+    if (DAG.getDataLayout().isBigEndian())
+        std::swap(Parts[0], Parts[1]);
+    IntegerToVector(Parts[0], NumElements, Ops, EltVT);
+    IntegerToVector(Parts[1], NumElements, Ops, EltVT);
+  } else {
+    Ops.push_back(DAG.getNode(ISD::BITCAST, DL, EltVT, Op));
+  }
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BITCAST(SDNode *N) {
+  SDLoc dl(N);
+  if (N->getValueType(0).isVector()) {
+    // An illegal expanding type is being converted to a legal vector type.
+    // Make a two element vector out of the expanded parts and convert that
+    // instead, but only if the new vector type is legal (otherwise there
+    // is no point, and it might create expansion loops).  For example, on
+    // x86 this turns v1i64 = BITCAST i64 into v1i64 = BITCAST v2i32.
+    //
+    // FIXME: I'm not sure why we are first trying to split the input into
+    // a 2 element vector, so I'm leaving it here to maintain the current
+    // behavior.
+    unsigned NumElts = 2;
+    EVT OVT = N->getOperand(0).getValueType();
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(),
+                               TLI.getTypeToTransformTo(*DAG.getContext(), OVT),
+                               NumElts);
+    if (!isTypeLegal(NVT)) {
+      // If we can't find a legal type by splitting the integer in half,
+      // then we can use the node's value type.
+      NumElts = N->getValueType(0).getVectorNumElements();
+      NVT = N->getValueType(0);
+    }
+
+    SmallVector<SDValue, 8> Ops;
+    IntegerToVector(N->getOperand(0), NumElts, Ops, NVT.getVectorElementType());
+
+    SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
+                              makeArrayRef(Ops.data(), NumElts));
+    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), Vec);
+  }
+
+  // Otherwise, store to a temporary and load out again as the new type.
+  return CreateStackStoreLoad(N->getOperand(0), N->getValueType(0));
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_BUILD_VECTOR(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  EVT OldVT = N->getOperand(0).getValueType();
+  EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
+  SDLoc dl(N);
+
+  assert(OldVT == VecVT.getVectorElementType() &&
+         "BUILD_VECTOR operand type doesn't match vector element type!");
+
+  // Build a vector of twice the length out of the expanded elements.
+  // For example <3 x i64> -> <6 x i32>.
+  std::vector<SDValue> NewElts;
+  NewElts.reserve(NumElts*2);
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    SDValue Lo, Hi;
+    GetExpandedOp(N->getOperand(i), Lo, Hi);
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Lo, Hi);
+    NewElts.push_back(Lo);
+    NewElts.push_back(Hi);
+  }
+
+  SDValue NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                               EVT::getVectorVT(*DAG.getContext(),
+                                                NewVT, NewElts.size()),
+                               NewElts);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_EXTRACT_ELEMENT(SDNode *N) {
+  SDValue Lo, Hi;
+  GetExpandedOp(N->getOperand(0), Lo, Hi);
+  return cast<ConstantSDNode>(N->getOperand(1))->getZExtValue() ? Hi : Lo;
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_INSERT_VECTOR_ELT(SDNode *N) {
+  // The vector type is legal but the element type needs expansion.
+  EVT VecVT = N->getValueType(0);
+  unsigned NumElts = VecVT.getVectorNumElements();
+  SDLoc dl(N);
+
+  SDValue Val = N->getOperand(1);
+  EVT OldEVT = Val.getValueType();
+  EVT NewEVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldEVT);
+
+  assert(OldEVT == VecVT.getVectorElementType() &&
+         "Inserted element type doesn't match vector element type!");
+
+  // Bitconvert to a vector of twice the length with elements of the expanded
+  // type, insert the expanded vector elements, and then convert back.
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewEVT, NumElts*2);
+  SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
+                               NewVecVT, N->getOperand(0));
+
+  SDValue Lo, Hi;
+  GetExpandedOp(Val, Lo, Hi);
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+
+  SDValue Idx = N->getOperand(2);
+  Idx = DAG.getNode(ISD::ADD, dl, Idx.getValueType(), Idx, Idx);
+  NewVec = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Lo, Idx);
+  Idx = DAG.getNode(ISD::ADD, dl,
+                    Idx.getValueType(), Idx,
+                    DAG.getConstant(1, dl, Idx.getValueType()));
+  NewVec =  DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, NewVec, Hi, Idx);
+
+  // Convert the new vector to the old vector type.
+  return DAG.getNode(ISD::BITCAST, dl, VecVT, NewVec);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_SCALAR_TO_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  assert(VT.getVectorElementType() == N->getOperand(0).getValueType() &&
+         "SCALAR_TO_VECTOR operand type doesn't match vector element type!");
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  Ops[0] = N->getOperand(0);
+  SDValue UndefVal = DAG.getUNDEF(Ops[0].getValueType());
+  for (unsigned i = 1; i < NumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
+SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
+  assert(ISD::isNormalStore(N) && "This routine only for normal stores!");
+  assert(OpNo == 1 && "Can only expand the stored value so far");
+  SDLoc dl(N);
+
+  StoreSDNode *St = cast<StoreSDNode>(N);
+  EVT ValueVT = St->getValue().getValueType();
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
+  SDValue Chain = St->getChain();
+  SDValue Ptr = St->getBasePtr();
+  unsigned Alignment = St->getAlignment();
+  AAMDNodes AAInfo = St->getAAInfo();
+
+  assert(NVT.isByteSized() && "Expanded type not byte sized!");
+  unsigned IncrementSize = NVT.getSizeInBits() / 8;
+
+  SDValue Lo, Hi;
+  GetExpandedOp(St->getValue(), Lo, Hi);
+
+  if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+    std::swap(Lo, Hi);
+
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), Alignment,
+                    St->getMemOperand()->getFlags(), AAInfo);
+
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+  Hi = DAG.getStore(Chain, dl, Hi, Ptr,
+                    St->getPointerInfo().getWithOffset(IncrementSize),
+                    MinAlign(Alignment, IncrementSize),
+                    St->getMemOperand()->getFlags(), AAInfo);
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+}
+
+
+//===--------------------------------------------------------------------===//
+// Generic Result Splitting.
+//===--------------------------------------------------------------------===//
+
+// Be careful to make no assumptions about which of Lo/Hi is stored first in
+// memory (for vectors it is always Lo first followed by Hi in the following
+// bytes; for integers and floats it is Lo first if and only if the machine is
+// little-endian).
+
+void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
+                                             SDValue &Lo, SDValue &Hi) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  GetSplitOp(Op, Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
+                                       SDValue &Hi) {
+  SDValue LL, LH, RL, RH, CL, CH;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(1), LL, LH);
+  GetSplitOp(N->getOperand(2), RL, RH);
+
+  SDValue Cond = N->getOperand(0);
+  CL = CH = Cond;
+  if (Cond.getValueType().isVector()) {
+    // Check if there are already splitted versions of the vector available and
+    // use those instead of splitting the mask operand again.
+    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+      GetSplitVector(Cond, CL, CH);
+    else
+      std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
+  }
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LL.getValueType(), CL, LL, RL);
+  Hi = DAG.getNode(N->getOpcode(), dl, LH.getValueType(), CH, LH, RH);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT_CC(SDNode *N, SDValue &Lo,
+                                          SDValue &Hi) {
+  SDValue LL, LH, RL, RH;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(2), LL, LH);
+  GetSplitOp(N->getOperand(3), RL, RH);
+
+  Lo = DAG.getNode(ISD::SELECT_CC, dl, LL.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LL, RL, N->getOperand(4));
+  Hi = DAG.getNode(ISD::SELECT_CC, dl, LH.getValueType(), N->getOperand(0),
+                   N->getOperand(1), LH, RH, N->getOperand(4));
+}
+
+void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  Lo = DAG.getUNDEF(LoVT);
+  Hi = DAG.getUNDEF(HiVT);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
new file mode 100644
index 000000000000..3c9cb17b58b2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -0,0 +1,1068 @@
+//===-- LegalizeVectorOps.cpp - Implement SelectionDAG::LegalizeVectors ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the SelectionDAG::LegalizeVectors method.
+//
+// The vector legalizer looks for vector operations which might need to be
+// scalarized and legalizes them. This is a separate step from Legalize because
+// scalarizing can introduce illegal types.  For example, suppose we have an
+// ISD::SDIV of type v2i64 on x86-32.  The type is legal (for example, addition
+// on a v2i64 is legal), but ISD::SDIV isn't legal, so we have to unroll the
+// operation, which introduces nodes with the illegal type i64 which must be
+// expanded.  Similarly, suppose we have an ISD::SRA of type v16i8 on PowerPC;
+// the operation must be unrolled, which introduces nodes with the illegal
+// type i8 which must be promoted.
+//
+// This does not legalize vector manipulations like ISD::BUILD_VECTOR,
+// or operations that happen to take a vector which are custom-lowered;
+// the legalization for such operations never produces nodes
+// with illegal types, so it's okay to put off legalizing them until
+// SelectionDAG::Legalize runs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/Target/TargetLowering.h"
+using namespace llvm;
+
+namespace {
+class VectorLegalizer {
+  SelectionDAG& DAG;
+  const TargetLowering &TLI;
+  bool Changed; // Keep track of whether anything changed
+
+  /// For nodes that are of legal width, and that have more than one use, this
+  /// map indicates what regularized operand to use.  This allows us to avoid
+  /// legalizing the same thing more than once.
+  SmallDenseMap<SDValue, SDValue, 64> LegalizedNodes;
+
+  /// \brief Adds a node to the translation cache.
+  void AddLegalizedOperand(SDValue From, SDValue To) {
+    LegalizedNodes.insert(std::make_pair(From, To));
+    // If someone requests legalization of the new node, return itself.
+    if (From != To)
+      LegalizedNodes.insert(std::make_pair(To, To));
+  }
+
+  /// \brief Legalizes the given node.
+  SDValue LegalizeOp(SDValue Op);
+
+  /// \brief Assuming the node is legal, "legalize" the results.
+  SDValue TranslateLegalizeResults(SDValue Op, SDValue Result);
+
+  /// \brief Implements unrolling a VSETCC.
+  SDValue UnrollVSETCC(SDValue Op);
+
+  /// \brief Implement expand-based legalization of vector operations.
+  ///
+  /// This is just a high-level routine to dispatch to specific code paths for
+  /// operations to legalize them.
+  SDValue Expand(SDValue Op);
+
+  /// \brief Implements expansion for FNEG; falls back to UnrollVectorOp if
+  /// FSUB isn't legal.
+  ///
+  /// Implements expansion for UINT_TO_FLOAT; falls back to UnrollVectorOp if
+  /// SINT_TO_FLOAT and SHR on vectors isn't legal.
+  SDValue ExpandUINT_TO_FLOAT(SDValue Op);
+
+  /// \brief Implement expansion for SIGN_EXTEND_INREG using SRL and SRA.
+  SDValue ExpandSEXTINREG(SDValue Op);
+
+  /// \brief Implement expansion for ANY_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and bitcasts to the proper
+  /// type. The contents of the bits in the extended part of each element are
+  /// undef.
+  SDValue ExpandANY_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Implement expansion for SIGN_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place, bitcasts to the proper
+  /// type, then shifts left and arithmetic shifts right to introduce a sign
+  /// extension.
+  SDValue ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Implement expansion for ZERO_EXTEND_VECTOR_INREG.
+  ///
+  /// Shuffles the low lanes of the operand into place and blends zeros into
+  /// the remaining lanes, finally bitcasting to the proper type.
+  SDValue ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op);
+
+  /// \brief Expand bswap of vectors into a shuffle if legal.
+  SDValue ExpandBSWAP(SDValue Op);
+
+  /// \brief Implement vselect in terms of XOR, AND, OR when blend is not
+  /// supported by the target.
+  SDValue ExpandVSELECT(SDValue Op);
+  SDValue ExpandSELECT(SDValue Op);
+  SDValue ExpandLoad(SDValue Op);
+  SDValue ExpandStore(SDValue Op);
+  SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandBITREVERSE(SDValue Op);
+  SDValue ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op);
+
+  /// \brief Implements vector promotion.
+  ///
+  /// This is essentially just bitcasting the operands to a different type and
+  /// bitcasting the result back to the original type.
+  SDValue Promote(SDValue Op);
+
+  /// \brief Implements [SU]INT_TO_FP vector promotion.
+  ///
+  /// This is a [zs]ext of the input operand to the next size up.
+  SDValue PromoteINT_TO_FP(SDValue Op);
+
+  /// \brief Implements FP_TO_[SU]INT vector promotion of the result type.
+  ///
+  /// It is promoted to the next size up integer type.  The result is then
+  /// truncated back to the original type.
+  SDValue PromoteFP_TO_INT(SDValue Op, bool isSigned);
+
+public:
+  /// \brief Begin legalizer the vector operations in the DAG.
+  bool Run();
+  VectorLegalizer(SelectionDAG& dag) :
+      DAG(dag), TLI(dag.getTargetLoweringInfo()), Changed(false) {}
+};
+
+bool VectorLegalizer::Run() {
+  // Before we start legalizing vector nodes, check if there are any vectors.
+  bool HasVectors = false;
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I) {
+    // Check if the values of the nodes contain vectors. We don't need to check
+    // the operands because we are going to check their values at some point.
+    for (SDNode::value_iterator J = I->value_begin(), E = I->value_end();
+         J != E; ++J)
+      HasVectors |= J->isVector();
+
+    // If we found a vector node we can start the legalization.
+    if (HasVectors)
+      break;
+  }
+
+  // If this basic block has no vectors then no need to legalize vectors.
+  if (!HasVectors)
+    return false;
+
+  // The legalize process is inherently a bottom-up recursive process (users
+  // legalize their uses before themselves).  Given infinite stack space, we
+  // could just start legalizing on the root and traverse the whole graph.  In
+  // practice however, this causes us to run out of stack space on large basic
+  // blocks.  To avoid this problem, compute an ordering of the nodes where each
+  // node is only legalized after all of its operands are legalized.
+  DAG.AssignTopologicalOrder();
+  for (SelectionDAG::allnodes_iterator I = DAG.allnodes_begin(),
+       E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I)
+    LegalizeOp(SDValue(&*I, 0));
+
+  // Finally, it's possible the root changed.  Get the new root.
+  SDValue OldRoot = DAG.getRoot();
+  assert(LegalizedNodes.count(OldRoot) && "Root didn't get legalized?");
+  DAG.setRoot(LegalizedNodes[OldRoot]);
+
+  LegalizedNodes.clear();
+
+  // Remove dead nodes now.
+  DAG.RemoveDeadNodes();
+
+  return Changed;
+}
+
+SDValue VectorLegalizer::TranslateLegalizeResults(SDValue Op, SDValue Result) {
+  // Generic legalization: just pass the operand through.
+  for (unsigned i = 0, e = Op.getNode()->getNumValues(); i != e; ++i)
+    AddLegalizedOperand(Op.getValue(i), Result.getValue(i));
+  return Result.getValue(Op.getResNo());
+}
+
+SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  DenseMap<SDValue, SDValue>::iterator I = LegalizedNodes.find(Op);
+  if (I != LegalizedNodes.end()) return I->second;
+
+  SDNode* Node = Op.getNode();
+
+  // Legalize the operands
+  SmallVector<SDValue, 8> Ops;
+  for (const SDValue &Op : Node->op_values())
+    Ops.push_back(LegalizeOp(Op));
+
+  SDValue Result = SDValue(DAG.UpdateNodeOperands(Op.getNode(), Ops), 0);
+
+  bool HasVectorValue = false;
+  if (Op.getOpcode() == ISD::LOAD) {
+    LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+    if (LD->getMemoryVT().isVector() && ExtType != ISD::NON_EXTLOAD)
+      switch (TLI.getLoadExtAction(LD->getExtensionType(), LD->getValueType(0),
+                                   LD->getMemoryVT())) {
+      default: llvm_unreachable("This action is not supported yet!");
+      case TargetLowering::Legal:
+        return TranslateLegalizeResults(Op, Result);
+      case TargetLowering::Custom:
+        if (SDValue Lowered = TLI.LowerOperation(Result, DAG)) {
+          if (Lowered == Result)
+            return TranslateLegalizeResults(Op, Lowered);
+          Changed = true;
+          if (Lowered->getNumValues() != Op->getNumValues()) {
+            // This expanded to something other than the load. Assume the
+            // lowering code took care of any chain values, and just handle the
+            // returned value.
+            assert(Result.getValue(1).use_empty() &&
+                   "There are still live users of the old chain!");
+            return LegalizeOp(Lowered);
+          }
+          return TranslateLegalizeResults(Op, Lowered);
+        }
+      case TargetLowering::Expand:
+        Changed = true;
+        return LegalizeOp(ExpandLoad(Op));
+      }
+  } else if (Op.getOpcode() == ISD::STORE) {
+    StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+    EVT StVT = ST->getMemoryVT();
+    MVT ValVT = ST->getValue().getSimpleValueType();
+    if (StVT.isVector() && ST->isTruncatingStore())
+      switch (TLI.getTruncStoreAction(ValVT, StVT)) {
+      default: llvm_unreachable("This action is not supported yet!");
+      case TargetLowering::Legal:
+        return TranslateLegalizeResults(Op, Result);
+      case TargetLowering::Custom: {
+        SDValue Lowered = TLI.LowerOperation(Result, DAG);
+        Changed = Lowered != Result;
+        return TranslateLegalizeResults(Op, Lowered);
+      }
+      case TargetLowering::Expand:
+        Changed = true;
+        return LegalizeOp(ExpandStore(Op));
+      }
+  } else if (Op.getOpcode() == ISD::MSCATTER || Op.getOpcode() == ISD::MSTORE)
+    HasVectorValue = true;
+
+  for (SDNode::value_iterator J = Node->value_begin(), E = Node->value_end();
+       J != E;
+       ++J)
+    HasVectorValue |= J->isVector();
+  if (!HasVectorValue)
+    return TranslateLegalizeResults(Op, Result);
+
+  EVT QueryType;
+  switch (Op.getOpcode()) {
+  default:
+    return TranslateLegalizeResults(Op, Result);
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+  case ISD::SDIVREM:
+  case ISD::UDIVREM:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+  case ISD::BSWAP:
+  case ISD::BITREVERSE:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::SELECT:
+  case ISD::VSELECT:
+  case ISD::SELECT_CC:
+  case ISD::SETCC:
+  case ISD::ZERO_EXTEND:
+  case ISD::ANY_EXTEND:
+  case ISD::TRUNCATE:
+  case ISD::SIGN_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FNEG:
+  case ISD::FABS:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::FCOPYSIGN:
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FP_ROUND:
+  case ISD::FP_EXTEND:
+  case ISD::FMA:
+  case ISD::SIGN_EXTEND_INREG:
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    QueryType = Node->getValueType(0);
+    break;
+  case ISD::FP_ROUND_INREG:
+    QueryType = cast<VTSDNode>(Node->getOperand(1))->getVT();
+    break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    QueryType = Node->getOperand(0).getValueType();
+    break;
+  case ISD::MSCATTER:
+    QueryType = cast<MaskedScatterSDNode>(Node)->getValue().getValueType();
+    break;
+  case ISD::MSTORE:
+    QueryType = cast<MaskedStoreSDNode>(Node)->getValue().getValueType();
+    break;
+  }
+
+  switch (TLI.getOperationAction(Node->getOpcode(), QueryType)) {
+  default: llvm_unreachable("This action is not supported yet!");
+  case TargetLowering::Promote:
+    Result = Promote(Op);
+    Changed = true;
+    break;
+  case TargetLowering::Legal:
+    break;
+  case TargetLowering::Custom: {
+    if (SDValue Tmp1 = TLI.LowerOperation(Op, DAG)) {
+      Result = Tmp1;
+      break;
+    }
+    // FALL THROUGH
+  }
+  case TargetLowering::Expand:
+    Result = Expand(Op);
+  }
+
+  // Make sure that the generated code is itself legal.
+  if (Result != Op) {
+    Result = LegalizeOp(Result);
+    Changed = true;
+  }
+
+  // Note that LegalizeOp may be reentered even from single-use nodes, which
+  // means that we always must cache transformed nodes.
+  AddLegalizedOperand(Op, Result);
+  return Result;
+}
+
+SDValue VectorLegalizer::Promote(SDValue Op) {
+  // For a few operations there is a specific concept for promotion based on
+  // the operand's type.
+  switch (Op.getOpcode()) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    // "Promote" the operation by extending the operand.
+    return PromoteINT_TO_FP(Op);
+  case ISD::FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+    // Promote the operation by extending the operand.
+    return PromoteFP_TO_INT(Op, Op->getOpcode() == ISD::FP_TO_SINT);
+  }
+
+  // There are currently two cases of vector promotion:
+  // 1) Bitcasting a vector of integers to a different type to a vector of the
+  //    same overall length. For example, x86 promotes ISD::AND v2i32 to v1i64.
+  // 2) Extending a vector of floats to a vector of the same number of larger
+  //    floats. For example, AArch64 promotes ISD::FADD on v4f16 to v4f32.
+  MVT VT = Op.getSimpleValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  MVT NVT = TLI.getTypeToPromoteTo(Op.getOpcode(), VT);
+  SDLoc dl(Op);
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      if (Op.getOperand(j)
+              .getValueType()
+              .getVectorElementType()
+              .isFloatingPoint() &&
+          NVT.isVector() && NVT.getVectorElementType().isFloatingPoint())
+        Operands[j] = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Op.getOperand(j));
+      else
+        Operands[j] = DAG.getNode(ISD::BITCAST, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  Op = DAG.getNode(Op.getOpcode(), dl, NVT, Operands, Op.getNode()->getFlags());
+  if ((VT.isFloatingPoint() && NVT.isFloatingPoint()) ||
+      (VT.isVector() && VT.getVectorElementType().isFloatingPoint() &&
+       NVT.isVector() && NVT.getVectorElementType().isFloatingPoint()))
+    return DAG.getNode(ISD::FP_ROUND, dl, VT, Op, DAG.getIntPtrConstant(0, dl));
+  else
+    return DAG.getNode(ISD::BITCAST, dl, VT, Op);
+}
+
+SDValue VectorLegalizer::PromoteINT_TO_FP(SDValue Op) {
+  // INT_TO_FP operations may require the input operand be promoted even
+  // when the type is otherwise legal.
+  EVT VT = Op.getOperand(0).getValueType();
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+
+  // Normal getTypeToPromoteTo() doesn't work here, as that will promote
+  // by widening the vector w/ the same element width and twice the number
+  // of elements. We want the other way around, the same number of elements,
+  // each twice the width.
+  //
+  // Increase the bitwidth of the element to the next pow-of-two
+  // (which is greater than 8 bits).
+
+  EVT NVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+  assert(NVT.isSimple() && "Promoting to a non-simple vector type!");
+  SDLoc dl(Op);
+  SmallVector<SDValue, 4> Operands(Op.getNumOperands());
+
+  unsigned Opc = Op.getOpcode() == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND :
+    ISD::SIGN_EXTEND;
+  for (unsigned j = 0; j != Op.getNumOperands(); ++j) {
+    if (Op.getOperand(j).getValueType().isVector())
+      Operands[j] = DAG.getNode(Opc, dl, NVT, Op.getOperand(j));
+    else
+      Operands[j] = Op.getOperand(j);
+  }
+
+  return DAG.getNode(Op.getOpcode(), dl, Op.getValueType(), Operands);
+}
+
+// For FP_TO_INT we promote the result type to a vector type with wider
+// elements and then truncate the result.  This is different from the default
+// PromoteVector which uses bitcast to promote thus assumning that the
+// promoted vector type has the same overall size.
+SDValue VectorLegalizer::PromoteFP_TO_INT(SDValue Op, bool isSigned) {
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "Can't promote a vector with multiple results!");
+  EVT VT = Op.getValueType();
+
+  EVT NewVT;
+  unsigned NewOpc;
+  while (1) {
+    NewVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+    assert(NewVT.isSimple() && "Promoting to a non-simple vector type!");
+    if (TLI.isOperationLegalOrCustom(ISD::FP_TO_SINT, NewVT)) {
+      NewOpc = ISD::FP_TO_SINT;
+      break;
+    }
+    if (!isSigned && TLI.isOperationLegalOrCustom(ISD::FP_TO_UINT, NewVT)) {
+      NewOpc = ISD::FP_TO_UINT;
+      break;
+    }
+  }
+
+  SDLoc loc(Op);
+  SDValue promoted  = DAG.getNode(NewOpc, SDLoc(Op), NewVT, Op.getOperand(0));
+  return DAG.getNode(ISD::TRUNCATE, SDLoc(Op), VT, promoted);
+}
+
+
+SDValue VectorLegalizer::ExpandLoad(SDValue Op) {
+  LoadSDNode *LD = cast<LoadSDNode>(Op.getNode());
+
+  EVT SrcVT = LD->getMemoryVT();
+  EVT SrcEltVT = SrcVT.getScalarType();
+  unsigned NumElem = SrcVT.getVectorNumElements();
+
+
+  SDValue NewChain;
+  SDValue Value;
+  if (SrcVT.getVectorNumElements() > 1 && !SrcEltVT.isByteSized()) {
+    SDLoc dl(Op);
+
+    SmallVector<SDValue, 8> Vals;
+    SmallVector<SDValue, 8> LoadChains;
+
+    EVT DstEltVT = LD->getValueType(0).getScalarType();
+    SDValue Chain = LD->getChain();
+    SDValue BasePTR = LD->getBasePtr();
+    ISD::LoadExtType ExtType = LD->getExtensionType();
+
+    // When elements in a vector is not byte-addressable, we cannot directly
+    // load each element by advancing pointer, which could only address bytes.
+    // Instead, we load all significant words, mask bits off, and concatenate
+    // them to form each element. Finally, they are extended to destination
+    // scalar type to build the destination vector.
+    EVT WideVT = TLI.getPointerTy(DAG.getDataLayout());
+
+    assert(WideVT.isRound() &&
+           "Could not handle the sophisticated case when the widest integer is"
+           " not power of 2.");
+    assert(WideVT.bitsGE(SrcEltVT) &&
+           "Type is not legalized?");
+
+    unsigned WideBytes = WideVT.getStoreSize();
+    unsigned Offset = 0;
+    unsigned RemainingBytes = SrcVT.getStoreSize();
+    SmallVector<SDValue, 8> LoadVals;
+
+    while (RemainingBytes > 0) {
+      SDValue ScalarLoad;
+      unsigned LoadBytes = WideBytes;
+
+      if (RemainingBytes >= LoadBytes) {
+        ScalarLoad =
+            DAG.getLoad(WideVT, dl, Chain, BasePTR,
+                        LD->getPointerInfo().getWithOffset(Offset),
+                        MinAlign(LD->getAlignment(), Offset),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+      } else {
+        EVT LoadVT = WideVT;
+        while (RemainingBytes < LoadBytes) {
+          LoadBytes >>= 1; // Reduce the load size by half.
+          LoadVT = EVT::getIntegerVT(*DAG.getContext(), LoadBytes << 3);
+        }
+        ScalarLoad =
+            DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
+                           LD->getPointerInfo().getWithOffset(Offset), LoadVT,
+                           MinAlign(LD->getAlignment(), Offset),
+                           LD->getMemOperand()->getFlags(), LD->getAAInfo());
+      }
+
+      RemainingBytes -= LoadBytes;
+      Offset += LoadBytes;
+      BasePTR = DAG.getNode(ISD::ADD, dl, BasePTR.getValueType(), BasePTR,
+                            DAG.getConstant(LoadBytes, dl,
+                                            BasePTR.getValueType()));
+
+      LoadVals.push_back(ScalarLoad.getValue(0));
+      LoadChains.push_back(ScalarLoad.getValue(1));
+    }
+
+    // Extract bits, pack and extend/trunc them into destination type.
+    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
+    SDValue SrcEltBitMask = DAG.getConstant((1U << SrcEltBits) - 1, dl, WideVT);
+
+    unsigned BitOffset = 0;
+    unsigned WideIdx = 0;
+    unsigned WideBits = WideVT.getSizeInBits();
+
+    for (unsigned Idx = 0; Idx != NumElem; ++Idx) {
+      SDValue Lo, Hi, ShAmt;
+
+      if (BitOffset < WideBits) {
+        ShAmt = DAG.getConstant(
+            BitOffset, dl, TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
+        Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+        Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask);
+      }
+
+      BitOffset += SrcEltBits;
+      if (BitOffset >= WideBits) {
+        WideIdx++;
+        BitOffset -= WideBits;
+        if (BitOffset > 0) {
+          ShAmt = DAG.getConstant(
+              SrcEltBits - BitOffset, dl,
+              TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
+          Hi = DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
+          Hi = DAG.getNode(ISD::AND, dl, WideVT, Hi, SrcEltBitMask);
+        }
+      }
+
+      if (Hi.getNode())
+        Lo = DAG.getNode(ISD::OR, dl, WideVT, Lo, Hi);
+
+      switch (ExtType) {
+      default: llvm_unreachable("Unknown extended-load op!");
+      case ISD::EXTLOAD:
+        Lo = DAG.getAnyExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::ZEXTLOAD:
+        Lo = DAG.getZExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      case ISD::SEXTLOAD:
+        ShAmt =
+            DAG.getConstant(WideBits - SrcEltBits, dl,
+                            TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
+        Lo = DAG.getNode(ISD::SHL, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getNode(ISD::SRA, dl, WideVT, Lo, ShAmt);
+        Lo = DAG.getSExtOrTrunc(Lo, dl, DstEltVT);
+        break;
+      }
+      Vals.push_back(Lo);
+    }
+
+    NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+    Value = DAG.getNode(ISD::BUILD_VECTOR, dl,
+                        Op.getNode()->getValueType(0), Vals);
+  } else {
+    SDValue Scalarized = TLI.scalarizeVectorLoad(LD, DAG);
+
+    NewChain = Scalarized.getValue(1);
+    Value = Scalarized.getValue(0);
+  }
+
+  AddLegalizedOperand(Op.getValue(0), Value);
+  AddLegalizedOperand(Op.getValue(1), NewChain);
+
+  return (Op.getResNo() ? NewChain : Value);
+}
+
+SDValue VectorLegalizer::ExpandStore(SDValue Op) {
+  StoreSDNode *ST = cast<StoreSDNode>(Op.getNode());
+
+  EVT StVT = ST->getMemoryVT();
+  EVT MemSclVT = StVT.getScalarType();
+  unsigned ScalarSize = MemSclVT.getSizeInBits();
+
+  // Round odd types to the next pow of two.
+  if (!isPowerOf2_32(ScalarSize)) {
+    // FIXME: This is completely broken and inconsistent with ExpandLoad
+    // handling.
+
+    // For sub-byte element sizes, this ends up with 0 stride between elements,
+    // so the same element just gets re-written to the same location. There seem
+    // to be tests explicitly testing for this broken behavior though.  tests
+    // for this broken behavior.
+
+    LLVMContext &Ctx = *DAG.getContext();
+
+    EVT NewMemVT
+      = EVT::getVectorVT(Ctx,
+                         MemSclVT.getIntegerVT(Ctx, NextPowerOf2(ScalarSize)),
+                         StVT.getVectorNumElements());
+
+    SDValue NewVectorStore = DAG.getTruncStore(
+        ST->getChain(), SDLoc(Op), ST->getValue(), ST->getBasePtr(),
+        ST->getPointerInfo(), NewMemVT, ST->getAlignment(),
+        ST->getMemOperand()->getFlags(), ST->getAAInfo());
+    ST = cast<StoreSDNode>(NewVectorStore.getNode());
+  }
+
+  SDValue TF = TLI.scalarizeVectorStore(ST, DAG);
+  AddLegalizedOperand(Op, TF);
+  return TF;
+}
+
+SDValue VectorLegalizer::Expand(SDValue Op) {
+  switch (Op->getOpcode()) {
+  case ISD::SIGN_EXTEND_INREG:
+    return ExpandSEXTINREG(Op);
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return ExpandANY_EXTEND_VECTOR_INREG(Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return ExpandSIGN_EXTEND_VECTOR_INREG(Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return ExpandZERO_EXTEND_VECTOR_INREG(Op);
+  case ISD::BSWAP:
+    return ExpandBSWAP(Op);
+  case ISD::VSELECT:
+    return ExpandVSELECT(Op);
+  case ISD::SELECT:
+    return ExpandSELECT(Op);
+  case ISD::UINT_TO_FP:
+    return ExpandUINT_TO_FLOAT(Op);
+  case ISD::FNEG:
+    return ExpandFNEG(Op);
+  case ISD::SETCC:
+    return UnrollVSETCC(Op);
+  case ISD::BITREVERSE:
+    return ExpandBITREVERSE(Op);
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+    return ExpandCTLZ_CTTZ_ZERO_UNDEF(Op);
+  default:
+    return DAG.UnrollVectorOp(Op.getNode());
+  }
+}
+
+SDValue VectorLegalizer::ExpandSELECT(SDValue Op) {
+  // Lower a select instruction where the condition is a scalar and the
+  // operands are vectors. Lower this select to VSELECT and implement it
+  // using XOR AND OR. The selector bit is broadcasted.
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  assert(VT.isVector() && !Mask.getValueType().isVector()
+         && Op1.getValueType() == Op2.getValueType() && "Invalid type");
+
+  unsigned NumElem = VT.getVectorNumElements();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // Also, we need to be able to construct a splat vector using BUILD_VECTOR.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR,  VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::BUILD_VECTOR,  VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Generate a mask operand.
+  EVT MaskTy = VT.changeVectorElementTypeToInteger();
+
+  // What is the size of each element in the vector mask.
+  EVT BitTy = MaskTy.getScalarType();
+
+  Mask = DAG.getSelect(DL, BitTy, Mask,
+          DAG.getConstant(APInt::getAllOnesValue(BitTy.getSizeInBits()), DL,
+                          BitTy),
+          DAG.getConstant(0, DL, BitTy));
+
+  // Broadcast the mask so that the entire vector is all-one or all zero.
+  SmallVector<SDValue, 8> Ops(NumElem, Mask);
+  Mask = DAG.getNode(ISD::BUILD_VECTOR, DL, MaskTy, Ops);
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, MaskTy, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+            APInt::getAllOnesValue(BitTy.getSizeInBits()), DL, MaskTy);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, MaskTy, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, MaskTy, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, MaskTy, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, MaskTy, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
+SDValue VectorLegalizer::ExpandSEXTINREG(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Make sure that the SRA and SHL instructions are available.
+  if (TLI.getOperationAction(ISD::SRA, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SHL, VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  SDLoc DL(Op);
+  EVT OrigTy = cast<VTSDNode>(Op->getOperand(1))->getVT();
+
+  unsigned BW = VT.getScalarType().getSizeInBits();
+  unsigned OrigBW = OrigTy.getScalarType().getSizeInBits();
+  SDValue ShiftSz = DAG.getConstant(BW - OrigBW, DL, VT);
+
+  Op = Op.getOperand(0);
+  Op =   DAG.getNode(ISD::SHL, DL, VT, Op, ShiftSz);
+  return DAG.getNode(ISD::SRA, DL, VT, Op, ShiftSz);
+}
+
+// Generically expand a vector anyext in register to a shuffle of the relevant
+// lanes into the appropriate locations, with other lanes left undef.
+SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // Build a base mask of undef shuffles.
+  SmallVector<int, 16> ShuffleMask;
+  ShuffleMask.resize(NumSrcElements, -1);
+
+  // Place the extended lanes into the correct locations.
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = i;
+
+  return DAG.getNode(
+      ISD::BITCAST, DL, VT,
+      DAG.getVectorShuffle(SrcVT, DL, Src, DAG.getUNDEF(SrcVT), ShuffleMask));
+}
+
+SDValue VectorLegalizer::ExpandSIGN_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  // First build an any-extend node which can be legalized above when we
+  // recurse through it.
+  Op = DAG.getAnyExtendVectorInReg(Src, DL, VT);
+
+  // Now we need sign extend. Do this by shifting the elements. Even if these
+  // aren't legal operations, they have a better chance of being legalized
+  // without full scalarization than the sign extension does.
+  unsigned EltWidth = VT.getVectorElementType().getSizeInBits();
+  unsigned SrcEltWidth = SrcVT.getVectorElementType().getSizeInBits();
+  SDValue ShiftAmount = DAG.getConstant(EltWidth - SrcEltWidth, DL, VT);
+  return DAG.getNode(ISD::SRA, DL, VT,
+                     DAG.getNode(ISD::SHL, DL, VT, Op, ShiftAmount),
+                     ShiftAmount);
+}
+
+// Generically expand a vector zext in register to a shuffle of the relevant
+// lanes into the appropriate locations, a blend of zero into the high bits,
+// and a bitcast to the wider element type.
+SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDValue Op) {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  int NumElements = VT.getVectorNumElements();
+  SDValue Src = Op.getOperand(0);
+  EVT SrcVT = Src.getValueType();
+  int NumSrcElements = SrcVT.getVectorNumElements();
+
+  // Build up a zero vector to blend into this one.
+  SDValue Zero = DAG.getConstant(0, DL, SrcVT);
+
+  // Shuffle the incoming lanes into the correct position, and pull all other
+  // lanes from the zero vector.
+  SmallVector<int, 16> ShuffleMask;
+  ShuffleMask.reserve(NumSrcElements);
+  for (int i = 0; i < NumSrcElements; ++i)
+    ShuffleMask.push_back(i);
+
+  int ExtLaneScale = NumSrcElements / NumElements;
+  int EndianOffset = DAG.getDataLayout().isBigEndian() ? ExtLaneScale - 1 : 0;
+  for (int i = 0; i < NumElements; ++i)
+    ShuffleMask[i * ExtLaneScale + EndianOffset] = NumSrcElements + i;
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getVectorShuffle(SrcVT, DL, Zero, Src, ShuffleMask));
+}
+
+static void createBSWAPShuffleMask(EVT VT, SmallVectorImpl<int> &ShuffleMask) {
+  int ScalarSizeInBytes = VT.getScalarSizeInBits() / 8;
+  for (int I = 0, E = VT.getVectorNumElements(); I != E; ++I)
+    for (int J = ScalarSizeInBytes - 1; J >= 0; --J)
+      ShuffleMask.push_back((I * ScalarSizeInBytes) + J);
+}
+
+SDValue VectorLegalizer::ExpandBSWAP(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // Generate a byte wise shuffle mask for the BSWAP.
+  SmallVector<int, 16> ShuffleMask;
+  createBSWAPShuffleMask(VT, ShuffleMask);
+  EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, ShuffleMask.size());
+
+  // Only emit a shuffle if the mask is legal.
+  if (!TLI.isShuffleMaskLegal(ShuffleMask, ByteVT))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  SDLoc DL(Op);
+  Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
+  Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT), ShuffleMask);
+  return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+}
+
+SDValue VectorLegalizer::ExpandBITREVERSE(SDValue Op) {
+  EVT VT = Op.getValueType();
+
+  // If we have the scalar operation, it's probably cheaper to unroll it.
+  if (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, VT.getScalarType()))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // If the vector element width is a whole number of bytes, test if its legal
+  // to BSWAP shuffle the bytes and then perform the BITREVERSE on the byte
+  // vector. This greatly reduces the number of bit shifts necessary.
+  unsigned ScalarSizeInBits = VT.getScalarSizeInBits();
+  if (ScalarSizeInBits > 8 && (ScalarSizeInBits % 8) == 0) {
+    SmallVector<int, 16> BSWAPMask;
+    createBSWAPShuffleMask(VT, BSWAPMask);
+
+    EVT ByteVT = EVT::getVectorVT(*DAG.getContext(), MVT::i8, BSWAPMask.size());
+    if (TLI.isShuffleMaskLegal(BSWAPMask, ByteVT) &&
+        (TLI.isOperationLegalOrCustom(ISD::BITREVERSE, ByteVT) ||
+         (TLI.isOperationLegalOrCustom(ISD::SHL, ByteVT) &&
+          TLI.isOperationLegalOrCustom(ISD::SRL, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::AND, ByteVT) &&
+          TLI.isOperationLegalOrCustomOrPromote(ISD::OR, ByteVT)))) {
+      SDLoc DL(Op);
+      Op = DAG.getNode(ISD::BITCAST, DL, ByteVT, Op.getOperand(0));
+      Op = DAG.getVectorShuffle(ByteVT, DL, Op, DAG.getUNDEF(ByteVT),
+                                BSWAPMask);
+      Op = DAG.getNode(ISD::BITREVERSE, DL, ByteVT, Op);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Op);
+    }
+  }
+
+  // If we have the appropriate vector bit operations, it is better to use them
+  // than unrolling and expanding each component.
+  if (!TLI.isOperationLegalOrCustom(ISD::SHL, VT) ||
+      !TLI.isOperationLegalOrCustom(ISD::SRL, VT) ||
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::AND, VT) ||
+      !TLI.isOperationLegalOrCustomOrPromote(ISD::OR, VT))
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Let LegalizeDAG handle this later.
+  return Op;
+}
+
+SDValue VectorLegalizer::ExpandVSELECT(SDValue Op) {
+  // Implement VSELECT in terms of XOR, AND, OR
+  // on platforms which do not support blend natively.
+  SDLoc DL(Op);
+
+  SDValue Mask = Op.getOperand(0);
+  SDValue Op1 = Op.getOperand(1);
+  SDValue Op2 = Op.getOperand(2);
+
+  EVT VT = Mask.getValueType();
+
+  // If we can't even use the basic vector operations of
+  // AND,OR,XOR, we will have to scalarize the op.
+  // Notice that the operation may be 'promoted' which means that it is
+  // 'bitcasted' to another type which is handled.
+  // This operation also isn't safe with AND, OR, XOR when the boolean
+  // type is 0/1 as we need an all ones vector constant to mask with.
+  // FIXME: Sign extend 1 to all ones if thats legal on the target.
+  if (TLI.getOperationAction(ISD::AND, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::XOR, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::OR, VT) == TargetLowering::Expand ||
+      TLI.getBooleanContents(Op1.getValueType()) !=
+          TargetLowering::ZeroOrNegativeOneBooleanContent)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // If the mask and the type are different sizes, unroll the vector op. This
+  // can occur when getSetCCResultType returns something that is different in
+  // size from the operand types. For example, v4i8 = select v4i32, v4i8, v4i8.
+  if (VT.getSizeInBits() != Op1.getValueType().getSizeInBits())
+    return DAG.UnrollVectorOp(Op.getNode());
+
+  // Bitcast the operands to be the same type as the mask.
+  // This is needed when we select between FP types because
+  // the mask is a vector of integers.
+  Op1 = DAG.getNode(ISD::BITCAST, DL, VT, Op1);
+  Op2 = DAG.getNode(ISD::BITCAST, DL, VT, Op2);
+
+  SDValue AllOnes = DAG.getConstant(
+    APInt::getAllOnesValue(VT.getScalarType().getSizeInBits()), DL, VT);
+  SDValue NotMask = DAG.getNode(ISD::XOR, DL, VT, Mask, AllOnes);
+
+  Op1 = DAG.getNode(ISD::AND, DL, VT, Op1, Mask);
+  Op2 = DAG.getNode(ISD::AND, DL, VT, Op2, NotMask);
+  SDValue Val = DAG.getNode(ISD::OR, DL, VT, Op1, Op2);
+  return DAG.getNode(ISD::BITCAST, DL, Op.getValueType(), Val);
+}
+
+SDValue VectorLegalizer::ExpandUINT_TO_FLOAT(SDValue Op) {
+  EVT VT = Op.getOperand(0).getValueType();
+  SDLoc DL(Op);
+
+  // Make sure that the SINT_TO_FP and SRL instructions are available.
+  if (TLI.getOperationAction(ISD::SINT_TO_FP, VT) == TargetLowering::Expand ||
+      TLI.getOperationAction(ISD::SRL,        VT) == TargetLowering::Expand)
+    return DAG.UnrollVectorOp(Op.getNode());
+
+ EVT SVT = VT.getScalarType();
+  assert((SVT.getSizeInBits() == 64 || SVT.getSizeInBits() == 32) &&
+      "Elements in vector-UINT_TO_FP must be 32 or 64 bits wide");
+
+  unsigned BW = SVT.getSizeInBits();
+  SDValue HalfWord = DAG.getConstant(BW/2, DL, VT);
+
+  // Constants to clear the upper part of the word.
+  // Notice that we can also use SHL+SHR, but using a constant is slightly
+  // faster on x86.
+  uint64_t HWMask = (SVT.getSizeInBits()==64)?0x00000000FFFFFFFF:0x0000FFFF;
+  SDValue HalfWordMask = DAG.getConstant(HWMask, DL, VT);
+
+  // Two to the power of half-word-size.
+  SDValue TWOHW = DAG.getConstantFP(1 << (BW/2), DL, Op.getValueType());
+
+  // Clear upper part of LO, lower HI
+  SDValue HI = DAG.getNode(ISD::SRL, DL, VT, Op.getOperand(0), HalfWord);
+  SDValue LO = DAG.getNode(ISD::AND, DL, VT, Op.getOperand(0), HalfWordMask);
+
+  // Convert hi and lo to floats
+  // Convert the hi part back to the upper values
+  // TODO: Can any fast-math-flags be set on these nodes?
+  SDValue fHI = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), HI);
+          fHI = DAG.getNode(ISD::FMUL, DL, Op.getValueType(), fHI, TWOHW);
+  SDValue fLO = DAG.getNode(ISD::SINT_TO_FP, DL, Op.getValueType(), LO);
+
+  // Add the two halves
+  return DAG.getNode(ISD::FADD, DL, Op.getValueType(), fHI, fLO);
+}
+
+
+SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
+  if (TLI.isOperationLegalOrCustom(ISD::FSUB, Op.getValueType())) {
+    SDLoc DL(Op);
+    SDValue Zero = DAG.getConstantFP(-0.0, DL, Op.getValueType());
+    // TODO: If FNEG had fast-math-flags, they'd get propagated to this FSUB.
+    return DAG.getNode(ISD::FSUB, DL, Op.getValueType(),
+                       Zero, Op.getOperand(0));
+  }
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::ExpandCTLZ_CTTZ_ZERO_UNDEF(SDValue Op) {
+  // If the non-ZERO_UNDEF version is supported we can use that instead.
+  unsigned Opc = Op.getOpcode() == ISD::CTLZ_ZERO_UNDEF ? ISD::CTLZ : ISD::CTTZ;
+  if (TLI.isOperationLegalOrCustom(Opc, Op.getValueType())) {
+    SDLoc DL(Op);
+    return DAG.getNode(Opc, DL, Op.getValueType(), Op.getOperand(0));
+  }
+
+  // Otherwise go ahead and unroll.
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
+SDValue VectorLegalizer::UnrollVSETCC(SDValue Op) {
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1), CC = Op.getOperand(2);
+  EVT TmpEltVT = LHS.getValueType().getVectorElementType();
+  SDLoc dl(Op);
+  SmallVector<SDValue, 8> Ops(NumElems);
+  for (unsigned i = 0; i < NumElems; ++i) {
+    SDValue LHSElem = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue RHSElem = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Ops[i] = DAG.getNode(ISD::SETCC, dl,
+                         TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                *DAG.getContext(), TmpEltVT),
+                         LHSElem, RHSElem, CC);
+    Ops[i] = DAG.getSelect(dl, EltVT, Ops[i],
+                           DAG.getConstant(APInt::getAllOnesValue
+                                           (EltVT.getSizeInBits()), dl, EltVT),
+                           DAG.getConstant(0, dl, EltVT));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
+}
+
+bool SelectionDAG::LegalizeVectors() {
+  return VectorLegalizer(*this).Run();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
new file mode 100644
index 000000000000..f3adca49ccfe
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -0,0 +1,3846 @@
+//===------- LegalizeVectorTypes.cpp - Legalization of vector types -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file performs vector type splitting and scalarization for LegalizeTypes.
+// Scalarization is the act of changing a computation in an illegal one-element
+// vector type to be a computation in its scalar element type.  For example,
+// implementing <1 x f32> arithmetic in a scalar f32 register.  This is needed
+// as a base case when scalarizing vector arithmetic like <4 x f32>, which
+// eventually decomposes to scalars if the target doesn't support v4f32 or v2f32
+// types.
+// Splitting is the act of changing a computation in an invalid vector type to
+// be a computation in two vectors of half the size.  For example, implementing
+// <128 x f32> operations in terms of two <64 x f32> operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "LegalizeTypes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "legalize-types"
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Scalarization: <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Scalarize node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue R = SDValue();
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "ScalarizeVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to scalarize the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
+  case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
+  case ISD::FP_ROUND_INREG:    R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::FPOWI:             R = ScalarizeVecRes_FPOWI(N); break;
+  case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
+  case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
+  case ISD::VSELECT:           R = ScalarizeVecRes_VSELECT(N); break;
+  case ISD::SELECT:            R = ScalarizeVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         R = ScalarizeVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
+  case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::ANY_EXTEND:
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+    R = ScalarizeVecRes_UnaryOp(N);
+    break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::FADD:
+  case ISD::FCOPYSIGN:
+  case ISD::FDIV:
+  case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:
+  case ISD::MUL:
+  case ISD::OR:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SUB:
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    R = ScalarizeVecRes_BinOp(N);
+    break;
+  case ISD::FMA:
+    R = ScalarizeVecRes_TernaryOp(N);
+    break;
+  }
+
+  // If R is null, the sub-method took care of registering the result.
+  if (R.getNode())
+    SetScalarizedVector(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BinOp(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     LHS.getValueType(), LHS, RHS, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  SDValue Op1 = GetScalarizedVector(N->getOperand(1));
+  SDValue Op2 = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     Op0.getValueType(), Op0, Op1, Op2);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_MERGE_VALUES(SDNode *N,
+                                                       unsigned ResNo) {
+  SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetScalarizedVector(Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                     NewVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op0 = GetScalarizedVector(N->getOperand(0));
+  return DAG.getConvertRndSat(NewVT, SDLoc(N),
+                              Op0, DAG.getValueType(NewVT),
+                              DAG.getValueType(Op0.getValueType()),
+                              N->getOperand(3),
+                              N->getOperand(4),
+                              cast<CvtRndSatSDNode>(N)->getCvtCode());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     N->getValueType(0).getVectorElementType(),
+                     N->getOperand(0), N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FP_ROUND(SDNode *N) {
+  EVT NewVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                     NewVT, Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_FPOWI(SDNode *N) {
+  SDValue Op = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::FPOWI, SDLoc(N),
+                     Op.getValueType(), Op, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  // The value to insert may have a wider type than the vector element type,
+  // so be sure to truncate it to the element type if necessary.
+  SDValue Op = N->getOperand(1);
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  if (Op.getValueType() != EltVT)
+    // FIXME: Can this happen for floating point types?
+    Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, Op);
+  return Op;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
+  assert(N->isUnindexed() && "Indexed vector load?");
+
+  SDValue Result = DAG.getLoad(
+      ISD::UNINDEXED, N->getExtensionType(),
+      N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
+      N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
+      N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
+      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
+      N->getAAInfo());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
+  // Get the dest type - it doesn't always match the input type, e.g. int_to_fp.
+  EVT DestVT = N->getValueType(0).getVectorElementType();
+  SDValue Op = N->getOperand(0);
+  EVT OpVT = Op.getValueType();
+  SDLoc DL(N);
+  // The result needs scalarizing, but it's not a given that the source does.
+  // This is a workaround for targets where it's impossible to scalarize the
+  // result of a conversion, because the source type is legal.
+  // For instance, this happens on AArch64: v1i1 is illegal but v1i{8,16,32}
+  // are widened to v8i8, v4i16, and v2i32, which is legal, because v1i64 is
+  // legal and was not scalarized.
+  // See the similar logic in ScalarizeVecRes_VSETCC
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    Op = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+  return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  EVT ExtVT = cast<VTSDNode>(N->getOperand(1))->getVT().getVectorElementType();
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), EltVT,
+                     LHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  // If the operand is wider than the vector element type then it is implicitly
+  // truncated.  Make that explicit here.
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  if (InOp.getValueType() != EltVT)
+    return DAG.getNode(ISD::TRUNCATE, SDLoc(N), EltVT, InOp);
+  return InOp;
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
+  SDValue Cond = GetScalarizedVector(N->getOperand(0));
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  TargetLowering::BooleanContent ScalarBool =
+      TLI.getBooleanContents(false, false);
+  TargetLowering::BooleanContent VecBool = TLI.getBooleanContents(true, false);
+
+  // If integer and float booleans have different contents then we can't
+  // reliably optimize in all cases. There is a full explanation for this in
+  // DAGCombiner::visitSELECT() where the same issue affects folding
+  // (select C, 0, 1) to (xor C, 1).
+  if (TLI.getBooleanContents(false, false) !=
+      TLI.getBooleanContents(false, true)) {
+    // At least try the common case where the boolean is generated by a
+    // comparison.
+    if (Cond->getOpcode() == ISD::SETCC) {
+      EVT OpVT = Cond->getOperand(0)->getValueType(0);
+      ScalarBool = TLI.getBooleanContents(OpVT.getScalarType());
+      VecBool = TLI.getBooleanContents(OpVT);
+    } else
+      ScalarBool = TargetLowering::UndefinedBooleanContent;
+  }
+
+  if (ScalarBool != VecBool) {
+    EVT CondVT = Cond.getValueType();
+    switch (ScalarBool) {
+      case TargetLowering::UndefinedBooleanContent:
+        break;
+      case TargetLowering::ZeroOrOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrNegativeOneBooleanContent);
+        // Vector read from all ones, scalar expects a single 1 so mask.
+        Cond = DAG.getNode(ISD::AND, SDLoc(N), CondVT,
+                           Cond, DAG.getConstant(1, SDLoc(N), CondVT));
+        break;
+      case TargetLowering::ZeroOrNegativeOneBooleanContent:
+        assert(VecBool == TargetLowering::UndefinedBooleanContent ||
+               VecBool == TargetLowering::ZeroOrOneBooleanContent);
+        // Vector reads from a one, scalar from all ones so sign extend.
+        Cond = DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), CondVT,
+                           Cond, DAG.getValueType(MVT::i1));
+        break;
+    }
+  }
+
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), Cond, LHS,
+                       GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(1));
+  return DAG.getSelect(SDLoc(N),
+                       LHS.getValueType(), N->getOperand(0), LHS,
+                       GetScalarizedVector(N->getOperand(2)));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SELECT_CC(SDNode *N) {
+  SDValue LHS = GetScalarizedVector(N->getOperand(2));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), LHS.getValueType(),
+                     N->getOperand(0), N->getOperand(1),
+                     LHS, GetScalarizedVector(N->getOperand(3)),
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+
+  if (N->getValueType(0).isVector()) return ScalarizeVecRes_VSETCC(N);
+
+  SDValue LHS = GetScalarizedVector(N->getOperand(0));
+  SDValue RHS = GetScalarizedVector(N->getOperand(1));
+  SDLoc DL(N);
+
+  // Turn it into a scalar SETCC.
+  return DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VECTOR_SHUFFLE(SDNode *N) {
+  // Figure out if the scalar is the LHS or RHS and return it.
+  SDValue Arg = N->getOperand(2).getOperand(0);
+  if (Arg.isUndef())
+    return DAG.getUNDEF(N->getValueType(0).getVectorElementType());
+  unsigned Op = !cast<ConstantSDNode>(Arg)->isNullValue();
+  return GetScalarizedVector(N->getOperand(Op));
+}
+
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+  EVT OpVT = LHS.getValueType();
+  EVT NVT = N->getValueType(0).getVectorElementType();
+  SDLoc DL(N);
+
+  // The result needs scalarizing, but it's not a given that the source does.
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    LHS = GetScalarizedVector(LHS);
+    RHS = GetScalarizedVector(RHS);
+  } else {
+    EVT VT = OpVT.getVectorElementType();
+    LHS = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    RHS = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  // Turn it into a scalar SETCC.
+  SDValue Res = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS,
+                            N->getOperand(2));
+  // Vectors may have a different boolean contents to scalars.  Promote the
+  // value appropriately.
+  ISD::NodeType ExtendCode =
+      TargetLowering::getExtendForContent(TLI.getBooleanContents(OpVT));
+  return DAG.getNode(ExtendCode, DL, NVT, Res);
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Scalarization <1 x ty> -> ty.
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::ScalarizeVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Scalarize node operand " << OpNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (!Res.getNode()) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "ScalarizeVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      llvm_unreachable("Do not know how to scalarize this operator's operand!");
+    case ISD::BITCAST:
+      Res = ScalarizeVecOp_BITCAST(N);
+      break;
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::SIGN_EXTEND:
+    case ISD::TRUNCATE:
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      Res = ScalarizeVecOp_UnaryOp(N);
+      break;
+    case ISD::CONCAT_VECTORS:
+      Res = ScalarizeVecOp_CONCAT_VECTORS(N);
+      break;
+    case ISD::EXTRACT_VECTOR_ELT:
+      Res = ScalarizeVecOp_EXTRACT_VECTOR_ELT(N);
+      break;
+    case ISD::VSELECT:
+      Res = ScalarizeVecOp_VSELECT(N);
+      break;
+    case ISD::STORE:
+      Res = ScalarizeVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+    case ISD::FP_ROUND:
+      Res = ScalarizeVecOp_FP_ROUND(N, OpNo);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+/// If the value to convert is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_BITCAST(SDNode *N) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  return DAG.getNode(ISD::BITCAST, SDLoc(N),
+                     N->getValueType(0), Elt);
+}
+
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>.
+/// Do the operation on the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_UnaryOp(SDNode *N) {
+  assert(N->getValueType(0).getVectorNumElements() == 1 &&
+         "Unexpected vector type!");
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Op = DAG.getNode(N->getOpcode(), SDLoc(N),
+                           N->getValueType(0).getScalarType(), Elt);
+  // Revectorize the result so the types line up with what the uses of this
+  // expression expect.
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Op);
+}
+
+/// The vectors to concatenate have length one - use a BUILD_VECTOR instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_CONCAT_VECTORS(SDNode *N) {
+  SmallVector<SDValue, 8> Ops(N->getNumOperands());
+  for (unsigned i = 0, e = N->getNumOperands(); i < e; ++i)
+    Ops[i] = GetScalarizedVector(N->getOperand(i));
+  return DAG.getNode(ISD::BUILD_VECTOR, SDLoc(N), N->getValueType(0), Ops);
+}
+
+/// If the input is a vector that needs to be scalarized, it must be <1 x ty>,
+/// so just return the element, ignoring the index.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Res = GetScalarizedVector(N->getOperand(0));
+  if (Res.getValueType() != N->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), N->getValueType(0),
+                      Res);
+  return Res;
+}
+
+
+/// If the input condition is a vector that needs to be scalarized, it must be
+/// <1 x i1>, so just convert to a normal ISD::SELECT
+/// (still with vector output type since that was acceptable if we got here).
+SDValue DAGTypeLegalizer::ScalarizeVecOp_VSELECT(SDNode *N) {
+  SDValue ScalarCond = GetScalarizedVector(N->getOperand(0));
+  EVT VT = N->getValueType(0);
+
+  return DAG.getNode(ISD::SELECT, SDLoc(N), VT, ScalarCond, N->getOperand(1),
+                     N->getOperand(2));
+}
+
+/// If the value to store is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Just store the element.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
+  assert(N->isUnindexed() && "Indexed store of one-element vector?");
+  assert(OpNo == 1 && "Do not know how to scalarize this operand!");
+  SDLoc dl(N);
+
+  if (N->isTruncatingStore())
+    return DAG.getTruncStore(
+        N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+        N->getBasePtr(), N->getPointerInfo(),
+        N->getMemoryVT().getVectorElementType(), N->getAlignment(),
+        N->getMemOperand()->getFlags(), N->getAAInfo());
+
+  return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
+                      N->getBasePtr(), N->getPointerInfo(),
+                      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
+                      N->getAAInfo());
+}
+
+/// If the value to round is a vector that needs to be scalarized, it must be
+/// <1 x ty>. Convert the element instead.
+SDValue DAGTypeLegalizer::ScalarizeVecOp_FP_ROUND(SDNode *N, unsigned OpNo) {
+  SDValue Elt = GetScalarizedVector(N->getOperand(0));
+  SDValue Res = DAG.getNode(ISD::FP_ROUND, SDLoc(N),
+                            N->getValueType(0).getVectorElementType(), Elt,
+                            N->getOperand(1));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N), N->getValueType(0), Res);
+}
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// This method is called when the specified result of the specified node is
+/// found to need vector splitting. At this point, the node may also have
+/// invalid operands or may have other results that need legalization, we just
+/// know that (at least) one result needs vector splitting.
+void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Split node result: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Lo, Hi;
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true))
+    return;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SplitVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
+
+  case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
+  case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
+  case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::BITCAST:           SplitVecRes_BITCAST(N, Lo, Hi); break;
+  case ISD::BUILD_VECTOR:      SplitVecRes_BUILD_VECTOR(N, Lo, Hi); break;
+  case ISD::CONCAT_VECTORS:    SplitVecRes_CONCAT_VECTORS(N, Lo, Hi); break;
+  case ISD::EXTRACT_SUBVECTOR: SplitVecRes_EXTRACT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::INSERT_SUBVECTOR:  SplitVecRes_INSERT_SUBVECTOR(N, Lo, Hi); break;
+  case ISD::FP_ROUND_INREG:    SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::FPOWI:             SplitVecRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FCOPYSIGN:         SplitVecRes_FCOPYSIGN(N, Lo, Hi); break;
+  case ISD::INSERT_VECTOR_ELT: SplitVecRes_INSERT_VECTOR_ELT(N, Lo, Hi); break;
+  case ISD::SCALAR_TO_VECTOR:  SplitVecRes_SCALAR_TO_VECTOR(N, Lo, Hi); break;
+  case ISD::SIGN_EXTEND_INREG: SplitVecRes_InregOp(N, Lo, Hi); break;
+  case ISD::LOAD:
+    SplitVecRes_LOAD(cast<LoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::MLOAD:
+    SplitVecRes_MLOAD(cast<MaskedLoadSDNode>(N), Lo, Hi);
+    break;
+  case ISD::MGATHER:
+    SplitVecRes_MGATHER(cast<MaskedGatherSDNode>(N), Lo, Hi);
+    break;
+  case ISD::SETCC:
+    SplitVecRes_SETCC(N, Lo, Hi);
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    SplitVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N), Lo, Hi);
+    break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+    break;
+
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CONVERT_RNDSAT:
+  case ISD::CTLZ:
+  case ISD::CTTZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTPOP:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+    SplitVecRes_UnaryOp(N, Lo, Hi);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    SplitVecRes_ExtendOp(N, Lo, Hi);
+    break;
+
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::FDIV:
+  case ISD::FPOW:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::UREM:
+  case ISD::SREM:
+  case ISD::FREM:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    SplitVecRes_BinOp(N, Lo, Hi);
+    break;
+  case ISD::FMA:
+    SplitVecRes_TernaryOp(N, Lo, Hi);
+    break;
+  }
+
+  // If Lo/Hi is null, the sub-method took care of registering results etc.
+  if (Lo.getNode())
+    SetSplitVector(SDValue(N, ResNo), Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDValue RHSLo, RHSHi;
+  GetSplitVector(N->getOperand(1), RHSLo, RHSHi);
+  SDLoc dl(N);
+
+  const SDNodeFlags *Flags = N->getFlags();
+  unsigned Opcode = N->getOpcode();
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Flags);
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Flags);
+}
+
+void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue Op0Lo, Op0Hi;
+  GetSplitVector(N->getOperand(0), Op0Lo, Op0Hi);
+  SDValue Op1Lo, Op1Hi;
+  GetSplitVector(N->getOperand(1), Op1Lo, Op1Hi);
+  SDValue Op2Lo, Op2Hi;
+  GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
+  SDLoc dl(N);
+
+  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
+                   Op0Lo, Op1Lo, Op2Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
+                   Op0Hi, Op1Hi, Op2Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // We know the result is a vector.  The input may be either a vector or a
+  // scalar value.
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+
+  // Handle some special cases efficiently.
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+  case TargetLowering::TypePromoteInteger:
+  case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeWidenVector:
+    break;
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+    // A scalar to vector conversion, where the scalar needs expansion.
+    // If the vector is being split in two then we can just convert the
+    // expanded pieces.
+    if (LoVT == HiVT) {
+      GetExpandedOp(InOp, Lo, Hi);
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+      Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+      Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+      return;
+    }
+    break;
+  case TargetLowering::TypeSplitVector:
+    // If the input is a vector that needs to be split, convert each split
+    // piece of the input now.
+    GetSplitVector(InOp, Lo, Hi);
+    Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+    Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+    return;
+  }
+
+  // In the general case, convert the input to an integer and split it by hand.
+  EVT LoIntVT = EVT::getIntegerVT(*DAG.getContext(), LoVT.getSizeInBits());
+  EVT HiIntVT = EVT::getIntegerVT(*DAG.getContext(), HiVT.getSizeInBits());
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(LoIntVT, HiIntVT);
+
+  SplitInteger(BitConvertToInteger(InOp), LoIntVT, HiIntVT, Lo, Hi);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+  Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
+  Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_BUILD_VECTOR(SDNode *N, SDValue &Lo,
+                                                SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  unsigned LoNumElts = LoVT.getVectorNumElements();
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+LoNumElts);
+  Lo = DAG.getNode(ISD::BUILD_VECTOR, dl, LoVT, LoOps);
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+LoNumElts, N->op_end());
+  Hi = DAG.getNode(ISD::BUILD_VECTOR, dl, HiVT, HiOps);
+}
+
+void DAGTypeLegalizer::SplitVecRes_CONCAT_VECTORS(SDNode *N, SDValue &Lo,
+                                                  SDValue &Hi) {
+  assert(!(N->getNumOperands() & 1) && "Unsupported CONCAT_VECTORS");
+  SDLoc dl(N);
+  unsigned NumSubvectors = N->getNumOperands() / 2;
+  if (NumSubvectors == 1) {
+    Lo = N->getOperand(0);
+    Hi = N->getOperand(1);
+    return;
+  }
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  SmallVector<SDValue, 8> LoOps(N->op_begin(), N->op_begin()+NumSubvectors);
+  Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, LoVT, LoOps);
+
+  SmallVector<SDValue, 8> HiOps(N->op_begin()+NumSubvectors, N->op_end());
+  Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HiVT, HiOps);
+}
+
+void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  SDLoc dl(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
+                   DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), dl,
+                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  GetSplitVector(Vec, Lo, Hi);
+
+  EVT VecVT = Vec.getValueType();
+  EVT VecElemVT = VecVT.getVectorElementType();
+  unsigned VecElems = VecVT.getVectorNumElements();
+  unsigned SubElems = SubVec.getValueType().getVectorNumElements();
+
+  // If we know the index is 0, and we know the subvector doesn't cross the
+  // boundary between the halves, we can avoid spilling the vector, and insert
+  // into the lower half of the split vector directly.
+  // TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
+  // the index is constant and there is no boundary crossing. But those cases
+  // don't seem to get hit in practice.
+  if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = ConstIdx->getZExtValue();
+    if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
+      EVT LoVT, HiVT;
+      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+      Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
+      return;
+    }
+  }
+
+  // Spill the vector to the stack.
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
+
+  // Store the new subvector into the specified index.
+  SDValue SubVecPtr = GetVectorElementPointer(StackPtr, VecElemVT, Idx);
+  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
+  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
+
+  // Load the Lo part from the stack slot.
+  Lo =
+      DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr =
+      DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                  DAG.getConstant(IncrementSize, dl, StackPtr.getValueType()));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   MinAlign(Alignment, IncrementSize));
+}
+
+void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
+                                         SDValue &Hi) {
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FPOWI, dl, Lo.getValueType(), Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FPOWI, dl, Hi.getValueType(), Hi, N->getOperand(1));
+}
+
+void DAGTypeLegalizer::SplitVecRes_FCOPYSIGN(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDLoc DL(N);
+
+  SDValue RHSLo, RHSHi;
+  SDValue RHS = N->getOperand(1);
+  EVT RHSVT = RHS.getValueType();
+  if (getTypeAction(RHSVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(RHS, RHSLo, RHSHi);
+  else
+    std::tie(RHSLo, RHSHi) = DAG.SplitVector(RHS, SDLoc(RHS));
+
+
+  Lo = DAG.getNode(ISD::FCOPYSIGN, DL, LHSLo.getValueType(), LHSLo, RHSLo);
+  Hi = DAG.getNode(ISD::FCOPYSIGN, DL, LHSHi.getValueType(), LHSHi, RHSHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_InregOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  SDValue LHSLo, LHSHi;
+  GetSplitVector(N->getOperand(0), LHSLo, LHSHi);
+  SDLoc dl(N);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) =
+    DAG.GetSplitDestVTs(cast<VTSDNode>(N->getOperand(1))->getVT());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, LHSLo.getValueType(), LHSLo,
+                   DAG.getValueType(LoVT));
+  Hi = DAG.getNode(N->getOpcode(), dl, LHSHi.getValueType(), LHSHi,
+                   DAG.getValueType(HiVT));
+}
+
+void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
+                                                 SDValue &Hi) {
+  unsigned Opcode = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+
+  SDLoc dl(N);
+  SDValue InLo, InHi;
+  GetSplitVector(N0, InLo, InHi);
+  EVT InLoVT = InLo.getValueType();
+  unsigned InNumElements = InLoVT.getVectorNumElements();
+
+  EVT OutLoVT, OutHiVT;
+  std::tie(OutLoVT, OutHiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  unsigned OutNumElements = OutLoVT.getVectorNumElements();
+  assert((2 * OutNumElements) <= InNumElements &&
+         "Illegal extend vector in reg split");
+
+  // *_EXTEND_VECTOR_INREG instructions extend the lowest elements of the
+  // input vector (i.e. we only use InLo):
+  // OutLo will extend the first OutNumElements from InLo.
+  // OutHi will extend the next OutNumElements from InLo.
+
+  // Shuffle the elements from InLo for OutHi into the bottom elements to
+  // create a 'fake' InHi.
+  SmallVector<int, 8> SplitHi(InNumElements, -1);
+  for (unsigned i = 0; i != OutNumElements; ++i)
+    SplitHi[i] = i + OutNumElements;
+  InHi = DAG.getVectorShuffle(InLoVT, dl, InLo, DAG.getUNDEF(InLoVT), SplitHi);
+
+  Lo = DAG.getNode(Opcode, dl, OutLoVT, InLo);
+  Hi = DAG.getNode(Opcode, dl, OutHiVT, InHi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
+                                                     SDValue &Hi) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Elt = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+  SDLoc dl(N);
+  GetSplitVector(Vec, Lo, Hi);
+
+  if (ConstantSDNode *CIdx = dyn_cast<ConstantSDNode>(Idx)) {
+    unsigned IdxVal = CIdx->getZExtValue();
+    unsigned LoNumElts = Lo.getValueType().getVectorNumElements();
+    if (IdxVal < LoNumElts)
+      Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
+                       Lo.getValueType(), Lo, Elt, Idx);
+    else
+      Hi =
+          DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                      DAG.getConstant(IdxVal - LoNumElts, dl,
+                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
+    return;
+  }
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return;
+
+  // Spill the vector to the stack.
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
+
+  // Store the new element.  This may be larger than the vector element type,
+  // so use a truncating store.
+  SDValue EltPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
+  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
+  Store =
+      DAG.getTruncStore(Store, dl, Elt, EltPtr, MachinePointerInfo(), EltVT);
+
+  // Load the Lo part from the stack slot.
+  Lo =
+      DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
+
+  // Increment the pointer to the other part.
+  unsigned IncrementSize = Lo.getValueType().getSizeInBits() / 8;
+  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
+                         DAG.getConstant(IncrementSize, dl,
+                                         StackPtr.getValueType()));
+
+  // Load the Hi part from the stack slot.
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
+                   MinAlign(Alignment, IncrementSize));
+}
+
+void DAGTypeLegalizer::SplitVecRes_SCALAR_TO_VECTOR(SDNode *N, SDValue &Lo,
+                                                    SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+  Lo = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, LoVT, N->getOperand(0));
+  Hi = DAG.getUNDEF(HiVT);
+}
+
+void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
+                                        SDValue &Hi) {
+  assert(ISD::isUNINDEXEDLoad(LD) && "Indexed load during type legalization!");
+  EVT LoVT, HiVT;
+  SDLoc dl(LD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(LD->getValueType(0));
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  SDValue Ch = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
+  EVT MemoryVT = LD->getMemoryVT();
+  unsigned Alignment = LD->getOriginalAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
+                   LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
+                   LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT,
+                   Alignment, MMOFlags, AAInfo);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(LD, 1), Ch);
+}
+
+void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MLD);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MLD->getValueType(0));
+
+  SDValue Ch = MLD->getChain();
+  SDValue Ptr = MLD->getBasePtr();
+  SDValue Mask = MLD->getMask();
+  SDValue Src0 = MLD->getSrc0();
+  unsigned Alignment = MLD->getOriginalAlignment();
+  ISD::LoadExtType ExtType = MLD->getExtensionType();
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == MLD->getValueType(0).getSizeInBits()/8) ?
+     Alignment/2 : Alignment;
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MLD->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0Lo, Src0Hi;
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, MaskLo, Src0Lo, LoMemVT, MMO,
+                         ExtType);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MLD->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, MLD->getAAInfo(), MLD->getRanges());
+
+  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, MaskHi, Src0Hi, HiMemVT, MMO,
+                         ExtType);
+
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MLD, 1), Ch);
+
+}
+
+void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
+                                         SDValue &Lo, SDValue &Hi) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MGT);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
+
+  SDValue Ch = MGT->getChain();
+  SDValue Ptr = MGT->getBasePtr();
+  SDValue Mask = MGT->getMask();
+  SDValue Src0 = MGT->getValue();
+  SDValue Index = MGT->getIndex();
+  unsigned Alignment = MGT->getOriginalAlignment();
+
+  // Split Mask operand
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MGT->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  // Split MemoryVT
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0Lo, Src0Hi;
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  SDValue IndexHi, IndexLo;
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MGT->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MGT->getAAInfo(), MGT->getRanges());
+
+  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
+  Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
+                           MMO);
+
+  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
+  Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl, OpsHi,
+                           MMO);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MGT, 1), Ch);
+}
+
+
+void DAGTypeLegalizer::SplitVecRes_SETCC(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+
+  EVT LoVT, HiVT;
+  SDLoc DL(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the input.
+  SDValue LL, LH, RL, RH;
+  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+}
+
+void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
+                                           SDValue &Hi) {
+  // Get the dest types - they may not match the input types, e.g. int_to_fp.
+  EVT LoVT, HiVT;
+  SDLoc dl(N);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // If the input also splits, handle it directly for a compile time speedup.
+  // Otherwise split it by hand.
+  EVT InVT = N->getOperand(0).getValueType();
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N->getOperand(0), Lo, Hi);
+  else
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+
+  if (N->getOpcode() == ISD::FP_ROUND) {
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
+  } else if (N->getOpcode() == ISD::CONVERT_RNDSAT) {
+    SDValue DTyOpLo = DAG.getValueType(LoVT);
+    SDValue DTyOpHi = DAG.getValueType(HiVT);
+    SDValue STyOpLo = DAG.getValueType(Lo.getValueType());
+    SDValue STyOpHi = DAG.getValueType(Hi.getValueType());
+    SDValue RndOp = N->getOperand(3);
+    SDValue SatOp = N->getOperand(4);
+    ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+    Lo = DAG.getConvertRndSat(LoVT, dl, Lo, DTyOpLo, STyOpLo, RndOp, SatOp,
+                              CvtCode);
+    Hi = DAG.getConvertRndSat(HiVT, dl, Hi, DTyOpHi, STyOpHi, RndOp, SatOp,
+                              CvtCode);
+  } else {
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+  }
+}
+
+void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
+                                            SDValue &Hi) {
+  SDLoc dl(N);
+  EVT SrcVT = N->getOperand(0).getValueType();
+  EVT DestVT = N->getValueType(0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(DestVT);
+
+  // We can do better than a generic split operation if the extend is doing
+  // more than just doubling the width of the elements and the following are
+  // true:
+  //   - The number of vector elements is even,
+  //   - the source type is legal,
+  //   - the type of a split source is illegal,
+  //   - the type of an extended (by doubling element size) source is legal, and
+  //   - the type of that extended source when split is legal.
+  //
+  // This won't necessarily completely legalize the operation, but it will
+  // more effectively move in the right direction and prevent falling down
+  // to scalarization in many cases due to the input vector being split too
+  // far.
+  unsigned NumElements = SrcVT.getVectorNumElements();
+  if ((NumElements & 1) == 0 &&
+      SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
+    LLVMContext &Ctx = *DAG.getContext();
+    EVT NewSrcVT = EVT::getVectorVT(
+        Ctx, EVT::getIntegerVT(
+                 Ctx, SrcVT.getVectorElementType().getSizeInBits() * 2),
+        NumElements);
+    EVT SplitSrcVT =
+        EVT::getVectorVT(Ctx, SrcVT.getVectorElementType(), NumElements / 2);
+    EVT SplitLoVT, SplitHiVT;
+    std::tie(SplitLoVT, SplitHiVT) = DAG.GetSplitDestVTs(NewSrcVT);
+    if (TLI.isTypeLegal(SrcVT) && !TLI.isTypeLegal(SplitSrcVT) &&
+        TLI.isTypeLegal(NewSrcVT) && TLI.isTypeLegal(SplitLoVT)) {
+      DEBUG(dbgs() << "Split vector extend via incremental extend:";
+            N->dump(&DAG); dbgs() << "\n");
+      // Extend the source vector by one step.
+      SDValue NewSrc =
+          DAG.getNode(N->getOpcode(), dl, NewSrcVT, N->getOperand(0));
+      // Get the low and high halves of the new, extended one step, vector.
+      std::tie(Lo, Hi) = DAG.SplitVector(NewSrc, dl);
+      // Extend those vector halves the rest of the way.
+      Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
+      Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+      return;
+    }
+  }
+  // Fall back to the generic unary operator splitting otherwise.
+  SplitVecRes_UnaryOp(N, Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
+                                                  SDValue &Lo, SDValue &Hi) {
+  // The low and high parts of the original input give four input vectors.
+  SDValue Inputs[4];
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(0), Inputs[0], Inputs[1]);
+  GetSplitVector(N->getOperand(1), Inputs[2], Inputs[3]);
+  EVT NewVT = Inputs[0].getValueType();
+  unsigned NewElts = NewVT.getVectorNumElements();
+
+  // If Lo or Hi uses elements from at most two of the four input vectors, then
+  // express it as a vector shuffle of those two inputs.  Otherwise extract the
+  // input elements by hand and construct the Lo/Hi output using a BUILD_VECTOR.
+  SmallVector<int, 16> Ops;
+  for (unsigned High = 0; High < 2; ++High) {
+    SDValue &Output = High ? Hi : Lo;
+
+    // Build a shuffle mask for the output, discovering on the fly which
+    // input vectors to use as shuffle operands (recorded in InputUsed).
+    // If building a suitable shuffle vector proves too hard, then bail
+    // out with useBuildVector set.
+    unsigned InputUsed[2] = { -1U, -1U }; // Not yet discovered.
+    unsigned FirstMaskIdx = High * NewElts;
+    bool useBuildVector = false;
+    for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+      // The mask element.  This indexes into the input.
+      int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+      // The input vector this mask element indexes into.
+      unsigned Input = (unsigned)Idx / NewElts;
+
+      if (Input >= array_lengthof(Inputs)) {
+        // The mask element does not index into any input vector.
+        Ops.push_back(-1);
+        continue;
+      }
+
+      // Turn the index into an offset from the start of the input vector.
+      Idx -= Input * NewElts;
+
+      // Find or create a shuffle vector operand to hold this input.
+      unsigned OpNo;
+      for (OpNo = 0; OpNo < array_lengthof(InputUsed); ++OpNo) {
+        if (InputUsed[OpNo] == Input) {
+          // This input vector is already an operand.
+          break;
+        } else if (InputUsed[OpNo] == -1U) {
+          // Create a new operand for this input vector.
+          InputUsed[OpNo] = Input;
+          break;
+        }
+      }
+
+      if (OpNo >= array_lengthof(InputUsed)) {
+        // More than two input vectors used!  Give up on trying to create a
+        // shuffle vector.  Insert all elements into a BUILD_VECTOR instead.
+        useBuildVector = true;
+        break;
+      }
+
+      // Add the mask index for the new shuffle vector.
+      Ops.push_back(Idx + OpNo * NewElts);
+    }
+
+    if (useBuildVector) {
+      EVT EltVT = NewVT.getVectorElementType();
+      SmallVector<SDValue, 16> SVOps;
+
+      // Extract the input elements by hand.
+      for (unsigned MaskOffset = 0; MaskOffset < NewElts; ++MaskOffset) {
+        // The mask element.  This indexes into the input.
+        int Idx = N->getMaskElt(FirstMaskIdx + MaskOffset);
+
+        // The input vector this mask element indexes into.
+        unsigned Input = (unsigned)Idx / NewElts;
+
+        if (Input >= array_lengthof(Inputs)) {
+          // The mask element is "undef" or indexes off the end of the input.
+          SVOps.push_back(DAG.getUNDEF(EltVT));
+          continue;
+        }
+
+        // Turn the index into an offset from the start of the input vector.
+        Idx -= Input * NewElts;
+
+        // Extract the vector element by hand.
+        SVOps.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input],
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+      }
+
+      // Construct the Lo/Hi output using a BUILD_VECTOR.
+      Output = DAG.getNode(ISD::BUILD_VECTOR, dl, NewVT, SVOps);
+    } else if (InputUsed[0] == -1U) {
+      // No input vectors were used!  The result is undefined.
+      Output = DAG.getUNDEF(NewVT);
+    } else {
+      SDValue Op0 = Inputs[InputUsed[0]];
+      // If only one input was used, use an undefined vector for the other.
+      SDValue Op1 = InputUsed[1] == -1U ?
+        DAG.getUNDEF(NewVT) : Inputs[InputUsed[1]];
+      // At least one input vector was used.  Create a new shuffle vector.
+      Output =  DAG.getVectorShuffle(NewVT, dl, Op0, Op1, Ops);
+    }
+
+    Ops.clear();
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Operand Vector Splitting
+//===----------------------------------------------------------------------===//
+
+/// This method is called when the specified operand of the specified node is
+/// found to need vector splitting. At this point, all of the result types of
+/// the node are known to be legal, but other operands of the node may need
+/// legalization as well as the specified one.
+bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Split node operand: ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom split this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  if (!Res.getNode()) {
+    switch (N->getOpcode()) {
+    default:
+#ifndef NDEBUG
+      dbgs() << "SplitVectorOperand Op #" << OpNo << ": ";
+      N->dump(&DAG);
+      dbgs() << "\n";
+#endif
+      report_fatal_error("Do not know how to split this operator's "
+                         "operand!\n");
+
+    case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
+    case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
+    case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
+    case ISD::EXTRACT_VECTOR_ELT:Res = SplitVecOp_EXTRACT_VECTOR_ELT(N); break;
+    case ISD::CONCAT_VECTORS:    Res = SplitVecOp_CONCAT_VECTORS(N); break;
+    case ISD::TRUNCATE:
+      Res = SplitVecOp_TruncateHelper(N);
+      break;
+    case ISD::FP_ROUND:          Res = SplitVecOp_FP_ROUND(N); break;
+    case ISD::FCOPYSIGN:         Res = SplitVecOp_FCOPYSIGN(N); break;
+    case ISD::STORE:
+      Res = SplitVecOp_STORE(cast<StoreSDNode>(N), OpNo);
+      break;
+    case ISD::MSTORE:
+      Res = SplitVecOp_MSTORE(cast<MaskedStoreSDNode>(N), OpNo);
+      break;
+    case ISD::MSCATTER:
+      Res = SplitVecOp_MSCATTER(cast<MaskedScatterSDNode>(N), OpNo);
+      break;
+    case ISD::MGATHER:
+      Res = SplitVecOp_MGATHER(cast<MaskedGatherSDNode>(N), OpNo);
+      break;
+    case ISD::VSELECT:
+      Res = SplitVecOp_VSELECT(N, OpNo);
+      break;
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT:
+      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
+        Res = SplitVecOp_TruncateHelper(N);
+      else
+        Res = SplitVecOp_UnaryOp(N);
+      break;
+    case ISD::SINT_TO_FP:
+    case ISD::UINT_TO_FP:
+      if (N->getValueType(0).bitsLT(N->getOperand(0)->getValueType(0)))
+        Res = SplitVecOp_TruncateHelper(N);
+      else
+        Res = SplitVecOp_UnaryOp(N);
+      break;
+    case ISD::CTTZ:
+    case ISD::CTLZ:
+    case ISD::CTPOP:
+    case ISD::FP_EXTEND:
+    case ISD::SIGN_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::ANY_EXTEND:
+    case ISD::FTRUNC:
+      Res = SplitVecOp_UnaryOp(N);
+      break;
+    }
+  }
+
+  // If the result is null, the sub-method took care of registering results etc.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) {
+  // The only possibility for an illegal operand is the mask, since result type
+  // legalization would have handled this node already otherwise.
+  assert(OpNo == 0 && "Illegal operand must be mask");
+
+  SDValue Mask = N->getOperand(0);
+  SDValue Src0 = N->getOperand(1);
+  SDValue Src1 = N->getOperand(2);
+  EVT Src0VT = Src0.getValueType();
+  SDLoc DL(N);
+  assert(Mask.getValueType().isVector() && "VSELECT without a vector mask?");
+
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  assert(Lo.getValueType() == Hi.getValueType() &&
+         "Lo and Hi have differing types");
+
+  EVT LoOpVT, HiOpVT;
+  std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(Src0VT);
+  assert(LoOpVT == HiOpVT && "Asymmetric vector split?");
+
+  SDValue LoOp0, HiOp0, LoOp1, HiOp1, LoMask, HiMask;
+  std::tie(LoOp0, HiOp0) = DAG.SplitVector(Src0, DL);
+  std::tie(LoOp1, HiOp1) = DAG.SplitVector(Src1, DL);
+  std::tie(LoMask, HiMask) = DAG.SplitVector(Mask, DL);
+
+  SDValue LoSelect =
+    DAG.getNode(ISD::VSELECT, DL, LoOpVT, LoMask, LoOp0, LoOp1);
+  SDValue HiSelect =
+    DAG.getNode(ISD::VSELECT, DL, HiOpVT, HiMask, HiOp0, HiOp1);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc dl(N);
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(N->getOpcode(), dl, OutVT, Lo);
+  Hi = DAG.getNode(N->getOpcode(), dl, OutVT, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_BITCAST(SDNode *N) {
+  // For example, i64 = BITCAST v4i16 on alpha.  Typically the vector will
+  // end up being split all the way down to individual components.  Convert the
+  // split pieces into integers and reassemble.
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  Lo = BitConvertToInteger(Lo);
+  Hi = BitConvertToInteger(Hi);
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::swap(Lo, Hi);
+
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0),
+                     JoinIntegers(Lo, Hi));
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  // We know that the extracted result type is legal.
+  EVT SubVT = N->getValueType(0);
+  SDValue Idx = N->getOperand(1);
+  SDLoc dl(N);
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+
+  uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+  if (IdxVal < LoElts) {
+    assert(IdxVal + SubVT.getVectorNumElements() <= LoElts &&
+           "Extracted subvector crosses vector split!");
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
+  } else {
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
+                       DAG.getConstant(IdxVal - LoElts, dl,
+                                       Idx.getValueType()));
+  }
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue Vec = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+  EVT VecVT = Vec.getValueType();
+
+  if (isa<ConstantSDNode>(Idx)) {
+    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    assert(IdxVal < VecVT.getVectorNumElements() && "Invalid vector index!");
+
+    SDValue Lo, Hi;
+    GetSplitVector(Vec, Lo, Hi);
+
+    uint64_t LoElts = Lo.getValueType().getVectorNumElements();
+
+    if (IdxVal < LoElts)
+      return SDValue(DAG.UpdateNodeOperands(N, Lo, Idx), 0);
+    return SDValue(DAG.UpdateNodeOperands(N, Hi,
+                                  DAG.getConstant(IdxVal - LoElts, SDLoc(N),
+                                                  Idx.getValueType())), 0);
+  }
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(0), true))
+    return SDValue();
+
+  // Make the vector elements byte-addressable if they aren't already.
+  SDLoc dl(N);
+  EVT EltVT = VecVT.getVectorElementType();
+  if (EltVT.getSizeInBits() < 8) {
+    SmallVector<SDValue, 4> ElementOps;
+    for (unsigned i = 0; i < VecVT.getVectorNumElements(); ++i) {
+      ElementOps.push_back(DAG.getAnyExtOrTrunc(
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vec,
+                      DAG.getConstant(i, dl, MVT::i8)),
+          dl, MVT::i8));
+    }
+
+    EltVT = MVT::i8;
+    VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                             VecVT.getVectorNumElements());
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, ElementOps);
+  }
+
+  // Store the vector to the stack.
+  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  SDValue Store =
+      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
+
+  // Load back the required element.
+  StackPtr = GetVectorElementPointer(StackPtr, EltVT, Idx);
+  return DAG.getExtLoad(ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
+                        MachinePointerInfo(), EltVT);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
+                                             unsigned OpNo) {
+  EVT LoVT, HiVT;
+  SDLoc dl(MGT);
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(MGT->getValueType(0));
+
+  SDValue Ch = MGT->getChain();
+  SDValue Ptr = MGT->getBasePtr();
+  SDValue Index = MGT->getIndex();
+  SDValue Mask = MGT->getMask();
+  SDValue Src0 = MGT->getValue();
+  unsigned Alignment = MGT->getOriginalAlignment();
+
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
+
+  EVT MemoryVT = MGT->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue Src0Lo, Src0Hi;
+  if (getTypeAction(Src0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Src0, Src0Lo, Src0Hi);
+  else
+    std::tie(Src0Lo, Src0Hi) = DAG.SplitVector(Src0, dl);
+
+  SDValue IndexHi, IndexLo;
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MGT->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
+                         Alignment, MGT->getAAInfo(), MGT->getRanges());
+
+  SDValue OpsLo[] = {Ch, Src0Lo, MaskLo, Ptr, IndexLo};
+  SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
+                                   OpsLo, MMO);
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MGT->getPointerInfo(),
+                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
+                         Alignment, MGT->getAAInfo(),
+                         MGT->getRanges());
+
+  SDValue OpsHi[] = {Ch, Src0Hi, MaskHi, Ptr, IndexHi};
+  SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
+                                   OpsHi, MMO);
+
+  // Build a factor node to remember that this load is independent of the
+  // other one.
+  Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                   Hi.getValue(1));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(MGT, 1), Ch);
+
+  SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, MGT->getValueType(0), Lo,
+                            Hi);
+  ReplaceValueWith(SDValue(MGT, 0), Res);
+  return SDValue();
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
+                                            unsigned OpNo) {
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Mask = N->getMask();
+  SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  SDLoc DL(N);
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+
+  MaskLo = PromoteTargetBoolean(MaskLo, DataLo.getValueType());
+  MaskHi = PromoteTargetBoolean(MaskHi, DataHi.getValueType());
+
+  // if Alignment is equal to the vector size,
+  // take the half of it for the second part
+  unsigned SecondHalfAlignment =
+    (Alignment == Data->getValueType(0).getSizeInBits()/8) ?
+       Alignment/2 : Alignment;
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, MaskLo, LoMemVT, MMO,
+                          N->isTruncatingStore());
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                         SecondHalfAlignment, N->getAAInfo(), N->getRanges());
+
+  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, MaskHi, HiMemVT, MMO,
+                          N->isTruncatingStore());
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
+                                              unsigned OpNo) {
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  SDValue Mask = N->getMask();
+  SDValue Index = N->getIndex();
+  SDValue Data = N->getValue();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  SDLoc DL(N);
+
+  // Split all operands
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  SDValue DataLo, DataHi;
+  if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Data operand
+    GetSplitVector(Data, DataLo, DataHi);
+  else
+    std::tie(DataLo, DataHi) = DAG.SplitVector(Data, DL);
+
+  SDValue MaskLo, MaskHi;
+  if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
+    // Split Mask operand
+    GetSplitVector(Mask, MaskLo, MaskHi);
+  else
+    std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
+
+  SDValue IndexHi, IndexLo;
+  if (getTypeAction(Index.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(Index, IndexLo, IndexHi);
+  else
+    std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
+
+  SDValue Lo, Hi;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo};
+  Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
+                            DL, OpsLo, MMO);
+
+  MMO = DAG.getMachineFunction().
+    getMachineMemOperand(N->getPointerInfo(),
+                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
+                         Alignment, N->getAAInfo(), N->getRanges());
+
+  SDValue OpsHi[] = {Ch, DataHi, MaskHi, Ptr, IndexHi};
+  Hi = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataHi.getValueType(),
+                            DL, OpsHi, MMO);
+
+  // Build a factor node to remember that this store is independent of the
+  // other one.
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
+  assert(N->isUnindexed() && "Indexed store of vector?");
+  assert(OpNo == 1 && "Can only split the stored value");
+  SDLoc DL(N);
+
+  bool isTruncating = N->isTruncatingStore();
+  SDValue Ch  = N->getChain();
+  SDValue Ptr = N->getBasePtr();
+  EVT MemoryVT = N->getMemoryVT();
+  unsigned Alignment = N->getOriginalAlignment();
+  MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = N->getAAInfo();
+  SDValue Lo, Hi;
+  GetSplitVector(N->getOperand(1), Lo, Hi);
+
+  EVT LoMemVT, HiMemVT;
+  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+
+  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
+
+  if (isTruncating)
+    Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
+                           Alignment, MMOFlags, AAInfo);
+  else
+    Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
+                      AAInfo);
+
+  // Increment the pointer to the other half.
+  Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
+                    DAG.getConstant(IncrementSize, DL, Ptr.getValueType()));
+
+  if (isTruncating)
+    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
+                           N->getPointerInfo().getWithOffset(IncrementSize),
+                           HiMemVT, Alignment, MMOFlags, AAInfo);
+  else
+    Hi = DAG.getStore(Ch, DL, Hi, Ptr,
+                      N->getPointerInfo().getWithOffset(IncrementSize),
+                      Alignment, MMOFlags, AAInfo);
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
+  SDLoc DL(N);
+
+  // The input operands all must have the same type, and we know the result
+  // type is valid.  Convert this to a buildvector which extracts all the
+  // input elements.
+  // TODO: If the input elements are power-two vectors, we could convert this to
+  // a new CONCAT_VECTORS node with elements that are half-wide.
+  SmallVector<SDValue, 32> Elts;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  for (const SDValue &Op : N->op_values()) {
+    for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
+         i != e; ++i) {
+      Elts.push_back(DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+    }
+  }
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, N->getValueType(0), Elts);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
+  // The result type is legal, but the input type is illegal.  If splitting
+  // ends up with the result type of each half still being legal, just
+  // do that.  If, however, that would result in an illegal result type,
+  // we can try to get more clever with power-two vectors. Specifically,
+  // split the input type, but also widen the result element size, then
+  // concatenate the halves and truncate again.  For example, consider a target
+  // where v8i8 is legal and v8i32 is not (ARM, which doesn't have 256-bit
+  // vectors). To perform a "%res = v8i8 trunc v8i32 %in" we do:
+  //   %inlo = v4i32 extract_subvector %in, 0
+  //   %inhi = v4i32 extract_subvector %in, 4
+  //   %lo16 = v4i16 trunc v4i32 %inlo
+  //   %hi16 = v4i16 trunc v4i32 %inhi
+  //   %in16 = v8i16 concat_vectors v4i16 %lo16, v4i16 %hi16
+  //   %res = v8i8 trunc v8i16 %in16
+  //
+  // Without this transform, the original truncate would end up being
+  // scalarized, which is pretty much always a last resort.
+  SDValue InVec = N->getOperand(0);
+  EVT InVT = InVec->getValueType(0);
+  EVT OutVT = N->getValueType(0);
+  unsigned NumElements = OutVT.getVectorNumElements();
+  bool IsFloat = OutVT.isFloatingPoint();
+
+  // Widening should have already made sure this is a power-two vector
+  // if we're trying to split it at all. assert() that's true, just in case.
+  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+
+  unsigned InElementSize = InVT.getVectorElementType().getSizeInBits();
+  unsigned OutElementSize = OutVT.getVectorElementType().getSizeInBits();
+
+  // If the input elements are only 1/2 the width of the result elements,
+  // just use the normal splitting. Our trick only work if there's room
+  // to split more than once.
+  if (InElementSize <= OutElementSize * 2)
+    return SplitVecOp_UnaryOp(N);
+  SDLoc DL(N);
+
+  // Extract the halves of the input via extract_subvector.
+  SDValue InLoVec, InHiVec;
+  std::tie(InLoVec, InHiVec) = DAG.SplitVector(InVec, DL);
+  // Truncate them to 1/2 the element size.
+  EVT HalfElementVT = IsFloat ?
+    EVT::getFloatingPointVT(InElementSize/2) :
+    EVT::getIntegerVT(*DAG.getContext(), InElementSize/2);
+  EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT,
+                                NumElements/2);
+  SDValue HalfLo = DAG.getNode(N->getOpcode(), DL, HalfVT, InLoVec);
+  SDValue HalfHi = DAG.getNode(N->getOpcode(), DL, HalfVT, InHiVec);
+  // Concatenate them to get the full intermediate truncation result.
+  EVT InterVT = EVT::getVectorVT(*DAG.getContext(), HalfElementVT, NumElements);
+  SDValue InterVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InterVT, HalfLo,
+                                 HalfHi);
+  // Now finish up by truncating all the way down to the original result
+  // type. This should normally be something that ends up being legal directly,
+  // but in theory if a target has very wide vectors and an annoyingly
+  // restricted set of legal types, this split can chain to build things up.
+  return IsFloat
+             ? DAG.getNode(ISD::FP_ROUND, DL, OutVT, InterVec,
+                           DAG.getTargetConstant(
+                               0, DL, TLI.getPointerTy(DAG.getDataLayout())))
+             : DAG.getNode(ISD::TRUNCATE, DL, OutVT, InterVec);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operand types must be vectors");
+  // The result has a legal vector type, but the input needs splitting.
+  SDValue Lo0, Hi0, Lo1, Hi1, LoRes, HiRes;
+  SDLoc DL(N);
+  GetSplitVector(N->getOperand(0), Lo0, Hi0);
+  GetSplitVector(N->getOperand(1), Lo1, Hi1);
+  unsigned PartElements = Lo0.getValueType().getVectorNumElements();
+  EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
+  EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements);
+
+  LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
+  HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
+  SDValue Con = DAG.getNode(ISD::CONCAT_VECTORS, DL, WideResVT, LoRes, HiRes);
+  return PromoteTargetBoolean(Con, N->getValueType(0));
+}
+
+
+SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
+  // The result has a legal vector type, but the input needs splitting.
+  EVT ResVT = N->getValueType(0);
+  SDValue Lo, Hi;
+  SDLoc DL(N);
+  GetSplitVector(N->getOperand(0), Lo, Hi);
+  EVT InVT = Lo.getValueType();
+
+  EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
+                               InVT.getVectorNumElements());
+
+  Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
+  Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
+}
+
+SDValue DAGTypeLegalizer::SplitVecOp_FCOPYSIGN(SDNode *N) {
+  // The result (and the first input) has a legal vector type, but the second
+  // input needs splitting.
+  return DAG.UnrollVectorOp(N, N->getValueType(0).getVectorNumElements());
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Result Vector Widening
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
+  DEBUG(dbgs() << "Widen node result " << ResNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+
+  // See if the target wants to custom widen this node.
+  if (CustomWidenLowerNode(N, N->getValueType(ResNo)))
+    return;
+
+  SDValue Res = SDValue();
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorResult #" << ResNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen the result of this operator!");
+
+  case ISD::MERGE_VALUES:      Res = WidenVecRes_MERGE_VALUES(N, ResNo); break;
+  case ISD::BITCAST:           Res = WidenVecRes_BITCAST(N); break;
+  case ISD::BUILD_VECTOR:      Res = WidenVecRes_BUILD_VECTOR(N); break;
+  case ISD::CONCAT_VECTORS:    Res = WidenVecRes_CONCAT_VECTORS(N); break;
+  case ISD::CONVERT_RNDSAT:    Res = WidenVecRes_CONVERT_RNDSAT(N); break;
+  case ISD::EXTRACT_SUBVECTOR: Res = WidenVecRes_EXTRACT_SUBVECTOR(N); break;
+  case ISD::FP_ROUND_INREG:    Res = WidenVecRes_InregOp(N); break;
+  case ISD::INSERT_VECTOR_ELT: Res = WidenVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::LOAD:              Res = WidenVecRes_LOAD(N); break;
+  case ISD::SCALAR_TO_VECTOR:  Res = WidenVecRes_SCALAR_TO_VECTOR(N); break;
+  case ISD::SIGN_EXTEND_INREG: Res = WidenVecRes_InregOp(N); break;
+  case ISD::VSELECT:
+  case ISD::SELECT:            Res = WidenVecRes_SELECT(N); break;
+  case ISD::SELECT_CC:         Res = WidenVecRes_SELECT_CC(N); break;
+  case ISD::SETCC:             Res = WidenVecRes_SETCC(N); break;
+  case ISD::UNDEF:             Res = WidenVecRes_UNDEF(N); break;
+  case ISD::VECTOR_SHUFFLE:
+    Res = WidenVecRes_VECTOR_SHUFFLE(cast<ShuffleVectorSDNode>(N));
+    break;
+  case ISD::MLOAD:
+    Res = WidenVecRes_MLOAD(cast<MaskedLoadSDNode>(N));
+    break;
+  case ISD::MGATHER:
+    Res = WidenVecRes_MGATHER(cast<MaskedGatherSDNode>(N));
+    break;
+
+  case ISD::ADD:
+  case ISD::AND:
+  case ISD::MUL:
+  case ISD::MULHS:
+  case ISD::MULHU:
+  case ISD::OR:
+  case ISD::SUB:
+  case ISD::XOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNAN:
+  case ISD::FMAXNAN:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    Res = WidenVecRes_Binary(N);
+    break;
+
+  case ISD::FADD:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FSUB:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    Res = WidenVecRes_BinaryCanTrap(N);
+    break;
+
+  case ISD::FCOPYSIGN:
+    Res = WidenVecRes_FCOPYSIGN(N);
+    break;
+
+  case ISD::FPOWI:
+    Res = WidenVecRes_POWI(N);
+    break;
+
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+    Res = WidenVecRes_Shift(N);
+    break;
+
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    Res = WidenVecRes_EXTEND_VECTOR_INREG(N);
+    break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::FP_EXTEND:
+  case ISD::FP_ROUND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SIGN_EXTEND:
+  case ISD::SINT_TO_FP:
+  case ISD::TRUNCATE:
+  case ISD::UINT_TO_FP:
+  case ISD::ZERO_EXTEND:
+    Res = WidenVecRes_Convert(N);
+    break;
+
+  case ISD::BITREVERSE:
+  case ISD::BSWAP:
+  case ISD::CTLZ:
+  case ISD::CTPOP:
+  case ISD::CTTZ:
+  case ISD::FABS:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG10:
+  case ISD::FLOG2:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+    Res = WidenVecRes_Unary(N);
+    break;
+  case ISD::FMA:
+    Res = WidenVecRes_Ternary(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (Res.getNode())
+    SetWidenedVector(SDValue(N, ResNo), Res);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Ternary(SDNode *N) {
+  // Ternary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp3 = GetWidenedVector(N->getOperand(2));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, InOp3);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Binary(SDNode *N) {
+  // Binary op widening.
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, N->getFlags());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
+  // Binary op widening for operations that can trap.
+  unsigned Opcode = N->getOpcode();
+  SDLoc dl(N);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  EVT VT = WidenVT;
+  unsigned NumElts =  VT.getVectorNumElements();
+  const SDNodeFlags *Flags = N->getFlags();
+  while (!TLI.isTypeLegal(VT) && NumElts != 1) {
+    NumElts = NumElts / 2;
+    VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+  }
+
+  if (NumElts != 1 && !TLI.canOpTrap(N->getOpcode(), VT)) {
+    // Operation doesn't trap so just widen as normal.
+    SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+    SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+    return DAG.getNode(N->getOpcode(), dl, WidenVT, InOp1, InOp2, Flags);
+  }
+
+  // No legal vector version so unroll the vector operation and then widen.
+  if (NumElts == 1)
+    return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+
+  // Since the operation can trap, apply operation on the original vector.
+  EVT MaxVT = VT;
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  unsigned CurNumElts = N->getValueType(0).getVectorNumElements();
+
+  SmallVector<SDValue, 16> ConcatOps(CurNumElts);
+  unsigned ConcatEnd = 0;  // Current ConcatOps index.
+  int Idx = 0;        // Current Idx into input vectors.
+
+  // NumElts := greatest legal vector size (at most WidenVT)
+  // while (orig. vector has unhandled elements) {
+  //   take munches of size NumElts from the beginning and add to ConcatOps
+  //   NumElts := next smaller supported vector size or 1
+  // }
+  while (CurNumElts != 0) {
+    while (CurNumElts >= NumElts) {
+      SDValue EOp1 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue EOp2 = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
+      Idx += NumElts;
+      CurNumElts -= NumElts;
+    }
+    do {
+      NumElts = NumElts / 2;
+      VT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NumElts);
+    } while (!TLI.isTypeLegal(VT) && NumElts != 1);
+
+    if (NumElts == 1) {
+      for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
+        SDValue EOp1 = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        SDValue EOp2 = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
+                                             EOp1, EOp2, Flags);
+      }
+      CurNumElts = 0;
+    }
+  }
+
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  // while (Some element of ConcatOps is not of type MaxVT) {
+  //   From the end of ConcatOps, collect elements of the same type and put
+  //   them into an op of the next larger supported type
+  // }
+  while (ConcatOps[ConcatEnd-1].getValueType() != MaxVT) {
+    Idx = ConcatEnd - 1;
+    VT = ConcatOps[Idx--].getValueType();
+    while (Idx >= 0 && ConcatOps[Idx].getValueType() == VT)
+      Idx--;
+
+    int NextSize = VT.isVector() ? VT.getVectorNumElements() : 1;
+    EVT NextVT;
+    do {
+      NextSize *= 2;
+      NextVT = EVT::getVectorVT(*DAG.getContext(), WidenEltVT, NextSize);
+    } while (!TLI.isTypeLegal(NextVT));
+
+    if (!VT.isVector()) {
+      // Scalar type, create an INSERT_VECTOR_ELEMENT of type NextVT
+      SDValue VecOp = DAG.getUNDEF(NextVT);
+      unsigned NumToInsert = ConcatEnd - Idx - 1;
+      for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
+        VecOp = DAG.getNode(
+            ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx],
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      }
+      ConcatOps[Idx+1] = VecOp;
+      ConcatEnd = Idx + 2;
+    } else {
+      // Vector type, create a CONCAT_VECTORS of type NextVT
+      SDValue undefVec = DAG.getUNDEF(VT);
+      unsigned OpsToConcat = NextSize/VT.getVectorNumElements();
+      SmallVector<SDValue, 16> SubConcatOps(OpsToConcat);
+      unsigned RealVals = ConcatEnd - Idx - 1;
+      unsigned SubConcatEnd = 0;
+      unsigned SubConcatIdx = Idx + 1;
+      while (SubConcatEnd < RealVals)
+        SubConcatOps[SubConcatEnd++] = ConcatOps[++Idx];
+      while (SubConcatEnd < OpsToConcat)
+        SubConcatOps[SubConcatEnd++] = undefVec;
+      ConcatOps[SubConcatIdx] = DAG.getNode(ISD::CONCAT_VECTORS, dl,
+                                            NextVT, SubConcatOps);
+      ConcatEnd = SubConcatIdx + 1;
+    }
+  }
+
+  // Check to see if we have a single operation with the widen type.
+  if (ConcatEnd == 1) {
+    VT = ConcatOps[0].getValueType();
+    if (VT == WidenVT)
+      return ConcatOps[0];
+  }
+
+  // add undefs of size MaxVT until ConcatOps grows to length of WidenVT
+  unsigned NumOps = WidenVT.getVectorNumElements()/MaxVT.getVectorNumElements();
+  if (NumOps != ConcatEnd ) {
+    SDValue UndefVal = DAG.getUNDEF(MaxVT);
+    for (unsigned j = ConcatEnd; j < NumOps; ++j)
+      ConcatOps[j] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                     makeArrayRef(ConcatOps.data(), NumOps));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  SDLoc DL(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  unsigned Opcode = N->getOpcode();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+  const SDNodeFlags *Flags = N->getFlags();
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(N->getOperand(0));
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts) {
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InOp);
+      return DAG.getNode(Opcode, DL, WidenVT, InOp, N->getOperand(1), Flags);
+    }
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      SDValue InVec = DAG.getNode(ISD::CONCAT_VECTORS, DL, InWidenVT, Ops);
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVec);
+      return DAG.getNode(Opcode, DL, WidenVT, InVec, N->getOperand(1), Flags);
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      SDValue InVal = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      // Extract the input and convert the shorten input vector.
+      if (N->getNumOperands() == 1)
+        return DAG.getNode(Opcode, DL, WidenVT, InVal);
+      return DAG.getNode(Opcode, DL, WidenVT, InVal, N->getOperand(1), Flags);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue Val = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+        DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    if (N->getNumOperands() == 1)
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
+    else
+      Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val, N->getOperand(1), Flags);
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
+  unsigned Opcode = N->getOpcode();
+  SDValue InOp = N->getOperand(0);
+  SDLoc DL(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT WidenSVT = WidenVT.getVectorElementType();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InSVT = InVT.getVectorElementType();
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() == WidenVT.getSizeInBits()) {
+      switch (Opcode) {
+      case ISD::ANY_EXTEND_VECTOR_INREG:
+        return DAG.getAnyExtendVectorInReg(InOp, DL, WidenVT);
+      case ISD::SIGN_EXTEND_VECTOR_INREG:
+        return DAG.getSignExtendVectorInReg(InOp, DL, WidenVT);
+      case ISD::ZERO_EXTEND_VECTOR_INREG:
+        return DAG.getZeroExtendVectorInReg(InOp, DL, WidenVT);
+      }
+    }
+  }
+
+  // Unroll, extend the scalars and rebuild the vector.
+  SmallVector<SDValue, 16> Ops;
+  for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
+      DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    switch (Opcode) {
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::SIGN_EXTEND, DL, WidenSVT, Val);
+      break;
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Val = DAG.getNode(ISD::ZERO_EXTEND, DL, WidenSVT, Val);
+      break;
+    default:
+      llvm_unreachable("A *_EXTEND_VECTOR_INREG node was expected");
+    }
+    Ops.push_back(Val);
+  }
+
+  while (Ops.size() != WidenNumElts)
+    Ops.push_back(DAG.getUNDEF(WidenSVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, WidenVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_FCOPYSIGN(SDNode *N) {
+  // If this is an FCOPYSIGN with same input types, we can treat it as a
+  // normal (can trap) binary op.
+  if (N->getOperand(0).getValueType() == N->getOperand(1).getValueType())
+    return WidenVecRes_BinaryCanTrap(N);
+
+  // If the types are different, fall back to unrolling.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.UnrollVectorOp(N, WidenVT.getVectorNumElements());
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_POWI(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Shift(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  SDValue ShOp = N->getOperand(1);
+
+  EVT ShVT = ShOp.getValueType();
+  if (getTypeAction(ShVT) == TargetLowering::TypeWidenVector) {
+    ShOp = GetWidenedVector(ShOp);
+    ShVT = ShOp.getValueType();
+  }
+  EVT ShWidenVT = EVT::getVectorVT(*DAG.getContext(),
+                                   ShVT.getVectorElementType(),
+                                   WidenVT.getVectorNumElements());
+  if (ShVT != ShWidenVT)
+    ShOp = ModifyToType(ShOp, ShWidenVT);
+
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp, ShOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_Unary(SDNode *N) {
+  // Unary op widening.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N), WidenVT, InOp);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_InregOp(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               cast<VTSDNode>(N->getOperand(1))->getVT()
+                                 .getVectorElementType(),
+                               WidenVT.getVectorNumElements());
+  SDValue WidenLHS = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     WidenVT, WidenLHS, DAG.getValueType(ExtVT));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MERGE_VALUES(SDNode *N, unsigned ResNo) {
+  SDValue WidenVec = DisintegrateMERGE_VALUES(N, ResNo);
+  return GetWidenedVector(WidenVec);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
+  SDValue InOp = N->getOperand(0);
+  EVT InVT = InOp.getValueType();
+  EVT VT = N->getValueType(0);
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  SDLoc dl(N);
+
+  switch (getTypeAction(InVT)) {
+  case TargetLowering::TypeLegal:
+    break;
+  case TargetLowering::TypePromoteInteger:
+    // If the incoming type is a vector that is being promoted, then
+    // we know that the elements are arranged differently and that we
+    // must perform the conversion using a stack slot.
+    if (InVT.isVector())
+      break;
+
+    // If the InOp is promoted to the same size, convert it.  Otherwise,
+    // fall out of the switch and widen the promoted input.
+    InOp = GetPromotedInteger(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
+    break;
+  case TargetLowering::TypeSoftenFloat:
+  case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeExpandInteger:
+  case TargetLowering::TypeExpandFloat:
+  case TargetLowering::TypeScalarizeVector:
+  case TargetLowering::TypeSplitVector:
+    break;
+  case TargetLowering::TypeWidenVector:
+    // If the InOp is widened to the same size, convert it.  Otherwise, fall
+    // out of the switch and widen the widened input.
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    if (WidenVT.bitsEq(InVT))
+      // The input widens to the same size. Convert to the widen value.
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, InOp);
+    break;
+  }
+
+  unsigned WidenSize = WidenVT.getSizeInBits();
+  unsigned InSize = InVT.getSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (WidenSize % InSize == 0 && InVT != MVT::x86mmx) {
+    // Determine new input vector type.  The new input vector type will use
+    // the same element type (if its a vector) or use the input type as a
+    // vector.  It is the same size as the type to widen to.
+    EVT NewInVT;
+    unsigned NewNumElts = WidenSize / InSize;
+    if (InVT.isVector()) {
+      EVT InEltVT = InVT.getVectorElementType();
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InEltVT,
+                                 WidenSize / InEltVT.getSizeInBits());
+    } else {
+      NewInVT = EVT::getVectorVT(*DAG.getContext(), InVT, NewNumElts);
+    }
+
+    if (TLI.isTypeLegal(NewInVT)) {
+      // Because the result and the input are different vector types, widening
+      // the result could create a legal type but widening the input might make
+      // it an illegal type that might lead to repeatedly splitting the input
+      // and then widening it. To avoid this, we widen the input only if
+      // it results in a legal type.
+      SmallVector<SDValue, 16> Ops(NewNumElts);
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      Ops[0] = InOp;
+      for (unsigned i = 1; i < NewNumElts; ++i)
+        Ops[i] = UndefVal;
+
+      SDValue NewVec;
+      if (InVT.isVector())
+        NewVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewInVT, Ops);
+      else
+        NewVec = DAG.getNode(ISD::BUILD_VECTOR, dl, NewInVT, Ops);
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, NewVec);
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_BUILD_VECTOR(SDNode *N) {
+  SDLoc dl(N);
+  // Build a vector with undefined for the new nodes.
+  EVT VT = N->getValueType(0);
+
+  // Integer BUILD_VECTOR operands may be larger than the node's vector element
+  // type. The UNDEFs need to have the same type as the existing operands.
+  EVT EltVT = N->getOperand(0).getValueType();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SmallVector<SDValue, 16> NewOps(N->op_begin(), N->op_end());
+  assert(WidenNumElts >= NumElts && "Shrinking vector instead of widening!");
+  NewOps.append(WidenNumElts - NumElts, DAG.getUNDEF(EltVT));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, NewOps);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
+  EVT InVT = N->getOperand(0).getValueType();
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  unsigned NumInElts = InVT.getVectorNumElements();
+  unsigned NumOperands = N->getNumOperands();
+
+  bool InputWidened = false; // Indicates we need to widen the input.
+  if (getTypeAction(InVT) != TargetLowering::TypeWidenVector) {
+    if (WidenVT.getVectorNumElements() % InVT.getVectorNumElements() == 0) {
+      // Add undef vectors to widen to correct length.
+      unsigned NumConcat = WidenVT.getVectorNumElements() /
+                           InVT.getVectorNumElements();
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      for (unsigned i=0; i < NumOperands; ++i)
+        Ops[i] = N->getOperand(i);
+      for (unsigned i = NumOperands; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, Ops);
+    }
+  } else {
+    InputWidened = true;
+    if (WidenVT == TLI.getTypeToTransformTo(*DAG.getContext(), InVT)) {
+      // The inputs and the result are widen to the same value.
+      unsigned i;
+      for (i=1; i < NumOperands; ++i)
+        if (!N->getOperand(i).isUndef())
+          break;
+
+      if (i == NumOperands)
+        // Everything but the first operand is an UNDEF so just return the
+        // widened first operand.
+        return GetWidenedVector(N->getOperand(0));
+
+      if (NumOperands == 2) {
+        // Replace concat of two operands with a shuffle.
+        SmallVector<int, 16> MaskOps(WidenNumElts, -1);
+        for (unsigned i = 0; i < NumInElts; ++i) {
+          MaskOps[i] = i;
+          MaskOps[i + NumInElts] = i + WidenNumElts;
+        }
+        return DAG.getVectorShuffle(WidenVT, dl,
+                                    GetWidenedVector(N->getOperand(0)),
+                                    GetWidenedVector(N->getOperand(1)),
+                                    MaskOps);
+      }
+    }
+  }
+
+  // Fall back to use extracts and build vector.
+  EVT EltVT = WidenVT.getVectorElementType();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Idx = 0;
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (InputWidened)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
+  SDLoc dl(N);
+  SDValue InOp  = N->getOperand(0);
+  SDValue RndOp = N->getOperand(3);
+  SDValue SatOp = N->getOperand(4);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+  EVT InWidenVT = EVT::getVectorVT(*DAG.getContext(), InEltVT, WidenNumElts);
+
+  SDValue DTyOp = DAG.getValueType(WidenVT);
+  SDValue STyOp = DAG.getValueType(InWidenVT);
+  ISD::CvtCode CvtCode = cast<CvtRndSatSDNode>(N)->getCvtCode();
+
+  unsigned InVTNumElts = InVT.getVectorNumElements();
+  if (getTypeAction(InVT) == TargetLowering::TypeWidenVector) {
+    InOp = GetWidenedVector(InOp);
+    InVT = InOp.getValueType();
+    InVTNumElts = InVT.getVectorNumElements();
+    if (InVTNumElts == WidenNumElts)
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  if (TLI.isTypeLegal(InWidenVT)) {
+    // Because the result and the input are different vector types, widening
+    // the result could create a legal type but widening the input might make
+    // it an illegal type that might lead to repeatedly splitting the input
+    // and then widening it. To avoid this, we widen the input only if
+    // it results in a legal type.
+    if (WidenNumElts % InVTNumElts == 0) {
+      // Widen the input and call convert on the widened input vector.
+      unsigned NumConcat = WidenNumElts/InVTNumElts;
+      SmallVector<SDValue, 16> Ops(NumConcat);
+      Ops[0] = InOp;
+      SDValue UndefVal = DAG.getUNDEF(InVT);
+      for (unsigned i = 1; i != NumConcat; ++i)
+        Ops[i] = UndefVal;
+
+      InOp = DAG.getNode(ISD::CONCAT_VECTORS, dl, InWidenVT, Ops);
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+
+    if (InVTNumElts % WidenNumElts == 0) {
+      // Extract the input and convert the shorten input vector.
+      InOp = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
+          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+    }
+  }
+
+  // Otherwise unroll into some nasty scalar code and rebuild the vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = WidenVT.getVectorElementType();
+  DTyOp = DAG.getValueType(EltVT);
+  STyOp = DAG.getValueType(InEltVT);
+
+  unsigned MinElts = std::min(InVTNumElts, WidenNumElts);
+  unsigned i;
+  for (i=0; i < MinElts; ++i) {
+    SDValue ExtVal = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
+                                  SatOp, CvtCode);
+  }
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
+  EVT      VT = N->getValueType(0);
+  EVT      WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SDValue  InOp = N->getOperand(0);
+  SDValue  Idx  = N->getOperand(1);
+  SDLoc dl(N);
+
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+    InOp = GetWidenedVector(InOp);
+
+  EVT InVT = InOp.getValueType();
+
+  // Check if we can just return the input vector after widening.
+  uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if (IdxVal == 0 && InVT == WidenVT)
+    return InOp;
+
+  // Check if we can extract from the vector.
+  unsigned InNumElts = InVT.getVectorNumElements();
+  if (IdxVal % WidenNumElts == 0 && IdxVal + WidenNumElts < InNumElts)
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, WidenVT, InOp, Idx);
+
+  // We could try widening the input to the right length but for now, extract
+  // the original elements, fill the rest with undefs and build a vector.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned i;
+  for (i=0; i < NumElts; ++i)
+    Ops[i] =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                    DAG.getConstant(IdxVal + i, dl,
+                                    TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i < WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_INSERT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(N),
+                     InOp.getValueType(), InOp,
+                     N->getOperand(1), N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
+  LoadSDNode *LD = cast<LoadSDNode>(N);
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  SDValue Result;
+  SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
+  if (ExtType != ISD::NON_EXTLOAD)
+    Result = GenWidenVectorExtLoads(LdChain, LD, ExtType);
+  else
+    Result = GenWidenVectorLoads(LdChain, LD);
+
+  // If we generate a single load, we can use that for the chain.  Otherwise,
+  // build a factor node to remember the multiple loads are independent and
+  // chain to that.
+  SDValue NewChain;
+  if (LdChain.size() == 1)
+    NewChain = LdChain[0];
+  else
+    NewChain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other, LdChain);
+
+  // Modified the chain - switch anything that used the old chain to use
+  // the new one.
+  ReplaceValueWith(SDValue(N, 1), NewChain);
+
+  return Result;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MLOAD(MaskedLoadSDNode *N) {
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),N->getValueType(0));
+  SDValue Mask = N->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue Src0 = GetWidenedVector(N->getSrc0());
+  ISD::LoadExtType ExtType = N->getExtensionType();
+  SDLoc dl(N);
+
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    EVT BoolVT = getSetCCResultType(WidenVT);
+
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+
+  SDValue Res = DAG.getMaskedLoad(WidenVT, dl, N->getChain(), N->getBasePtr(),
+                                  Mask, Src0, N->getMemoryVT(),
+                                  N->getMemOperand(), ExtType);
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_MGATHER(MaskedGatherSDNode *N) {
+
+  EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Mask = N->getMask();
+  SDValue Src0 = GetWidenedVector(N->getValue());
+  unsigned NumElts = WideVT.getVectorNumElements();
+  SDLoc dl(N);
+
+  // The mask should be widened as well
+  Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+  // Widen the Index operand
+  SDValue Index = N->getIndex();
+  EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                     Index.getValueType().getScalarType(),
+                                     NumElts);
+  Index = ModifyToType(Index, WideIndexVT);
+  SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+  SDValue Res = DAG.getMaskedGather(DAG.getVTList(WideVT, MVT::Other),
+                                    N->getMemoryVT(), dl, Ops,
+                                    N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(N),
+                     WidenVT, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue Cond1 = N->getOperand(0);
+  EVT CondVT = Cond1.getValueType();
+  if (CondVT.isVector()) {
+    EVT CondEltVT = CondVT.getVectorElementType();
+    EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
+                                        CondEltVT, WidenNumElts);
+    if (getTypeAction(CondVT) == TargetLowering::TypeWidenVector)
+      Cond1 = GetWidenedVector(Cond1);
+
+    // If we have to split the condition there is no point in widening the
+    // select. This would result in an cycle of widening the select ->
+    // widening the condition operand -> splitting the condition operand ->
+    // splitting the select -> widening the select. Instead split this select
+    // further and widen the resulting type.
+    if (getTypeAction(CondVT) == TargetLowering::TypeSplitVector) {
+      SDValue SplitSelect = SplitVecOp_VSELECT(N, 0);
+      SDValue Res = ModifyToType(SplitSelect, WidenVT);
+      return Res;
+    }
+
+    if (Cond1.getValueType() != CondWidenVT)
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
+  }
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(2));
+  assert(InOp1.getValueType() == WidenVT && InOp2.getValueType() == WidenVT);
+  return DAG.getNode(N->getOpcode(), SDLoc(N),
+                     WidenVT, Cond1, InOp1, InOp2);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SELECT_CC(SDNode *N) {
+  SDValue InOp1 = GetWidenedVector(N->getOperand(2));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N),
+                     InOp1.getValueType(), N->getOperand(0),
+                     N->getOperand(1), InOp1, InOp2, N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_SETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() ==
+         N->getOperand(0).getValueType().isVector() &&
+         "Scalar/Vector type mismatch");
+  if (N->getValueType(0).isVector()) return WidenVecRes_VSETCC(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+  return DAG.getNode(ISD::SETCC, SDLoc(N), WidenVT,
+                     InOp1, InOp2, N->getOperand(2));
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_UNDEF(SDNode *N) {
+ EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+ return DAG.getUNDEF(WidenVT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Adjust mask based on new input vector length.
+  SmallVector<int, 16> NewMask;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    int Idx = N->getMaskElt(i);
+    if (Idx < (int)NumElts)
+      NewMask.push_back(Idx);
+    else
+      NewMask.push_back(Idx - NumElts + WidenNumElts);
+  }
+  for (unsigned i = NumElts; i != WidenNumElts; ++i)
+    NewMask.push_back(-1);
+  return DAG.getVectorShuffle(WidenVT, dl, InOp1, InOp2, NewMask);
+}
+
+SDValue DAGTypeLegalizer::WidenVecRes_VSETCC(SDNode *N) {
+  assert(N->getValueType(0).isVector() &&
+         N->getOperand(0).getValueType().isVector() &&
+         "Operands must be vectors");
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+
+  SDValue InOp1 = N->getOperand(0);
+  EVT InVT = InOp1.getValueType();
+  assert(InVT.isVector() && "can not widen non-vector type");
+  EVT WidenInVT = EVT::getVectorVT(*DAG.getContext(),
+                                   InVT.getVectorElementType(), WidenNumElts);
+
+  // The input and output types often differ here, and it could be that while
+  // we'd prefer to widen the result type, the input operands have been split.
+  // In this case, we also need to split the result of this node as well.
+  if (getTypeAction(InVT) == TargetLowering::TypeSplitVector) {
+    SDValue SplitVSetCC = SplitVecOp_VSETCC(N);
+    SDValue Res = ModifyToType(SplitVSetCC, WidenVT);
+    return Res;
+  }
+
+  InOp1 = GetWidenedVector(InOp1);
+  SDValue InOp2 = GetWidenedVector(N->getOperand(1));
+
+  // Assume that the input and output will be widen appropriately.  If not,
+  // we will have to unroll it at some point.
+  assert(InOp1.getValueType() == WidenInVT &&
+         InOp2.getValueType() == WidenInVT &&
+         "Input not widened to expected type!");
+  (void)WidenInVT;
+  return DAG.getNode(ISD::SETCC, SDLoc(N),
+                     WidenVT, InOp1, InOp2, N->getOperand(2));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Widen Vector Operand
+//===----------------------------------------------------------------------===//
+bool DAGTypeLegalizer::WidenVectorOperand(SDNode *N, unsigned OpNo) {
+  DEBUG(dbgs() << "Widen node operand " << OpNo << ": ";
+        N->dump(&DAG);
+        dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  // See if the target wants to custom widen this node.
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false))
+    return false;
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "WidenVectorOperand op #" << OpNo << ": ";
+    N->dump(&DAG);
+    dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to widen this operator's operand!");
+
+  case ISD::BITCAST:            Res = WidenVecOp_BITCAST(N); break;
+  case ISD::CONCAT_VECTORS:     Res = WidenVecOp_CONCAT_VECTORS(N); break;
+  case ISD::EXTRACT_SUBVECTOR:  Res = WidenVecOp_EXTRACT_SUBVECTOR(N); break;
+  case ISD::EXTRACT_VECTOR_ELT: Res = WidenVecOp_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::STORE:              Res = WidenVecOp_STORE(N); break;
+  case ISD::MSTORE:             Res = WidenVecOp_MSTORE(N, OpNo); break;
+  case ISD::MSCATTER:           Res = WidenVecOp_MSCATTER(N, OpNo); break;
+  case ISD::SETCC:              Res = WidenVecOp_SETCC(N); break;
+  case ISD::FCOPYSIGN:          Res = WidenVecOp_FCOPYSIGN(N); break;
+
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
+  case ISD::ZERO_EXTEND:
+    Res = WidenVecOp_EXTEND(N);
+    break;
+
+  case ISD::FP_EXTEND:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::TRUNCATE:
+    Res = WidenVecOp_Convert(N);
+    break;
+  }
+
+  // If Res is null, the sub-method took care of registering the result.
+  if (!Res.getNode()) return false;
+
+  // If the result is N, the sub-method updated N in place.  Tell the legalizer
+  // core about this.
+  if (Res.getNode() == N)
+    return true;
+
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  SDValue InOp = N->getOperand(0);
+  // If some legalization strategy other than widening is used on the operand,
+  // we can't safely assume that just extending the low lanes is the correct
+  // transformation.
+  if (getTypeAction(InOp.getValueType()) != TargetLowering::TypeWidenVector)
+    return WidenVecOp_Convert(N);
+  InOp = GetWidenedVector(InOp);
+  assert(VT.getVectorNumElements() <
+             InOp.getValueType().getVectorNumElements() &&
+         "Input wasn't widened!");
+
+  // We may need to further widen the operand until it has the same total
+  // vector size as the result.
+  EVT InVT = InOp.getValueType();
+  if (InVT.getSizeInBits() != VT.getSizeInBits()) {
+    EVT InEltVT = InVT.getVectorElementType();
+    for (int i = MVT::FIRST_VECTOR_VALUETYPE, e = MVT::LAST_VECTOR_VALUETYPE; i < e; ++i) {
+      EVT FixedVT = (MVT::SimpleValueType)i;
+      EVT FixedEltVT = FixedVT.getVectorElementType();
+      if (TLI.isTypeLegal(FixedVT) &&
+          FixedVT.getSizeInBits() == VT.getSizeInBits() &&
+          FixedEltVT == InEltVT) {
+        assert(FixedVT.getVectorNumElements() >= VT.getVectorNumElements() &&
+               "Not enough elements in the fixed type for the operand!");
+        assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
+               "We can't have the same type as we started with!");
+        if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
+          InOp = DAG.getNode(
+              ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp,
+              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        else
+          InOp = DAG.getNode(
+              ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
+              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        break;
+      }
+    }
+    InVT = InOp.getValueType();
+    if (InVT.getSizeInBits() != VT.getSizeInBits())
+      // We couldn't find a legal vector type that was a widening of the input
+      // and could be extended in-register to the result type, so we have to
+      // scalarize.
+      return WidenVecOp_Convert(N);
+  }
+
+  // Use special DAG nodes to represent the operation of extending the
+  // low lanes.
+  switch (N->getOpcode()) {
+  default:
+    llvm_unreachable("Extend legalization on on extend operation!");
+  case ISD::ANY_EXTEND:
+    return DAG.getAnyExtendVectorInReg(InOp, DL, VT);
+  case ISD::SIGN_EXTEND:
+    return DAG.getSignExtendVectorInReg(InOp, DL, VT);
+  case ISD::ZERO_EXTEND:
+    return DAG.getZeroExtendVectorInReg(InOp, DL, VT);
+  }
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_FCOPYSIGN(SDNode *N) {
+  // The result (and first input) is legal, but the second input is illegal.
+  // We can't do much to fix that, so just unroll and let the extracts off of
+  // the second input be widened as needed later.
+  return DAG.UnrollVectorOp(N);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
+  // Since the result is legal and the input is illegal, it is unlikely that we
+  // can fix the input to a legal type so unroll the convert into some scalar
+  // code and create a nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+  unsigned NumElts = VT.getVectorNumElements();
+  SDValue InOp = N->getOperand(0);
+  if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+    InOp = GetWidenedVector(InOp);
+  EVT InVT = InOp.getValueType();
+  EVT InEltVT = InVT.getVectorElementType();
+
+  unsigned Opcode = N->getOpcode();
+  SmallVector<SDValue, 16> Ops(NumElts);
+  for (unsigned i=0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(
+        Opcode, dl, EltVT,
+        DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
+  EVT VT = N->getValueType(0);
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  EVT InWidenVT = InOp.getValueType();
+  SDLoc dl(N);
+
+  // Check if we can convert between two legal vector types and extract.
+  unsigned InWidenSize = InWidenVT.getSizeInBits();
+  unsigned Size = VT.getSizeInBits();
+  // x86mmx is not an acceptable vector element type, so don't try.
+  if (InWidenSize % Size == 0 && !VT.isVector() && VT != MVT::x86mmx) {
+    unsigned NewNumElts = InWidenSize / Size;
+    EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
+    if (TLI.isTypeLegal(NewVT)) {
+      SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
+      return DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    }
+  }
+
+  return CreateStackStoreLoad(InOp, VT);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
+  // If the input vector is not legal, it is likely that we will not find a
+  // legal vector of the same size. Replace the concatenate vector with a
+  // nasty build vector.
+  EVT VT = N->getValueType(0);
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+  unsigned NumElts = VT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(NumElts);
+
+  EVT InVT = N->getOperand(0).getValueType();
+  unsigned NumInElts = InVT.getVectorNumElements();
+
+  unsigned Idx = 0;
+  unsigned NumOperands = N->getNumOperands();
+  for (unsigned i=0; i < NumOperands; ++i) {
+    SDValue InOp = N->getOperand(i);
+    if (getTypeAction(InOp.getValueType()) == TargetLowering::TypeWidenVector)
+      InOp = GetWidenedVector(InOp);
+    for (unsigned j=0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue InOp = GetWidenedVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     N->getValueType(0), InOp, N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_STORE(SDNode *N) {
+  // We have to widen the value, but we want only to store the original
+  // vector type.
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+
+  SmallVector<SDValue, 16> StChain;
+  if (ST->isTruncatingStore())
+    GenWidenVectorTruncStores(StChain, ST);
+  else
+    GenWidenVectorStores(StChain, ST);
+
+  if (StChain.size() == 1)
+    return StChain[0];
+  else
+    return DAG.getNode(ISD::TokenFactor, SDLoc(ST), MVT::Other, StChain);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_MSTORE(SDNode *N, unsigned OpNo) {
+  MaskedStoreSDNode *MST = cast<MaskedStoreSDNode>(N);
+  SDValue Mask = MST->getMask();
+  EVT MaskVT = Mask.getValueType();
+  SDValue StVal = MST->getValue();
+  // Widen the value
+  SDValue WideVal = GetWidenedVector(StVal);
+  SDLoc dl(N);
+
+  if (OpNo == 2 || getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    Mask = GetWidenedVector(Mask);
+  else {
+    // The mask should be widened as well.
+    EVT BoolVT = getSetCCResultType(WideVal.getValueType());
+    // We can't use ModifyToType() because we should fill the mask with
+    // zeroes.
+    unsigned WidenNumElts = BoolVT.getVectorNumElements();
+    unsigned MaskNumElts = MaskVT.getVectorNumElements();
+
+    unsigned NumConcat = WidenNumElts / MaskNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue ZeroVal = DAG.getConstant(0, dl, MaskVT);
+    Ops[0] = Mask;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = ZeroVal;
+
+    Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, BoolVT, Ops);
+  }
+  assert(Mask.getValueType().getVectorNumElements() ==
+         WideVal.getValueType().getVectorNumElements() &&
+         "Mask and data vectors should have the same number of elements");
+  return DAG.getMaskedStore(MST->getChain(), dl, WideVal, MST->getBasePtr(),
+                            Mask, MST->getMemoryVT(), MST->getMemOperand(),
+                            false);
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_MSCATTER(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can widen only data operand of mscatter");
+  MaskedScatterSDNode *MSC = cast<MaskedScatterSDNode>(N);
+  SDValue DataOp = MSC->getValue();
+  SDValue Mask = MSC->getMask();
+
+  // Widen the value.
+  SDValue WideVal = GetWidenedVector(DataOp);
+  EVT WideVT = WideVal.getValueType();
+  unsigned NumElts = WideVal.getValueType().getVectorNumElements();
+  SDLoc dl(N);
+
+  // The mask should be widened as well.
+  Mask = WidenTargetBoolean(Mask, WideVT, true);
+
+  // Widen index.
+  SDValue Index = MSC->getIndex();
+  EVT WideIndexVT = EVT::getVectorVT(*DAG.getContext(),
+                                     Index.getValueType().getScalarType(),
+                                     NumElts);
+  Index = ModifyToType(Index, WideIndexVT);
+
+  SDValue Ops[] = {MSC->getChain(), WideVal, Mask, MSC->getBasePtr(), Index};
+  return DAG.getMaskedScatter(DAG.getVTList(MVT::Other),
+                              MSC->getMemoryVT(), dl, Ops,
+                              MSC->getMemOperand());
+}
+
+SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
+  SDValue InOp0 = GetWidenedVector(N->getOperand(0));
+  SDValue InOp1 = GetWidenedVector(N->getOperand(1));
+  SDLoc dl(N);
+
+  // WARNING: In this code we widen the compare instruction with garbage.
+  // This garbage may contain denormal floats which may be slow. Is this a real
+  // concern ? Should we zero the unused lanes if this is a float compare ?
+
+  // Get a new SETCC node to compare the newly widened operands.
+  // Only some of the compared elements are legal.
+  EVT SVT = TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                   InOp0.getValueType());
+  SDValue WideSETCC = DAG.getNode(ISD::SETCC, SDLoc(N),
+                     SVT, InOp0, InOp1, N->getOperand(2));
+
+  // Extract the needed results from the result vector.
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
+                               SVT.getVectorElementType(),
+                               N->getValueType(0).getVectorNumElements());
+  SDValue CC = DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
+      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  return PromoteTargetBoolean(CC, N->getValueType(0));
+}
+
+
+//===----------------------------------------------------------------------===//
+// Vector Widening Utilities
+//===----------------------------------------------------------------------===//
+
+// Utility function to find the type to chop up a widen vector for load/store
+//  TLI:       Target lowering used to determine legal types.
+//  Width:     Width left need to load/store.
+//  WidenVT:   The widen vector type to load to/store from
+//  Align:     If 0, don't allow use of a wider type
+//  WidenEx:   If Align is not 0, the amount additional we can load/store from.
+
+static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
+                       unsigned Width, EVT WidenVT,
+                       unsigned Align = 0, unsigned WidenEx = 0) {
+  EVT WidenEltVT = WidenVT.getVectorElementType();
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  unsigned WidenEltWidth = WidenEltVT.getSizeInBits();
+  unsigned AlignInBits = Align*8;
+
+  // If we have one element to load/store, return it.
+  EVT RetVT = WidenEltVT;
+  if (Width == WidenEltWidth)
+    return RetVT;
+
+  // See if there is larger legal integer than the element type to load/store.
+  unsigned VT;
+  for (VT = (unsigned)MVT::LAST_INTEGER_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_INTEGER_VALUETYPE; --VT) {
+    EVT MemVT((MVT::SimpleValueType) VT);
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (MemVT.getSizeInBits() <= WidenEltWidth)
+      break;
+    auto Action = TLI.getTypeAction(*DAG.getContext(), MemVT);
+    if ((Action == TargetLowering::TypeLegal ||
+         Action == TargetLowering::TypePromoteInteger) &&
+        (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      RetVT = MemVT;
+      break;
+    }
+  }
+
+  // See if there is a larger vector type to load/store that has the same vector
+  // element type and is evenly divisible with the WidenVT.
+  for (VT = (unsigned)MVT::LAST_VECTOR_VALUETYPE;
+       VT >= (unsigned)MVT::FIRST_VECTOR_VALUETYPE; --VT) {
+    EVT MemVT = (MVT::SimpleValueType) VT;
+    unsigned MemVTWidth = MemVT.getSizeInBits();
+    if (TLI.isTypeLegal(MemVT) && WidenEltVT == MemVT.getVectorElementType() &&
+        (WidenWidth % MemVTWidth) == 0 &&
+        isPowerOf2_32(WidenWidth / MemVTWidth) &&
+        (MemVTWidth <= Width ||
+         (Align!=0 && MemVTWidth<=AlignInBits && MemVTWidth<=Width+WidenEx))) {
+      if (RetVT.getSizeInBits() < MemVTWidth || MemVT == WidenVT)
+        return MemVT;
+    }
+  }
+
+  return RetVT;
+}
+
+// Builds a vector type from scalar loads
+//  VecTy: Resulting Vector type
+//  LDOps: Load operators to build a vector type
+//  [Start,End) the list of loads to use.
+static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
+                                     SmallVectorImpl<SDValue> &LdOps,
+                                     unsigned Start, unsigned End) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc dl(LdOps[Start]);
+  EVT LdTy = LdOps[Start].getValueType();
+  unsigned Width = VecTy.getSizeInBits();
+  unsigned NumElts = Width / LdTy.getSizeInBits();
+  EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), LdTy, NumElts);
+
+  unsigned Idx = 1;
+  SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT,LdOps[Start]);
+
+  for (unsigned i = Start + 1; i != End; ++i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      NumElts = Width / NewLdTy.getSizeInBits();
+      NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewLdTy, NumElts);
+      VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, VecOp);
+      // Readjust position and vector position based on new load type.
+      Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
+      LdTy = NewLdTy;
+    }
+    VecOp = DAG.getNode(
+        ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+        DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+  return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
+}
+
+SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
+                                              LoadSDNode *LD) {
+  // The strategy assumes that we can efficiently load power-of-two widths.
+  // The routine chops the vector into the largest vector loads with the same
+  // element type or scalar loads and then recombines it to the widen vector
+  // type.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  unsigned WidenWidth = WidenVT.getSizeInBits();
+  EVT LdVT    = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector());
+  assert(LdVT.getVectorElementType() == WidenVT.getVectorElementType());
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Align = LD->getAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  int LdWidth = LdVT.getSizeInBits();
+  int WidthDiff = WidenWidth - LdWidth;
+  unsigned LdAlign = LD->isVolatile() ? 0 : Align; // Allow wider loads.
+
+  // Find the vector type that can load from.
+  EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+  int NewVTWidth = NewVT.getSizeInBits();
+  SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
+                             Align, MMOFlags, AAInfo);
+  LdChain.push_back(LdOp.getValue(1));
+
+  // Check if we can load the element with one instruction.
+  if (LdWidth <= NewVTWidth) {
+    if (!NewVT.isVector()) {
+      unsigned NumElts = WidenWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, NewVecVT, LdOp);
+      return DAG.getNode(ISD::BITCAST, dl, WidenVT, VecOp);
+    }
+    if (NewVT == WidenVT)
+      return LdOp;
+
+    assert(WidenWidth % NewVTWidth == 0);
+    unsigned NumConcat = WidenWidth / NewVTWidth;
+    SmallVector<SDValue, 16> ConcatOps(NumConcat);
+    SDValue UndefVal = DAG.getUNDEF(NewVT);
+    ConcatOps[0] = LdOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      ConcatOps[i] = UndefVal;
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, ConcatOps);
+  }
+
+  // Load vector by using multiple loads from largest vector to scalar.
+  SmallVector<SDValue, 16> LdOps;
+  LdOps.push_back(LdOp);
+
+  LdWidth -= NewVTWidth;
+  unsigned Offset = 0;
+
+  while (LdWidth > 0) {
+    unsigned Increment = NewVTWidth / 8;
+    Offset += Increment;
+    BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                          DAG.getConstant(Increment, dl, BasePtr.getValueType()));
+
+    SDValue L;
+    if (LdWidth < NewVTWidth) {
+      // The current type we are using is too large. Find a better size.
+      NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
+      NewVTWidth = NewVT.getSizeInBits();
+      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
+                      LD->getPointerInfo().getWithOffset(Offset),
+                      MinAlign(Align, Increment), MMOFlags, AAInfo);
+      LdChain.push_back(L.getValue(1));
+      if (L->getValueType(0).isVector()) {
+        SmallVector<SDValue, 16> Loads;
+        Loads.push_back(L);
+        unsigned size = L->getValueSizeInBits(0);
+        while (size < LdOp->getValueSizeInBits(0)) {
+          Loads.push_back(DAG.getUNDEF(L->getValueType(0)));
+          size += L->getValueSizeInBits(0);
+        }
+        L = DAG.getNode(ISD::CONCAT_VECTORS, dl, LdOp->getValueType(0), Loads);
+      }
+    } else {
+      L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
+                      LD->getPointerInfo().getWithOffset(Offset),
+                      MinAlign(Align, Increment), MMOFlags, AAInfo);
+      LdChain.push_back(L.getValue(1));
+    }
+
+    LdOps.push_back(L);
+
+
+    LdWidth -= NewVTWidth;
+  }
+
+  // Build the vector from the load operations.
+  unsigned End = LdOps.size();
+  if (!LdOps[0].getValueType().isVector())
+    // All the loads are scalar loads.
+    return BuildVectorFromScalar(DAG, WidenVT, LdOps, 0, End);
+
+  // If the load contains vectors, build the vector using concat vector.
+  // All of the vectors used to load are power-of-2, and the scalar loads can be
+  // combined to make a power-of-2 vector.
+  SmallVector<SDValue, 16> ConcatOps(End);
+  int i = End - 1;
+  int Idx = End;
+  EVT LdTy = LdOps[i].getValueType();
+  // First, combine the scalar loads to a vector.
+  if (!LdTy.isVector())  {
+    for (--i; i >= 0; --i) {
+      LdTy = LdOps[i].getValueType();
+      if (LdTy.isVector())
+        break;
+    }
+    ConcatOps[--Idx] = BuildVectorFromScalar(DAG, LdTy, LdOps, i + 1, End);
+  }
+  ConcatOps[--Idx] = LdOps[i];
+  for (--i; i >= 0; --i) {
+    EVT NewLdTy = LdOps[i].getValueType();
+    if (NewLdTy != LdTy) {
+      // Create a larger vector.
+      ConcatOps[End-1] = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewLdTy,
+                                     makeArrayRef(&ConcatOps[Idx], End - Idx));
+      Idx = End - 1;
+      LdTy = NewLdTy;
+    }
+    ConcatOps[--Idx] = LdOps[i];
+  }
+
+  if (WidenWidth == LdTy.getSizeInBits() * (End - Idx))
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT,
+                       makeArrayRef(&ConcatOps[Idx], End - Idx));
+
+  // We need to fill the rest with undefs to build the vector.
+  unsigned NumOps = WidenWidth / LdTy.getSizeInBits();
+  SmallVector<SDValue, 16> WidenOps(NumOps);
+  SDValue UndefVal = DAG.getUNDEF(LdTy);
+  {
+    unsigned i = 0;
+    for (; i != End-Idx; ++i)
+      WidenOps[i] = ConcatOps[Idx+i];
+    for (; i != NumOps; ++i)
+      WidenOps[i] = UndefVal;
+  }
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, WidenVT, WidenOps);
+}
+
+SDValue
+DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
+                                         LoadSDNode *LD,
+                                         ISD::LoadExtType ExtType) {
+  // For extension loads, it may not be more efficient to chop up the vector
+  // and then extend it. Instead, we unroll the load and build a new vector.
+  EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(),LD->getValueType(0));
+  EVT LdVT    = LD->getMemoryVT();
+  SDLoc dl(LD);
+  assert(LdVT.isVector() && WidenVT.isVector());
+
+  // Load information
+  SDValue Chain = LD->getChain();
+  SDValue BasePtr = LD->getBasePtr();
+  unsigned Align = LD->getAlignment();
+  MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = LD->getAAInfo();
+
+  EVT EltVT = WidenVT.getVectorElementType();
+  EVT LdEltVT = LdVT.getVectorElementType();
+  unsigned NumElts = LdVT.getVectorNumElements();
+
+  // Load each element and widen.
+  unsigned WidenNumElts = WidenVT.getVectorNumElements();
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  unsigned Increment = LdEltVT.getSizeInBits() / 8;
+  Ops[0] =
+      DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
+                     LdEltVT, Align, MMOFlags, AAInfo);
+  LdChain.push_back(Ops[0].getValue(1));
+  unsigned i = 0, Offset = Increment;
+  for (i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr,
+                                     DAG.getConstant(Offset, dl,
+                                                     BasePtr.getValueType()));
+    Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
+                            LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
+                            Align, MMOFlags, AAInfo);
+    LdChain.push_back(Ops[i].getValue(1));
+  }
+
+  // Fill the rest with undefs.
+  SDValue UndefVal = DAG.getUNDEF(EltVT);
+  for (; i != WidenNumElts; ++i)
+    Ops[i] = UndefVal;
+
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, WidenVT, Ops);
+}
+
+
+void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
+                                            StoreSDNode *ST) {
+  // The strategy assumes that we can efficiently store power-of-two widths.
+  // The routine chops the vector into the largest vector stores with the same
+  // element type or scalar stores.
+  SDValue  Chain = ST->getChain();
+  SDValue  BasePtr = ST->getBasePtr();
+  unsigned Align = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+  SDValue  ValOp = GetWidenedVector(ST->getValue());
+  SDLoc dl(ST);
+
+  EVT StVT = ST->getMemoryVT();
+  unsigned StWidth = StVT.getSizeInBits();
+  EVT ValVT = ValOp.getValueType();
+  unsigned ValWidth = ValVT.getSizeInBits();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned ValEltWidth = ValEltVT.getSizeInBits();
+  assert(StVT.getVectorElementType() == ValEltVT);
+
+  int Idx = 0;          // current index to store
+  unsigned Offset = 0;  // offset from base to store
+  while (StWidth != 0) {
+    // Find the largest vector type we can store with.
+    EVT NewVT = FindMemType(DAG, TLI, StWidth, ValVT);
+    unsigned NewVTWidth = NewVT.getSizeInBits();
+    unsigned Increment = NewVTWidth / 8;
+    if (NewVT.isVector()) {
+      unsigned NumVTElts = NewVT.getVectorNumElements();
+      do {
+        SDValue EOp = DAG.getNode(
+            ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        StChain.push_back(DAG.getStore(
+            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
+            MinAlign(Align, Offset), MMOFlags, AAInfo));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        Idx += NumVTElts;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getConstant(Increment, dl,
+                                              BasePtr.getValueType()));
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
+    } else {
+      // Cast the vector to the scalar type we can store.
+      unsigned NumElts = ValWidth / NewVTWidth;
+      EVT NewVecVT = EVT::getVectorVT(*DAG.getContext(), NewVT, NumElts);
+      SDValue VecOp = DAG.getNode(ISD::BITCAST, dl, NewVecVT, ValOp);
+      // Readjust index position based on new vector type.
+      Idx = Idx * ValEltWidth / NewVTWidth;
+      do {
+        SDValue EOp = DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+            DAG.getConstant(Idx++, dl,
+                            TLI.getVectorIdxTy(DAG.getDataLayout())));
+        StChain.push_back(DAG.getStore(
+            Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
+            MinAlign(Align, Offset), MMOFlags, AAInfo));
+        StWidth -= NewVTWidth;
+        Offset += Increment;
+        BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
+                              DAG.getConstant(Increment, dl,
+                                              BasePtr.getValueType()));
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
+      // Restore index back to be relative to the original widen element type.
+      Idx = Idx * NewVTWidth / ValEltWidth;
+    }
+  }
+}
+
+void
+DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
+                                            StoreSDNode *ST) {
+  // For extension loads, it may not be more efficient to truncate the vector
+  // and then store it. Instead, we extract each element and then store it.
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  unsigned Align = ST->getAlignment();
+  MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
+  AAMDNodes AAInfo = ST->getAAInfo();
+  SDValue ValOp = GetWidenedVector(ST->getValue());
+  SDLoc dl(ST);
+
+  EVT StVT = ST->getMemoryVT();
+  EVT ValVT = ValOp.getValueType();
+
+  // It must be true that the wide vector type is bigger than where we need to
+  // store.
+  assert(StVT.isVector() && ValOp.getValueType().isVector());
+  assert(StVT.bitsLT(ValOp.getValueType()));
+
+  // For truncating stores, we can not play the tricks of chopping legal vector
+  // types and bitcast it to the right type. Instead, we unroll the store.
+  EVT StEltVT  = StVT.getVectorElementType();
+  EVT ValEltVT = ValVT.getVectorElementType();
+  unsigned Increment = ValEltVT.getSizeInBits() / 8;
+  unsigned NumElts = StVT.getVectorNumElements();
+  SDValue EOp = DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
+                                      ST->getPointerInfo(), StEltVT, Align,
+                                      MMOFlags, AAInfo));
+  unsigned Offset = Increment;
+  for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
+    SDValue NewBasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(),
+                                     BasePtr,
+                                     DAG.getConstant(Offset, dl,
+                                                     BasePtr.getValueType()));
+    SDValue EOp = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    StChain.push_back(DAG.getTruncStore(
+        Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
+        StEltVT, MinAlign(Align, Offset), MMOFlags, AAInfo));
+  }
+}
+
+/// Modifies a vector input (widen or narrows) to a vector of NVT.  The
+/// input vector must have the same element type as NVT.
+/// FillWithZeroes specifies that the vector should be widened with zeroes.
+SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
+                                       bool FillWithZeroes) {
+  // Note that InOp might have been widened so it might already have
+  // the right width or it might need be narrowed.
+  EVT InVT = InOp.getValueType();
+  assert(InVT.getVectorElementType() == NVT.getVectorElementType() &&
+         "input and widen element type must match");
+  SDLoc dl(InOp);
+
+  // Check if InOp already has the right width.
+  if (InVT == NVT)
+    return InOp;
+
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned WidenNumElts = NVT.getVectorNumElements();
+  if (WidenNumElts > InNumElts && WidenNumElts % InNumElts == 0) {
+    unsigned NumConcat = WidenNumElts / InNumElts;
+    SmallVector<SDValue, 16> Ops(NumConcat);
+    SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, InVT) :
+      DAG.getUNDEF(InVT);
+    Ops[0] = InOp;
+    for (unsigned i = 1; i != NumConcat; ++i)
+      Ops[i] = FillVal;
+
+    return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, Ops);
+  }
+
+  if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  // Fall back to extract and build.
+  SmallVector<SDValue, 16> Ops(WidenNumElts);
+  EVT EltVT = NVT.getVectorElementType();
+  unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
+  unsigned Idx;
+  for (Idx = 0; Idx < MinNumElts; ++Idx)
+    Ops[Idx] = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+  SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
+    DAG.getUNDEF(EltVT);
+  for ( ; Idx < WidenNumElts; ++Idx)
+    Ops[Idx] = FillVal;
+  return DAG.getNode(ISD::BUILD_VECTOR, dl, NVT, Ops);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
new file mode 100644
index 000000000000..1e5c4a73693f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -0,0 +1,639 @@
+//===- ResourcePriorityQueue.cpp - A DFA-oriented priority queue -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ResourcePriorityQueue class, which is a
+// SchedulingPriorityQueue that prioritizes instructions using DFA state to
+// reduce the length of the critical path through the basic block
+// on VLIW platforms.
+// The scheduler is basically a top-down adaptable list scheduler with DFA
+// resource tracking added to the cost function.
+// DFA is queried as a state machine to model "packets/bundles" during
+// schedule. Currently packets/bundles are discarded at the end of
+// scheduling, affecting only order of instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "scheduler"
+
+static cl::opt<bool> DisableDFASched("disable-dfa-sched", cl::Hidden,
+  cl::ZeroOrMore, cl::init(false),
+  cl::desc("Disable use of DFA during scheduling"));
+
+static cl::opt<int> RegPressureThreshold(
+  "dfa-sched-reg-pressure-threshold", cl::Hidden, cl::ZeroOrMore, cl::init(5),
+  cl::desc("Track reg pressure and switch priority to in-depth"));
+
+ResourcePriorityQueue::ResourcePriorityQueue(SelectionDAGISel *IS)
+    : Picker(this), InstrItins(IS->MF->getSubtarget().getInstrItineraryData()) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  TRI = STI.getRegisterInfo();
+  TLI = IS->TLI;
+  TII = STI.getInstrInfo();
+  ResourcesModel.reset(TII->CreateTargetScheduleState(STI));
+  // This hard requirement could be relaxed, but for now
+  // do not let it proceed.
+  assert(ResourcesModel && "Unimplemented CreateTargetScheduleState.");
+
+  unsigned NumRC = TRI->getNumRegClasses();
+  RegLimit.resize(NumRC);
+  RegPressure.resize(NumRC);
+  std::fill(RegLimit.begin(), RegLimit.end(), 0);
+  std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+                                             E = TRI->regclass_end();
+       I != E; ++I)
+    RegLimit[(*I)->getID()] = TRI->getRegPressureLimit(*I, *IS->MF);
+
+  ParallelLiveRanges = 0;
+  HorizontalVerticalBalance = 0;
+}
+
+unsigned
+ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+
+    SUnit *PredSU = I->getSUnit();
+    const SDNode *ScegN = PredSU->getNode();
+
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    NumberDeps++;  break;
+      case ISD::CopyToReg:      break;
+      case ISD::INLINEASM:      break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
+                                                    unsigned RCId) {
+  unsigned NumberDeps = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isCtrl())
+      continue;
+
+    SUnit *SuccSU = I->getSUnit();
+    const SDNode *ScegN = SuccSU->getNode();
+    if (!ScegN)
+      continue;
+
+    // If value is passed to CopyToReg, it is probably
+    // live outside BB.
+    switch (ScegN->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:    break;
+      case ISD::CopyFromReg:    break;
+      case ISD::CopyToReg:      NumberDeps++;  break;
+      case ISD::INLINEASM:      break;
+    }
+    if (!ScegN->isMachineOpcode())
+      continue;
+
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (TLI->isTypeLegal(VT)
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+        NumberDeps++;
+        break;
+      }
+    }
+  }
+  return NumberDeps;
+}
+
+static unsigned numberCtrlDepsInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+static unsigned numberCtrlPredInSU(SUnit *SU) {
+  unsigned NumberDeps = 0;
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (I->isCtrl())
+      NumberDeps++;
+
+  return NumberDeps;
+}
+
+///
+/// Initialize nodes.
+///
+void ResourcePriorityQueue::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  NumNodesSolelyBlocking.resize(SUnits->size(), 0);
+
+  for (unsigned i = 0, e = SUnits->size(); i != e; ++i) {
+    SUnit *SU = &(*SUnits)[i];
+    initNumRegDefsLeft(SU);
+    SU->NodeQueueId = 0;
+  }
+}
+
+/// This heuristic is used if DFA scheduling is not desired
+/// for some VLIW platform.
+bool resource_sort::operator()(const SUnit *LHS, const SUnit *RHS) const {
+  // The isScheduleHigh flag allows nodes with wraparound dependencies that
+  // cannot easily be modeled as edges with latencies to be scheduled as
+  // soon as possible in a top-down schedule.
+  if (LHS->isScheduleHigh && !RHS->isScheduleHigh)
+    return false;
+
+  if (!LHS->isScheduleHigh && RHS->isScheduleHigh)
+    return true;
+
+  unsigned LHSNum = LHS->NodeNum;
+  unsigned RHSNum = RHS->NodeNum;
+
+  // The most important heuristic is scheduling the critical path.
+  unsigned LHSLatency = PQ->getLatency(LHSNum);
+  unsigned RHSLatency = PQ->getLatency(RHSNum);
+  if (LHSLatency < RHSLatency) return true;
+  if (LHSLatency > RHSLatency) return false;
+
+  // After that, if two nodes have identical latencies, look to see if one will
+  // unblock more other nodes than the other.
+  unsigned LHSBlocked = PQ->getNumSolelyBlockNodes(LHSNum);
+  unsigned RHSBlocked = PQ->getNumSolelyBlockNodes(RHSNum);
+  if (LHSBlocked < RHSBlocked) return true;
+  if (LHSBlocked > RHSBlocked) return false;
+
+  // Finally, just to provide a stable ordering, use the node number as a
+  // deciding factor.
+  return LHSNum < RHSNum;
+}
+
+
+/// getSingleUnscheduledPred - If there is exactly one unscheduled predecessor
+/// of SU, return it, otherwise return null.
+SUnit *ResourcePriorityQueue::getSingleUnscheduledPred(SUnit *SU) {
+  SUnit *OnlyAvailablePred = nullptr;
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    SUnit &Pred = *I->getSUnit();
+    if (!Pred.isScheduled) {
+      // We found an available, but not scheduled, predecessor.  If it's the
+      // only one we have found, keep track of it... otherwise give up.
+      if (OnlyAvailablePred && OnlyAvailablePred != &Pred)
+        return nullptr;
+      OnlyAvailablePred = &Pred;
+    }
+  }
+  return OnlyAvailablePred;
+}
+
+void ResourcePriorityQueue::push(SUnit *SU) {
+  // Look at all of the successors of this node.  Count the number of nodes that
+  // this node is the sole unscheduled node for.
+  unsigned NumNodesBlocking = 0;
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I)
+    if (getSingleUnscheduledPred(I->getSUnit()) == SU)
+      ++NumNodesBlocking;
+
+  NumNodesSolelyBlocking[SU->NodeNum] = NumNodesBlocking;
+  Queue.push_back(SU);
+}
+
+/// Check if scheduling of this SU is possible
+/// in the current packet.
+bool ResourcePriorityQueue::isResourceAvailable(SUnit *SU) {
+  if (!SU || !SU->getNode())
+    return false;
+
+  // If this is a compound instruction,
+  // it is likely to be a call. Do not delay it.
+  if (SU->getNode()->getGluedNode())
+    return true;
+
+  // First see if the pipeline could receive this instruction
+  // in the current cycle.
+  if (SU->getNode()->isMachineOpcode())
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      if (!ResourcesModel->canReserveResources(&TII->get(
+          SU->getNode()->getMachineOpcode())))
+           return false;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+        break;
+    }
+
+  // Now see if there are no other dependencies
+  // to instructions already in the packet.
+  for (unsigned i = 0, e = Packet.size(); i != e; ++i)
+    for (SUnit::const_succ_iterator I = Packet[i]->Succs.begin(),
+         E = Packet[i]->Succs.end(); I != E; ++I) {
+      // Since we do not add pseudos to packets, might as well
+      // ignore order deps.
+      if (I->isCtrl())
+        continue;
+
+      if (I->getSUnit() == SU)
+        return false;
+    }
+
+  return true;
+}
+
+/// Keep track of available resources.
+void ResourcePriorityQueue::reserveResources(SUnit *SU) {
+  // If this SU does not fit in the packet
+  // start a new one.
+  if (!isResourceAvailable(SU) || SU->getNode()->getGluedNode()) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  if (SU->getNode() && SU->getNode()->isMachineOpcode()) {
+    switch (SU->getNode()->getMachineOpcode()) {
+    default:
+      ResourcesModel->reserveResources(&TII->get(
+        SU->getNode()->getMachineOpcode()));
+      break;
+    case TargetOpcode::EXTRACT_SUBREG:
+    case TargetOpcode::INSERT_SUBREG:
+    case TargetOpcode::SUBREG_TO_REG:
+    case TargetOpcode::REG_SEQUENCE:
+    case TargetOpcode::IMPLICIT_DEF:
+      break;
+    }
+    Packet.push_back(SU);
+  }
+  // Forcefully end packet for PseudoOps.
+  else {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+
+  // If packet is now full, reset the state so in the next cycle
+  // we start fresh.
+  if (Packet.size() >= InstrItins->SchedModel.IssueWidth) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+  }
+}
+
+int ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
+  int RegBalance = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  // Gen estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumValues(); i != e; ++i) {
+      MVT VT = SU->getNode()->getSimpleValueType(i);
+      if (TLI->isTypeLegal(VT)
+          && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance += numberRCValSuccInSU(SU, RCId);
+  }
+  // Kill estimate.
+  for (unsigned i = 0, e = SU->getNode()->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = SU->getNode()->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+      if (isa<ConstantSDNode>(Op.getNode()))
+        continue;
+
+      if (TLI->isTypeLegal(VT) && TLI->getRegClassFor(VT)
+          && TLI->getRegClassFor(VT)->getID() == RCId)
+        RegBalance -= numberRCValPredInSU(SU, RCId);
+  }
+  return RegBalance;
+}
+
+/// Estimates change in reg pressure from this SU.
+/// It is achieved by trivial tracking of defined
+/// and used vregs in dependent instructions.
+/// The RawPressure flag makes this function to ignore
+/// existing reg file sizes, and report raw def/use
+/// balance.
+int ResourcePriorityQueue::regPressureDelta(SUnit *SU, bool RawPressure) {
+  int RegBalance = 0;
+
+  if (!SU || !SU->getNode() || !SU->getNode()->isMachineOpcode())
+    return RegBalance;
+
+  if (RawPressure) {
+    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I) {
+      const TargetRegisterClass *RC = *I;
+      RegBalance += rawRegPressureDelta(SU, RC->getID());
+    }
+  }
+  else {
+    for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I) {
+      const TargetRegisterClass *RC = *I;
+      if ((RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID()) > 0) &&
+          (RegPressure[RC->getID()] +
+           rawRegPressureDelta(SU, RC->getID())  >= RegLimit[RC->getID()]))
+        RegBalance += rawRegPressureDelta(SU, RC->getID());
+    }
+  }
+
+  return RegBalance;
+}
+
+// Constants used to denote relative importance of
+// heuristic components for cost computation.
+static const unsigned PriorityOne = 200;
+static const unsigned PriorityTwo = 50;
+static const unsigned PriorityThree = 15;
+static const unsigned PriorityFour = 5;
+static const unsigned ScaleOne = 20;
+static const unsigned ScaleTwo = 10;
+static const unsigned ScaleThree = 5;
+static const unsigned FactorOne = 2;
+
+/// Returns single number reflecting benefit of scheduling SU
+/// in the current cycle.
+int ResourcePriorityQueue::SUSchedulingCost(SUnit *SU) {
+  // Initial trivial priority.
+  int ResCount = 1;
+
+  // Do not waste time on a node that is already scheduled.
+  if (SU->isScheduled)
+    return ResCount;
+
+  // Forced priority is high.
+  if (SU->isScheduleHigh)
+    ResCount += PriorityOne;
+
+  // Adaptable scheduling
+  // A small, but very parallel
+  // region, where reg pressure is an issue.
+  if (HorizontalVerticalBalance > RegPressureThreshold) {
+    // Critical path first
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    // Consider change to reg pressure from scheduling
+    // this SU.
+    ResCount -= (regPressureDelta(SU,true) * ScaleOne);
+  }
+  // Default heuristic, greeady and
+  // critical path driven.
+  else {
+    // Critical path first.
+    ResCount += (SU->getHeight() * ScaleTwo);
+    // Now see how many instructions is blocked by this SU.
+    ResCount += (NumNodesSolelyBlocking[SU->NodeNum] * ScaleTwo);
+    // If resources are available for it, multiply the
+    // chance of scheduling.
+    if (isResourceAvailable(SU))
+      ResCount <<= FactorOne;
+
+    ResCount -= (regPressureDelta(SU) * ScaleTwo);
+  }
+
+  // These are platform-specific things.
+  // Will need to go into the back end
+  // and accessed from here via a hook.
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      if (TID.isCall())
+        ResCount += (PriorityTwo + (ScaleThree*N->getNumValues()));
+    }
+    else
+      switch (N->getOpcode()) {
+      default:  break;
+      case ISD::TokenFactor:
+      case ISD::CopyFromReg:
+      case ISD::CopyToReg:
+        ResCount += PriorityFour;
+        break;
+
+      case ISD::INLINEASM:
+        ResCount += PriorityThree;
+        break;
+      }
+  }
+  return ResCount;
+}
+
+
+/// Main resource tracking point.
+void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
+  // Use NULL entry as an event marker to reset
+  // the DFA state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    return;
+  }
+
+  const SDNode *ScegN = SU->getNode();
+  // Update reg pressure tracking.
+  // First update current node.
+  if (ScegN->isMachineOpcode()) {
+    // Estimate generated regs.
+    for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
+      MVT VT = ScegN->getSimpleValueType(i);
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC)
+          RegPressure[RC->getID()] += numberRCValSuccInSU(SU, RC->getID());
+      }
+    }
+    // Estimate killed regs.
+    for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
+      const SDValue &Op = ScegN->getOperand(i);
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+
+      if (TLI->isTypeLegal(VT)) {
+        const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
+        if (RC) {
+          if (RegPressure[RC->getID()] >
+            (numberRCValPredInSU(SU, RC->getID())))
+            RegPressure[RC->getID()] -= numberRCValPredInSU(SU, RC->getID());
+          else RegPressure[RC->getID()] = 0;
+        }
+      }
+    }
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+                              I != E; ++I) {
+      if (I->isCtrl() || (I->getSUnit()->NumRegDefsLeft == 0))
+        continue;
+      --I->getSUnit()->NumRegDefsLeft;
+    }
+  }
+
+  // Reserve resources for this SU.
+  reserveResources(SU);
+
+  // Adjust number of parallel live ranges.
+  // Heuristic is simple - node with no data successors reduces
+  // number of live ranges. All others, increase it.
+  unsigned NumberNonControlDeps = 0;
+
+  for (SUnit::const_succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+                                  I != E; ++I) {
+    adjustPriorityOfUnscheduledPreds(I->getSUnit());
+    if (!I->isCtrl())
+      NumberNonControlDeps++;
+  }
+
+  if (!NumberNonControlDeps) {
+    if (ParallelLiveRanges >= SU->NumPreds)
+      ParallelLiveRanges -= SU->NumPreds;
+    else
+      ParallelLiveRanges = 0;
+
+  }
+  else
+    ParallelLiveRanges += SU->NumRegDefsLeft;
+
+  // Track parallel live chains.
+  HorizontalVerticalBalance += (SU->Succs.size() - numberCtrlDepsInSU(SU));
+  HorizontalVerticalBalance -= (SU->Preds.size() - numberCtrlPredInSU(SU));
+}
+
+void ResourcePriorityQueue::initNumRegDefsLeft(SUnit *SU) {
+  unsigned  NodeNumDefs = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode()) {
+      const MCInstrDesc &TID = TII->get(N->getMachineOpcode());
+      // No register need be allocated for this.
+      if (N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
+        NodeNumDefs = 0;
+        break;
+      }
+      NodeNumDefs = std::min(N->getNumValues(), TID.getNumDefs());
+    }
+    else
+      switch(N->getOpcode()) {
+        default:     break;
+        case ISD::CopyFromReg:
+          NodeNumDefs++;
+          break;
+        case ISD::INLINEASM:
+          NodeNumDefs++;
+          break;
+      }
+
+  SU->NumRegDefsLeft = NodeNumDefs;
+}
+
+/// adjustPriorityOfUnscheduledPreds - One of the predecessors of SU was just
+/// scheduled.  If SU is not itself available, then there is at least one
+/// predecessor node that has not been scheduled yet.  If SU has exactly ONE
+/// unscheduled predecessor, we want to increase its priority: it getting
+/// scheduled will make this node available, so it is better than some other
+/// node of the same priority that will not make a node available.
+void ResourcePriorityQueue::adjustPriorityOfUnscheduledPreds(SUnit *SU) {
+  if (SU->isAvailable) return;  // All preds scheduled.
+
+  SUnit *OnlyAvailablePred = getSingleUnscheduledPred(SU);
+  if (!OnlyAvailablePred || !OnlyAvailablePred->isAvailable)
+    return;
+
+  // Okay, we found a single predecessor that is available, but not scheduled.
+  // Since it is available, it must be in the priority queue.  First remove it.
+  remove(OnlyAvailablePred);
+
+  // Reinsert the node into the priority queue, which recomputes its
+  // NumNodesSolelyBlocking value.
+  push(OnlyAvailablePred);
+}
+
+
+/// Main access point - returns next instructions
+/// to be placed in scheduling sequence.
+SUnit *ResourcePriorityQueue::pop() {
+  if (empty())
+    return nullptr;
+
+  std::vector<SUnit *>::iterator Best = Queue.begin();
+  if (!DisableDFASched) {
+    int BestCost = SUSchedulingCost(*Best);
+    for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
+           E = Queue.end(); I != E; ++I) {
+
+      if (SUSchedulingCost(*I) > BestCost) {
+        BestCost = SUSchedulingCost(*I);
+        Best = I;
+      }
+    }
+  }
+  // Use default TD scheduling mechanism.
+  else {
+    for (std::vector<SUnit *>::iterator I = std::next(Queue.begin()),
+       E = Queue.end(); I != E; ++I)
+      if (Picker(*Best, *I))
+        Best = I;
+  }
+
+  SUnit *V = *Best;
+  if (Best != std::prev(Queue.end()))
+    std::swap(*Best, Queue.back());
+
+  Queue.pop_back();
+
+  return V;
+}
+
+
+void ResourcePriorityQueue::remove(SUnit *SU) {
+  assert(!Queue.empty() && "Queue is empty!");
+  std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(), SU);
+  if (I != std::prev(Queue.end()))
+    std::swap(*I, Queue.back());
+
+  Queue.pop_back();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
new file mode 100644
index 000000000000..237d541b4cb9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SDNodeDbgValue.h
@@ -0,0 +1,127 @@
+//===-- llvm/CodeGen/SDNodeDbgValue.h - SelectionDAG dbg_value --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the SDDbgValue class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SDNODEDBGVALUE_H
+
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/DataTypes.h"
+#include <utility>
+
+namespace llvm {
+
+class MDNode;
+class SDNode;
+class Value;
+
+/// SDDbgValue - Holds the information from a dbg_value node through SDISel.
+/// We do not use SDValue here to avoid including its header.
+
+class SDDbgValue {
+public:
+  enum DbgValueKind {
+    SDNODE = 0,             // value is the result of an expression
+    CONST = 1,              // value is a constant
+    FRAMEIX = 2             // value is contents of a stack location
+  };
+private:
+  union {
+    struct {
+      SDNode *Node;         // valid for expressions
+      unsigned ResNo;       // valid for expressions
+    } s;
+    const Value *Const;     // valid for constants
+    unsigned FrameIx;       // valid for stack objects
+  } u;
+  MDNode *Var;
+  MDNode *Expr;
+  uint64_t Offset;
+  DebugLoc DL;
+  unsigned Order;
+  enum DbgValueKind kind;
+  bool IsIndirect;
+  bool Invalid = false;
+
+public:
+  // Constructor for non-constants.
+  SDDbgValue(MDNode *Var, MDNode *Expr, SDNode *N, unsigned R, bool indir,
+             uint64_t off, DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(indir) {
+    kind = SDNODE;
+    u.s.Node = N;
+    u.s.ResNo = R;
+  }
+
+  // Constructor for constants.
+  SDDbgValue(MDNode *Var, MDNode *Expr, const Value *C, uint64_t off,
+             DebugLoc dl, unsigned O)
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(false) {
+    kind = CONST;
+    u.Const = C;
+  }
+
+  // Constructor for frame indices.
+  SDDbgValue(MDNode *Var, MDNode *Expr, unsigned FI, uint64_t off, DebugLoc dl,
+             unsigned O)
+      : Var(Var), Expr(Expr), Offset(off), DL(std::move(dl)), Order(O),
+        IsIndirect(false) {
+    kind = FRAMEIX;
+    u.FrameIx = FI;
+  }
+
+  // Returns the kind.
+  DbgValueKind getKind() const { return kind; }
+
+  // Returns the MDNode pointer for the variable.
+  MDNode *getVariable() const { return Var; }
+
+  // Returns the MDNode pointer for the expression.
+  MDNode *getExpression() const { return Expr; }
+
+  // Returns the SDNode* for a register ref
+  SDNode *getSDNode() const { assert (kind==SDNODE); return u.s.Node; }
+
+  // Returns the ResNo for a register ref
+  unsigned getResNo() const { assert (kind==SDNODE); return u.s.ResNo; }
+
+  // Returns the Value* for a constant
+  const Value *getConst() const { assert (kind==CONST); return u.Const; }
+
+  // Returns the FrameIx for a stack object
+  unsigned getFrameIx() const { assert (kind==FRAMEIX); return u.FrameIx; }
+
+  // Returns whether this is an indirect value.
+  bool isIndirect() const { return IsIndirect; }
+
+  // Returns the offset.
+  uint64_t getOffset() const { return Offset; }
+
+  // Returns the DebugLoc.
+  DebugLoc getDebugLoc() const { return DL; }
+
+  // Returns the SDNodeOrder.  This is the order of the preceding node in the
+  // input.
+  unsigned getOrder() const { return Order; }
+
+  // setIsInvalidated / isInvalidated - Setter / getter of the "Invalidated"
+  // property. A SDDbgValue is invalid if the SDNode that produces the value is
+  // deleted.
+  void setIsInvalidated() { Invalid = true; }
+  bool isInvalidated() const { return Invalid; }
+};
+
+} // end llvm namespace
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
new file mode 100644
index 000000000000..62e7733ecd2b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -0,0 +1,805 @@
+//===----- ScheduleDAGFast.cpp - Fast poor list scheduler -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a fast scheduler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "InstrEmitter.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical copies");
+
+static RegisterScheduler
+  fastDAGScheduler("fast", "Fast suboptimal list scheduling",
+                   createFastDAGScheduler);
+static RegisterScheduler
+  linearizeDAGScheduler("linearize", "Linearize DAG, no scheduling",
+                        createDAGLinearizer);
+
+
+namespace {
+  /// FastPriorityQueue - A degenerate priority queue that considers
+  /// all nodes to have the same priority.
+  ///
+  struct FastPriorityQueue {
+    SmallVector<SUnit *, 16> Queue;
+
+    bool empty() const { return Queue.empty(); }
+
+    void push(SUnit *U) {
+      Queue.push_back(U);
+    }
+
+    SUnit *pop() {
+      if (empty()) return nullptr;
+      SUnit *V = Queue.back();
+      Queue.pop_back();
+      return V;
+    }
+  };
+
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGFast - The actual "fast" list scheduler implementation.
+///
+class ScheduleDAGFast : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  FastPriorityQueue AvailableQueue;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::vector<SUnit*> LiveRegDefs;
+  std::vector<unsigned> LiveRegCycles;
+
+public:
+  ScheduleDAGFast(MachineFunction &mf)
+    : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule() override;
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  void AddPred(SUnit *SU, const SDep &D) {
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    SU->removePred(D);
+  }
+
+private:
+  void ReleasePred(SUnit *SU, SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU, unsigned CurCycle);
+  void ScheduleNodeBottomUp(SUnit*, unsigned);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
+  void ListScheduleBottomUp();
+
+  /// forceUnitLatencies - The fast scheduler doesn't care about real latencies.
+  bool forceUnitLatencies() const override { return true; }
+};
+}  // end anonymous namespace
+
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGFast::Schedule() {
+  DEBUG(dbgs() << "********** List Scheduling **********\n");
+
+  NumLiveRegs = 0;
+  LiveRegDefs.resize(TRI->getNumRegs(), nullptr);
+  LiveRegCycles.resize(TRI->getNumRegs(), 0);
+
+  // Build the scheduling graph.
+  BuildSchedGraph(nullptr);
+
+  DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
+          SUnits[su].dumpAll(this));
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGFast::ReleasePred(SUnit *SU, SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+    AvailableQueue.push(PredSU);
+  }
+}
+
+void ScheduleDAGFast::ReleasePredecessors(SUnit *SU, unsigned CurCycle) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      if (!LiveRegDefs[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegDefs[I->getReg()] = I->getSUnit();
+        LiveRegCycles[I->getReg()] = CurCycle;
+      }
+    }
+  }
+}
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGFast::ScheduleNodeBottomUp(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  assert(CurCycle >= SU->getHeight() && "Node scheduled below its height!");
+  SU->setHeightToAtLeast(CurCycle);
+  Sequence.push_back(SU);
+
+  ReleasePredecessors(SU, CurCycle);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      if (LiveRegCycles[I->getReg()] == I->getSUnit()->getHeight()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        assert(LiveRegDefs[I->getReg()] == SU &&
+               "Physical register dependency violated?");
+        --NumLiveRegs;
+        LiveRegDefs[I->getReg()] = nullptr;
+        LiveRegCycles[I->getReg()] = 0;
+      }
+    }
+  }
+
+  SU->isScheduled = true;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
+  if (SU->getNode()->getGluedNode())
+    return nullptr;
+
+  SDNode *N = SU->getNode();
+  if (!N)
+    return nullptr;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (VT == MVT::Glue)
+      return nullptr;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (const SDValue &Op : N->op_values()) {
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+    if (VT == MVT::Glue)
+      return nullptr;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return nullptr;
+
+    DEBUG(dbgs() << "Unfolding SU # " << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    SUnit *NewSU = newSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = newSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+    }
+
+    SDep ChainPred;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainPred = *I;
+      else if (I->getSUnit()->getNode() &&
+               I->getSUnit()->getNode()->isOperandOf(LoadNode))
+        LoadPreds.push_back(*I);
+      else
+        NodePreds.push_back(*I);
+    }
+    for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+         I != E; ++I) {
+      if (I->isCtrl())
+        ChainSuccs.push_back(*I);
+      else
+        NodeSuccs.push_back(*I);
+    }
+
+    if (ChainPred.getSUnit()) {
+      RemovePred(SU, ChainPred);
+      if (isNewLoad)
+        AddPred(LoadSU, ChainPred);
+    }
+    for (unsigned i = 0, e = LoadPreds.size(); i != e; ++i) {
+      const SDep &Pred = LoadPreds[i];
+      RemovePred(SU, Pred);
+      if (isNewLoad) {
+        AddPred(LoadSU, Pred);
+      }
+    }
+    for (unsigned i = 0, e = NodePreds.size(); i != e; ++i) {
+      const SDep &Pred = NodePreds[i];
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (unsigned i = 0, e = NodeSuccs.size(); i != e; ++i) {
+      SDep D = NodeSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+    }
+    for (unsigned i = 0, e = ChainSuccs.size(); i != e; ++i) {
+      SDep D = ChainSuccs[i];
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    }
+    if (isNewLoad) {
+      SDep D(LoadSU, SDep::Barrier);
+      D.setLatency(LoadSU->Latency);
+      AddPred(NewSU, D);
+    }
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "Duplicating SU # " << SU->NodeNum << "\n");
+  NewSU = Clone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I)
+    if (!I->isArtificial())
+      AddPred(NewSU, *I);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                              SmallVectorImpl<SUnit*> &Copies) {
+  SUnit *CopyFromSU = newSUnit(static_cast<SDNode *>(nullptr));
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = newSUnit(static_cast<SDNode *>(nullptr));
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    if (I->isArtificial())
+      continue;
+    SUnit *SuccSU = I->getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = *I;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, *I));
+    }
+  }
+  for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
+    RemovePred(DelDeps[i].first, DelDeps[i].second);
+  }
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
+
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
+  }
+  return N->getSimpleValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static bool CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               std::vector<SUnit*> &LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVectorImpl<unsigned> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  bool Added = false;
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    if (LiveRegDefs[*AI] && LiveRegDefs[*AI] != SU) {
+      if (RegAdded.insert(*AI).second) {
+        LRegs.push_back(*AI);
+        Added = true;
+      }
+    }
+  }
+  return Added;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGFast::DelayForLiveRegsBottomUp(SUnit *SU,
+                                              SmallVectorImpl<unsigned> &LRegs){
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep()) {
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs,
+                         RegAdded, LRegs, TRI);
+    }
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+    if (!Node->isMachineOpcode())
+      continue;
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (!MCID.ImplicitDefs)
+      continue;
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg) {
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs, RegAdded, LRegs, TRI);
+    }
+  }
+  return !LRegs.empty();
+}
+
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGFast::ListScheduleBottomUp() {
+  unsigned CurCycle = 0;
+
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU, CurCycle);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue.push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  SmallVector<SUnit*, 4> NotReady;
+  DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMap;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue.empty()) {
+    bool Delayed = false;
+    LRegsMap.clear();
+    SUnit *CurSU = AvailableQueue.pop();
+    while (CurSU) {
+      SmallVector<unsigned, 4> LRegs;
+      if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+        break;
+      Delayed = true;
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      NotReady.push_back(CurSU);
+      CurSU = AvailableQueue.pop();
+    }
+
+    // All candidates are delayed due to live physical reg dependencies.
+    // Try code duplication or inserting cross class copies
+    // to resolve it.
+    if (Delayed && !CurSU) {
+      if (!CurSU) {
+        // Try duplicating the nodes that produces these
+        // "expensive to copy" values to break the dependency. In case even
+        // that doesn't work, insert cross class copies.
+        SUnit *TrySU = NotReady[0];
+        SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+        assert(LRegs.size() == 1 && "Can't handle this yet!");
+        unsigned Reg = LRegs[0];
+        SUnit *LRDef = LiveRegDefs[Reg];
+        MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, VT);
+        const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+        // If cross copy register class is the same as RC, then it must be
+        // possible copy the value directly. Do not try duplicate the def.
+        // If cross copy register class is not the same as RC, then it's
+        // possible to copy the value but it require cross register class copies
+        // and it is expensive.
+        // If cross copy register class is null, then it's not possible to copy
+        // the value at all.
+        SUnit *NewDef = nullptr;
+        if (DestRC != RC) {
+          NewDef = CopyAndMoveSuccessors(LRDef);
+          if (!DestRC && !NewDef)
+            report_fatal_error("Can't handle live physical "
+                               "register dependency!");
+        }
+        if (!NewDef) {
+          // Issue copies, these can be expensive cross register class copies.
+          SmallVector<SUnit*, 2> Copies;
+          InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+          DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
+                       << " to SU #" << Copies.front()->NodeNum << "\n");
+          AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
+          NewDef = Copies.back();
+        }
+
+        DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
+                     << " to SU #" << TrySU->NodeNum << "\n");
+        LiveRegDefs[Reg] = NewDef;
+        AddPred(NewDef, SDep(TrySU, SDep::Artificial));
+        TrySU->isAvailable = false;
+        CurSU = NewDef;
+      }
+
+      if (!CurSU) {
+        llvm_unreachable("Unable to resolve live physical register dependencies!");
+      }
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    for (unsigned i = 0, e = NotReady.size(); i != e; ++i) {
+      NotReady[i]->isPending = false;
+      // May no longer be available due to backtracking.
+      if (NotReady[i]->isAvailable)
+        AvailableQueue.push(NotReady[i]);
+    }
+    NotReady.clear();
+
+    if (CurSU)
+      ScheduleNodeBottomUp(CurSU, CurCycle);
+    ++CurCycle;
+  }
+
+  // Reverse the order since it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ScheduleDAGLinearize - No scheduling scheduler, it simply linearize the
+// DAG in topological order.
+// IMPORTANT: this may not work for targets with phyreg dependency.
+//
+class ScheduleDAGLinearize : public ScheduleDAGSDNodes {
+public:
+  ScheduleDAGLinearize(MachineFunction &mf) : ScheduleDAGSDNodes(mf) {}
+
+  void Schedule() override;
+
+  MachineBasicBlock *
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos) override;
+
+private:
+  std::vector<SDNode*> Sequence;
+  DenseMap<SDNode*, SDNode*> GluedMap;  // Cache glue to its user
+
+  void ScheduleNode(SDNode *N);
+};
+} // end anonymous namespace
+
+void ScheduleDAGLinearize::ScheduleNode(SDNode *N) {
+  if (N->getNodeId() != 0)
+    llvm_unreachable(nullptr);
+
+  if (!N->isMachineOpcode() &&
+      (N->getOpcode() == ISD::EntryToken || isPassiveNode(N)))
+    // These nodes do not need to be translated into MIs.
+    return;
+
+  DEBUG(dbgs() << "\n*** Scheduling: ");
+  DEBUG(N->dump(DAG));
+  Sequence.push_back(N);
+
+  unsigned NumOps = N->getNumOperands();
+  if (unsigned NumLeft = NumOps) {
+    SDNode *GluedOpN = nullptr;
+    do {
+      const SDValue &Op = N->getOperand(NumLeft-1);
+      SDNode *OpN = Op.getNode();
+
+      if (NumLeft == NumOps && Op.getValueType() == MVT::Glue) {
+        // Schedule glue operand right above N.
+        GluedOpN = OpN;
+        assert(OpN->getNodeId() != 0 && "Glue operand not ready?");
+        OpN->setNodeId(0);
+        ScheduleNode(OpN);
+        continue;
+      }
+
+      if (OpN == GluedOpN)
+        // Glue operand is already scheduled.
+        continue;
+
+      DenseMap<SDNode*, SDNode*>::iterator DI = GluedMap.find(OpN);
+      if (DI != GluedMap.end() && DI->second != N)
+        // Users of glues are counted against the glued users.
+        OpN = DI->second;
+
+      unsigned Degree = OpN->getNodeId();
+      assert(Degree > 0 && "Predecessor over-released!");
+      OpN->setNodeId(--Degree);
+      if (Degree == 0)
+        ScheduleNode(OpN);
+    } while (--NumLeft);
+  }
+}
+
+/// findGluedUser - Find the representative use of a glue value by walking
+/// the use chain.
+static SDNode *findGluedUser(SDNode *N) {
+  while (SDNode *Glued = N->getGluedUser())
+    N = Glued;
+  return N;
+}
+
+void ScheduleDAGLinearize::Schedule() {
+  DEBUG(dbgs() << "********** DAG Linearization **********\n");
+
+  SmallVector<SDNode*, 8> Glues;
+  unsigned DAGSize = 0;
+  for (SDNode &Node : DAG->allnodes()) {
+    SDNode *N = &Node;
+
+    // Use node id to record degree.
+    unsigned Degree = N->use_size();
+    N->setNodeId(Degree);
+    unsigned NumVals = N->getNumValues();
+    if (NumVals && N->getValueType(NumVals-1) == MVT::Glue &&
+        N->hasAnyUseOfValue(NumVals-1)) {
+      SDNode *User = findGluedUser(N);
+      if (User) {
+        Glues.push_back(N);
+        GluedMap.insert(std::make_pair(N, User));
+      }
+    }
+
+    if (N->isMachineOpcode() ||
+        (N->getOpcode() != ISD::EntryToken && !isPassiveNode(N)))
+      ++DAGSize;
+  }
+
+  for (unsigned i = 0, e = Glues.size(); i != e; ++i) {
+    SDNode *Glue = Glues[i];
+    SDNode *GUser = GluedMap[Glue];
+    unsigned Degree = Glue->getNodeId();
+    unsigned UDegree = GUser->getNodeId();
+
+    // Glue user must be scheduled together with the glue operand. So other
+    // users of the glue operand must be treated as its users.
+    SDNode *ImmGUser = Glue->getGluedUser();
+    for (SDNode::use_iterator ui = Glue->use_begin(), ue = Glue->use_end();
+         ui != ue; ++ui)
+      if (*ui == ImmGUser)
+        --Degree;
+    GUser->setNodeId(UDegree + Degree);
+    Glue->setNodeId(1);
+  }
+
+  Sequence.reserve(DAGSize);
+  ScheduleNode(DAG->getRoot().getNode());
+}
+
+MachineBasicBlock*
+ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+
+  DEBUG({
+      dbgs() << "\n*** Final schedule ***\n";
+    });
+
+  // FIXME: Handle dbg_values.
+  unsigned NumNodes = Sequence.size();
+  for (unsigned i = 0; i != NumNodes; ++i) {
+    SDNode *N = Sequence[NumNodes-i-1];
+    DEBUG(N->dump(DAG));
+    Emitter.EmitNode(N, false, false, VRBaseMap);
+  }
+
+  DEBUG(dbgs() << '\n');
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createFastDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGFast(*IS->MF);
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createDAGLinearizer(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGLinearize(*IS->MF);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
new file mode 100644
index 000000000000..802c459a0223
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -0,0 +1,3020 @@
+//===----- ScheduleDAGRRList.cpp - Reg pressure reduction list scheduler --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements bottom-up and top-down register pressure reduction list
+// schedulers, using standard algorithms.  The basic approach uses a priority
+// queue of available nodes to schedule.  One at a time, nodes are taken from
+// the priority queue (thus in priority order), checked for legality to
+// schedule, and emitted if legal.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumBacktracks, "Number of times scheduler backtracked");
+STATISTIC(NumUnfolds,    "Number of nodes unfolded");
+STATISTIC(NumDups,       "Number of duplicated nodes");
+STATISTIC(NumPRCopies,   "Number of physical register copies");
+
+static RegisterScheduler
+  burrListDAGScheduler("list-burr",
+                       "Bottom-up register reduction list scheduling",
+                       createBURRListDAGScheduler);
+static RegisterScheduler
+  sourceListDAGScheduler("source",
+                         "Similar to list-burr but schedules in source "
+                         "order when possible",
+                         createSourceListDAGScheduler);
+
+static RegisterScheduler
+  hybridListDAGScheduler("list-hybrid",
+                         "Bottom-up register pressure aware list scheduling "
+                         "which tries to balance latency and register pressure",
+                         createHybridListDAGScheduler);
+
+static RegisterScheduler
+  ILPListDAGScheduler("list-ilp",
+                      "Bottom-up register pressure aware list scheduling "
+                      "which tries to balance ILP and register pressure",
+                      createILPListDAGScheduler);
+
+static cl::opt<bool> DisableSchedCycles(
+  "disable-sched-cycles", cl::Hidden, cl::init(false),
+  cl::desc("Disable cycle-level precision during preRA scheduling"));
+
+// Temporary sched=list-ilp flags until the heuristics are robust.
+// Some options are also available under sched=list-hybrid.
+static cl::opt<bool> DisableSchedRegPressure(
+  "disable-sched-reg-pressure", cl::Hidden, cl::init(false),
+  cl::desc("Disable regpressure priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedLiveUses(
+  "disable-sched-live-uses", cl::Hidden, cl::init(true),
+  cl::desc("Disable live use priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedVRegCycle(
+  "disable-sched-vrcycle", cl::Hidden, cl::init(false),
+  cl::desc("Disable virtual register cycle interference checks"));
+static cl::opt<bool> DisableSchedPhysRegJoin(
+  "disable-sched-physreg-join", cl::Hidden, cl::init(false),
+  cl::desc("Disable physreg def-use affinity"));
+static cl::opt<bool> DisableSchedStalls(
+  "disable-sched-stalls", cl::Hidden, cl::init(true),
+  cl::desc("Disable no-stall priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedCriticalPath(
+  "disable-sched-critical-path", cl::Hidden, cl::init(false),
+  cl::desc("Disable critical path priority in sched=list-ilp"));
+static cl::opt<bool> DisableSchedHeight(
+  "disable-sched-height", cl::Hidden, cl::init(false),
+  cl::desc("Disable scheduled-height priority in sched=list-ilp"));
+static cl::opt<bool> Disable2AddrHack(
+  "disable-2addr-hack", cl::Hidden, cl::init(true),
+  cl::desc("Disable scheduler's two-address hack"));
+
+static cl::opt<int> MaxReorderWindow(
+  "max-sched-reorder", cl::Hidden, cl::init(6),
+  cl::desc("Number of instructions to allow ahead of the critical path "
+           "in sched=list-ilp"));
+
+static cl::opt<unsigned> AvgIPC(
+  "sched-avg-ipc", cl::Hidden, cl::init(1),
+  cl::desc("Average inst/cycle whan no target itinerary exists."));
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGRRList - The actual register reduction list scheduler
+/// implementation.  This supports both top-down and bottom-up scheduling.
+///
+class ScheduleDAGRRList : public ScheduleDAGSDNodes {
+private:
+  /// NeedLatency - True if the scheduler will make use of latency information.
+  ///
+  bool NeedLatency;
+
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands becomes available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// CurCycle - The current scheduler state corresponds to this cycle.
+  unsigned CurCycle;
+
+  /// MinAvailableCycle - Cycle of the soonest available instruction.
+  unsigned MinAvailableCycle;
+
+  /// IssueCount - Count instructions issued in this cycle
+  /// Currently valid only for bottom-up scheduling.
+  unsigned IssueCount;
+
+  /// LiveRegDefs - A set of physical registers and their definition
+  /// that are "live". These nodes must be scheduled before any other nodes that
+  /// modifies the registers can be scheduled.
+  unsigned NumLiveRegs;
+  std::unique_ptr<SUnit*[]> LiveRegDefs;
+  std::unique_ptr<SUnit*[]> LiveRegGens;
+
+  // Collect interferences between physical register use/defs.
+  // Each interference is an SUnit and set of physical registers.
+  SmallVector<SUnit*, 4> Interferences;
+  typedef DenseMap<SUnit*, SmallVector<unsigned, 4> > LRegsMapT;
+  LRegsMapT LRegsMap;
+
+  /// Topo - A topological ordering for SUnits which permits fast IsReachable
+  /// and similar queries.
+  ScheduleDAGTopologicalSort Topo;
+
+  // Hack to keep track of the inverse of FindCallSeqStart without more crazy
+  // DAG crawling.
+  DenseMap<SUnit*, SUnit*> CallSeqEndForStart;
+
+public:
+  ScheduleDAGRRList(MachineFunction &mf, bool needlatency,
+                    SchedulingPriorityQueue *availqueue,
+                    CodeGenOpt::Level OptLevel)
+    : ScheduleDAGSDNodes(mf),
+      NeedLatency(needlatency), AvailableQueue(availqueue), CurCycle(0),
+      Topo(SUnits, nullptr) {
+
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    if (DisableSchedCycles || !NeedLatency)
+      HazardRec = new ScheduleHazardRecognizer();
+    else
+      HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
+  }
+
+  ~ScheduleDAGRRList() override {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule() override;
+
+  ScheduleHazardRecognizer *getHazardRec() { return HazardRec; }
+
+  /// IsReachable - Checks if SU is reachable from TargetSU.
+  bool IsReachable(const SUnit *SU, const SUnit *TargetSU) {
+    return Topo.IsReachable(SU, TargetSU);
+  }
+
+  /// WillCreateCycle - Returns true if adding an edge from SU to TargetSU will
+  /// create a cycle.
+  bool WillCreateCycle(SUnit *SU, SUnit *TargetSU) {
+    return Topo.WillCreateCycle(SU, TargetSU);
+  }
+
+  /// AddPred - adds a predecessor edge to SUnit SU.
+  /// This returns true if this is a new predecessor.
+  /// Updates the topological ordering if required.
+  void AddPred(SUnit *SU, const SDep &D) {
+    Topo.AddPred(SU, D.getSUnit());
+    SU->addPred(D);
+  }
+
+  /// RemovePred - removes a predecessor edge from SUnit SU.
+  /// This returns true if an edge was removed.
+  /// Updates the topological ordering if required.
+  void RemovePred(SUnit *SU, const SDep &D) {
+    Topo.RemovePred(SU, D.getSUnit());
+    SU->removePred(D);
+  }
+
+private:
+  bool isReady(SUnit *SU) {
+    return DisableSchedCycles || !AvailableQueue->hasReadyFilter() ||
+      AvailableQueue->isReady(SU);
+  }
+
+  void ReleasePred(SUnit *SU, const SDep *PredEdge);
+  void ReleasePredecessors(SUnit *SU);
+  void ReleasePending();
+  void AdvanceToCycle(unsigned NextCycle);
+  void AdvancePastStalls(SUnit *SU);
+  void EmitNode(SUnit *SU);
+  void ScheduleNodeBottomUp(SUnit*);
+  void CapturePred(SDep *PredEdge);
+  void UnscheduleNodeBottomUp(SUnit*);
+  void RestoreHazardCheckerBottomUp();
+  void BacktrackBottomUp(SUnit*, SUnit*);
+  SUnit *CopyAndMoveSuccessors(SUnit*);
+  void InsertCopiesAndMoveSuccs(SUnit*, unsigned,
+                                const TargetRegisterClass*,
+                                const TargetRegisterClass*,
+                                SmallVectorImpl<SUnit*>&);
+  bool DelayForLiveRegsBottomUp(SUnit*, SmallVectorImpl<unsigned>&);
+
+  void releaseInterferences(unsigned Reg = 0);
+
+  SUnit *PickNodeToScheduleBottomUp();
+  void ListScheduleBottomUp();
+
+  /// CreateNewSUnit - Creates a new SUnit and returns a pointer to it.
+  /// Updates the topological ordering if required.
+  SUnit *CreateNewSUnit(SDNode *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = newSUnit(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// CreateClone - Creates a new SUnit from an existing one.
+  /// Updates the topological ordering if required.
+  SUnit *CreateClone(SUnit *N) {
+    unsigned NumSUnits = SUnits.size();
+    SUnit *NewNode = Clone(N);
+    // Update the topological ordering.
+    if (NewNode->NodeNum >= NumSUnits)
+      Topo.InitDAGTopologicalSorting();
+    return NewNode;
+  }
+
+  /// forceUnitLatencies - Register-pressure-reducing scheduling doesn't
+  /// need actual latency information but the hybrid scheduler does.
+  bool forceUnitLatencies() const override {
+    return !NeedLatency;
+  }
+};
+}  // end anonymous namespace
+
+/// GetCostForDef - Looks up the register class and cost for a given definition.
+/// Typically this just means looking up the representative register class,
+/// but for untyped values (MVT::Untyped) it means inspecting the node's
+/// opcode to determine what register class is being generated.
+static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
+                          const TargetLowering *TLI,
+                          const TargetInstrInfo *TII,
+                          const TargetRegisterInfo *TRI,
+                          unsigned &RegClass, unsigned &Cost,
+                          const MachineFunction &MF) {
+  MVT VT = RegDefPos.GetValue();
+
+  // Special handling for untyped values.  These values can only come from
+  // the expansion of custom DAG-to-DAG patterns.
+  if (VT == MVT::Untyped) {
+    const SDNode *Node = RegDefPos.GetNode();
+
+    // Special handling for CopyFromReg of untyped values.
+    if (!Node->isMachineOpcode() && Node->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+      const TargetRegisterClass *RC = MF.getRegInfo().getRegClass(Reg);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Opcode = Node->getMachineOpcode();
+    if (Opcode == TargetOpcode::REG_SEQUENCE) {
+      unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
+      const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
+      RegClass = RC->getID();
+      Cost = 1;
+      return;
+    }
+
+    unsigned Idx = RegDefPos.GetIdx();
+    const MCInstrDesc Desc = TII->get(Opcode);
+    const TargetRegisterClass *RC = TII->getRegClass(Desc, Idx, TRI, MF);
+    RegClass = RC->getID();
+    // FIXME: Cost arbitrarily set to 1 because there doesn't seem to be a
+    // better way to determine it.
+    Cost = 1;
+  } else {
+    RegClass = TLI->getRepRegClassFor(VT)->getID();
+    Cost = TLI->getRepRegClassCostFor(VT);
+  }
+}
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGRRList::Schedule() {
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " '" << BB->getName() << "' **********\n");
+
+  CurCycle = 0;
+  IssueCount = 0;
+  MinAvailableCycle = DisableSchedCycles ? 0 : UINT_MAX;
+  NumLiveRegs = 0;
+  // Allocate slots for each physical register, plus one for a special register
+  // to track the virtual resource of a calling sequence.
+  LiveRegDefs.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  LiveRegGens.reset(new SUnit*[TRI->getNumRegs() + 1]());
+  CallSeqEndForStart.clear();
+  assert(Interferences.empty() && LRegsMap.empty() && "stale Interferences");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(nullptr);
+
+  DEBUG(for (SUnit &SU : SUnits)
+          SU.dumpAll(this));
+  Topo.InitDAGTopologicalSorting();
+
+  AvailableQueue->initNodes(SUnits);
+
+  HazardRec->Reset();
+
+  // Execute the actual scheduling loop.
+  ListScheduleBottomUp();
+
+  AvailableQueue->releaseState();
+
+  DEBUG({
+      dbgs() << "*** Final schedule ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
+}
+
+//===----------------------------------------------------------------------===//
+//  Bottom-Up Scheduling
+//===----------------------------------------------------------------------===//
+
+/// ReleasePred - Decrement the NumSuccsLeft count of a predecessor. Add it to
+/// the AvailableQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGRRList::ReleasePred(SUnit *SU, const SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+
+#ifndef NDEBUG
+  if (PredSU->NumSuccsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    PredSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  --PredSU->NumSuccsLeft;
+
+  if (!forceUnitLatencies()) {
+    // Updating predecessor's height. This is now the cycle when the
+    // predecessor can be scheduled without causing a pipeline stall.
+    PredSU->setHeightToAtLeast(SU->getHeight() + PredEdge->getLatency());
+  }
+
+  // If all the node's successors are scheduled, this node is ready
+  // to be scheduled. Ignore the special EntrySU node.
+  if (PredSU->NumSuccsLeft == 0 && PredSU != &EntrySU) {
+    PredSU->isAvailable = true;
+
+    unsigned Height = PredSU->getHeight();
+    if (Height < MinAvailableCycle)
+      MinAvailableCycle = Height;
+
+    if (isReady(PredSU)) {
+      AvailableQueue->push(PredSU);
+    }
+    // CapturePred and others may have left the node in the pending queue, avoid
+    // adding it twice.
+    else if (!PredSU->isPending) {
+      PredSU->isPending = true;
+      PendingQueue.push_back(PredSU);
+    }
+  }
+}
+
+/// IsChainDependent - Test if Outer is reachable from Inner through
+/// chain dependencies.
+static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
+                             unsigned NestLevel,
+                             const TargetInstrInfo *TII) {
+  SDNode *N = Outer;
+  for (;;) {
+    if (N == Inner)
+      return true;
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      for (const SDValue &Op : N->op_values())
+        if (IsChainDependent(Op.getNode(), Inner, NestLevel, TII))
+          return true;
+      return false;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() ==
+          (unsigned)TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+      } else if (N->getMachineOpcode() ==
+                 (unsigned)TII->getCallFrameSetupOpcode()) {
+        if (NestLevel == 0)
+          return false;
+        --NestLevel;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other) {
+        N = Op.getNode();
+        goto found_chain_operand;
+      }
+    return false;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return false;
+  }
+}
+
+/// FindCallSeqStart - Starting from the (lowered) CALLSEQ_END node, locate
+/// the corresponding (lowered) CALLSEQ_BEGIN node.
+///
+/// NestLevel and MaxNested are used in recursion to indcate the current level
+/// of nesting of CALLSEQ_BEGIN and CALLSEQ_END pairs, as well as the maximum
+/// level seen so far.
+///
+/// TODO: It would be better to give CALLSEQ_END an explicit operand to point
+/// to the corresponding CALLSEQ_BEGIN to avoid needing to search for it.
+static SDNode *
+FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
+                 const TargetInstrInfo *TII) {
+  for (;;) {
+    // For a TokenFactor, examine each operand. There may be multiple ways
+    // to get to the CALLSEQ_BEGIN, but we need to find the path with the
+    // most nesting in order to ensure that we find the corresponding match.
+    if (N->getOpcode() == ISD::TokenFactor) {
+      SDNode *Best = nullptr;
+      unsigned BestMaxNest = MaxNest;
+      for (const SDValue &Op : N->op_values()) {
+        unsigned MyNestLevel = NestLevel;
+        unsigned MyMaxNest = MaxNest;
+        if (SDNode *New = FindCallSeqStart(Op.getNode(),
+                                           MyNestLevel, MyMaxNest, TII))
+          if (!Best || (MyMaxNest > BestMaxNest)) {
+            Best = New;
+            BestMaxNest = MyMaxNest;
+          }
+      }
+      assert(Best);
+      MaxNest = BestMaxNest;
+      return Best;
+    }
+    // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
+    if (N->isMachineOpcode()) {
+      if (N->getMachineOpcode() ==
+          (unsigned)TII->getCallFrameDestroyOpcode()) {
+        ++NestLevel;
+        MaxNest = std::max(MaxNest, NestLevel);
+      } else if (N->getMachineOpcode() ==
+                 (unsigned)TII->getCallFrameSetupOpcode()) {
+        assert(NestLevel != 0);
+        --NestLevel;
+        if (NestLevel == 0)
+          return N;
+      }
+    }
+    // Otherwise, find the chain and continue climbing.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other) {
+        N = Op.getNode();
+        goto found_chain_operand;
+      }
+    return nullptr;
+  found_chain_operand:;
+    if (N->getOpcode() == ISD::EntryToken)
+      return nullptr;
+  }
+}
+
+/// Call ReleasePred for each predecessor, then update register live def/gen.
+/// Always update LiveRegDefs for a register dependence even if the current SU
+/// also defines the register. This effectively create one large live range
+/// across a sequence of two-address node. This is important because the
+/// entire chain must be scheduled together. Example:
+///
+/// flags = (3) add
+/// flags = (2) addc flags
+/// flags = (1) addc flags
+///
+/// results in
+///
+/// LiveRegDefs[flags] = 3
+/// LiveRegGens[flags] = 1
+///
+/// If (2) addc is unscheduled, then (1) addc must also be unscheduled to avoid
+/// interference on flags.
+void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
+  // Bottom up: release predecessors
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    ReleasePred(SU, &*I);
+    if (I->isAssignedRegDep()) {
+      // This is a physical register dependency and it's impossible or
+      // expensive to copy the register. Make sure nothing that can
+      // clobber the register is scheduled between the predecessor and
+      // this node.
+      SUnit *RegDef = LiveRegDefs[I->getReg()]; (void)RegDef;
+      assert((!RegDef || RegDef == SU || RegDef == I->getSUnit()) &&
+             "interference on register dependence");
+      LiveRegDefs[I->getReg()] = I->getSUnit();
+      if (!LiveRegGens[I->getReg()]) {
+        ++NumLiveRegs;
+        LiveRegGens[I->getReg()] = SU;
+      }
+    }
+  }
+
+  // If we're scheduling a lowered CALLSEQ_END, find the corresponding
+  // CALLSEQ_BEGIN. Inject an artificial physical register dependence between
+  // these nodes, to prevent other calls from being interscheduled with them.
+  unsigned CallResource = TRI->getNumRegs();
+  if (!LiveRegDefs[CallResource])
+    for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
+      if (Node->isMachineOpcode() &&
+          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+        unsigned NestLevel = 0;
+        unsigned MaxNest = 0;
+        SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
+
+        SUnit *Def = &SUnits[N->getNodeId()];
+        CallSeqEndForStart[Def] = SU;
+
+        ++NumLiveRegs;
+        LiveRegDefs[CallResource] = Def;
+        LiveRegGens[CallResource] = SU;
+        break;
+      }
+}
+
+/// Check to see if any of the pending instructions are ready to issue.  If
+/// so, add them to the available queue.
+void ScheduleDAGRRList::ReleasePending() {
+  if (DisableSchedCycles) {
+    assert(PendingQueue.empty() && "pending instrs not allowed in this mode");
+    return;
+  }
+
+  // If the available queue is empty, it is safe to reset MinAvailableCycle.
+  if (AvailableQueue->empty())
+    MinAvailableCycle = UINT_MAX;
+
+  // Check to see if any of the pending instructions are ready to issue.  If
+  // so, add them to the available queue.
+  for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+    unsigned ReadyCycle = PendingQueue[i]->getHeight();
+    if (ReadyCycle < MinAvailableCycle)
+      MinAvailableCycle = ReadyCycle;
+
+    if (PendingQueue[i]->isAvailable) {
+      if (!isReady(PendingQueue[i]))
+          continue;
+      AvailableQueue->push(PendingQueue[i]);
+    }
+    PendingQueue[i]->isPending = false;
+    PendingQueue[i] = PendingQueue.back();
+    PendingQueue.pop_back();
+    --i; --e;
+  }
+}
+
+/// Move the scheduler state forward by the specified number of Cycles.
+void ScheduleDAGRRList::AdvanceToCycle(unsigned NextCycle) {
+  if (NextCycle <= CurCycle)
+    return;
+
+  IssueCount = 0;
+  AvailableQueue->setCurCycle(NextCycle);
+  if (!HazardRec->isEnabled()) {
+    // Bypass lots of virtual calls in case of long latency.
+    CurCycle = NextCycle;
+  }
+  else {
+    for (; CurCycle != NextCycle; ++CurCycle) {
+      HazardRec->RecedeCycle();
+    }
+  }
+  // FIXME: Instead of visiting the pending Q each time, set a dirty flag on the
+  // available Q to release pending nodes at least once before popping.
+  ReleasePending();
+}
+
+/// Move the scheduler state forward until the specified node's dependents are
+/// ready and can be scheduled with no resource conflicts.
+void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) {
+  if (DisableSchedCycles)
+    return;
+
+  // FIXME: Nodes such as CopyFromReg probably should not advance the current
+  // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node
+  // has predecessors the cycle will be advanced when they are scheduled.
+  // But given the crude nature of modeling latency though such nodes, we
+  // currently need to treat these nodes like real instructions.
+  // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return;
+
+  unsigned ReadyCycle = SU->getHeight();
+
+  // Bump CurCycle to account for latency. We assume the latency of other
+  // available instructions may be hidden by the stall (not a full pipe stall).
+  // This updates the hazard recognizer's cycle before reserving resources for
+  // this instruction.
+  AdvanceToCycle(ReadyCycle);
+
+  // Calls are scheduled in their preceding cycle, so don't conflict with
+  // hazards from instructions after the call. EmitNode will reset the
+  // scoreboard state before emitting the call.
+  if (SU->isCall)
+    return;
+
+  // FIXME: For resource conflicts in very long non-pipelined stages, we
+  // should probably skip ahead here to avoid useless scoreboard checks.
+  int Stalls = 0;
+  while (true) {
+    ScheduleHazardRecognizer::HazardType HT =
+      HazardRec->getHazardType(SU, -Stalls);
+
+    if (HT == ScheduleHazardRecognizer::NoHazard)
+      break;
+
+    ++Stalls;
+  }
+  AdvanceToCycle(CurCycle + Stalls);
+}
+
+/// Record this SUnit in the HazardRecognizer.
+/// Does not update CurCycle.
+void ScheduleDAGRRList::EmitNode(SUnit *SU) {
+  if (!HazardRec->isEnabled())
+    return;
+
+  // Check for phys reg copy.
+  if (!SU->getNode())
+    return;
+
+  switch (SU->getNode()->getOpcode()) {
+  default:
+    assert(SU->getNode()->isMachineOpcode() &&
+           "This target-independent node should not be scheduled.");
+    break;
+  case ISD::MERGE_VALUES:
+  case ISD::TokenFactor:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+  case ISD::CopyToReg:
+  case ISD::CopyFromReg:
+  case ISD::EH_LABEL:
+    // Noops don't affect the scoreboard state. Copies are likely to be
+    // removed.
+    return;
+  case ISD::INLINEASM:
+    // For inline asm, clear the pipeline state.
+    HazardRec->Reset();
+    return;
+  }
+  if (SU->isCall) {
+    // Calls are scheduled with their preceding instructions. For bottom-up
+    // scheduling, clear the pipeline state before emitting.
+    HazardRec->Reset();
+  }
+
+  HazardRec->EmitInstruction(SU);
+}
+
+static void resetVRegCycle(SUnit *SU);
+
+/// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending
+/// count of its predecessors. If a predecessor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
+  DEBUG(dbgs() << "\n*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+#ifndef NDEBUG
+  if (CurCycle < SU->getHeight())
+    DEBUG(dbgs() << "   Height [" << SU->getHeight()
+          << "] pipeline stall!\n");
+#endif
+
+  // FIXME: Do not modify node height. It may interfere with
+  // backtracking. Instead add a "ready cycle" to SUnit. Before scheduling the
+  // node its ready cycle can aid heuristics, and after scheduling it can
+  // indicate the scheduled cycle.
+  SU->setHeightToAtLeast(CurCycle);
+
+  // Reserve resources for the scheduled instruction.
+  EmitNode(SU);
+
+  Sequence.push_back(SU);
+
+  AvailableQueue->scheduledNode(SU);
+
+  // If HazardRec is disabled, and each inst counts as one cycle, then
+  // advance CurCycle before ReleasePredecessors to avoid useless pushes to
+  // PendingQueue for schedulers that implement HasReadyFilter.
+  if (!HazardRec->isEnabled() && AvgIPC < 2)
+    AdvanceToCycle(CurCycle + 1);
+
+  // Update liveness of predecessors before successors to avoid treating a
+  // two-address node as a live range def.
+  ReleasePredecessors(SU);
+
+  // Release all the implicit physical register defs that are live.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    // LiveRegDegs[I->getReg()] != SU when SU is a two-address node.
+    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] == SU) {
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = nullptr;
+      LiveRegGens[I->getReg()] = nullptr;
+      releaseInterferences(I->getReg());
+    }
+  }
+  // Release the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  if (LiveRegDefs[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  resetVRegCycle(SU);
+
+  SU->isScheduled = true;
+
+  // Conditions under which the scheduler should eagerly advance the cycle:
+  // (1) No available instructions
+  // (2) All pipelines full, so available instructions must have hazards.
+  //
+  // If HazardRec is disabled, the cycle was pre-advanced before calling
+  // ReleasePredecessors. In that case, IssueCount should remain 0.
+  //
+  // Check AvailableQueue after ReleasePredecessors in case of zero latency.
+  if (HazardRec->isEnabled() || AvgIPC > 1) {
+    if (SU->getNode() && SU->getNode()->isMachineOpcode())
+      ++IssueCount;
+    if ((HazardRec->isEnabled() && HazardRec->atIssueLimit())
+        || (!HazardRec->isEnabled() && IssueCount == AvgIPC))
+      AdvanceToCycle(CurCycle + 1);
+  }
+}
+
+/// CapturePred - This does the opposite of ReleasePred. Since SU is being
+/// unscheduled, incrcease the succ left count of its predecessors. Remove
+/// them from AvailableQueue if necessary.
+void ScheduleDAGRRList::CapturePred(SDep *PredEdge) {
+  SUnit *PredSU = PredEdge->getSUnit();
+  if (PredSU->isAvailable) {
+    PredSU->isAvailable = false;
+    if (!PredSU->isPending)
+      AvailableQueue->remove(PredSU);
+  }
+
+  assert(PredSU->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+  ++PredSU->NumSuccsLeft;
+}
+
+/// UnscheduleNodeBottomUp - Remove the node from the schedule, update its and
+/// its predecessor states to reflect the change.
+void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
+  DEBUG(dbgs() << "*** Unscheduling [" << SU->getHeight() << "]: ");
+  DEBUG(SU->dump(this));
+
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    CapturePred(&*I);
+    if (I->isAssignedRegDep() && SU == LiveRegGens[I->getReg()]){
+      assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+      assert(LiveRegDefs[I->getReg()] == I->getSUnit() &&
+             "Physical register dependency violated?");
+      --NumLiveRegs;
+      LiveRegDefs[I->getReg()] = nullptr;
+      LiveRegGens[I->getReg()] = nullptr;
+      releaseInterferences(I->getReg());
+    }
+  }
+
+  // Reclaim the special call resource dependence, if this is the beginning
+  // of a call.
+  unsigned CallResource = TRI->getNumRegs();
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isMachineOpcode() &&
+        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+      ++NumLiveRegs;
+      LiveRegDefs[CallResource] = SU;
+      LiveRegGens[CallResource] = CallSeqEndForStart[SU];
+    }
+  }
+
+  // Release the special call resource dependence, if this is the end
+  // of a call.
+  if (LiveRegGens[CallResource] == SU)
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->isMachineOpcode() &&
+          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+        assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
+        --NumLiveRegs;
+        LiveRegDefs[CallResource] = nullptr;
+        LiveRegGens[CallResource] = nullptr;
+        releaseInterferences(CallResource);
+      }
+    }
+
+  for (auto &Succ : SU->Succs) {
+    if (Succ.isAssignedRegDep()) {
+      auto Reg = Succ.getReg();
+      if (!LiveRegDefs[Reg])
+        ++NumLiveRegs;
+      // This becomes the nearest def. Note that an earlier def may still be
+      // pending if this is a two-address node.
+      LiveRegDefs[Reg] = SU;
+
+      // Update LiveRegGen only if was empty before this unscheduling.
+      // This is to avoid incorrect updating LiveRegGen set in previous run.
+      if (!LiveRegGens[Reg]) {
+        // Find the successor with the lowest height.
+        LiveRegGens[Reg] = Succ.getSUnit();
+        for (auto &Succ2 : SU->Succs) {
+          if (Succ2.isAssignedRegDep() && Succ2.getReg() == Reg &&
+              Succ2.getSUnit()->getHeight() < LiveRegGens[Reg]->getHeight())
+            LiveRegGens[Reg] = Succ2.getSUnit();
+        }
+      }
+    }
+  }
+  if (SU->getHeight() < MinAvailableCycle)
+    MinAvailableCycle = SU->getHeight();
+
+  SU->setHeightDirty();
+  SU->isScheduled = false;
+  SU->isAvailable = true;
+  if (!DisableSchedCycles && AvailableQueue->hasReadyFilter()) {
+    // Don't make available until backtracking is complete.
+    SU->isPending = true;
+    PendingQueue.push_back(SU);
+  }
+  else {
+    AvailableQueue->push(SU);
+  }
+  AvailableQueue->unscheduledNode(SU);
+}
+
+/// After backtracking, the hazard checker needs to be restored to a state
+/// corresponding the current cycle.
+void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
+  HazardRec->Reset();
+
+  unsigned LookAhead = std::min((unsigned)Sequence.size(),
+                                HazardRec->getMaxLookAhead());
+  if (LookAhead == 0)
+    return;
+
+  std::vector<SUnit*>::const_iterator I = (Sequence.end() - LookAhead);
+  unsigned HazardCycle = (*I)->getHeight();
+  for (std::vector<SUnit*>::const_iterator E = Sequence.end(); I != E; ++I) {
+    SUnit *SU = *I;
+    for (; SU->getHeight() > HazardCycle; ++HazardCycle) {
+      HazardRec->RecedeCycle();
+    }
+    EmitNode(SU);
+  }
+}
+
+/// BacktrackBottomUp - Backtrack scheduling to a previous cycle specified in
+/// BTCycle in order to schedule a specific node.
+void ScheduleDAGRRList::BacktrackBottomUp(SUnit *SU, SUnit *BtSU) {
+  SUnit *OldSU = Sequence.back();
+  while (true) {
+    Sequence.pop_back();
+    // FIXME: use ready cycle instead of height
+    CurCycle = OldSU->getHeight();
+    UnscheduleNodeBottomUp(OldSU);
+    AvailableQueue->setCurCycle(CurCycle);
+    if (OldSU == BtSU)
+      break;
+    OldSU = Sequence.back();
+  }
+
+  assert(!SU->isSucc(OldSU) && "Something is wrong!");
+
+  RestoreHazardCheckerBottomUp();
+
+  ReleasePending();
+
+  ++NumBacktracks;
+}
+
+static bool isOperandOf(const SUnit *SU, SDNode *N) {
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (SUNode->isOperandOf(N))
+      return true;
+  }
+  return false;
+}
+
+/// CopyAndMoveSuccessors - Clone the specified node and move its scheduled
+/// successors to the newly created node.
+SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
+  SDNode *N = SU->getNode();
+  if (!N)
+    return nullptr;
+
+  if (SU->getNode()->getGluedNode())
+    return nullptr;
+
+  SUnit *NewSU;
+  bool TryUnfold = false;
+  for (unsigned i = 0, e = N->getNumValues(); i != e; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (VT == MVT::Glue)
+      return nullptr;
+    else if (VT == MVT::Other)
+      TryUnfold = true;
+  }
+  for (const SDValue &Op : N->op_values()) {
+    MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
+    if (VT == MVT::Glue)
+      return nullptr;
+  }
+
+  if (TryUnfold) {
+    SmallVector<SDNode*, 2> NewNodes;
+    if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
+      return nullptr;
+
+    // unfolding an x86 DEC64m operation results in store, dec, load which
+    // can't be handled here so quit
+    if (NewNodes.size() == 3)
+      return nullptr;
+
+    DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
+    assert(NewNodes.size() == 2 && "Expected a load folding node!");
+
+    N = NewNodes[1];
+    SDNode *LoadNode = NewNodes[0];
+    unsigned NumVals = N->getNumValues();
+    unsigned OldNumVals = SU->getNode()->getNumValues();
+    for (unsigned i = 0; i != NumVals; ++i)
+      DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), i), SDValue(N, i));
+    DAG->ReplaceAllUsesOfValueWith(SDValue(SU->getNode(), OldNumVals-1),
+                                   SDValue(LoadNode, 1));
+
+    // LoadNode may already exist. This can happen when there is another
+    // load from the same location and producing the same type of value
+    // but it has different alignment or volatileness.
+    bool isNewLoad = true;
+    SUnit *LoadSU;
+    if (LoadNode->getNodeId() != -1) {
+      LoadSU = &SUnits[LoadNode->getNodeId()];
+      isNewLoad = false;
+    } else {
+      LoadSU = CreateNewSUnit(LoadNode);
+      LoadNode->setNodeId(LoadSU->NodeNum);
+
+      InitNumRegDefsLeft(LoadSU);
+      computeLatency(LoadSU);
+    }
+
+    SUnit *NewSU = CreateNewSUnit(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NewSU->NodeNum);
+
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+      if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+        NewSU->isTwoAddress = true;
+        break;
+      }
+    }
+    if (MCID.isCommutable())
+      NewSU->isCommutable = true;
+
+    InitNumRegDefsLeft(NewSU);
+    computeLatency(NewSU);
+
+    // Record all the edges to and from the old SU, by category.
+    SmallVector<SDep, 4> ChainPreds;
+    SmallVector<SDep, 4> ChainSuccs;
+    SmallVector<SDep, 4> LoadPreds;
+    SmallVector<SDep, 4> NodePreds;
+    SmallVector<SDep, 4> NodeSuccs;
+    for (SDep &Pred : SU->Preds) {
+      if (Pred.isCtrl())
+        ChainPreds.push_back(Pred);
+      else if (isOperandOf(Pred.getSUnit(), LoadNode))
+        LoadPreds.push_back(Pred);
+      else
+        NodePreds.push_back(Pred);
+    }
+    for (SDep &Succ : SU->Succs) {
+      if (Succ.isCtrl())
+        ChainSuccs.push_back(Succ);
+      else
+        NodeSuccs.push_back(Succ);
+    }
+
+    // Now assign edges to the newly-created nodes.
+    for (const SDep &Pred : ChainPreds) {
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (const SDep &Pred : LoadPreds) {
+      RemovePred(SU, Pred);
+      if (isNewLoad)
+        AddPred(LoadSU, Pred);
+    }
+    for (const SDep &Pred : NodePreds) {
+      RemovePred(SU, Pred);
+      AddPred(NewSU, Pred);
+    }
+    for (SDep D : NodeSuccs) {
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      D.setSUnit(NewSU);
+      AddPred(SuccDep, D);
+      // Balance register pressure.
+      if (AvailableQueue->tracksRegPressure() && SuccDep->isScheduled
+          && !D.isCtrl() && NewSU->NumRegDefsLeft > 0)
+        --NewSU->NumRegDefsLeft;
+    }
+    for (SDep D : ChainSuccs) {
+      SUnit *SuccDep = D.getSUnit();
+      D.setSUnit(SU);
+      RemovePred(SuccDep, D);
+      if (isNewLoad) {
+        D.setSUnit(LoadSU);
+        AddPred(SuccDep, D);
+      }
+    }
+
+    // Add a data dependency to reflect that NewSU reads the value defined
+    // by LoadSU.
+    SDep D(LoadSU, SDep::Data, 0);
+    D.setLatency(LoadSU->Latency);
+    AddPred(NewSU, D);
+
+    if (isNewLoad)
+      AvailableQueue->addNode(LoadSU);
+    AvailableQueue->addNode(NewSU);
+
+    ++NumUnfolds;
+
+    if (NewSU->NumSuccsLeft == 0) {
+      NewSU->isAvailable = true;
+      return NewSU;
+    }
+    SU = NewSU;
+  }
+
+  DEBUG(dbgs() << "    Duplicating SU #" << SU->NodeNum << "\n");
+  NewSU = CreateClone(SU);
+
+  // New SUnit has the exact same predecessors.
+  for (SDep &Pred : SU->Preds)
+    if (!Pred.isArtificial())
+      AddPred(NewSU, Pred);
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(NewSU);
+      AddPred(SuccSU, D);
+      D.setSUnit(SU);
+      DelDeps.push_back(std::make_pair(SuccSU, D));
+    }
+  }
+  for (auto &DelDep : DelDeps)
+    RemovePred(DelDep.first, DelDep.second);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(NewSU);
+
+  ++NumDups;
+  return NewSU;
+}
+
+/// InsertCopiesAndMoveSuccs - Insert register copies and move all
+/// scheduled successors of the given SUnit to the last copy.
+void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
+                                              const TargetRegisterClass *DestRC,
+                                              const TargetRegisterClass *SrcRC,
+                                              SmallVectorImpl<SUnit*> &Copies) {
+  SUnit *CopyFromSU = CreateNewSUnit(nullptr);
+  CopyFromSU->CopySrcRC = SrcRC;
+  CopyFromSU->CopyDstRC = DestRC;
+
+  SUnit *CopyToSU = CreateNewSUnit(nullptr);
+  CopyToSU->CopySrcRC = DestRC;
+  CopyToSU->CopyDstRC = SrcRC;
+
+  // Only copy scheduled successors. Cut them from old node's successor
+  // list and move them over.
+  SmallVector<std::pair<SUnit *, SDep>, 4> DelDeps;
+  for (SDep &Succ : SU->Succs) {
+    if (Succ.isArtificial())
+      continue;
+    SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->isScheduled) {
+      SDep D = Succ;
+      D.setSUnit(CopyToSU);
+      AddPred(SuccSU, D);
+      DelDeps.push_back(std::make_pair(SuccSU, Succ));
+    }
+    else {
+      // Avoid scheduling the def-side copy before other successors. Otherwise
+      // we could introduce another physreg interference on the copy and
+      // continue inserting copies indefinitely.
+      AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
+    }
+  }
+  for (auto &DelDep : DelDeps)
+    RemovePred(DelDep.first, DelDep.second);
+
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
+
+  AvailableQueue->updateNode(SU);
+  AvailableQueue->addNode(CopyFromSU);
+  AvailableQueue->addNode(CopyToSU);
+  Copies.push_back(CopyFromSU);
+  Copies.push_back(CopyToSU);
+
+  ++NumPRCopies;
+}
+
+/// getPhysicalRegisterVT - Returns the ValueType of the physical register
+/// definition of the specified node.
+/// FIXME: Move to SelectionDAG?
+static MVT getPhysicalRegisterVT(SDNode *N, unsigned Reg,
+                                 const TargetInstrInfo *TII) {
+  unsigned NumRes;
+  if (N->getOpcode() == ISD::CopyFromReg) {
+    // CopyFromReg has: "chain, Val, glue" so operand 1 gives the type.
+    NumRes = 1;
+  } else {
+    const MCInstrDesc &MCID = TII->get(N->getMachineOpcode());
+    assert(MCID.ImplicitDefs && "Physical reg def must be in implicit def list!");
+    NumRes = MCID.getNumDefs();
+    for (const MCPhysReg *ImpDef = MCID.getImplicitDefs(); *ImpDef; ++ImpDef) {
+      if (Reg == *ImpDef)
+        break;
+      ++NumRes;
+    }
+  }
+  return N->getSimpleValueType(NumRes);
+}
+
+/// CheckForLiveRegDef - Return true and update live register vector if the
+/// specified register def of the specified SUnit clobbers any "live" registers.
+static void CheckForLiveRegDef(SUnit *SU, unsigned Reg,
+                               SUnit **LiveRegDefs,
+                               SmallSet<unsigned, 4> &RegAdded,
+                               SmallVectorImpl<unsigned> &LRegs,
+                               const TargetRegisterInfo *TRI) {
+  for (MCRegAliasIterator AliasI(Reg, TRI, true); AliasI.isValid(); ++AliasI) {
+
+    // Check if Ref is live.
+    if (!LiveRegDefs[*AliasI]) continue;
+
+    // Allow multiple uses of the same def.
+    if (LiveRegDefs[*AliasI] == SU) continue;
+
+    // Add Reg to the set of interfering live regs.
+    if (RegAdded.insert(*AliasI).second) {
+      LRegs.push_back(*AliasI);
+    }
+  }
+}
+
+/// CheckForLiveRegDefMasked - Check for any live physregs that are clobbered
+/// by RegMask, and add them to LRegs.
+static void CheckForLiveRegDefMasked(SUnit *SU, const uint32_t *RegMask,
+                                     ArrayRef<SUnit*> LiveRegDefs,
+                                     SmallSet<unsigned, 4> &RegAdded,
+                                     SmallVectorImpl<unsigned> &LRegs) {
+  // Look at all live registers. Skip Reg0 and the special CallResource.
+  for (unsigned i = 1, e = LiveRegDefs.size()-1; i != e; ++i) {
+    if (!LiveRegDefs[i]) continue;
+    if (LiveRegDefs[i] == SU) continue;
+    if (!MachineOperand::clobbersPhysReg(RegMask, i)) continue;
+    if (RegAdded.insert(i).second)
+      LRegs.push_back(i);
+  }
+}
+
+/// getNodeRegMask - Returns the register mask attached to an SDNode, if any.
+static const uint32_t *getNodeRegMask(const SDNode *N) {
+  for (const SDValue &Op : N->op_values())
+    if (const auto *RegOp = dyn_cast<RegisterMaskSDNode>(Op.getNode()))
+      return RegOp->getRegMask();
+  return nullptr;
+}
+
+/// DelayForLiveRegsBottomUp - Returns true if it is necessary to delay
+/// scheduling of the given node to satisfy live physical register dependencies.
+/// If the specific node is the last one that's available to schedule, do
+/// whatever is necessary (i.e. backtracking or cloning) to make it possible.
+bool ScheduleDAGRRList::
+DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
+  if (NumLiveRegs == 0)
+    return false;
+
+  SmallSet<unsigned, 4> RegAdded;
+  // If this node would clobber any "live" register, then it's not ready.
+  //
+  // If SU is the currently live definition of the same register that it uses,
+  // then we are free to schedule it.
+  for (SUnit::pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isAssignedRegDep() && LiveRegDefs[I->getReg()] != SU)
+      CheckForLiveRegDef(I->getSUnit(), I->getReg(), LiveRegDefs.get(),
+                         RegAdded, LRegs, TRI);
+  }
+
+  for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode()) {
+    if (Node->getOpcode() == ISD::INLINEASM) {
+      // Inline asm can clobber physical defs.
+      unsigned NumOps = Node->getNumOperands();
+      if (Node->getOperand(NumOps-1).getValueType() == MVT::Glue)
+        --NumOps;  // Ignore the glue operand.
+
+      for (unsigned i = InlineAsm::Op_FirstOperand; i != NumOps;) {
+        unsigned Flags =
+          cast<ConstantSDNode>(Node->getOperand(i))->getZExtValue();
+        unsigned NumVals = InlineAsm::getNumOperandRegisters(Flags);
+
+        ++i; // Skip the ID value.
+        if (InlineAsm::isRegDefKind(Flags) ||
+            InlineAsm::isRegDefEarlyClobberKind(Flags) ||
+            InlineAsm::isClobberKind(Flags)) {
+          // Check for def of register or earlyclobber register.
+          for (; NumVals; --NumVals, ++i) {
+            unsigned Reg = cast<RegisterSDNode>(Node->getOperand(i))->getReg();
+            if (TargetRegisterInfo::isPhysicalRegister(Reg))
+              CheckForLiveRegDef(SU, Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+          }
+        } else
+          i += NumVals;
+      }
+      continue;
+    }
+
+    if (!Node->isMachineOpcode())
+      continue;
+    // If we're in the middle of scheduling a call, don't begin scheduling
+    // another call. Also, don't allow any physical registers to be live across
+    // the call.
+    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+      // Check the special calling-sequence resource.
+      unsigned CallResource = TRI->getNumRegs();
+      if (LiveRegDefs[CallResource]) {
+        SDNode *Gen = LiveRegGens[CallResource]->getNode();
+        while (SDNode *Glued = Gen->getGluedNode())
+          Gen = Glued;
+        if (!IsChainDependent(Gen, Node, 0, TII) &&
+            RegAdded.insert(CallResource).second)
+          LRegs.push_back(CallResource);
+      }
+    }
+    if (const uint32_t *RegMask = getNodeRegMask(Node))
+      CheckForLiveRegDefMasked(SU, RegMask,
+                               makeArrayRef(LiveRegDefs.get(), TRI->getNumRegs()),
+                               RegAdded, LRegs);
+
+    const MCInstrDesc &MCID = TII->get(Node->getMachineOpcode());
+    if (!MCID.ImplicitDefs)
+      continue;
+    for (const MCPhysReg *Reg = MCID.getImplicitDefs(); *Reg; ++Reg)
+      CheckForLiveRegDef(SU, *Reg, LiveRegDefs.get(), RegAdded, LRegs, TRI);
+  }
+
+  return !LRegs.empty();
+}
+
+void ScheduleDAGRRList::releaseInterferences(unsigned Reg) {
+  // Add the nodes that aren't ready back onto the available list.
+  for (unsigned i = Interferences.size(); i > 0; --i) {
+    SUnit *SU = Interferences[i-1];
+    LRegsMapT::iterator LRegsPos = LRegsMap.find(SU);
+    if (Reg) {
+      SmallVectorImpl<unsigned> &LRegs = LRegsPos->second;
+      if (std::find(LRegs.begin(), LRegs.end(), Reg) == LRegs.end())
+        continue;
+    }
+    SU->isPending = false;
+    // The interfering node may no longer be available due to backtracking.
+    // Furthermore, it may have been made available again, in which case it is
+    // now already in the AvailableQueue.
+    if (SU->isAvailable && !SU->NodeQueueId) {
+      DEBUG(dbgs() << "    Repushing SU #" << SU->NodeNum << '\n');
+      AvailableQueue->push(SU);
+    }
+    if (i < Interferences.size())
+      Interferences[i-1] = Interferences.back();
+    Interferences.pop_back();
+    LRegsMap.erase(LRegsPos);
+  }
+}
+
+/// Return a node that can be scheduled in this cycle. Requirements:
+/// (1) Ready: latency has been satisfied
+/// (2) No Hazards: resources are available
+/// (3) No Interferences: may unschedule to break register interferences.
+SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
+  SUnit *CurSU = AvailableQueue->empty() ? nullptr : AvailableQueue->pop();
+  while (CurSU) {
+    SmallVector<unsigned, 4> LRegs;
+    if (!DelayForLiveRegsBottomUp(CurSU, LRegs))
+      break;
+    DEBUG(dbgs() << "    Interfering reg " <<
+          (LRegs[0] == TRI->getNumRegs() ? "CallResource"
+           : TRI->getName(LRegs[0]))
+           << " SU #" << CurSU->NodeNum << '\n');
+    std::pair<LRegsMapT::iterator, bool> LRegsPair =
+      LRegsMap.insert(std::make_pair(CurSU, LRegs));
+    if (LRegsPair.second) {
+      CurSU->isPending = true;  // This SU is not in AvailableQueue right now.
+      Interferences.push_back(CurSU);
+    }
+    else {
+      assert(CurSU->isPending && "Interferences are pending");
+      // Update the interference with current live regs.
+      LRegsPair.first->second = LRegs;
+    }
+    CurSU = AvailableQueue->pop();
+  }
+  if (CurSU)
+    return CurSU;
+
+  // All candidates are delayed due to live physical reg dependencies.
+  // Try backtracking, code duplication, or inserting cross class copies
+  // to resolve it.
+  for (SUnit *TrySU : Interferences) {
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+
+    // Try unscheduling up to the point where it's safe to schedule
+    // this node.
+    SUnit *BtSU = nullptr;
+    unsigned LiveCycle = UINT_MAX;
+    for (unsigned Reg : LRegs) {
+      if (LiveRegGens[Reg]->getHeight() < LiveCycle) {
+        BtSU = LiveRegGens[Reg];
+        LiveCycle = BtSU->getHeight();
+      }
+    }
+    if (!WillCreateCycle(TrySU, BtSU))  {
+      // BacktrackBottomUp mutates Interferences!
+      BacktrackBottomUp(TrySU, BtSU);
+
+      // Force the current node to be scheduled before the node that
+      // requires the physical reg dep.
+      if (BtSU->isAvailable) {
+        BtSU->isAvailable = false;
+        if (!BtSU->isPending)
+          AvailableQueue->remove(BtSU);
+      }
+      DEBUG(dbgs() << "ARTIFICIAL edge from SU(" << BtSU->NodeNum << ") to SU("
+            << TrySU->NodeNum << ")\n");
+      AddPred(TrySU, SDep(BtSU, SDep::Artificial));
+
+      // If one or more successors has been unscheduled, then the current
+      // node is no longer available.
+      if (!TrySU->isAvailable || !TrySU->NodeQueueId)
+        CurSU = AvailableQueue->pop();
+      else {
+        // Available and in AvailableQueue
+        AvailableQueue->remove(TrySU);
+        CurSU = TrySU;
+      }
+      // Interferences has been mutated. We must break.
+      break;
+    }
+  }
+
+  if (!CurSU) {
+    // Can't backtrack. If it's too expensive to copy the value, then try
+    // duplicate the nodes that produces these "too expensive to copy"
+    // values to break the dependency. In case even that doesn't work,
+    // insert cross class copies.
+    // If it's not too expensive, i.e. cost != -1, issue copies.
+    SUnit *TrySU = Interferences[0];
+    SmallVectorImpl<unsigned> &LRegs = LRegsMap[TrySU];
+    assert(LRegs.size() == 1 && "Can't handle this yet!");
+    unsigned Reg = LRegs[0];
+    SUnit *LRDef = LiveRegDefs[Reg];
+    MVT VT = getPhysicalRegisterVT(LRDef->getNode(), Reg, TII);
+    const TargetRegisterClass *RC =
+      TRI->getMinimalPhysRegClass(Reg, VT);
+    const TargetRegisterClass *DestRC = TRI->getCrossCopyRegClass(RC);
+
+    // If cross copy register class is the same as RC, then it must be possible
+    // copy the value directly. Do not try duplicate the def.
+    // If cross copy register class is not the same as RC, then it's possible to
+    // copy the value but it require cross register class copies and it is
+    // expensive.
+    // If cross copy register class is null, then it's not possible to copy
+    // the value at all.
+    SUnit *NewDef = nullptr;
+    if (DestRC != RC) {
+      NewDef = CopyAndMoveSuccessors(LRDef);
+      if (!DestRC && !NewDef)
+        report_fatal_error("Can't handle live physical register dependency!");
+    }
+    if (!NewDef) {
+      // Issue copies, these can be expensive cross register class copies.
+      SmallVector<SUnit*, 2> Copies;
+      InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
+      DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
+            << " to SU #" << Copies.front()->NodeNum << "\n");
+      AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
+      NewDef = Copies.back();
+    }
+
+    DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
+          << " to SU #" << TrySU->NodeNum << "\n");
+    LiveRegDefs[Reg] = NewDef;
+    AddPred(NewDef, SDep(TrySU, SDep::Artificial));
+    TrySU->isAvailable = false;
+    CurSU = NewDef;
+  }
+  assert(CurSU && "Unable to resolve live physical register dependencies!");
+  return CurSU;
+}
+
+/// ListScheduleBottomUp - The main loop of list scheduling for bottom-up
+/// schedulers.
+void ScheduleDAGRRList::ListScheduleBottomUp() {
+  // Release any predecessors of the special Exit node.
+  ReleasePredecessors(&ExitSU);
+
+  // Add root to Available queue.
+  if (!SUnits.empty()) {
+    SUnit *RootSU = &SUnits[DAG->getRoot().getNode()->getNodeId()];
+    assert(RootSU->Succs.empty() && "Graph root shouldn't have successors!");
+    RootSU->isAvailable = true;
+    AvailableQueue->push(RootSU);
+  }
+
+  // While Available queue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !Interferences.empty()) {
+    DEBUG(dbgs() << "\nExamining Available:\n";
+          AvailableQueue->dump(this));
+
+    // Pick the best node to schedule taking all constraints into
+    // consideration.
+    SUnit *SU = PickNodeToScheduleBottomUp();
+
+    AdvancePastStalls(SU);
+
+    ScheduleNodeBottomUp(SU);
+
+    while (AvailableQueue->empty() && !PendingQueue.empty()) {
+      // Advance the cycle to free resources. Skip ahead to the next ready SU.
+      assert(MinAvailableCycle < UINT_MAX && "MinAvailableCycle uninitialized");
+      AdvanceToCycle(std::max(CurCycle + 1, MinAvailableCycle));
+    }
+  }
+
+  // Reverse the order if it is bottom up.
+  std::reverse(Sequence.begin(), Sequence.end());
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/true);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                RegReductionPriorityQueue Definition
+//===----------------------------------------------------------------------===//
+//
+// This is a SchedulingPriorityQueue that schedules using Sethi Ullman numbers
+// to reduce register pressure.
+//
+namespace {
+class RegReductionPQBase;
+
+struct queue_sort : public std::binary_function<SUnit*, SUnit*, bool> {
+  bool isReady(SUnit* SU, unsigned CurCycle) const { return true; }
+};
+
+#ifndef NDEBUG
+template<class SF>
+struct reverse_sort : public queue_sort {
+  SF &SortFunc;
+  reverse_sort(SF &sf) : SortFunc(sf) {}
+
+  bool operator()(SUnit* left, SUnit* right) const {
+    // reverse left/right rather than simply !SortFunc(left, right)
+    // to expose different paths in the comparison logic.
+    return SortFunc(right, left);
+  }
+};
+#endif // NDEBUG
+
+/// bu_ls_rr_sort - Priority function for bottom up register pressure
+// reduction scheduler.
+struct bu_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  bu_ls_rr_sort(RegReductionPQBase *spq) : SPQ(spq) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// src_ls_rr_sort - Priority function for source order scheduler.
+struct src_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  src_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// hybrid_ls_rr_sort - Priority function for hybrid scheduler.
+struct hybrid_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  hybrid_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+// ilp_ls_rr_sort - Priority function for ILP (instruction level parallelism)
+// scheduler.
+struct ilp_ls_rr_sort : public queue_sort {
+  enum {
+    IsBottomUp = true,
+    HasReadyFilter = false
+  };
+
+  RegReductionPQBase *SPQ;
+  ilp_ls_rr_sort(RegReductionPQBase *spq)
+    : SPQ(spq) {}
+
+  bool isReady(SUnit *SU, unsigned CurCycle) const;
+
+  bool operator()(SUnit* left, SUnit* right) const;
+};
+
+class RegReductionPQBase : public SchedulingPriorityQueue {
+protected:
+  std::vector<SUnit*> Queue;
+  unsigned CurQueueId;
+  bool TracksRegPressure;
+  bool SrcOrder;
+
+  // SUnits - The SUnits for the current graph.
+  std::vector<SUnit> *SUnits;
+
+  MachineFunction &MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const TargetLowering *TLI;
+  ScheduleDAGRRList *scheduleDAG;
+
+  // SethiUllmanNumbers - The SethiUllman number for each node.
+  std::vector<unsigned> SethiUllmanNumbers;
+
+  /// RegPressure - Tracking current reg pressure per register class.
+  ///
+  std::vector<unsigned> RegPressure;
+
+  /// RegLimit - Tracking the number of allocatable registers per register
+  /// class.
+  std::vector<unsigned> RegLimit;
+
+public:
+  RegReductionPQBase(MachineFunction &mf,
+                     bool hasReadyFilter,
+                     bool tracksrp,
+                     bool srcorder,
+                     const TargetInstrInfo *tii,
+                     const TargetRegisterInfo *tri,
+                     const TargetLowering *tli)
+    : SchedulingPriorityQueue(hasReadyFilter),
+      CurQueueId(0), TracksRegPressure(tracksrp), SrcOrder(srcorder),
+      MF(mf), TII(tii), TRI(tri), TLI(tli), scheduleDAG(nullptr) {
+    if (TracksRegPressure) {
+      unsigned NumRC = TRI->getNumRegClasses();
+      RegLimit.resize(NumRC);
+      RegPressure.resize(NumRC);
+      std::fill(RegLimit.begin(), RegLimit.end(), 0);
+      std::fill(RegPressure.begin(), RegPressure.end(), 0);
+      for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+             E = TRI->regclass_end(); I != E; ++I)
+        RegLimit[(*I)->getID()] = tri->getRegPressureLimit(*I, MF);
+    }
+  }
+
+  void setScheduleDAG(ScheduleDAGRRList *scheduleDag) {
+    scheduleDAG = scheduleDag;
+  }
+
+  ScheduleHazardRecognizer* getHazardRec() {
+    return scheduleDAG->getHazardRec();
+  }
+
+  void initNodes(std::vector<SUnit> &sunits) override;
+
+  void addNode(const SUnit *SU) override;
+
+  void updateNode(const SUnit *SU) override;
+
+  void releaseState() override {
+    SUnits = nullptr;
+    SethiUllmanNumbers.clear();
+    std::fill(RegPressure.begin(), RegPressure.end(), 0);
+  }
+
+  unsigned getNodePriority(const SUnit *SU) const;
+
+  unsigned getNodeOrdering(const SUnit *SU) const {
+    if (!SU->getNode()) return 0;
+
+    return SU->getNode()->getIROrder();
+  }
+
+  bool empty() const override { return Queue.empty(); }
+
+  void push(SUnit *U) override {
+    assert(!U->NodeQueueId && "Node in the queue already");
+    U->NodeQueueId = ++CurQueueId;
+    Queue.push_back(U);
+  }
+
+  void remove(SUnit *SU) override {
+    assert(!Queue.empty() && "Queue is empty!");
+    assert(SU->NodeQueueId != 0 && "Not in queue!");
+    std::vector<SUnit *>::iterator I = std::find(Queue.begin(), Queue.end(),
+                                                 SU);
+    if (I != std::prev(Queue.end()))
+      std::swap(*I, Queue.back());
+    Queue.pop_back();
+    SU->NodeQueueId = 0;
+  }
+
+  bool tracksRegPressure() const override { return TracksRegPressure; }
+
+  void dumpRegPressure() const;
+
+  bool HighRegPressure(const SUnit *SU) const;
+
+  bool MayReduceRegPressure(SUnit *SU) const;
+
+  int RegPressureDiff(SUnit *SU, unsigned &LiveUses) const;
+
+  void scheduledNode(SUnit *SU) override;
+
+  void unscheduledNode(SUnit *SU) override;
+
+protected:
+  bool canClobber(const SUnit *SU, const SUnit *Op);
+  void AddPseudoTwoAddrDeps();
+  void PrescheduleNodesWithMultipleUses();
+  void CalculateSethiUllmanNumbers();
+};
+
+template<class SF>
+static SUnit *popFromQueueImpl(std::vector<SUnit*> &Q, SF &Picker) {
+  std::vector<SUnit *>::iterator Best = Q.begin();
+  for (std::vector<SUnit *>::iterator I = std::next(Q.begin()),
+         E = Q.end(); I != E; ++I)
+    if (Picker(*Best, *I))
+      Best = I;
+  SUnit *V = *Best;
+  if (Best != std::prev(Q.end()))
+    std::swap(*Best, Q.back());
+  Q.pop_back();
+  return V;
+}
+
+template<class SF>
+SUnit *popFromQueue(std::vector<SUnit*> &Q, SF &Picker, ScheduleDAG *DAG) {
+#ifndef NDEBUG
+  if (DAG->StressSched) {
+    reverse_sort<SF> RPicker(Picker);
+    return popFromQueueImpl(Q, RPicker);
+  }
+#endif
+  (void)DAG;
+  return popFromQueueImpl(Q, Picker);
+}
+
+template<class SF>
+class RegReductionPriorityQueue : public RegReductionPQBase {
+  SF Picker;
+
+public:
+  RegReductionPriorityQueue(MachineFunction &mf,
+                            bool tracksrp,
+                            bool srcorder,
+                            const TargetInstrInfo *tii,
+                            const TargetRegisterInfo *tri,
+                            const TargetLowering *tli)
+    : RegReductionPQBase(mf, SF::HasReadyFilter, tracksrp, srcorder,
+                         tii, tri, tli),
+      Picker(this) {}
+
+  bool isBottomUp() const override { return SF::IsBottomUp; }
+
+  bool isReady(SUnit *U) const override {
+    return Picker.HasReadyFilter && Picker.isReady(U, getCurCycle());
+  }
+
+  SUnit *pop() override {
+    if (Queue.empty()) return nullptr;
+
+    SUnit *V = popFromQueue(Queue, Picker, scheduleDAG);
+    V->NodeQueueId = 0;
+    return V;
+  }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void dump(ScheduleDAG *DAG) const override {
+    // Emulate pop() without clobbering NodeQueueIds.
+    std::vector<SUnit*> DumpQueue = Queue;
+    SF DumpPicker = Picker;
+    while (!DumpQueue.empty()) {
+      SUnit *SU = popFromQueue(DumpQueue, DumpPicker, scheduleDAG);
+      dbgs() << "Height " << SU->getHeight() << ": ";
+      SU->dump(DAG);
+    }
+  }
+#endif
+};
+
+typedef RegReductionPriorityQueue<bu_ls_rr_sort>
+BURegReductionPriorityQueue;
+
+typedef RegReductionPriorityQueue<src_ls_rr_sort>
+SrcRegReductionPriorityQueue;
+
+typedef RegReductionPriorityQueue<hybrid_ls_rr_sort>
+HybridBURRPriorityQueue;
+
+typedef RegReductionPriorityQueue<ilp_ls_rr_sort>
+ILPBURRPriorityQueue;
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+//           Static Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+// Check for special nodes that bypass scheduling heuristics.
+// Currently this pushes TokenFactor nodes down, but may be used for other
+// pseudo-ops as well.
+//
+// Return -1 to schedule right above left, 1 for left above right.
+// Return 0 if no bias exists.
+static int checkSpecialNodes(const SUnit *left, const SUnit *right) {
+  bool LSchedLow = left->isScheduleLow;
+  bool RSchedLow = right->isScheduleLow;
+  if (LSchedLow != RSchedLow)
+    return LSchedLow < RSchedLow ? 1 : -1;
+  return 0;
+}
+
+/// CalcNodeSethiUllmanNumber - Compute Sethi Ullman number.
+/// Smaller number is the higher priority.
+static unsigned
+CalcNodeSethiUllmanNumber(const SUnit *SU, std::vector<unsigned> &SUNumbers) {
+  unsigned &SethiUllmanNumber = SUNumbers[SU->NodeNum];
+  if (SethiUllmanNumber != 0)
+    return SethiUllmanNumber;
+
+  unsigned Extra = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
+    unsigned PredSethiUllman = CalcNodeSethiUllmanNumber(PredSU, SUNumbers);
+    if (PredSethiUllman > SethiUllmanNumber) {
+      SethiUllmanNumber = PredSethiUllman;
+      Extra = 0;
+    } else if (PredSethiUllman == SethiUllmanNumber)
+      ++Extra;
+  }
+
+  SethiUllmanNumber += Extra;
+
+  if (SethiUllmanNumber == 0)
+    SethiUllmanNumber = 1;
+
+  return SethiUllmanNumber;
+}
+
+/// CalculateSethiUllmanNumbers - Calculate Sethi-Ullman numbers of all
+/// scheduling units.
+void RegReductionPQBase::CalculateSethiUllmanNumbers() {
+  SethiUllmanNumbers.assign(SUnits->size(), 0);
+
+  for (const SUnit &SU : *SUnits)
+    CalcNodeSethiUllmanNumber(&SU, SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::addNode(const SUnit *SU) {
+  unsigned SUSize = SethiUllmanNumbers.size();
+  if (SUnits->size() > SUSize)
+    SethiUllmanNumbers.resize(SUSize*2, 0);
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+void RegReductionPQBase::updateNode(const SUnit *SU) {
+  SethiUllmanNumbers[SU->NodeNum] = 0;
+  CalcNodeSethiUllmanNumber(SU, SethiUllmanNumbers);
+}
+
+// Lower priority means schedule further down. For bottom-up scheduling, lower
+// priority SUs are scheduled before higher priority SUs.
+unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
+  assert(SU->NodeNum < SethiUllmanNumbers.size());
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return 0;
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return 0;
+  if (SU->NumSuccs == 0 && SU->NumPreds != 0)
+    // If SU does not have a register use, i.e. it doesn't produce a value
+    // that would be consumed (e.g. store), then it terminates a chain of
+    // computation.  Give it a large SethiUllman number so it will be
+    // scheduled right before its predecessors that it doesn't lengthen
+    // their live ranges.
+    return 0xffff;
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return 0;
+#if 1
+  return SethiUllmanNumbers[SU->NodeNum];
+#else
+  unsigned Priority = SethiUllmanNumbers[SU->NodeNum];
+  if (SU->isCallOp) {
+    // FIXME: This assumes all of the defs are used as call operands.
+    int NP = (int)Priority - SU->getNode()->getNumValues();
+    return (NP > 0) ? NP : 0;
+  }
+  return Priority;
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                     Register Pressure Tracking
+//===----------------------------------------------------------------------===//
+
+void RegReductionPQBase::dumpRegPressure() const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
+         E = TRI->regclass_end(); I != E; ++I) {
+    const TargetRegisterClass *RC = *I;
+    unsigned Id = RC->getID();
+    unsigned RP = RegPressure[Id];
+    if (!RP) continue;
+    DEBUG(dbgs() << TRI->getRegClassName(RC) << ": " << RP << " / "
+          << RegLimit[Id] << '\n');
+  }
+#endif
+}
+
+bool RegReductionPQBase::HighRegPressure(const SUnit *SU) const {
+  if (!TLI)
+    return false;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+
+      if ((RegPressure[RCId] + Cost) >= RegLimit[RCId])
+        return true;
+    }
+  }
+  return false;
+}
+
+bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
+  const SDNode *N = SU->getNode();
+
+  if (!N->isMachineOpcode() || !SU->NumSuccs)
+    return false;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      return true;
+  }
+  return false;
+}
+
+// Compute the register pressure contribution by this instruction by count up
+// for uses that are not live and down for defs. Only count register classes
+// that are already under high pressure. As a side effect, compute the number of
+// uses of registers that are already live.
+//
+// FIXME: This encompasses the logic in HighRegPressure and MayReduceRegPressure
+// so could probably be factored.
+int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
+  LiveUses = 0;
+  int PDiff = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      if (PredSU->getNode()->isMachineOpcode())
+        ++LiveUses;
+      continue;
+    }
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance()) {
+      MVT VT = RegDefPos.GetValue();
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] >= RegLimit[RCId])
+        ++PDiff;
+    }
+  }
+  const SDNode *N = SU->getNode();
+
+  if (!N || !N->isMachineOpcode() || !SU->NumSuccs)
+    return PDiff;
+
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  for (unsigned i = 0; i != NumDefs; ++i) {
+    MVT VT = N->getSimpleValueType(i);
+    if (!N->hasAnyUseOfValue(i))
+      continue;
+    unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+    if (RegPressure[RCId] >= RegLimit[RCId])
+      --PDiff;
+  }
+  return PDiff;
+}
+
+void RegReductionPQBase::scheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  if (!SU->getNode())
+    return;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumRegDefsLeft is zero when enough uses of this node have been scheduled
+    // to cover the number of registers defined (they are all live).
+    if (PredSU->NumRegDefsLeft == 0) {
+      continue;
+    }
+    // FIXME: The ScheduleDAG currently loses information about which of a
+    // node's values is consumed by each dependence. Consequently, if the node
+    // defines multiple register classes, we don't know which to pressurize
+    // here. Instead the following loop consumes the register defs in an
+    // arbitrary order. At least it handles the common case of clustered loads
+    // to the same class. For precise liveness, each SDep needs to indicate the
+    // result number. But that tightly couples the ScheduleDAG with the
+    // SelectionDAG making updates tricky. A simpler hack would be to attach a
+    // value type or register class to SDep.
+    //
+    // The most important aspect of register tracking is balancing the increase
+    // here with the reduction further below. Note that this SU may use multiple
+    // defs in PredSU. The can't be determined here, but we've already
+    // compensated by reducing NumRegDefsLeft in PredSU during
+    // ScheduleDAGSDNodes::AddSchedEdges.
+    --PredSU->NumRegDefsLeft;
+    unsigned SkipRegDefs = PredSU->NumRegDefsLeft;
+    for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
+         RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+      if (SkipRegDefs)
+        continue;
+
+      unsigned RCId, Cost;
+      GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+      RegPressure[RCId] += Cost;
+      break;
+    }
+  }
+
+  // We should have this assert, but there may be dead SDNodes that never
+  // materialize as SUnits, so they don't appear to generate liveness.
+  //assert(SU->NumRegDefsLeft == 0 && "not all regdefs have scheduled uses");
+  int SkipRegDefs = (int)SU->NumRegDefsLeft;
+  for (ScheduleDAGSDNodes::RegDefIter RegDefPos(SU, scheduleDAG);
+       RegDefPos.IsValid(); RegDefPos.Advance(), --SkipRegDefs) {
+    if (SkipRegDefs > 0)
+      continue;
+    unsigned RCId, Cost;
+    GetCostForDef(RegDefPos, TLI, TII, TRI, RCId, Cost, MF);
+    if (RegPressure[RCId] < Cost) {
+      // Register pressure tracking is imprecise. This can happen. But we try
+      // hard not to let it happen because it likely results in poor scheduling.
+      DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") has too many regdefs\n");
+      RegPressure[RCId] = 0;
+    }
+    else {
+      RegPressure[RCId] -= Cost;
+    }
+  }
+  dumpRegPressure();
+}
+
+void RegReductionPQBase::unscheduledNode(SUnit *SU) {
+  if (!TracksRegPressure)
+    return;
+
+  const SDNode *N = SU->getNode();
+  if (!N) return;
+
+  if (!N->isMachineOpcode()) {
+    if (N->getOpcode() != ISD::CopyToReg)
+      return;
+  } else {
+    unsigned Opc = N->getMachineOpcode();
+    if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+        Opc == TargetOpcode::INSERT_SUBREG ||
+        Opc == TargetOpcode::SUBREG_TO_REG ||
+        Opc == TargetOpcode::REG_SEQUENCE ||
+        Opc == TargetOpcode::IMPLICIT_DEF)
+      return;
+  }
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl())
+      continue;
+    SUnit *PredSU = Pred.getSUnit();
+    // NumSuccsLeft counts all deps. Don't compare it with NumSuccs which only
+    // counts data deps.
+    if (PredSU->NumSuccsLeft != PredSU->Succs.size())
+      continue;
+    const SDNode *PN = PredSU->getNode();
+    if (!PN->isMachineOpcode()) {
+      if (PN->getOpcode() == ISD::CopyFromReg) {
+        MVT VT = PN->getSimpleValueType(0);
+        unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+        RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      }
+      continue;
+    }
+    unsigned POpc = PN->getMachineOpcode();
+    if (POpc == TargetOpcode::IMPLICIT_DEF)
+      continue;
+    if (POpc == TargetOpcode::EXTRACT_SUBREG ||
+        POpc == TargetOpcode::INSERT_SUBREG ||
+        POpc == TargetOpcode::SUBREG_TO_REG) {
+      MVT VT = PN->getSimpleValueType(0);
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+      continue;
+    }
+    unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
+    for (unsigned i = 0; i != NumDefs; ++i) {
+      MVT VT = PN->getSimpleValueType(i);
+      if (!PN->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      if (RegPressure[RCId] < TLI->getRepRegClassCostFor(VT))
+        // Register pressure tracking is imprecise. This can happen.
+        RegPressure[RCId] = 0;
+      else
+        RegPressure[RCId] -= TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  // Check for isMachineOpcode() as PrescheduleNodesWithMultipleUses()
+  // may transfer data dependencies to CopyToReg.
+  if (SU->NumSuccs && N->isMachineOpcode()) {
+    unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getSimpleValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
+      RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
+    }
+  }
+
+  dumpRegPressure();
+}
+
+//===----------------------------------------------------------------------===//
+//           Dynamic Node Priority for Register Pressure Reduction
+//===----------------------------------------------------------------------===//
+
+/// closestSucc - Returns the scheduled cycle of the successor which is
+/// closest to the current cycle.
+static unsigned closestSucc(const SUnit *SU) {
+  unsigned MaxHeight = 0;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;  // ignore chain succs
+    unsigned Height = Succ.getSUnit()->getHeight();
+    // If there are bunch of CopyToRegs stacked up, they should be considered
+    // to be at the same position.
+    if (Succ.getSUnit()->getNode() &&
+        Succ.getSUnit()->getNode()->getOpcode() == ISD::CopyToReg)
+      Height = closestSucc(Succ.getSUnit())+1;
+    if (Height > MaxHeight)
+      MaxHeight = Height;
+  }
+  return MaxHeight;
+}
+
+/// calcMaxScratches - Returns an cost estimate of the worse case requirement
+/// for scratch registers, i.e. number of data dependencies.
+static unsigned calcMaxScratches(const SUnit *SU) {
+  unsigned Scratches = 0;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    Scratches++;
+  }
+  return Scratches;
+}
+
+/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are
+/// CopyFromReg from a virtual register.
+static bool hasOnlyLiveInOpers(const SUnit *SU) {
+  bool RetVal = false;
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    const SUnit *PredSU = Pred.getSUnit();
+    if (PredSU->getNode() &&
+        PredSU->getNode()->getOpcode() == ISD::CopyFromReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+/// hasOnlyLiveOutUses - Return true if SU has only value successors that are
+/// CopyToReg to a virtual register. This SU def is probably a liveout and
+/// it has no other use. It should be scheduled closer to the terminator.
+static bool hasOnlyLiveOutUses(const SUnit *SU) {
+  bool RetVal = false;
+  for (const SDep &Succ : SU->Succs) {
+    if (Succ.isCtrl()) continue;
+    const SUnit *SuccSU = Succ.getSUnit();
+    if (SuccSU->getNode() && SuccSU->getNode()->getOpcode() == ISD::CopyToReg) {
+      unsigned Reg =
+        cast<RegisterSDNode>(SuccSU->getNode()->getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        RetVal = true;
+        continue;
+      }
+    }
+    return false;
+  }
+  return RetVal;
+}
+
+// Set isVRegCycle for a node with only live in opers and live out uses. Also
+// set isVRegCycle for its CopyFromReg operands.
+//
+// This is only relevant for single-block loops, in which case the VRegCycle
+// node is likely an induction variable in which the operand and target virtual
+// registers should be coalesced (e.g. pre/post increment values). Setting the
+// isVRegCycle flag helps the scheduler prioritize other uses of the same
+// CopyFromReg so that this node becomes the virtual register "kill". This
+// avoids interference between the values live in and out of the block and
+// eliminates a copy inside the loop.
+static void initVRegCycle(SUnit *SU) {
+  if (DisableSchedVRegCycle)
+    return;
+
+  if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU))
+    return;
+
+  DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n");
+
+  SU->isVRegCycle = true;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;
+    Pred.getSUnit()->isVRegCycle = true;
+  }
+}
+
+// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of
+// CopyFromReg operands. We should no longer penalize other uses of this VReg.
+static void resetVRegCycle(SUnit *SU) {
+  if (!SU->isVRegCycle)
+    return;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    SUnit *PredSU = Pred.getSUnit();
+    if (PredSU->isVRegCycle) {
+      assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg &&
+             "VRegCycle def must be CopyFromReg");
+      Pred.getSUnit()->isVRegCycle = false;
+    }
+  }
+}
+
+// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This
+// means a node that defines the VRegCycle has not been scheduled yet.
+static bool hasVRegCycleUse(const SUnit *SU) {
+  // If this SU also defines the VReg, don't hoist it as a "use".
+  if (SU->isVRegCycle)
+    return false;
+
+  for (const SDep &Pred : SU->Preds) {
+    if (Pred.isCtrl()) continue;  // ignore chain preds
+    if (Pred.getSUnit()->isVRegCycle &&
+        Pred.getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) {
+      DEBUG(dbgs() << "  VReg cycle use: SU (" << SU->NodeNum << ")\n");
+      return true;
+    }
+  }
+  return false;
+}
+
+// Check for either a dependence (latency) or resource (hazard) stall.
+//
+// Note: The ScheduleHazardRecognizer interface requires a non-const SU.
+static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) {
+  if ((int)SPQ->getCurCycle() < Height) return true;
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return true;
+  return false;
+}
+
+// Return -1 if left has higher priority, 1 if right has higher priority.
+// Return 0 if latency-based priority is equivalent.
+static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref,
+                            RegReductionPQBase *SPQ) {
+  // Scheduling an instruction that uses a VReg whose postincrement has not yet
+  // been scheduled will induce a copy. Model this as an extra cycle of latency.
+  int LPenalty = hasVRegCycleUse(left) ? 1 : 0;
+  int RPenalty = hasVRegCycleUse(right) ? 1 : 0;
+  int LHeight = (int)left->getHeight() + LPenalty;
+  int RHeight = (int)right->getHeight() + RPenalty;
+
+  bool LStall = (!checkPref || left->SchedulingPref == Sched::ILP) &&
+    BUHasStall(left, LHeight, SPQ);
+  bool RStall = (!checkPref || right->SchedulingPref == Sched::ILP) &&
+    BUHasStall(right, RHeight, SPQ);
+
+  // If scheduling one of the node will cause a pipeline stall, delay it.
+  // If scheduling either one of the node will cause a pipeline stall, sort
+  // them according to their height.
+  if (LStall) {
+    if (!RStall)
+      return 1;
+    if (LHeight != RHeight)
+      return LHeight > RHeight ? 1 : -1;
+  } else if (RStall)
+    return -1;
+
+  // If either node is scheduling for latency, sort them by height/depth
+  // and latency.
+  if (!checkPref || (left->SchedulingPref == Sched::ILP ||
+                     right->SchedulingPref == Sched::ILP)) {
+    // If neither instruction stalls (!LStall && !RStall) and HazardRecognizer
+    // is enabled, grouping instructions by cycle, then its height is already
+    // covered so only its depth matters. We also reach this point if both stall
+    // but have the same height.
+    if (!SPQ->getHazardRec()->isEnabled()) {
+      if (LHeight != RHeight)
+        return LHeight > RHeight ? 1 : -1;
+    }
+    int LDepth = left->getDepth() - LPenalty;
+    int RDepth = right->getDepth() - RPenalty;
+    if (LDepth != RDepth) {
+      DEBUG(dbgs() << "  Comparing latency of SU (" << left->NodeNum
+            << ") depth " << LDepth << " vs SU (" << right->NodeNum
+            << ") depth " << RDepth << "\n");
+      return LDepth < RDepth ? 1 : -1;
+    }
+    if (left->Latency != right->Latency)
+      return left->Latency > right->Latency ? 1 : -1;
+  }
+  return 0;
+}
+
+static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) {
+  // Schedule physical register definitions close to their use. This is
+  // motivated by microarchitectures that can fuse cmp+jump macro-ops. But as
+  // long as shortening physreg live ranges is generally good, we can defer
+  // creating a subtarget hook.
+  if (!DisableSchedPhysRegJoin) {
+    bool LHasPhysReg = left->hasPhysRegDefs;
+    bool RHasPhysReg = right->hasPhysRegDefs;
+    if (LHasPhysReg != RHasPhysReg) {
+      #ifndef NDEBUG
+      static const char *const PhysRegMsg[] = { " has no physreg",
+                                                " defines a physreg" };
+      #endif
+      DEBUG(dbgs() << "  SU (" << left->NodeNum << ") "
+            << PhysRegMsg[LHasPhysReg] << " SU(" << right->NodeNum << ") "
+            << PhysRegMsg[RHasPhysReg] << "\n");
+      return LHasPhysReg < RHasPhysReg;
+    }
+  }
+
+  // Prioritize by Sethi-Ulmann number and push CopyToReg nodes down.
+  unsigned LPriority = SPQ->getNodePriority(left);
+  unsigned RPriority = SPQ->getNodePriority(right);
+
+  // Be really careful about hoisting call operands above previous calls.
+  // Only allows it if it would reduce register pressure.
+  if (left->isCall && right->isCallOp) {
+    unsigned RNumVals = right->getNode()->getNumValues();
+    RPriority = (RPriority > RNumVals) ? (RPriority - RNumVals) : 0;
+  }
+  if (right->isCall && left->isCallOp) {
+    unsigned LNumVals = left->getNode()->getNumValues();
+    LPriority = (LPriority > LNumVals) ? (LPriority - LNumVals) : 0;
+  }
+
+  if (LPriority != RPriority)
+    return LPriority > RPriority;
+
+  // One or both of the nodes are calls and their sethi-ullman numbers are the
+  // same, then keep source order.
+  if (left->isCall || right->isCall) {
+    unsigned LOrder = SPQ->getNodeOrdering(left);
+    unsigned ROrder = SPQ->getNodeOrdering(right);
+
+    // Prefer an ordering where the lower the non-zero order number, the higher
+    // the preference.
+    if ((LOrder || ROrder) && LOrder != ROrder)
+      return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+  }
+
+  // Try schedule def + use closer when Sethi-Ullman numbers are the same.
+  // e.g.
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // and the following instructions are both ready.
+  // t2 = op c3
+  // t4 = op c4
+  //
+  // Then schedule t2 = op first.
+  // i.e.
+  // t4 = op c4
+  // t2 = op c3
+  // t1 = op t2, c1
+  // t3 = op t4, c2
+  //
+  // This creates more short live intervals.
+  unsigned LDist = closestSucc(left);
+  unsigned RDist = closestSucc(right);
+  if (LDist != RDist)
+    return LDist < RDist;
+
+  // How many registers becomes live when the node is scheduled.
+  unsigned LScratch = calcMaxScratches(left);
+  unsigned RScratch = calcMaxScratches(right);
+  if (LScratch != RScratch)
+    return LScratch > RScratch;
+
+  // Comparing latency against a call makes little sense unless the node
+  // is register pressure-neutral.
+  if ((left->isCall && RPriority > 0) || (right->isCall && LPriority > 0))
+    return (left->NodeQueueId > right->NodeQueueId);
+
+  // Do not compare latencies when one or both of the nodes are calls.
+  if (!DisableSchedCycles &&
+      !(left->isCall || right->isCall)) {
+    int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  else {
+    if (left->getHeight() != right->getHeight())
+      return left->getHeight() > right->getHeight();
+
+    if (left->getDepth() != right->getDepth())
+      return left->getDepth() < right->getDepth();
+  }
+
+  assert(left->NodeQueueId && right->NodeQueueId &&
+         "NodeQueueId cannot be zero");
+  return (left->NodeQueueId > right->NodeQueueId);
+}
+
+// Bottom up
+bool bu_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  return BURRSort(left, right, SPQ);
+}
+
+// Source order, otherwise bottom up.
+bool src_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  unsigned LOrder = SPQ->getNodeOrdering(left);
+  unsigned ROrder = SPQ->getNodeOrdering(right);
+
+  // Prefer an ordering where the lower the non-zero order number, the higher
+  // the preference.
+  if ((LOrder || ROrder) && LOrder != ROrder)
+    return LOrder != 0 && (LOrder < ROrder || ROrder == 0);
+
+  return BURRSort(left, right, SPQ);
+}
+
+// If the time between now and when the instruction will be ready can cover
+// the spill code, then avoid adding it to the ready queue. This gives long
+// stalls highest priority and allows hoisting across calls. It should also
+// speed up processing the available queue.
+bool hybrid_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  static const unsigned ReadyDelay = 3;
+
+  if (SPQ->MayReduceRegPressure(SU)) return true;
+
+  if (SU->getHeight() > (CurCycle + ReadyDelay)) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, -ReadyDelay)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+// Return true if right should be scheduled with higher priority than left.
+bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  bool LHigh = SPQ->HighRegPressure(left);
+  bool RHigh = SPQ->HighRegPressure(right);
+  // Avoid causing spills. If register pressure is high, schedule for
+  // register pressure reduction.
+  if (LHigh && !RHigh) {
+    DEBUG(dbgs() << "  pressure SU(" << left->NodeNum << ") > SU("
+          << right->NodeNum << ")\n");
+    return true;
+  }
+  else if (!LHigh && RHigh) {
+    DEBUG(dbgs() << "  pressure SU(" << right->NodeNum << ") > SU("
+          << left->NodeNum << ")\n");
+    return false;
+  }
+  if (!LHigh && !RHigh) {
+    int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ);
+    if (result != 0)
+      return result > 0;
+  }
+  return BURRSort(left, right, SPQ);
+}
+
+// Schedule as many instructions in each cycle as possible. So don't make an
+// instruction available unless it is ready in the current cycle.
+bool ilp_ls_rr_sort::isReady(SUnit *SU, unsigned CurCycle) const {
+  if (SU->getHeight() > CurCycle) return false;
+
+  if (SPQ->getHazardRec()->getHazardType(SU, 0)
+      != ScheduleHazardRecognizer::NoHazard)
+    return false;
+
+  return true;
+}
+
+static bool canEnableCoalescing(SUnit *SU) {
+  unsigned Opc = SU->getNode() ? SU->getNode()->getOpcode() : 0;
+  if (Opc == ISD::TokenFactor || Opc == ISD::CopyToReg)
+    // CopyToReg should be close to its uses to facilitate coalescing and
+    // avoid spilling.
+    return true;
+
+  if (Opc == TargetOpcode::EXTRACT_SUBREG ||
+      Opc == TargetOpcode::SUBREG_TO_REG ||
+      Opc == TargetOpcode::INSERT_SUBREG)
+    // EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG nodes should be
+    // close to their uses to facilitate coalescing.
+    return true;
+
+  if (SU->NumPreds == 0 && SU->NumSuccs != 0)
+    // If SU does not have a register def, schedule it close to its uses
+    // because it does not lengthen any live ranges.
+    return true;
+
+  return false;
+}
+
+// list-ilp is currently an experimental scheduler that allows various
+// heuristics to be enabled prior to the normal register reduction logic.
+bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const {
+  if (int res = checkSpecialNodes(left, right))
+    return res > 0;
+
+  if (left->isCall || right->isCall)
+    // No way to compute latency of calls.
+    return BURRSort(left, right, SPQ);
+
+  unsigned LLiveUses = 0, RLiveUses = 0;
+  int LPDiff = 0, RPDiff = 0;
+  if (!DisableSchedRegPressure || !DisableSchedLiveUses) {
+    LPDiff = SPQ->RegPressureDiff(left, LLiveUses);
+    RPDiff = SPQ->RegPressureDiff(right, RLiveUses);
+  }
+  if (!DisableSchedRegPressure && LPDiff != RPDiff) {
+    DEBUG(dbgs() << "RegPressureDiff SU(" << left->NodeNum << "): " << LPDiff
+          << " != SU(" << right->NodeNum << "): " << RPDiff << "\n");
+    return LPDiff > RPDiff;
+  }
+
+  if (!DisableSchedRegPressure && (LPDiff > 0 || RPDiff > 0)) {
+    bool LReduce = canEnableCoalescing(left);
+    bool RReduce = canEnableCoalescing(right);
+    if (LReduce && !RReduce) return false;
+    if (RReduce && !LReduce) return true;
+  }
+
+  if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) {
+    DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses
+          << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n");
+    return LLiveUses < RLiveUses;
+  }
+
+  if (!DisableSchedStalls) {
+    bool LStall = BUHasStall(left, left->getHeight(), SPQ);
+    bool RStall = BUHasStall(right, right->getHeight(), SPQ);
+    if (LStall != RStall)
+      return left->getHeight() > right->getHeight();
+  }
+
+  if (!DisableSchedCriticalPath) {
+    int spread = (int)left->getDepth() - (int)right->getDepth();
+    if (std::abs(spread) > MaxReorderWindow) {
+      DEBUG(dbgs() << "Depth of SU(" << left->NodeNum << "): "
+            << left->getDepth() << " != SU(" << right->NodeNum << "): "
+            << right->getDepth() << "\n");
+      return left->getDepth() < right->getDepth();
+    }
+  }
+
+  if (!DisableSchedHeight && left->getHeight() != right->getHeight()) {
+    int spread = (int)left->getHeight() - (int)right->getHeight();
+    if (std::abs(spread) > MaxReorderWindow)
+      return left->getHeight() > right->getHeight();
+  }
+
+  return BURRSort(left, right, SPQ);
+}
+
+void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) {
+  SUnits = &sunits;
+  // Add pseudo dependency edges for two-address nodes.
+  if (!Disable2AddrHack)
+    AddPseudoTwoAddrDeps();
+  // Reroute edges to nodes with multiple uses.
+  if (!TracksRegPressure && !SrcOrder)
+    PrescheduleNodesWithMultipleUses();
+  // Calculate node priorities.
+  CalculateSethiUllmanNumbers();
+
+  // For single block loops, mark nodes that look like canonical IV increments.
+  if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB))
+    for (SUnit &SU : sunits)
+      initVRegCycle(&SU);
+}
+
+//===----------------------------------------------------------------------===//
+//                    Preschedule for Register Pressure
+//===----------------------------------------------------------------------===//
+
+bool RegReductionPQBase::canClobber(const SUnit *SU, const SUnit *Op) {
+  if (SU->isTwoAddress) {
+    unsigned Opc = SU->getNode()->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (MCID.getOperandConstraint(i+NumRes, MCOI::TIED_TO) != -1) {
+        SDNode *DU = SU->getNode()->getOperand(i).getNode();
+        if (DU->getNodeId() != -1 &&
+            Op->OrigNode == &(*SUnits)[DU->getNodeId()])
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// canClobberReachingPhysRegUse - True if SU would clobber one of it's
+/// successor's explicit physregs whose definition can reach DepSU.
+/// i.e. DepSU should not be scheduled above SU.
+static bool canClobberReachingPhysRegUse(const SUnit *DepSU, const SUnit *SU,
+                                         ScheduleDAGRRList *scheduleDAG,
+                                         const TargetInstrInfo *TII,
+                                         const TargetRegisterInfo *TRI) {
+  const MCPhysReg *ImpDefs
+    = TII->get(SU->getNode()->getMachineOpcode()).getImplicitDefs();
+  const uint32_t *RegMask = getNodeRegMask(SU->getNode());
+  if(!ImpDefs && !RegMask)
+    return false;
+
+  for (const SDep &Succ : SU->Succs) {
+    SUnit *SuccSU = Succ.getSUnit();
+    for (const SDep &SuccPred : SuccSU->Preds) {
+      if (!SuccPred.isAssignedRegDep())
+        continue;
+
+      if (RegMask &&
+          MachineOperand::clobbersPhysReg(RegMask, SuccPred.getReg()) &&
+          scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
+        return true;
+
+      if (ImpDefs)
+        for (const MCPhysReg *ImpDef = ImpDefs; *ImpDef; ++ImpDef)
+          // Return true if SU clobbers this physical register use and the
+          // definition of the register reaches from DepSU. IsReachable queries
+          // a topological forward sort of the DAG (following the successors).
+          if (TRI->regsOverlap(*ImpDef, SuccPred.getReg()) &&
+              scheduleDAG->IsReachable(DepSU, SuccPred.getSUnit()))
+            return true;
+    }
+  }
+  return false;
+}
+
+/// canClobberPhysRegDefs - True if SU would clobber one of SuccSU's
+/// physical register defs.
+static bool canClobberPhysRegDefs(const SUnit *SuccSU, const SUnit *SU,
+                                  const TargetInstrInfo *TII,
+                                  const TargetRegisterInfo *TRI) {
+  SDNode *N = SuccSU->getNode();
+  unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
+  const MCPhysReg *ImpDefs = TII->get(N->getMachineOpcode()).getImplicitDefs();
+  assert(ImpDefs && "Caller should check hasPhysRegDefs");
+  for (const SDNode *SUNode = SU->getNode(); SUNode;
+       SUNode = SUNode->getGluedNode()) {
+    if (!SUNode->isMachineOpcode())
+      continue;
+    const MCPhysReg *SUImpDefs =
+      TII->get(SUNode->getMachineOpcode()).getImplicitDefs();
+    const uint32_t *SURegMask = getNodeRegMask(SUNode);
+    if (!SUImpDefs && !SURegMask)
+      continue;
+    for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
+      MVT VT = N->getSimpleValueType(i);
+      if (VT == MVT::Glue || VT == MVT::Other)
+        continue;
+      if (!N->hasAnyUseOfValue(i))
+        continue;
+      unsigned Reg = ImpDefs[i - NumDefs];
+      if (SURegMask && MachineOperand::clobbersPhysReg(SURegMask, Reg))
+        return true;
+      if (!SUImpDefs)
+        continue;
+      for (;*SUImpDefs; ++SUImpDefs) {
+        unsigned SUReg = *SUImpDefs;
+        if (TRI->regsOverlap(Reg, SUReg))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// PrescheduleNodesWithMultipleUses - Nodes with multiple uses
+/// are not handled well by the general register pressure reduction
+/// heuristics. When presented with code like this:
+///
+///      N
+///    / |
+///   /  |
+///  U  store
+///  |
+/// ...
+///
+/// the heuristics tend to push the store up, but since the
+/// operand of the store has another use (U), this would increase
+/// the length of that other use (the U->N edge).
+///
+/// This function transforms code like the above to route U's
+/// dependence through the store when possible, like this:
+///
+///      N
+///      ||
+///      ||
+///     store
+///       |
+///       U
+///       |
+///      ...
+///
+/// This results in the store being scheduled immediately
+/// after N, which shortens the U->N live range, reducing
+/// register pressure.
+///
+void RegReductionPQBase::PrescheduleNodesWithMultipleUses() {
+  // Visit all the nodes in topological order, working top-down.
+  for (SUnit &SU : *SUnits) {
+    // For now, only look at nodes with no data successors, such as stores.
+    // These are especially important, due to the heuristics in
+    // getNodePriority for nodes with no data successors.
+    if (SU.NumSuccs != 0)
+      continue;
+    // For now, only look at nodes with exactly one data predecessor.
+    if (SU.NumPreds != 1)
+      continue;
+    // Avoid prescheduling copies to virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU.getNode())
+      if (N->getOpcode() == ISD::CopyToReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Locate the single data predecessor.
+    SUnit *PredSU = nullptr;
+    for (const SDep &Pred : SU.Preds)
+      if (!Pred.isCtrl()) {
+        PredSU = Pred.getSUnit();
+        break;
+      }
+    assert(PredSU);
+
+    // Don't rewrite edges that carry physregs, because that requires additional
+    // support infrastructure.
+    if (PredSU->hasPhysRegDefs)
+      continue;
+    // Short-circuit the case where SU is PredSU's only data successor.
+    if (PredSU->NumSuccs == 1)
+      continue;
+    // Avoid prescheduling to copies from virtual registers, which don't behave
+    // like other nodes from the perspective of scheduling heuristics.
+    if (SDNode *N = SU.getNode())
+      if (N->getOpcode() == ISD::CopyFromReg &&
+          TargetRegisterInfo::isVirtualRegister
+            (cast<RegisterSDNode>(N->getOperand(1))->getReg()))
+        continue;
+
+    // Perform checks on the successors of PredSU.
+    for (const SDep &PredSucc : PredSU->Succs) {
+      SUnit *PredSuccSU = PredSucc.getSUnit();
+      if (PredSuccSU == &SU) continue;
+      // If PredSU has another successor with no data successors, for
+      // now don't attempt to choose either over the other.
+      if (PredSuccSU->NumSuccs == 0)
+        goto outer_loop_continue;
+      // Don't break physical register dependencies.
+      if (SU.hasPhysRegClobbers && PredSuccSU->hasPhysRegDefs)
+        if (canClobberPhysRegDefs(PredSuccSU, &SU, TII, TRI))
+          goto outer_loop_continue;
+      // Don't introduce graph cycles.
+      if (scheduleDAG->IsReachable(&SU, PredSuccSU))
+        goto outer_loop_continue;
+    }
+
+    // Ok, the transformation is safe and the heuristics suggest it is
+    // profitable. Update the graph.
+    DEBUG(dbgs() << "    Prescheduling SU #" << SU.NodeNum
+                 << " next to PredSU #" << PredSU->NodeNum
+                 << " to guide scheduling in the presence of multiple uses\n");
+    for (unsigned i = 0; i != PredSU->Succs.size(); ++i) {
+      SDep Edge = PredSU->Succs[i];
+      assert(!Edge.isAssignedRegDep());
+      SUnit *SuccSU = Edge.getSUnit();
+      if (SuccSU != &SU) {
+        Edge.setSUnit(PredSU);
+        scheduleDAG->RemovePred(SuccSU, Edge);
+        scheduleDAG->AddPred(&SU, Edge);
+        Edge.setSUnit(&SU);
+        scheduleDAG->AddPred(SuccSU, Edge);
+        --i;
+      }
+    }
+  outer_loop_continue:;
+  }
+}
+
+/// AddPseudoTwoAddrDeps - If two nodes share an operand and one of them uses
+/// it as a def&use operand. Add a pseudo control edge from it to the other
+/// node (if it won't create a cycle) so the two-address one will be scheduled
+/// first (lower in the schedule). If both nodes are two-address, favor the
+/// one that has a CopyToReg use (more likely to be a loop induction update).
+/// If both are two-address, but one is commutable while the other is not
+/// commutable, favor the one that's not commutable.
+void RegReductionPQBase::AddPseudoTwoAddrDeps() {
+  for (SUnit &SU : *SUnits) {
+    if (!SU.isTwoAddress)
+      continue;
+
+    SDNode *Node = SU.getNode();
+    if (!Node || !Node->isMachineOpcode() || SU.getNode()->getGluedNode())
+      continue;
+
+    bool isLiveOut = hasOnlyLiveOutUses(&SU);
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    unsigned NumRes = MCID.getNumDefs();
+    unsigned NumOps = MCID.getNumOperands() - NumRes;
+    for (unsigned j = 0; j != NumOps; ++j) {
+      if (MCID.getOperandConstraint(j+NumRes, MCOI::TIED_TO) == -1)
+        continue;
+      SDNode *DU = SU.getNode()->getOperand(j).getNode();
+      if (DU->getNodeId() == -1)
+        continue;
+      const SUnit *DUSU = &(*SUnits)[DU->getNodeId()];
+      if (!DUSU)
+        continue;
+      for (const SDep &Succ : DUSU->Succs) {
+        if (Succ.isCtrl())
+          continue;
+        SUnit *SuccSU = Succ.getSUnit();
+        if (SuccSU == &SU)
+          continue;
+        // Be conservative. Ignore if nodes aren't at roughly the same
+        // depth and height.
+        if (SuccSU->getHeight() < SU.getHeight() &&
+            (SU.getHeight() - SuccSU->getHeight()) > 1)
+          continue;
+        // Skip past COPY_TO_REGCLASS nodes, so that the pseudo edge
+        // constrains whatever is using the copy, instead of the copy
+        // itself. In the case that the copy is coalesced, this
+        // preserves the intent of the pseudo two-address heurietics.
+        while (SuccSU->Succs.size() == 1 &&
+               SuccSU->getNode()->isMachineOpcode() &&
+               SuccSU->getNode()->getMachineOpcode() ==
+                 TargetOpcode::COPY_TO_REGCLASS)
+          SuccSU = SuccSU->Succs.front().getSUnit();
+        // Don't constrain non-instruction nodes.
+        if (!SuccSU->getNode() || !SuccSU->getNode()->isMachineOpcode())
+          continue;
+        // Don't constrain nodes with physical register defs if the
+        // predecessor can clobber them.
+        if (SuccSU->hasPhysRegDefs && SU.hasPhysRegClobbers) {
+          if (canClobberPhysRegDefs(SuccSU, &SU, TII, TRI))
+            continue;
+        }
+        // Don't constrain EXTRACT_SUBREG, INSERT_SUBREG, and SUBREG_TO_REG;
+        // these may be coalesced away. We want them close to their uses.
+        unsigned SuccOpc = SuccSU->getNode()->getMachineOpcode();
+        if (SuccOpc == TargetOpcode::EXTRACT_SUBREG ||
+            SuccOpc == TargetOpcode::INSERT_SUBREG ||
+            SuccOpc == TargetOpcode::SUBREG_TO_REG)
+          continue;
+        if (!canClobberReachingPhysRegUse(SuccSU, &SU, scheduleDAG, TII, TRI) &&
+            (!canClobber(SuccSU, DUSU) ||
+             (isLiveOut && !hasOnlyLiveOutUses(SuccSU)) ||
+             (!SU.isCommutable && SuccSU->isCommutable)) &&
+            !scheduleDAG->IsReachable(SuccSU, &SU)) {
+          DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
+                       << SU.NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
+          scheduleDAG->AddPred(&SU, SDep(SuccSU, SDep::Artificial));
+        }
+      }
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+llvm::ScheduleDAGSDNodes *
+llvm::createBURRListDAGScheduler(SelectionDAGISel *IS,
+                                 CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  BURegReductionPriorityQueue *PQ =
+    new BURegReductionPriorityQueue(*IS->MF, false, false, TII, TRI, nullptr);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createSourceListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  SrcRegReductionPriorityQueue *PQ =
+    new SrcRegReductionPriorityQueue(*IS->MF, false, true, TII, TRI, nullptr);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, false, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createHybridListDAGScheduler(SelectionDAGISel *IS,
+                                   CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
+
+  HybridBURRPriorityQueue *PQ =
+    new HybridBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
+
+llvm::ScheduleDAGSDNodes *
+llvm::createILPListDAGScheduler(SelectionDAGISel *IS,
+                                CodeGenOpt::Level OptLevel) {
+  const TargetSubtargetInfo &STI = IS->MF->getSubtarget();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetLowering *TLI = IS->TLI;
+
+  ILPBURRPriorityQueue *PQ =
+    new ILPBURRPriorityQueue(*IS->MF, true, false, TII, TRI, TLI);
+  ScheduleDAGRRList *SD = new ScheduleDAGRRList(*IS->MF, true, PQ, OptLevel);
+  PQ->setScheduleDAG(SD);
+  return SD;
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
new file mode 100644
index 000000000000..3be622f8c179
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -0,0 +1,913 @@
+//===--- ScheduleDAGSDNodes.cpp - Implement the ScheduleDAGSDNodes class --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the ScheduleDAG class, which is a base class used by
+// scheduling implementation classes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ScheduleDAGSDNodes.h"
+#include "InstrEmitter.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(LoadsClustered, "Number of loads clustered together");
+
+// This allows the latency-based scheduler to notice high latency instructions
+// without a target itinerary. The choice of number here has more to do with
+// balancing scheduler heuristics than with the actual machine latency.
+static cl::opt<int> HighLatencyCycles(
+  "sched-high-latency-cycles", cl::Hidden, cl::init(10),
+  cl::desc("Roughly estimate the number of cycles that 'long latency'"
+           "instructions take for targets with no itinerary"));
+
+ScheduleDAGSDNodes::ScheduleDAGSDNodes(MachineFunction &mf)
+    : ScheduleDAG(mf), BB(nullptr), DAG(nullptr),
+      InstrItins(mf.getSubtarget().getInstrItineraryData()) {}
+
+/// Run - perform scheduling.
+///
+void ScheduleDAGSDNodes::Run(SelectionDAG *dag, MachineBasicBlock *bb) {
+  BB = bb;
+  DAG = dag;
+
+  // Clear the scheduler's SUnit DAG.
+  ScheduleDAG::clearDAG();
+  Sequence.clear();
+
+  // Invoke the target's selection of scheduler.
+  Schedule();
+}
+
+/// NewSUnit - Creates a new SUnit and return a ptr to it.
+///
+SUnit *ScheduleDAGSDNodes::newSUnit(SDNode *N) {
+#ifndef NDEBUG
+  const SUnit *Addr = nullptr;
+  if (!SUnits.empty())
+    Addr = &SUnits[0];
+#endif
+  SUnits.emplace_back(N, (unsigned)SUnits.size());
+  assert((Addr == nullptr || Addr == &SUnits[0]) &&
+         "SUnits std::vector reallocated on the fly!");
+  SUnits.back().OrigNode = &SUnits.back();
+  SUnit *SU = &SUnits.back();
+  const TargetLowering &TLI = DAG->getTargetLoweringInfo();
+  if (!N ||
+      (N->isMachineOpcode() &&
+       N->getMachineOpcode() == TargetOpcode::IMPLICIT_DEF))
+    SU->SchedulingPref = Sched::None;
+  else
+    SU->SchedulingPref = TLI.getSchedulingPreference(N);
+  return SU;
+}
+
+SUnit *ScheduleDAGSDNodes::Clone(SUnit *Old) {
+  SUnit *SU = newSUnit(Old->getNode());
+  SU->OrigNode = Old->OrigNode;
+  SU->Latency = Old->Latency;
+  SU->isVRegCycle = Old->isVRegCycle;
+  SU->isCall = Old->isCall;
+  SU->isCallOp = Old->isCallOp;
+  SU->isTwoAddress = Old->isTwoAddress;
+  SU->isCommutable = Old->isCommutable;
+  SU->hasPhysRegDefs = Old->hasPhysRegDefs;
+  SU->hasPhysRegClobbers = Old->hasPhysRegClobbers;
+  SU->isScheduleHigh = Old->isScheduleHigh;
+  SU->isScheduleLow = Old->isScheduleLow;
+  SU->SchedulingPref = Old->SchedulingPref;
+  Old->isCloned = true;
+  return SU;
+}
+
+/// CheckForPhysRegDependency - Check if the dependency between def and use of
+/// a specified operand is a physical register dependency. If so, returns the
+/// register and the cost of copying the register.
+static void CheckForPhysRegDependency(SDNode *Def, SDNode *User, unsigned Op,
+                                      const TargetRegisterInfo *TRI,
+                                      const TargetInstrInfo *TII,
+                                      unsigned &PhysReg, int &Cost) {
+  if (Op != 2 || User->getOpcode() != ISD::CopyToReg)
+    return;
+
+  unsigned Reg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+  if (TargetRegisterInfo::isVirtualRegister(Reg))
+    return;
+
+  unsigned ResNo = User->getOperand(2).getResNo();
+  if (Def->getOpcode() == ISD::CopyFromReg &&
+      cast<RegisterSDNode>(Def->getOperand(1))->getReg() == Reg) {
+    PhysReg = Reg;
+  } else if (Def->isMachineOpcode()) {
+    const MCInstrDesc &II = TII->get(Def->getMachineOpcode());
+    if (ResNo >= II.getNumDefs() &&
+        II.ImplicitDefs[ResNo - II.getNumDefs()] == Reg)
+      PhysReg = Reg;
+  }
+
+  if (PhysReg != 0) {
+    const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, Def->getSimpleValueType(ResNo));
+    Cost = RC->getCopyCost();
+  }
+}
+
+// Helper for AddGlue to clone node operands.
+static void CloneNodeWithValues(SDNode *N, SelectionDAG *DAG, ArrayRef<EVT> VTs,
+                                SDValue ExtraOper = SDValue()) {
+  SmallVector<SDValue, 8> Ops(N->op_begin(), N->op_end());
+  if (ExtraOper.getNode())
+    Ops.push_back(ExtraOper);
+
+  SDVTList VTList = DAG->getVTList(VTs);
+  MachineSDNode::mmo_iterator Begin = nullptr, End = nullptr;
+  MachineSDNode *MN = dyn_cast<MachineSDNode>(N);
+
+  // Store memory references.
+  if (MN) {
+    Begin = MN->memoperands_begin();
+    End = MN->memoperands_end();
+  }
+
+  DAG->MorphNodeTo(N, N->getOpcode(), VTList, Ops);
+
+  // Reset the memory references
+  if (MN)
+    MN->setMemRefs(Begin, End);
+}
+
+static bool AddGlue(SDNode *N, SDValue Glue, bool AddGlue, SelectionDAG *DAG) {
+  SDNode *GlueDestNode = Glue.getNode();
+
+  // Don't add glue from a node to itself.
+  if (GlueDestNode == N) return false;
+
+  // Don't add a glue operand to something that already uses glue.
+  if (GlueDestNode &&
+      N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+    return false;
+  }
+  // Don't add glue to something that already has a glue value.
+  if (N->getValueType(N->getNumValues() - 1) == MVT::Glue) return false;
+
+  SmallVector<EVT, 4> VTs(N->value_begin(), N->value_end());
+  if (AddGlue)
+    VTs.push_back(MVT::Glue);
+
+  CloneNodeWithValues(N, DAG, VTs, Glue);
+
+  return true;
+}
+
+// Cleanup after unsuccessful AddGlue. Use the standard method of morphing the
+// node even though simply shrinking the value list is sufficient.
+static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
+  assert((N->getValueType(N->getNumValues() - 1) == MVT::Glue &&
+          !N->hasAnyUseOfValue(N->getNumValues() - 1)) &&
+         "expected an unused glue value");
+
+  CloneNodeWithValues(N, DAG,
+                      makeArrayRef(N->value_begin(), N->getNumValues() - 1));
+}
+
+/// ClusterNeighboringLoads - Force nearby loads together by "gluing" them.
+/// This function finds loads of the same base and different offsets. If the
+/// offsets are not far apart (target specific), it add MVT::Glue inputs and
+/// outputs to ensure they are scheduled together and in order. This
+/// optimization may benefit some targets by improving cache locality.
+void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
+  SDNode *Chain = nullptr;
+  unsigned NumOps = Node->getNumOperands();
+  if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
+    Chain = Node->getOperand(NumOps-1).getNode();
+  if (!Chain)
+    return;
+
+  // Look for other loads of the same chain. Find loads that are loading from
+  // the same base pointer and different offsets.
+  SmallPtrSet<SDNode*, 16> Visited;
+  SmallVector<int64_t, 4> Offsets;
+  DenseMap<long long, SDNode*> O2SMap;  // Map from offset to SDNode.
+  bool Cluster = false;
+  SDNode *Base = Node;
+  // This algorithm requires a reasonably low use count before finding a match
+  // to avoid uselessly blowing up compile time in large blocks.
+  unsigned UseCount = 0;
+  for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
+       I != E && UseCount < 100; ++I, ++UseCount) {
+    SDNode *User = *I;
+    if (User == Node || !Visited.insert(User).second)
+      continue;
+    int64_t Offset1, Offset2;
+    if (!TII->areLoadsFromSameBasePtr(Base, User, Offset1, Offset2) ||
+        Offset1 == Offset2)
+      // FIXME: Should be ok if they addresses are identical. But earlier
+      // optimizations really should have eliminated one of the loads.
+      continue;
+    if (O2SMap.insert(std::make_pair(Offset1, Base)).second)
+      Offsets.push_back(Offset1);
+    O2SMap.insert(std::make_pair(Offset2, User));
+    Offsets.push_back(Offset2);
+    if (Offset2 < Offset1)
+      Base = User;
+    Cluster = true;
+    // Reset UseCount to allow more matches.
+    UseCount = 0;
+  }
+
+  if (!Cluster)
+    return;
+
+  // Sort them in increasing order.
+  std::sort(Offsets.begin(), Offsets.end());
+
+  // Check if the loads are close enough.
+  SmallVector<SDNode*, 4> Loads;
+  unsigned NumLoads = 0;
+  int64_t BaseOff = Offsets[0];
+  SDNode *BaseLoad = O2SMap[BaseOff];
+  Loads.push_back(BaseLoad);
+  for (unsigned i = 1, e = Offsets.size(); i != e; ++i) {
+    int64_t Offset = Offsets[i];
+    SDNode *Load = O2SMap[Offset];
+    if (!TII->shouldScheduleLoadsNear(BaseLoad, Load, BaseOff, Offset,NumLoads))
+      break; // Stop right here. Ignore loads that are further away.
+    Loads.push_back(Load);
+    ++NumLoads;
+  }
+
+  if (NumLoads == 0)
+    return;
+
+  // Cluster loads by adding MVT::Glue outputs and inputs. This also
+  // ensure they are scheduled in order of increasing addresses.
+  SDNode *Lead = Loads[0];
+  SDValue InGlue = SDValue(nullptr, 0);
+  if (AddGlue(Lead, InGlue, true, DAG))
+    InGlue = SDValue(Lead, Lead->getNumValues() - 1);
+  for (unsigned I = 1, E = Loads.size(); I != E; ++I) {
+    bool OutGlue = I < E - 1;
+    SDNode *Load = Loads[I];
+
+    // If AddGlue fails, we could leave an unsused glue value. This should not
+    // cause any
+    if (AddGlue(Load, InGlue, OutGlue, DAG)) {
+      if (OutGlue)
+        InGlue = SDValue(Load, Load->getNumValues() - 1);
+
+      ++LoadsClustered;
+    }
+    else if (!OutGlue && InGlue.getNode())
+      RemoveUnusedGlue(InGlue.getNode(), DAG);
+  }
+}
+
+/// ClusterNodes - Cluster certain nodes which should be scheduled together.
+///
+void ScheduleDAGSDNodes::ClusterNodes() {
+  for (SDNode &NI : DAG->allnodes()) {
+    SDNode *Node = &NI;
+    if (!Node || !Node->isMachineOpcode())
+      continue;
+
+    unsigned Opc = Node->getMachineOpcode();
+    const MCInstrDesc &MCID = TII->get(Opc);
+    if (MCID.mayLoad())
+      // Cluster loads from "near" addresses into combined SUnits.
+      ClusterNeighboringLoads(Node);
+  }
+}
+
+void ScheduleDAGSDNodes::BuildSchedUnits() {
+  // During scheduling, the NodeId field of SDNode is used to map SDNodes
+  // to their associated SUnits by holding SUnits table indices. A value
+  // of -1 means the SDNode does not yet have an associated SUnit.
+  unsigned NumNodes = 0;
+  for (SDNode &NI : DAG->allnodes()) {
+    NI.setNodeId(-1);
+    ++NumNodes;
+  }
+
+  // Reserve entries in the vector for each of the SUnits we are creating.  This
+  // ensure that reallocation of the vector won't happen, so SUnit*'s won't get
+  // invalidated.
+  // FIXME: Multiply by 2 because we may clone nodes during scheduling.
+  // This is a temporary workaround.
+  SUnits.reserve(NumNodes * 2);
+
+  // Add all nodes in depth first order.
+  SmallVector<SDNode*, 64> Worklist;
+  SmallPtrSet<SDNode*, 32> Visited;
+  Worklist.push_back(DAG->getRoot().getNode());
+  Visited.insert(DAG->getRoot().getNode());
+
+  SmallVector<SUnit*, 8> CallSUnits;
+  while (!Worklist.empty()) {
+    SDNode *NI = Worklist.pop_back_val();
+
+    // Add all operands to the worklist unless they've already been added.
+    for (const SDValue &Op : NI->op_values())
+      if (Visited.insert(Op.getNode()).second)
+        Worklist.push_back(Op.getNode());
+
+    if (isPassiveNode(NI))  // Leaf node, e.g. a TargetImmediate.
+      continue;
+
+    // If this node has already been processed, stop now.
+    if (NI->getNodeId() != -1) continue;
+
+    SUnit *NodeSUnit = newSUnit(NI);
+
+    // See if anything is glued to this node, if so, add them to glued
+    // nodes.  Nodes can have at most one glue input and one glue output.  Glue
+    // is required to be the last operand and result of a node.
+
+    // Scan up to find glued preds.
+    SDNode *N = NI;
+    while (N->getNumOperands() &&
+           N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue) {
+      N = N->getOperand(N->getNumOperands()-1).getNode();
+      assert(N->getNodeId() == -1 && "Node already inserted!");
+      N->setNodeId(NodeSUnit->NodeNum);
+      if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+        NodeSUnit->isCall = true;
+    }
+
+    // Scan down to find any glued succs.
+    N = NI;
+    while (N->getValueType(N->getNumValues()-1) == MVT::Glue) {
+      SDValue GlueVal(N, N->getNumValues()-1);
+
+      // There are either zero or one users of the Glue result.
+      bool HasGlueUse = false;
+      for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end();
+           UI != E; ++UI)
+        if (GlueVal.isOperandOf(*UI)) {
+          HasGlueUse = true;
+          assert(N->getNodeId() == -1 && "Node already inserted!");
+          N->setNodeId(NodeSUnit->NodeNum);
+          N = *UI;
+          if (N->isMachineOpcode() && TII->get(N->getMachineOpcode()).isCall())
+            NodeSUnit->isCall = true;
+          break;
+        }
+      if (!HasGlueUse) break;
+    }
+
+    if (NodeSUnit->isCall)
+      CallSUnits.push_back(NodeSUnit);
+
+    // Schedule zero-latency TokenFactor below any nodes that may increase the
+    // schedule height. Otherwise, ancestors of the TokenFactor may appear to
+    // have false stalls.
+    if (NI->getOpcode() == ISD::TokenFactor)
+      NodeSUnit->isScheduleLow = true;
+
+    // If there are glue operands involved, N is now the bottom-most node
+    // of the sequence of nodes that are glued together.
+    // Update the SUnit.
+    NodeSUnit->setNode(N);
+    assert(N->getNodeId() == -1 && "Node already inserted!");
+    N->setNodeId(NodeSUnit->NodeNum);
+
+    // Compute NumRegDefsLeft. This must be done before AddSchedEdges.
+    InitNumRegDefsLeft(NodeSUnit);
+
+    // Assign the Latency field of NodeSUnit using target-provided information.
+    computeLatency(NodeSUnit);
+  }
+
+  // Find all call operands.
+  while (!CallSUnits.empty()) {
+    SUnit *SU = CallSUnits.pop_back_val();
+    for (const SDNode *SUNode = SU->getNode(); SUNode;
+         SUNode = SUNode->getGluedNode()) {
+      if (SUNode->getOpcode() != ISD::CopyToReg)
+        continue;
+      SDNode *SrcN = SUNode->getOperand(2).getNode();
+      if (isPassiveNode(SrcN)) continue;   // Not scheduled.
+      SUnit *SrcSU = &SUnits[SrcN->getNodeId()];
+      SrcSU->isCallOp = true;
+    }
+  }
+}
+
+void ScheduleDAGSDNodes::AddSchedEdges() {
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
+  // Check to see if the scheduler cares about latencies.
+  bool UnitLatencies = forceUnitLatencies();
+
+  // Pass 2: add the preds, succs, etc.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    SUnit *SU = &SUnits[su];
+    SDNode *MainNode = SU->getNode();
+
+    if (MainNode->isMachineOpcode()) {
+      unsigned Opc = MainNode->getMachineOpcode();
+      const MCInstrDesc &MCID = TII->get(Opc);
+      for (unsigned i = 0; i != MCID.getNumOperands(); ++i) {
+        if (MCID.getOperandConstraint(i, MCOI::TIED_TO) != -1) {
+          SU->isTwoAddress = true;
+          break;
+        }
+      }
+      if (MCID.isCommutable())
+        SU->isCommutable = true;
+    }
+
+    // Find all predecessors and successors of the group.
+    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode()) {
+      if (N->isMachineOpcode() &&
+          TII->get(N->getMachineOpcode()).getImplicitDefs()) {
+        SU->hasPhysRegClobbers = true;
+        unsigned NumUsed = InstrEmitter::CountResults(N);
+        while (NumUsed != 0 && !N->hasAnyUseOfValue(NumUsed - 1))
+          --NumUsed;    // Skip over unused values at the end.
+        if (NumUsed > TII->get(N->getMachineOpcode()).getNumDefs())
+          SU->hasPhysRegDefs = true;
+      }
+
+      for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+        SDNode *OpN = N->getOperand(i).getNode();
+        if (isPassiveNode(OpN)) continue;   // Not scheduled.
+        SUnit *OpSU = &SUnits[OpN->getNodeId()];
+        assert(OpSU && "Node has no SUnit!");
+        if (OpSU == SU) continue;           // In the same group.
+
+        EVT OpVT = N->getOperand(i).getValueType();
+        assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!");
+        bool isChain = OpVT == MVT::Other;
+
+        unsigned PhysReg = 0;
+        int Cost = 1;
+        // Determine if this is a physical register dependency.
+        CheckForPhysRegDependency(OpN, N, i, TRI, TII, PhysReg, Cost);
+        assert((PhysReg == 0 || !isChain) &&
+               "Chain dependence via physreg data?");
+        // FIXME: See ScheduleDAGSDNodes::EmitCopyFromReg. For now, scheduler
+        // emits a copy from the physical register to a virtual register unless
+        // it requires a cross class copy (cost < 0). That means we are only
+        // treating "expensive to copy" register dependency as physical register
+        // dependency. This may change in the future though.
+        if (Cost >= 0 && !StressSched)
+          PhysReg = 0;
+
+        // If this is a ctrl dep, latency is 1.
+        unsigned OpLatency = isChain ? 1 : OpSU->Latency;
+        // Special-case TokenFactor chains as zero-latency.
+        if(isChain && OpN->getOpcode() == ISD::TokenFactor)
+          OpLatency = 0;
+
+        SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
+          : SDep(OpSU, SDep::Data, PhysReg);
+        Dep.setLatency(OpLatency);
+        if (!isChain && !UnitLatencies) {
+          computeOperandLatency(OpN, N, i, Dep);
+          ST.adjustSchedDependency(OpSU, SU, Dep);
+        }
+
+        if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+          // Multiple register uses are combined in the same SUnit. For example,
+          // we could have a set of glued nodes with all their defs consumed by
+          // another set of glued nodes. Register pressure tracking sees this as
+          // a single use, so to keep pressure balanced we reduce the defs.
+          //
+          // We can't tell (without more book-keeping) if this results from
+          // glued nodes or duplicate operands. As long as we don't reduce
+          // NumRegDefsLeft to zero, we handle the common cases well.
+          --OpSU->NumRegDefsLeft;
+        }
+      }
+    }
+  }
+}
+
+/// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+/// are input.  This SUnit graph is similar to the SelectionDAG, but
+/// excludes nodes that aren't interesting to scheduling, and represents
+/// glued together nodes with a single SUnit.
+void ScheduleDAGSDNodes::BuildSchedGraph(AliasAnalysis *AA) {
+  // Cluster certain nodes which should be scheduled together.
+  ClusterNodes();
+  // Populate the SUnits array.
+  BuildSchedUnits();
+  // Compute all the scheduling dependencies between nodes.
+  AddSchedEdges();
+}
+
+// Initialize NumNodeDefs for the current Node's opcode.
+void ScheduleDAGSDNodes::RegDefIter::InitNodeNumDefs() {
+  // Check for phys reg copy.
+  if (!Node)
+    return;
+
+  if (!Node->isMachineOpcode()) {
+    if (Node->getOpcode() == ISD::CopyFromReg)
+      NodeNumDefs = 1;
+    else
+      NodeNumDefs = 0;
+    return;
+  }
+  unsigned POpc = Node->getMachineOpcode();
+  if (POpc == TargetOpcode::IMPLICIT_DEF) {
+    // No register need be allocated for this.
+    NodeNumDefs = 0;
+    return;
+  }
+  if (POpc == TargetOpcode::PATCHPOINT &&
+      Node->getValueType(0) == MVT::Other) {
+    // PATCHPOINT is defined to have one result, but it might really have none
+    // if we're not using CallingConv::AnyReg. Don't mistake the chain for a
+    // real definition.
+    NodeNumDefs = 0;
+    return;
+  }
+  unsigned NRegDefs = SchedDAG->TII->get(Node->getMachineOpcode()).getNumDefs();
+  // Some instructions define regs that are not represented in the selection DAG
+  // (e.g. unused flags). See tMOVi8. Make sure we don't access past NumValues.
+  NodeNumDefs = std::min(Node->getNumValues(), NRegDefs);
+  DefIdx = 0;
+}
+
+// Construct a RegDefIter for this SUnit and find the first valid value.
+ScheduleDAGSDNodes::RegDefIter::RegDefIter(const SUnit *SU,
+                                           const ScheduleDAGSDNodes *SD)
+  : SchedDAG(SD), Node(SU->getNode()), DefIdx(0), NodeNumDefs(0) {
+  InitNodeNumDefs();
+  Advance();
+}
+
+// Advance to the next valid value defined by the SUnit.
+void ScheduleDAGSDNodes::RegDefIter::Advance() {
+  for (;Node;) { // Visit all glued nodes.
+    for (;DefIdx < NodeNumDefs; ++DefIdx) {
+      if (!Node->hasAnyUseOfValue(DefIdx))
+        continue;
+      ValueType = Node->getSimpleValueType(DefIdx);
+      ++DefIdx;
+      return; // Found a normal regdef.
+    }
+    Node = Node->getGluedNode();
+    if (!Node) {
+      return; // No values left to visit.
+    }
+    InitNodeNumDefs();
+  }
+}
+
+void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) {
+  assert(SU->NumRegDefsLeft == 0 && "expect a new node");
+  for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) {
+    assert(SU->NumRegDefsLeft < USHRT_MAX && "overflow is ok but unexpected");
+    ++SU->NumRegDefsLeft;
+  }
+}
+
+void ScheduleDAGSDNodes::computeLatency(SUnit *SU) {
+  SDNode *N = SU->getNode();
+
+  // TokenFactor operands are considered zero latency, and some schedulers
+  // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero
+  // whenever node latency is nonzero.
+  if (N && N->getOpcode() == ISD::TokenFactor) {
+    SU->Latency = 0;
+    return;
+  }
+
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies()) {
+    SU->Latency = 1;
+    return;
+  }
+
+  if (!InstrItins || InstrItins->isEmpty()) {
+    if (N && N->isMachineOpcode() &&
+        TII->isHighLatencyDef(N->getMachineOpcode()))
+      SU->Latency = HighLatencyCycles;
+    else
+      SU->Latency = 1;
+    return;
+  }
+
+  // Compute the latency for the node.  We use the sum of the latencies for
+  // all nodes glued together into this SUnit.
+  SU->Latency = 0;
+  for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+    if (N->isMachineOpcode())
+      SU->Latency += TII->getInstrLatency(InstrItins, N);
+}
+
+void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
+                                               unsigned OpIdx, SDep& dep) const{
+  // Check to see if the scheduler cares about latencies.
+  if (forceUnitLatencies())
+    return;
+
+  if (dep.getKind() != SDep::Data)
+    return;
+
+  unsigned DefIdx = Use->getOperand(OpIdx).getResNo();
+  if (Use->isMachineOpcode())
+    // Adjust the use operand index by num of defs.
+    OpIdx += TII->get(Use->getMachineOpcode()).getNumDefs();
+  int Latency = TII->getOperandLatency(InstrItins, Def, DefIdx, Use, OpIdx);
+  if (Latency > 1 && Use->getOpcode() == ISD::CopyToReg &&
+      !BB->succ_empty()) {
+    unsigned Reg = cast<RegisterSDNode>(Use->getOperand(1))->getReg();
+    if (TargetRegisterInfo::isVirtualRegister(Reg))
+      // This copy is a liveout value. It is likely coalesced, so reduce the
+      // latency so not to penalize the def.
+      // FIXME: need target specific adjustment here?
+      Latency = (Latency > 1) ? Latency - 1 : 1;
+  }
+  if (Latency >= 0)
+    dep.setLatency(Latency);
+}
+
+void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  if (!SU->getNode()) {
+    dbgs() << "PHYS REG COPY\n";
+    return;
+  }
+
+  SU->getNode()->dump(DAG);
+  dbgs() << "\n";
+  SmallVector<SDNode *, 4> GluedNodes;
+  for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
+    GluedNodes.push_back(N);
+  while (!GluedNodes.empty()) {
+    dbgs() << "    ";
+    GluedNodes.back()->dump(DAG);
+    dbgs() << "\n";
+    GluedNodes.pop_back();
+  }
+#endif
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGSDNodes::dumpSchedule() const {
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    if (SUnit *SU = Sequence[i])
+      SU->dump(this);
+    else
+      dbgs() << "**** NOOP ****\n";
+  }
+}
+#endif
+
+#ifndef NDEBUG
+/// VerifyScheduledSequence - Verify that all SUnits were scheduled and that
+/// their state is consistent with the nodes listed in Sequence.
+///
+void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
+  unsigned ScheduledNodes = ScheduleDAG::VerifyScheduledDAG(isBottomUp);
+  unsigned Noops = 0;
+  for (unsigned i = 0, e = Sequence.size(); i != e; ++i)
+    if (!Sequence[i])
+      ++Noops;
+  assert(Sequence.size() - Noops == ScheduledNodes &&
+         "The number of nodes scheduled doesn't match the expected number!");
+}
+#endif // NDEBUG
+
+/// ProcessSDDbgValues - Process SDDbgValues associated with this node.
+static void
+ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                   SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
+                   DenseMap<SDValue, unsigned> &VRBaseMap, unsigned Order) {
+  if (!N->getHasDebugValue())
+    return;
+
+  // Opportunistically insert immediate dbg_value uses, i.e. those with source
+  // order number right after the N.
+  MachineBasicBlock *BB = Emitter.getBlock();
+  MachineBasicBlock::iterator InsertPos = Emitter.getInsertPos();
+  ArrayRef<SDDbgValue*> DVs = DAG->GetDbgValues(N);
+  for (unsigned i = 0, e = DVs.size(); i != e; ++i) {
+    if (DVs[i]->isInvalidated())
+      continue;
+    unsigned DVOrder = DVs[i]->getOrder();
+    if (!Order || DVOrder == ++Order) {
+      MachineInstr *DbgMI = Emitter.EmitDbgValue(DVs[i], VRBaseMap);
+      if (DbgMI) {
+        Orders.push_back(std::make_pair(DVOrder, DbgMI));
+        BB->insert(InsertPos, DbgMI);
+      }
+      DVs[i]->setIsInvalidated();
+    }
+  }
+}
+
+// ProcessSourceNode - Process nodes with source order numbers. These are added
+// to a vector which EmitSchedule uses to determine how to insert dbg_value
+// instructions in the right order.
+static void
+ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
+                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
+                  SmallSet<unsigned, 8> &Seen) {
+  unsigned Order = N->getIROrder();
+  if (!Order || !Seen.insert(Order).second) {
+    // Process any valid SDDbgValues even if node does not have any order
+    // assigned.
+    ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, 0);
+    return;
+  }
+
+  MachineBasicBlock *BB = Emitter.getBlock();
+  if (Emitter.getInsertPos() == BB->begin() || BB->back().isPHI() ||
+      // Fast-isel may have inserted some instructions, in which case the
+      // BB->back().isPHI() test will not fire when we want it to.
+      std::prev(Emitter.getInsertPos())->isPHI()) {
+    // Did not insert any instruction.
+    Orders.push_back(std::make_pair(Order, (MachineInstr*)nullptr));
+    return;
+  }
+
+  Orders.push_back(std::make_pair(Order, &*std::prev(Emitter.getInsertPos())));
+  ProcessSDDbgValues(N, DAG, Emitter, Orders, VRBaseMap, Order);
+}
+
+void ScheduleDAGSDNodes::
+EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                MachineBasicBlock::iterator InsertPos) {
+  for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
+       I != E; ++I) {
+    if (I->isCtrl()) continue;  // ignore chain preds
+    if (I->getSUnit()->CopyDstRC) {
+      // Copy to physical register.
+      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
+      // Find the destination physical register.
+      unsigned Reg = 0;
+      for (SUnit::const_succ_iterator II = SU->Succs.begin(),
+             EE = SU->Succs.end(); II != EE; ++II) {
+        if (II->isCtrl()) continue;  // ignore chain preds
+        if (II->getReg()) {
+          Reg = II->getReg();
+          break;
+        }
+      }
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), Reg)
+        .addReg(VRI->second);
+    } else {
+      // Copy from physical register.
+      assert(I->getReg() && "Unknown physical register!");
+      unsigned VRBase = MRI.createVirtualRegister(SU->CopyDstRC);
+      bool isNew = VRBaseMap.insert(std::make_pair(SU, VRBase)).second;
+      (void)isNew; // Silence compiler warning.
+      assert(isNew && "Node emitted out of order - early");
+      BuildMI(*BB, InsertPos, DebugLoc(), TII->get(TargetOpcode::COPY), VRBase)
+        .addReg(I->getReg());
+    }
+    break;
+  }
+}
+
+/// EmitSchedule - Emit the machine code in scheduled order. Return the new
+/// InsertPos and MachineBasicBlock that contains this insertion
+/// point. ScheduleDAGSDNodes holds a BB pointer for convenience, but this does
+/// not necessarily refer to returned BB. The emitter may split blocks.
+MachineBasicBlock *ScheduleDAGSDNodes::
+EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
+  InstrEmitter Emitter(BB, InsertPos);
+  DenseMap<SDValue, unsigned> VRBaseMap;
+  DenseMap<SUnit*, unsigned> CopyVRBaseMap;
+  SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
+  SmallSet<unsigned, 8> Seen;
+  bool HasDbg = DAG->hasDebugValues();
+
+  // If this is the first BB, emit byval parameter dbg_value's.
+  if (HasDbg && BB->getParent()->begin() == MachineFunction::iterator(BB)) {
+    SDDbgInfo::DbgIterator PDI = DAG->ByvalParmDbgBegin();
+    SDDbgInfo::DbgIterator PDE = DAG->ByvalParmDbgEnd();
+    for (; PDI != PDE; ++PDI) {
+      MachineInstr *DbgMI= Emitter.EmitDbgValue(*PDI, VRBaseMap);
+      if (DbgMI)
+        BB->insert(InsertPos, DbgMI);
+    }
+  }
+
+  for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
+    SUnit *SU = Sequence[i];
+    if (!SU) {
+      // Null SUnit* is a noop.
+      TII->insertNoop(*Emitter.getBlock(), InsertPos);
+      continue;
+    }
+
+    // For pre-regalloc scheduling, create instructions corresponding to the
+    // SDNode and any glued SDNodes and append them to the block.
+    if (!SU->getNode()) {
+      // Emit a copy.
+      EmitPhysRegCopy(SU, CopyVRBaseMap, InsertPos);
+      continue;
+    }
+
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode()->getGluedNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      SDNode *N = GluedNodes.back();
+      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned,
+                       VRBaseMap);
+      // Remember the source order of the inserted instruction.
+      if (HasDbg)
+        ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
+      GluedNodes.pop_back();
+    }
+    Emitter.EmitNode(SU->getNode(), SU->OrigNode != SU, SU->isCloned,
+                     VRBaseMap);
+    // Remember the source order of the inserted instruction.
+    if (HasDbg)
+      ProcessSourceNode(SU->getNode(), DAG, Emitter, VRBaseMap, Orders,
+                        Seen);
+  }
+
+  // Insert all the dbg_values which have not already been inserted in source
+  // order sequence.
+  if (HasDbg) {
+    MachineBasicBlock::iterator BBBegin = BB->getFirstNonPHI();
+
+    // Sort the source order instructions and use the order to insert debug
+    // values.
+    std::sort(Orders.begin(), Orders.end(), less_first());
+
+    SDDbgInfo::DbgIterator DI = DAG->DbgBegin();
+    SDDbgInfo::DbgIterator DE = DAG->DbgEnd();
+    // Now emit the rest according to source order.
+    unsigned LastOrder = 0;
+    for (unsigned i = 0, e = Orders.size(); i != e && DI != DE; ++i) {
+      unsigned Order = Orders[i].first;
+      MachineInstr *MI = Orders[i].second;
+      // Insert all SDDbgValue's whose order(s) are before "Order".
+      if (!MI)
+        continue;
+      for (; DI != DE &&
+             (*DI)->getOrder() >= LastOrder && (*DI)->getOrder() < Order; ++DI) {
+        if ((*DI)->isInvalidated())
+          continue;
+        MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap);
+        if (DbgMI) {
+          if (!LastOrder)
+            // Insert to start of the BB (after PHIs).
+            BB->insert(BBBegin, DbgMI);
+          else {
+            // Insert at the instruction, which may be in a different
+            // block, if the block was split by a custom inserter.
+            MachineBasicBlock::iterator Pos = MI;
+            MI->getParent()->insert(Pos, DbgMI);
+          }
+        }
+      }
+      LastOrder = Order;
+    }
+    // Add trailing DbgValue's before the terminator. FIXME: May want to add
+    // some of them before one or more conditional branches?
+    SmallVector<MachineInstr*, 8> DbgMIs;
+    while (DI != DE) {
+      if (!(*DI)->isInvalidated())
+        if (MachineInstr *DbgMI = Emitter.EmitDbgValue(*DI, VRBaseMap))
+          DbgMIs.push_back(DbgMI);
+      ++DI;
+    }
+
+    MachineBasicBlock *InsertBB = Emitter.getBlock();
+    MachineBasicBlock::iterator Pos = InsertBB->getFirstTerminator();
+    InsertBB->insert(Pos, DbgMIs.begin(), DbgMIs.end());
+  }
+
+  InsertPos = Emitter.getInsertPos();
+  return Emitter.getBlock();
+}
+
+/// Return the basic block label.
+std::string ScheduleDAGSDNodes::getDAGName() const {
+  return "sunit-dag." + BB->getFullName();
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
new file mode 100644
index 000000000000..5cc806668b12
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -0,0 +1,180 @@
+//===---- ScheduleDAGSDNodes.h - SDNode Scheduling --------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the ScheduleDAGSDNodes class, which implements
+// scheduling for an SDNode-based dependency graph.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SCHEDULEDAGSDNODES_H
+
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+namespace llvm {
+  /// ScheduleDAGSDNodes - A ScheduleDAG for scheduling SDNode-based DAGs.
+  ///
+  /// Edges between SUnits are initially based on edges in the SelectionDAG,
+  /// and additional edges can be added by the schedulers as heuristics.
+  /// SDNodes such as Constants, Registers, and a few others that are not
+  /// interesting to schedulers are not allocated SUnits.
+  ///
+  /// SDNodes with MVT::Glue operands are grouped along with the flagged
+  /// nodes into a single SUnit so that they are scheduled together.
+  ///
+  /// SDNode-based scheduling graphs do not use SDep::Anti or SDep::Output
+  /// edges.  Physical register dependence information is not carried in
+  /// the DAG and must be handled explicitly by schedulers.
+  ///
+  class ScheduleDAGSDNodes : public ScheduleDAG {
+  public:
+    MachineBasicBlock *BB;
+    SelectionDAG *DAG;                    // DAG of the current basic block
+    const InstrItineraryData *InstrItins;
+
+    /// The schedule. Null SUnit*'s represent noop instructions.
+    std::vector<SUnit*> Sequence;
+
+    explicit ScheduleDAGSDNodes(MachineFunction &mf);
+
+    ~ScheduleDAGSDNodes() override {}
+
+    /// Run - perform scheduling.
+    ///
+    void Run(SelectionDAG *dag, MachineBasicBlock *bb);
+
+    /// isPassiveNode - Return true if the node is a non-scheduled leaf.
+    ///
+    static bool isPassiveNode(SDNode *Node) {
+      if (isa<ConstantSDNode>(Node))       return true;
+      if (isa<ConstantFPSDNode>(Node))     return true;
+      if (isa<RegisterSDNode>(Node))       return true;
+      if (isa<RegisterMaskSDNode>(Node))   return true;
+      if (isa<GlobalAddressSDNode>(Node))  return true;
+      if (isa<BasicBlockSDNode>(Node))     return true;
+      if (isa<FrameIndexSDNode>(Node))     return true;
+      if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<TargetIndexSDNode>(Node))    return true;
+      if (isa<JumpTableSDNode>(Node))      return true;
+      if (isa<ExternalSymbolSDNode>(Node)) return true;
+      if (isa<MCSymbolSDNode>(Node))       return true;
+      if (isa<BlockAddressSDNode>(Node))   return true;
+      if (Node->getOpcode() == ISD::EntryToken ||
+          isa<MDNodeSDNode>(Node)) return true;
+      return false;
+    }
+
+    /// NewSUnit - Creates a new SUnit and return a ptr to it.
+    ///
+    SUnit *newSUnit(SDNode *N);
+
+    /// Clone - Creates a clone of the specified SUnit. It does not copy the
+    /// predecessors / successors info nor the temporary scheduling states.
+    ///
+    SUnit *Clone(SUnit *N);
+
+    /// BuildSchedGraph - Build the SUnit graph from the selection dag that we
+    /// are input.  This SUnit graph is similar to the SelectionDAG, but
+    /// excludes nodes that aren't interesting to scheduling, and represents
+    /// flagged together nodes with a single SUnit.
+    void BuildSchedGraph(AliasAnalysis *AA);
+
+    /// InitNumRegDefsLeft - Determine the # of regs defined by this node.
+    ///
+    void InitNumRegDefsLeft(SUnit *SU);
+
+    /// computeLatency - Compute node latency.
+    ///
+    virtual void computeLatency(SUnit *SU);
+
+    virtual void computeOperandLatency(SDNode *Def, SDNode *Use,
+                                       unsigned OpIdx, SDep& dep) const;
+
+    /// Schedule - Order nodes according to selected style, filling
+    /// in the Sequence member.
+    ///
+    virtual void Schedule() = 0;
+
+    /// VerifyScheduledSequence - Verify that all SUnits are scheduled and
+    /// consistent with the Sequence of scheduled instructions.
+    void VerifyScheduledSequence(bool isBottomUp);
+
+    /// EmitSchedule - Insert MachineInstrs into the MachineBasicBlock
+    /// according to the order specified in Sequence.
+    ///
+    virtual MachineBasicBlock*
+    EmitSchedule(MachineBasicBlock::iterator &InsertPos);
+
+    void dumpNode(const SUnit *SU) const override;
+
+    void dumpSchedule() const;
+
+    std::string getGraphNodeLabel(const SUnit *SU) const override;
+
+    std::string getDAGName() const override;
+
+    virtual void getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const;
+
+    /// RegDefIter - In place iteration over the values defined by an
+    /// SUnit. This does not need copies of the iterator or any other STLisms.
+    /// The iterator creates itself, rather than being provided by the SchedDAG.
+    class RegDefIter {
+      const ScheduleDAGSDNodes *SchedDAG;
+      const SDNode *Node;
+      unsigned DefIdx;
+      unsigned NodeNumDefs;
+      MVT ValueType;
+    public:
+      RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
+
+      bool IsValid() const { return Node != nullptr; }
+
+      MVT GetValue() const {
+        assert(IsValid() && "bad iterator");
+        return ValueType;
+      }
+
+      const SDNode *GetNode() const {
+        return Node;
+      }
+
+      unsigned GetIdx() const {
+        return DefIdx-1;
+      }
+
+      void Advance();
+    private:
+      void InitNodeNumDefs();
+    };
+
+  protected:
+    /// ForceUnitLatencies - Return true if all scheduling edges should be given
+    /// a latency value of one.  The default is to return false; schedulers may
+    /// override this as needed.
+    virtual bool forceUnitLatencies() const { return false; }
+
+  private:
+    /// ClusterNeighboringLoads - Cluster loads from "near" addresses into
+    /// combined SUnits.
+    void ClusterNeighboringLoads(SDNode *Node);
+    /// ClusterNodes - Cluster certain nodes which should be scheduled together.
+    ///
+    void ClusterNodes();
+
+    /// BuildSchedUnits, AddSchedEdges - Helper functions for BuildSchedGraph.
+    void BuildSchedUnits();
+    void AddSchedEdges();
+
+    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+                         MachineBasicBlock::iterator InsertPos);
+  };
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
new file mode 100644
index 000000000000..eee4a4b06718
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGVLIW.cpp
@@ -0,0 +1,279 @@
+//===- ScheduleDAGVLIW.cpp - SelectionDAG list scheduler for VLIW -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements a top-down list scheduler, using standard algorithms.
+// The basic approach uses a priority queue of available nodes to schedule.
+// One at a time, nodes are taken from the priority queue (thus in priority
+// order), checked for legality to schedule, and emitted if legal.
+//
+// Nodes may not be legal to schedule either due to structural hazards (e.g.
+// pipeline or resource constraints) or because an input to the instruction has
+// not completed execution.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LatencyPriorityQueue.h"
+#include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <climits>
+using namespace llvm;
+
+#define DEBUG_TYPE "pre-RA-sched"
+
+STATISTIC(NumNoops , "Number of noops inserted");
+STATISTIC(NumStalls, "Number of pipeline stalls");
+
+static RegisterScheduler
+  VLIWScheduler("vliw-td", "VLIW scheduler",
+                createVLIWDAGScheduler);
+
+namespace {
+//===----------------------------------------------------------------------===//
+/// ScheduleDAGVLIW - The actual DFA list scheduler implementation.  This
+/// supports / top-down scheduling.
+///
+class ScheduleDAGVLIW : public ScheduleDAGSDNodes {
+private:
+  /// AvailableQueue - The priority queue to use for the available SUnits.
+  ///
+  SchedulingPriorityQueue *AvailableQueue;
+
+  /// PendingQueue - This contains all of the instructions whose operands have
+  /// been issued, but their results are not ready yet (due to the latency of
+  /// the operation).  Once the operands become available, the instruction is
+  /// added to the AvailableQueue.
+  std::vector<SUnit*> PendingQueue;
+
+  /// HazardRec - The hazard recognizer to use.
+  ScheduleHazardRecognizer *HazardRec;
+
+  /// AA - AliasAnalysis for making memory reference queries.
+  AliasAnalysis *AA;
+
+public:
+  ScheduleDAGVLIW(MachineFunction &mf,
+                  AliasAnalysis *aa,
+                  SchedulingPriorityQueue *availqueue)
+    : ScheduleDAGSDNodes(mf), AvailableQueue(availqueue), AA(aa) {
+    const TargetSubtargetInfo &STI = mf.getSubtarget();
+    HazardRec = STI.getInstrInfo()->CreateTargetHazardRecognizer(&STI, this);
+  }
+
+  ~ScheduleDAGVLIW() override {
+    delete HazardRec;
+    delete AvailableQueue;
+  }
+
+  void Schedule() override;
+
+private:
+  void releaseSucc(SUnit *SU, const SDep &D);
+  void releaseSuccessors(SUnit *SU);
+  void scheduleNodeTopDown(SUnit *SU, unsigned CurCycle);
+  void listScheduleTopDown();
+};
+}  // end anonymous namespace
+
+/// Schedule - Schedule the DAG using list scheduling.
+void ScheduleDAGVLIW::Schedule() {
+  DEBUG(dbgs()
+        << "********** List Scheduling BB#" << BB->getNumber()
+        << " '" << BB->getName() << "' **********\n");
+
+  // Build the scheduling graph.
+  BuildSchedGraph(AA);
+
+  AvailableQueue->initNodes(SUnits);
+
+  listScheduleTopDown();
+
+  AvailableQueue->releaseState();
+}
+
+//===----------------------------------------------------------------------===//
+//  Top-Down Scheduling
+//===----------------------------------------------------------------------===//
+
+/// releaseSucc - Decrement the NumPredsLeft count of a successor. Add it to
+/// the PendingQueue if the count reaches zero. Also update its cycle bound.
+void ScheduleDAGVLIW::releaseSucc(SUnit *SU, const SDep &D) {
+  SUnit *SuccSU = D.getSUnit();
+
+#ifndef NDEBUG
+  if (SuccSU->NumPredsLeft == 0) {
+    dbgs() << "*** Scheduling failed! ***\n";
+    SuccSU->dump(this);
+    dbgs() << " has been released too many times!\n";
+    llvm_unreachable(nullptr);
+  }
+#endif
+  assert(!D.isWeak() && "unexpected artificial DAG edge");
+
+  --SuccSU->NumPredsLeft;
+
+  SuccSU->setDepthToAtLeast(SU->getDepth() + D.getLatency());
+
+  // If all the node's predecessors are scheduled, this node is ready
+  // to be scheduled. Ignore the special ExitSU node.
+  if (SuccSU->NumPredsLeft == 0 && SuccSU != &ExitSU) {
+    PendingQueue.push_back(SuccSU);
+  }
+}
+
+void ScheduleDAGVLIW::releaseSuccessors(SUnit *SU) {
+  // Top down: release successors.
+  for (SUnit::succ_iterator I = SU->Succs.begin(), E = SU->Succs.end();
+       I != E; ++I) {
+    assert(!I->isAssignedRegDep() &&
+           "The list-td scheduler doesn't yet support physreg dependencies!");
+
+    releaseSucc(SU, *I);
+  }
+}
+
+/// scheduleNodeTopDown - Add the node to the schedule. Decrement the pending
+/// count of its successors. If a successor pending count is zero, add it to
+/// the Available queue.
+void ScheduleDAGVLIW::scheduleNodeTopDown(SUnit *SU, unsigned CurCycle) {
+  DEBUG(dbgs() << "*** Scheduling [" << CurCycle << "]: ");
+  DEBUG(SU->dump(this));
+
+  Sequence.push_back(SU);
+  assert(CurCycle >= SU->getDepth() && "Node scheduled above its depth!");
+  SU->setDepthToAtLeast(CurCycle);
+
+  releaseSuccessors(SU);
+  SU->isScheduled = true;
+  AvailableQueue->scheduledNode(SU);
+}
+
+/// listScheduleTopDown - The main loop of list scheduling for top-down
+/// schedulers.
+void ScheduleDAGVLIW::listScheduleTopDown() {
+  unsigned CurCycle = 0;
+
+  // Release any successors of the special Entry node.
+  releaseSuccessors(&EntrySU);
+
+  // All leaves to AvailableQueue.
+  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
+    // It is available if it has no predecessors.
+    if (SUnits[i].Preds.empty()) {
+      AvailableQueue->push(&SUnits[i]);
+      SUnits[i].isAvailable = true;
+    }
+  }
+
+  // While AvailableQueue is not empty, grab the node with the highest
+  // priority. If it is not ready put it back.  Schedule the node.
+  std::vector<SUnit*> NotReady;
+  Sequence.reserve(SUnits.size());
+  while (!AvailableQueue->empty() || !PendingQueue.empty()) {
+    // Check to see if any of the pending instructions are ready to issue.  If
+    // so, add them to the available queue.
+    for (unsigned i = 0, e = PendingQueue.size(); i != e; ++i) {
+      if (PendingQueue[i]->getDepth() == CurCycle) {
+        AvailableQueue->push(PendingQueue[i]);
+        PendingQueue[i]->isAvailable = true;
+        PendingQueue[i] = PendingQueue.back();
+        PendingQueue.pop_back();
+        --i; --e;
+      }
+      else {
+        assert(PendingQueue[i]->getDepth() > CurCycle && "Negative latency?");
+      }
+    }
+
+    // If there are no instructions available, don't try to issue anything, and
+    // don't advance the hazard recognizer.
+    if (AvailableQueue->empty()) {
+      // Reset DFA state.
+      AvailableQueue->scheduledNode(nullptr);
+      ++CurCycle;
+      continue;
+    }
+
+    SUnit *FoundSUnit = nullptr;
+
+    bool HasNoopHazards = false;
+    while (!AvailableQueue->empty()) {
+      SUnit *CurSUnit = AvailableQueue->pop();
+
+      ScheduleHazardRecognizer::HazardType HT =
+        HazardRec->getHazardType(CurSUnit, 0/*no stalls*/);
+      if (HT == ScheduleHazardRecognizer::NoHazard) {
+        FoundSUnit = CurSUnit;
+        break;
+      }
+
+      // Remember if this is a noop hazard.
+      HasNoopHazards |= HT == ScheduleHazardRecognizer::NoopHazard;
+
+      NotReady.push_back(CurSUnit);
+    }
+
+    // Add the nodes that aren't ready back onto the available list.
+    if (!NotReady.empty()) {
+      AvailableQueue->push_all(NotReady);
+      NotReady.clear();
+    }
+
+    // If we found a node to schedule, do it now.
+    if (FoundSUnit) {
+      scheduleNodeTopDown(FoundSUnit, CurCycle);
+      HazardRec->EmitInstruction(FoundSUnit);
+
+      // If this is a pseudo-op node, we don't want to increment the current
+      // cycle.
+      if (FoundSUnit->Latency)  // Don't increment CurCycle for pseudo-ops!
+        ++CurCycle;
+    } else if (!HasNoopHazards) {
+      // Otherwise, we have a pipeline stall, but no other problem, just advance
+      // the current cycle and try again.
+      DEBUG(dbgs() << "*** Advancing cycle, no work to do\n");
+      HazardRec->AdvanceCycle();
+      ++NumStalls;
+      ++CurCycle;
+    } else {
+      // Otherwise, we have no instructions to issue and we have instructions
+      // that will fault if we don't do this right.  This is the case for
+      // processors without pipeline interlocks and other cases.
+      DEBUG(dbgs() << "*** Emitting noop\n");
+      HazardRec->EmitNoop();
+      Sequence.push_back(nullptr);   // NULL here means noop
+      ++NumNoops;
+      ++CurCycle;
+    }
+  }
+
+#ifndef NDEBUG
+  VerifyScheduledSequence(/*isBottomUp=*/false);
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                         Public Constructor Functions
+//===----------------------------------------------------------------------===//
+
+/// createVLIWDAGScheduler - This creates a top-down list scheduler.
+ScheduleDAGSDNodes *
+llvm::createVLIWDAGScheduler(SelectionDAGISel *IS, CodeGenOpt::Level) {
+  return new ScheduleDAGVLIW(*IS->MF, IS->AA, new ResourcePriorityQueue(IS));
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
new file mode 100644
index 000000000000..8235522b14bd
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -0,0 +1,7306 @@
+//===-- SelectionDAG.cpp - Implement the SelectionDAG data structures -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Mutex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cmath>
+#include <utility>
+
+using namespace llvm;
+
+/// makeVTList - Return an instance of the SDVTList struct initialized with the
+/// specified members.
+static SDVTList makeVTList(const EVT *VTs, unsigned NumVTs) {
+  SDVTList Res = {VTs, NumVTs};
+  return Res;
+}
+
+// Default null implementations of the callbacks.
+void SelectionDAG::DAGUpdateListener::NodeDeleted(SDNode*, SDNode*) {}
+void SelectionDAG::DAGUpdateListener::NodeUpdated(SDNode*) {}
+
+//===----------------------------------------------------------------------===//
+//                              ConstantFPSDNode Class
+//===----------------------------------------------------------------------===//
+
+/// isExactlyValue - We don't rely on operator== working on double values, as
+/// it returns true for things that are clearly not equal, like -0.0 and 0.0.
+/// As such, this method can be used to do an exact bit-for-bit comparison of
+/// two floating point values.
+bool ConstantFPSDNode::isExactlyValue(const APFloat& V) const {
+  return getValueAPF().bitwiseIsEqual(V);
+}
+
+bool ConstantFPSDNode::isValueValidForType(EVT VT,
+                                           const APFloat& Val) {
+  assert(VT.isFloatingPoint() && "Can only convert between FP types");
+
+  // convert modifies in place, so make a copy.
+  APFloat Val2 = APFloat(Val);
+  bool losesInfo;
+  (void) Val2.convert(SelectionDAG::EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven,
+                      &losesInfo);
+  return !losesInfo;
+}
+
+//===----------------------------------------------------------------------===//
+//                              ISD Namespace
+//===----------------------------------------------------------------------===//
+
+bool ISD::isConstantSplatVector(const SDNode *N, APInt &SplatVal) {
+  auto *BV = dyn_cast<BuildVectorSDNode>(N);
+  if (!BV)
+    return false;
+
+  APInt SplatUndef;
+  unsigned SplatBitSize;
+  bool HasUndefs;
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  return BV->isConstantSplat(SplatVal, SplatUndef, SplatBitSize, HasUndefs) &&
+         EltVT.getSizeInBits() >= SplatBitSize;
+}
+
+// FIXME: AllOnes and AllZeros duplicate a lot of code. Could these be
+// specializations of the more general isConstantSplatVector()?
+
+bool ISD::isBuildVectorAllOnes(const SDNode *N) {
+  // Look through a bit convert.
+  while (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  unsigned i = 0, e = N->getNumOperands();
+
+  // Skip over all of the undef values.
+  while (i != e && N->getOperand(i).isUndef())
+    ++i;
+
+  // Do not accept an all-undef vector.
+  if (i == e) return false;
+
+  // Do not accept build_vectors that aren't all constants or which have non-~0
+  // elements. We have to be a bit careful here, as the type of the constant
+  // may not be the same as the type of the vector elements due to type
+  // legalization (the elements are promoted to a legal type for the target and
+  // a vector of a type may be legal when the base element type is not).
+  // We only want to check enough bits to cover the vector elements, because
+  // we care if the resultant vector is all ones, not whether the individual
+  // constants are.
+  SDValue NotZero = N->getOperand(i);
+  unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(NotZero)) {
+    if (CN->getAPIntValue().countTrailingOnes() < EltSize)
+      return false;
+  } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(NotZero)) {
+    if (CFPN->getValueAPF().bitcastToAPInt().countTrailingOnes() < EltSize)
+      return false;
+  } else
+    return false;
+
+  // Okay, we have at least one ~0 value, check to see if the rest match or are
+  // undefs. Even with the above element type twiddling, this should be OK, as
+  // the same type legalization should have applied to all the elements.
+  for (++i; i != e; ++i)
+    if (N->getOperand(i) != NotZero && !N->getOperand(i).isUndef())
+      return false;
+  return true;
+}
+
+bool ISD::isBuildVectorAllZeros(const SDNode *N) {
+  // Look through a bit convert.
+  while (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0).getNode();
+
+  if (N->getOpcode() != ISD::BUILD_VECTOR) return false;
+
+  bool IsAllUndef = true;
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    IsAllUndef = false;
+    // Do not accept build_vectors that aren't all constants or which have non-0
+    // elements. We have to be a bit careful here, as the type of the constant
+    // may not be the same as the type of the vector elements due to type
+    // legalization (the elements are promoted to a legal type for the target
+    // and a vector of a type may be legal when the base element type is not).
+    // We only want to check enough bits to cover the vector elements, because
+    // we care if the resultant vector is all zeros, not whether the individual
+    // constants are.
+    unsigned EltSize = N->getValueType(0).getVectorElementType().getSizeInBits();
+    if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Op)) {
+      if (CN->getAPIntValue().countTrailingZeros() < EltSize)
+        return false;
+    } else if (ConstantFPSDNode *CFPN = dyn_cast<ConstantFPSDNode>(Op)) {
+      if (CFPN->getValueAPF().bitcastToAPInt().countTrailingZeros() < EltSize)
+        return false;
+    } else
+      return false;
+  }
+
+  // Do not accept an all-undef vector.
+  if (IsAllUndef)
+    return false;
+  return true;
+}
+
+bool ISD::isBuildVectorOfConstantSDNodes(const SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    if (!isa<ConstantSDNode>(Op))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::isBuildVectorOfConstantFPSDNodes(const SDNode *N) {
+  if (N->getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+
+  for (const SDValue &Op : N->op_values()) {
+    if (Op.isUndef())
+      continue;
+    if (!isa<ConstantFPSDNode>(Op))
+      return false;
+  }
+  return true;
+}
+
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+
+  for (const SDValue &Op : N->op_values())
+    if (!Op.isUndef())
+      return false;
+
+  return true;
+}
+
+ISD::NodeType ISD::getExtForLoadExtType(bool IsFP, ISD::LoadExtType ExtType) {
+  switch (ExtType) {
+  case ISD::EXTLOAD:
+    return IsFP ? ISD::FP_EXTEND : ISD::ANY_EXTEND;
+  case ISD::SEXTLOAD:
+    return ISD::SIGN_EXTEND;
+  case ISD::ZEXTLOAD:
+    return ISD::ZERO_EXTEND;
+  default:
+    break;
+  }
+
+  llvm_unreachable("Invalid LoadExtType");
+}
+
+ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
+  // To perform this operation, we just need to swap the L and G bits of the
+  // operation.
+  unsigned OldL = (Operation >> 2) & 1;
+  unsigned OldG = (Operation >> 1) & 1;
+  return ISD::CondCode((Operation & ~6) |  // Keep the N, U, E bits
+                       (OldL << 1) |       // New G bit
+                       (OldG << 2));       // New L bit.
+}
+
+ISD::CondCode ISD::getSetCCInverse(ISD::CondCode Op, bool isInteger) {
+  unsigned Operation = Op;
+  if (isInteger)
+    Operation ^= 7;   // Flip L, G, E bits, but not U.
+  else
+    Operation ^= 15;  // Flip all of the condition bits.
+
+  if (Operation > ISD::SETTRUE2)
+    Operation &= ~8;  // Don't let N and U bits get set.
+
+  return ISD::CondCode(Operation);
+}
+
+
+/// For an integer comparison, return 1 if the comparison is a signed operation
+/// and 2 if the result is an unsigned comparison. Return zero if the operation
+/// does not depend on the sign of the input (setne and seteq).
+static int isSignedOp(ISD::CondCode Opcode) {
+  switch (Opcode) {
+  default: llvm_unreachable("Illegal integer setcc operation!");
+  case ISD::SETEQ:
+  case ISD::SETNE: return 0;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return 1;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return 2;
+  }
+}
+
+ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                       bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed integer setcc with an unsigned integer setcc.
+    return ISD::SETCC_INVALID;
+
+  unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
+
+  // If the N and U bits get set then the resultant comparison DOES suddenly
+  // care about orderedness, and is true when ordered.
+  if (Op > ISD::SETTRUE2)
+    Op &= ~16;     // Clear the U bit if the N bit is set.
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+    Op = ISD::SETNE;
+
+  return ISD::CondCode(Op);
+}
+
+ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
+                                        bool isInteger) {
+  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+    // Cannot fold a signed setcc with an unsigned setcc.
+    return ISD::SETCC_INVALID;
+
+  // Combine all of the condition bits.
+  ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
+
+  // Canonicalize illegal integer setcc's.
+  if (isInteger) {
+    switch (Result) {
+    default: break;
+    case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
+    case ISD::SETOEQ:                                 // SETEQ  & SETU[LG]E
+    case ISD::SETUEQ: Result = ISD::SETEQ   ; break;  // SETUGE & SETULE
+    case ISD::SETOLT: Result = ISD::SETULT  ; break;  // SETULT & SETNE
+    case ISD::SETOGT: Result = ISD::SETUGT  ; break;  // SETUGT & SETNE
+    }
+  }
+
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//                           SDNode Profile Support
+//===----------------------------------------------------------------------===//
+
+/// AddNodeIDOpcode - Add the node opcode to the NodeID data.
+///
+static void AddNodeIDOpcode(FoldingSetNodeID &ID, unsigned OpC)  {
+  ID.AddInteger(OpC);
+}
+
+/// AddNodeIDValueTypes - Value type lists are intern'd so we can represent them
+/// solely with their pointer.
+static void AddNodeIDValueTypes(FoldingSetNodeID &ID, SDVTList VTList) {
+  ID.AddPointer(VTList.VTs);
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              ArrayRef<SDValue> Ops) {
+  for (auto& Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
+  }
+}
+
+/// AddNodeIDOperands - Various routines for adding operands to the NodeID data.
+///
+static void AddNodeIDOperands(FoldingSetNodeID &ID,
+                              ArrayRef<SDUse> Ops) {
+  for (auto& Op : Ops) {
+    ID.AddPointer(Op.getNode());
+    ID.AddInteger(Op.getResNo());
+  }
+}
+
+static void AddNodeIDNode(FoldingSetNodeID &ID, unsigned short OpC,
+                          SDVTList VTList, ArrayRef<SDValue> OpList) {
+  AddNodeIDOpcode(ID, OpC);
+  AddNodeIDValueTypes(ID, VTList);
+  AddNodeIDOperands(ID, OpList);
+}
+
+/// If this is an SDNode with special info, add this info to the NodeID data.
+static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
+  switch (N->getOpcode()) {
+  case ISD::TargetExternalSymbol:
+  case ISD::ExternalSymbol:
+  case ISD::MCSymbol:
+    llvm_unreachable("Should only be used on nodes with operands");
+  default: break;  // Normal nodes don't need extra info.
+  case ISD::TargetConstant:
+  case ISD::Constant: {
+    const ConstantSDNode *C = cast<ConstantSDNode>(N);
+    ID.AddPointer(C->getConstantIntValue());
+    ID.AddBoolean(C->isOpaque());
+    break;
+  }
+  case ISD::TargetConstantFP:
+  case ISD::ConstantFP: {
+    ID.AddPointer(cast<ConstantFPSDNode>(N)->getConstantFPValue());
+    break;
+  }
+  case ISD::TargetGlobalAddress:
+  case ISD::GlobalAddress:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::GlobalTLSAddress: {
+    const GlobalAddressSDNode *GA = cast<GlobalAddressSDNode>(N);
+    ID.AddPointer(GA->getGlobal());
+    ID.AddInteger(GA->getOffset());
+    ID.AddInteger(GA->getTargetFlags());
+    ID.AddInteger(GA->getAddressSpace());
+    break;
+  }
+  case ISD::BasicBlock:
+    ID.AddPointer(cast<BasicBlockSDNode>(N)->getBasicBlock());
+    break;
+  case ISD::Register:
+    ID.AddInteger(cast<RegisterSDNode>(N)->getReg());
+    break;
+  case ISD::RegisterMask:
+    ID.AddPointer(cast<RegisterMaskSDNode>(N)->getRegMask());
+    break;
+  case ISD::SRCVALUE:
+    ID.AddPointer(cast<SrcValueSDNode>(N)->getValue());
+    break;
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    ID.AddInteger(cast<FrameIndexSDNode>(N)->getIndex());
+    break;
+  case ISD::JumpTable:
+  case ISD::TargetJumpTable:
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getIndex());
+    ID.AddInteger(cast<JumpTableSDNode>(N)->getTargetFlags());
+    break;
+  case ISD::ConstantPool:
+  case ISD::TargetConstantPool: {
+    const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
+    ID.AddInteger(CP->getAlignment());
+    ID.AddInteger(CP->getOffset());
+    if (CP->isMachineConstantPoolEntry())
+      CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
+    else
+      ID.AddPointer(CP->getConstVal());
+    ID.AddInteger(CP->getTargetFlags());
+    break;
+  }
+  case ISD::TargetIndex: {
+    const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+    ID.AddInteger(TI->getIndex());
+    ID.AddInteger(TI->getOffset());
+    ID.AddInteger(TI->getTargetFlags());
+    break;
+  }
+  case ISD::LOAD: {
+    const LoadSDNode *LD = cast<LoadSDNode>(N);
+    ID.AddInteger(LD->getMemoryVT().getRawBits());
+    ID.AddInteger(LD->getRawSubclassData());
+    ID.AddInteger(LD->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::STORE: {
+    const StoreSDNode *ST = cast<StoreSDNode>(N);
+    ID.AddInteger(ST->getMemoryVT().getRawBits());
+    ID.AddInteger(ST->getRawSubclassData());
+    ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::ATOMIC_CMP_SWAP:
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS:
+  case ISD::ATOMIC_SWAP:
+  case ISD::ATOMIC_LOAD_ADD:
+  case ISD::ATOMIC_LOAD_SUB:
+  case ISD::ATOMIC_LOAD_AND:
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_XOR:
+  case ISD::ATOMIC_LOAD_NAND:
+  case ISD::ATOMIC_LOAD_MIN:
+  case ISD::ATOMIC_LOAD_MAX:
+  case ISD::ATOMIC_LOAD_UMIN:
+  case ISD::ATOMIC_LOAD_UMAX:
+  case ISD::ATOMIC_LOAD:
+  case ISD::ATOMIC_STORE: {
+    const AtomicSDNode *AT = cast<AtomicSDNode>(N);
+    ID.AddInteger(AT->getMemoryVT().getRawBits());
+    ID.AddInteger(AT->getRawSubclassData());
+    ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::PREFETCH: {
+    const MemSDNode *PF = cast<MemSDNode>(N);
+    ID.AddInteger(PF->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::VECTOR_SHUFFLE: {
+    const ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
+    for (unsigned i = 0, e = N->getValueType(0).getVectorNumElements();
+         i != e; ++i)
+      ID.AddInteger(SVN->getMaskElt(i));
+    break;
+  }
+  case ISD::TargetBlockAddress:
+  case ISD::BlockAddress: {
+    const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+    ID.AddPointer(BA->getBlockAddress());
+    ID.AddInteger(BA->getOffset());
+    ID.AddInteger(BA->getTargetFlags());
+    break;
+  }
+  } // end switch (N->getOpcode())
+
+  // Target specific memory nodes could also have address spaces to check.
+  if (N->isTargetMemoryOpcode())
+    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
+}
+
+/// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
+/// data.
+static void AddNodeIDNode(FoldingSetNodeID &ID, const SDNode *N) {
+  AddNodeIDOpcode(ID, N->getOpcode());
+  // Add the return value info.
+  AddNodeIDValueTypes(ID, N->getVTList());
+  // Add the operand info.
+  AddNodeIDOperands(ID, N->ops());
+
+  // Handle SDNode leafs with special info.
+  AddNodeIDCustom(ID, N);
+}
+
+/// encodeMemSDNodeFlags - Generic routine for computing a value for use in
+/// the CSE map that carries volatility, temporalness, indexing mode, and
+/// extension/truncation information.
+///
+static inline unsigned
+encodeMemSDNodeFlags(int ConvType, ISD::MemIndexedMode AM, bool isVolatile,
+                     bool isNonTemporal, bool isInvariant) {
+  assert((ConvType & 3) == ConvType &&
+         "ConvType may not require more than 2 bits!");
+  assert((AM & 7) == AM &&
+         "AM may not require more than 3 bits!");
+  return ConvType |
+         (AM << 2) |
+         (isVolatile << 5) |
+         (isNonTemporal << 6) |
+         (isInvariant << 7);
+}
+
+//===----------------------------------------------------------------------===//
+//                              SelectionDAG Class
+//===----------------------------------------------------------------------===//
+
+/// doNotCSE - Return true if CSE should not be performed for this node.
+static bool doNotCSE(SDNode *N) {
+  if (N->getValueType(0) == MVT::Glue)
+    return true; // Never CSE anything that produces a flag.
+
+  switch (N->getOpcode()) {
+  default: break;
+  case ISD::HANDLENODE:
+  case ISD::EH_LABEL:
+    return true;   // Never CSE these nodes.
+  }
+
+  // Check that remaining values produced are not flags.
+  for (unsigned i = 1, e = N->getNumValues(); i != e; ++i)
+    if (N->getValueType(i) == MVT::Glue)
+      return true; // Never CSE anything that produces a flag.
+
+  return false;
+}
+
+/// RemoveDeadNodes - This method deletes all unreachable nodes in the
+/// SelectionDAG.
+void SelectionDAG::RemoveDeadNodes() {
+  // Create a dummy node (which is not added to allnodes), that adds a reference
+  // to the root node, preventing it from being deleted.
+  HandleSDNode Dummy(getRoot());
+
+  SmallVector<SDNode*, 128> DeadNodes;
+
+  // Add all obviously-dead nodes to the DeadNodes worklist.
+  for (SDNode &Node : allnodes())
+    if (Node.use_empty())
+      DeadNodes.push_back(&Node);
+
+  RemoveDeadNodes(DeadNodes);
+
+  // If the root changed (e.g. it was a dead load, update the root).
+  setRoot(Dummy.getValue());
+}
+
+/// RemoveDeadNodes - This method deletes the unreachable nodes in the
+/// given list, and any nodes that become unreachable as a result.
+void SelectionDAG::RemoveDeadNodes(SmallVectorImpl<SDNode *> &DeadNodes) {
+
+  // Process the worklist, deleting the nodes and adding their uses to the
+  // worklist.
+  while (!DeadNodes.empty()) {
+    SDNode *N = DeadNodes.pop_back_val();
+
+    for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+      DUL->NodeDeleted(N, nullptr);
+
+    // Take the node out of the appropriate CSE map.
+    RemoveNodeFromCSEMaps(N);
+
+    // Next, brutally remove the operand list.  This is safe to do, as there are
+    // no cycles in the graph.
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+      SDUse &Use = *I++;
+      SDNode *Operand = Use.getNode();
+      Use.set(SDValue());
+
+      // Now that we removed this operand, see if there are no uses of it left.
+      if (Operand->use_empty())
+        DeadNodes.push_back(Operand);
+    }
+
+    DeallocateNode(N);
+  }
+}
+
+void SelectionDAG::RemoveDeadNode(SDNode *N){
+  SmallVector<SDNode*, 16> DeadNodes(1, N);
+
+  // Create a dummy node that adds a reference to the root node, preventing
+  // it from being deleted.  (This matters if the root is an operand of the
+  // dead node.)
+  HandleSDNode Dummy(getRoot());
+
+  RemoveDeadNodes(DeadNodes);
+}
+
+void SelectionDAG::DeleteNode(SDNode *N) {
+  // First take this out of the appropriate CSE map.
+  RemoveNodeFromCSEMaps(N);
+
+  // Finally, remove uses due to operands of this node, remove from the
+  // AllNodes list, and delete the node.
+  DeleteNodeNotInCSEMaps(N);
+}
+
+void SelectionDAG::DeleteNodeNotInCSEMaps(SDNode *N) {
+  assert(N->getIterator() != AllNodes.begin() &&
+         "Cannot delete the entry node!");
+  assert(N->use_empty() && "Cannot delete a node that is not dead!");
+
+  // Drop all of the operands and decrement used node's use counts.
+  N->DropOperands();
+
+  DeallocateNode(N);
+}
+
+void SDDbgInfo::erase(const SDNode *Node) {
+  DbgValMapType::iterator I = DbgValMap.find(Node);
+  if (I == DbgValMap.end())
+    return;
+  for (auto &Val: I->second)
+    Val->setIsInvalidated();
+  DbgValMap.erase(I);
+}
+
+void SelectionDAG::DeallocateNode(SDNode *N) {
+  // If we have operands, deallocate them.
+  removeOperands(N);
+
+  // Set the opcode to DELETED_NODE to help catch bugs when node
+  // memory is reallocated.
+  N->NodeType = ISD::DELETED_NODE;
+
+  NodeAllocator.Deallocate(AllNodes.remove(N));
+
+  // If any of the SDDbgValue nodes refer to this SDNode, invalidate
+  // them and forget about that node.
+  DbgInfo->erase(N);
+}
+
+#ifndef NDEBUG
+/// VerifySDNode - Sanity check the given SDNode.  Aborts if it is invalid.
+static void VerifySDNode(SDNode *N) {
+  switch (N->getOpcode()) {
+  default:
+    break;
+  case ISD::BUILD_PAIR: {
+    EVT VT = N->getValueType(0);
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(!VT.isVector() && (VT.isInteger() || VT.isFloatingPoint()) &&
+           "Wrong return type!");
+    assert(N->getNumOperands() == 2 && "Wrong number of operands!");
+    assert(N->getOperand(0).getValueType() == N->getOperand(1).getValueType() &&
+           "Mismatched operand types!");
+    assert(N->getOperand(0).getValueType().isInteger() == VT.isInteger() &&
+           "Wrong operand type!");
+    assert(VT.getSizeInBits() == 2 * N->getOperand(0).getValueSizeInBits() &&
+           "Wrong return type size");
+    break;
+  }
+  case ISD::BUILD_VECTOR: {
+    assert(N->getNumValues() == 1 && "Too many results!");
+    assert(N->getValueType(0).isVector() && "Wrong return type!");
+    assert(N->getNumOperands() == N->getValueType(0).getVectorNumElements() &&
+           "Wrong number of operands!");
+    EVT EltVT = N->getValueType(0).getVectorElementType();
+    for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ++I) {
+      assert((I->getValueType() == EltVT ||
+             (EltVT.isInteger() && I->getValueType().isInteger() &&
+              EltVT.bitsLE(I->getValueType()))) &&
+            "Wrong operand type!");
+      assert(I->getValueType() == N->getOperand(0).getValueType() &&
+             "Operands must all have the same type");
+    }
+    break;
+  }
+  }
+}
+#endif // NDEBUG
+
+/// \brief Insert a newly allocated node into the DAG.
+///
+/// Handles insertion into the all nodes list and CSE map, as well as
+/// verification and other common operations when a new node is allocated.
+void SelectionDAG::InsertNode(SDNode *N) {
+  AllNodes.push_back(N);
+#ifndef NDEBUG
+  N->PersistentId = NextPersistentId++;
+  VerifySDNode(N);
+#endif
+}
+
+/// RemoveNodeFromCSEMaps - Take the specified node out of the CSE map that
+/// correspond to it.  This is useful when we're about to delete or repurpose
+/// the node.  We don't want future request for structurally identical nodes
+/// to return N anymore.
+bool SelectionDAG::RemoveNodeFromCSEMaps(SDNode *N) {
+  bool Erased = false;
+  switch (N->getOpcode()) {
+  case ISD::HANDLENODE: return false;  // noop.
+  case ISD::CONDCODE:
+    assert(CondCodeNodes[cast<CondCodeSDNode>(N)->get()] &&
+           "Cond code doesn't exist!");
+    Erased = CondCodeNodes[cast<CondCodeSDNode>(N)->get()] != nullptr;
+    CondCodeNodes[cast<CondCodeSDNode>(N)->get()] = nullptr;
+    break;
+  case ISD::ExternalSymbol:
+    Erased = ExternalSymbols.erase(cast<ExternalSymbolSDNode>(N)->getSymbol());
+    break;
+  case ISD::TargetExternalSymbol: {
+    ExternalSymbolSDNode *ESN = cast<ExternalSymbolSDNode>(N);
+    Erased = TargetExternalSymbols.erase(
+               std::pair<std::string,unsigned char>(ESN->getSymbol(),
+                                                    ESN->getTargetFlags()));
+    break;
+  }
+  case ISD::MCSymbol: {
+    auto *MCSN = cast<MCSymbolSDNode>(N);
+    Erased = MCSymbols.erase(MCSN->getMCSymbol());
+    break;
+  }
+  case ISD::VALUETYPE: {
+    EVT VT = cast<VTSDNode>(N)->getVT();
+    if (VT.isExtended()) {
+      Erased = ExtendedValueTypeNodes.erase(VT);
+    } else {
+      Erased = ValueTypeNodes[VT.getSimpleVT().SimpleTy] != nullptr;
+      ValueTypeNodes[VT.getSimpleVT().SimpleTy] = nullptr;
+    }
+    break;
+  }
+  default:
+    // Remove it from the CSE Map.
+    assert(N->getOpcode() != ISD::DELETED_NODE && "DELETED_NODE in CSEMap!");
+    assert(N->getOpcode() != ISD::EntryToken && "EntryToken in CSEMap!");
+    Erased = CSEMap.RemoveNode(N);
+    break;
+  }
+#ifndef NDEBUG
+  // Verify that the node was actually in one of the CSE maps, unless it has a
+  // flag result (which cannot be CSE'd) or is one of the special cases that are
+  // not subject to CSE.
+  if (!Erased && N->getValueType(N->getNumValues()-1) != MVT::Glue &&
+      !N->isMachineOpcode() && !doNotCSE(N)) {
+    N->dump(this);
+    dbgs() << "\n";
+    llvm_unreachable("Node is not in map!");
+  }
+#endif
+  return Erased;
+}
+
+/// AddModifiedNodeToCSEMaps - The specified node has been removed from the CSE
+/// maps and modified in place. Add it back to the CSE maps, unless an identical
+/// node already exists, in which case transfer all its users to the existing
+/// node. This transfer can potentially trigger recursive merging.
+///
+void
+SelectionDAG::AddModifiedNodeToCSEMaps(SDNode *N) {
+  // For node types that aren't CSE'd, just act as if no identical node
+  // already exists.
+  if (!doNotCSE(N)) {
+    SDNode *Existing = CSEMap.GetOrInsertNode(N);
+    if (Existing != N) {
+      // If there was already an existing matching node, use ReplaceAllUsesWith
+      // to replace the dead one with the existing one.  This can cause
+      // recursive merging of other unrelated nodes down the line.
+      ReplaceAllUsesWith(N, Existing);
+
+      // N is now dead. Inform the listeners and delete it.
+      for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+        DUL->NodeDeleted(N, Existing);
+      DeleteNodeNotInCSEMaps(N);
+      return;
+    }
+  }
+
+  // If the node doesn't already exist, we updated it.  Inform listeners.
+  for (DAGUpdateListener *DUL = UpdateListeners; DUL; DUL = DUL->Next)
+    DUL->NodeUpdated(N);
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, SDValue Op,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  SDValue Ops[] = { Op };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
+  return Node;
+}
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N,
+                                           SDValue Op1, SDValue Op2,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  SDValue Ops[] = { Op1, Op2 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
+  return Node;
+}
+
+
+/// FindModifiedNodeSlot - Find a slot for the specified node if its operands
+/// were replaced with those specified.  If this node is never memoized,
+/// return null, otherwise return a pointer to the slot it would take.  If a
+/// node already exists with these operands, the slot will be non-null.
+SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
+                                           void *&InsertPos) {
+  if (doNotCSE(N))
+    return nullptr;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, N->getOpcode(), N->getVTList(), Ops);
+  AddNodeIDCustom(ID, N);
+  SDNode *Node = FindNodeOrInsertPos(ID, SDLoc(N), InsertPos);
+  if (Node)
+    if (const SDNodeFlags *Flags = N->getFlags())
+      Node->intersectFlagsWith(Flags);
+  return Node;
+}
+
+unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
+  Type *Ty = VT == MVT::iPTR ?
+                   PointerType::get(Type::getInt8Ty(*getContext()), 0) :
+                   VT.getTypeForEVT(*getContext());
+
+  return getDataLayout().getABITypeAlignment(Ty);
+}
+
+// EntryNode could meaningfully have debug info if we can find it...
+SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
+    : TM(tm), TSI(nullptr), TLI(nullptr), OptLevel(OL),
+      EntryNode(ISD::EntryToken, 0, DebugLoc(), getVTList(MVT::Other)),
+      Root(getEntryNode()), NewNodesMustHaveLegalTypes(false),
+      UpdateListeners(nullptr) {
+  InsertNode(&EntryNode);
+  DbgInfo = new SDDbgInfo();
+}
+
+void SelectionDAG::init(MachineFunction &mf) {
+  MF = &mf;
+  TLI = getSubtarget().getTargetLowering();
+  TSI = getSubtarget().getSelectionDAGInfo();
+  Context = &mf.getFunction()->getContext();
+}
+
+SelectionDAG::~SelectionDAG() {
+  assert(!UpdateListeners && "Dangling registered DAGUpdateListeners");
+  allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
+  delete DbgInfo;
+}
+
+void SelectionDAG::allnodes_clear() {
+  assert(&*AllNodes.begin() == &EntryNode);
+  AllNodes.remove(AllNodes.begin());
+  while (!AllNodes.empty())
+    DeallocateNode(&AllNodes.front());
+#ifndef NDEBUG
+  NextPersistentId = 0;
+#endif
+}
+
+SDNode *SelectionDAG::GetBinarySDNode(unsigned Opcode, const SDLoc &DL,
+                                      SDVTList VTs, SDValue N1, SDValue N2,
+                                      const SDNodeFlags *Flags) {
+  SDValue Ops[] = {N1, N2};
+
+  if (isBinOpWithFlags(Opcode)) {
+    // If no flags were passed in, use a default flags object.
+    SDNodeFlags F;
+    if (Flags == nullptr)
+      Flags = &F;
+
+    auto *FN = newSDNode<BinaryWithFlagsSDNode>(Opcode, DL.getIROrder(),
+                                                DL.getDebugLoc(), VTs, *Flags);
+    createOperands(FN, Ops);
+
+    return FN;
+  }
+
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  createOperands(N, Ops);
+  return N;
+}
+
+SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                          void *&InsertPos) {
+  SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (N) {
+    switch (N->getOpcode()) {
+    default: break;
+    case ISD::Constant:
+    case ISD::ConstantFP:
+      llvm_unreachable("Querying for Constant and ConstantFP nodes requires "
+                       "debug location.  Use another overload.");
+    }
+  }
+  return N;
+}
+
+SDNode *SelectionDAG::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
+                                          const SDLoc &DL, void *&InsertPos) {
+  SDNode *N = CSEMap.FindNodeOrInsertPos(ID, InsertPos);
+  if (N) {
+    switch (N->getOpcode()) {
+    case ISD::Constant:
+    case ISD::ConstantFP:
+      // Erase debug location from the node if the node is used at several
+      // different places. Do not propagate one location to all uses as it
+      // will cause a worse single stepping debugging experience.
+      if (N->getDebugLoc() != DL.getDebugLoc())
+        N->setDebugLoc(DebugLoc());
+      break;
+    default:
+      // When the node's point of use is located earlier in the instruction
+      // sequence than its prior point of use, update its debug info to the
+      // earlier location.
+      if (DL.getIROrder() && DL.getIROrder() < N->getIROrder())
+        N->setDebugLoc(DL.getDebugLoc());
+      break;
+    }
+  }
+  return N;
+}
+
+void SelectionDAG::clear() {
+  allnodes_clear();
+  OperandRecycler.clear(OperandAllocator);
+  OperandAllocator.Reset();
+  CSEMap.clear();
+
+  ExtendedValueTypeNodes.clear();
+  ExternalSymbols.clear();
+  TargetExternalSymbols.clear();
+  MCSymbols.clear();
+  std::fill(CondCodeNodes.begin(), CondCodeNodes.end(),
+            static_cast<CondCodeSDNode*>(nullptr));
+  std::fill(ValueTypeNodes.begin(), ValueTypeNodes.end(),
+            static_cast<SDNode*>(nullptr));
+
+  EntryNode.UseList = nullptr;
+  InsertNode(&EntryNode);
+  Root = getEntryNode();
+  DbgInfo->clear();
+}
+
+SDValue SelectionDAG::getAnyExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ANY_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getSExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::SIGN_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
+  return VT.bitsGT(Op.getValueType()) ?
+    getNode(ISD::ZERO_EXTEND, DL, VT, Op) :
+    getNode(ISD::TRUNCATE, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
+                                        EVT OpVT) {
+  if (VT.bitsLE(Op.getValueType()))
+    return getNode(ISD::TRUNCATE, SL, VT, Op);
+
+  TargetLowering::BooleanContent BType = TLI->getBooleanContents(OpVT);
+  return getNode(TLI->getExtendForContent(BType), SL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
+  assert(!VT.isVector() &&
+         "getZeroExtendInReg should use the vector element type instead of "
+         "the vector type!");
+  if (Op.getValueType() == VT) return Op;
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  APInt Imm = APInt::getLowBitsSet(BitWidth,
+                                   VT.getSizeInBits());
+  return getNode(ISD::AND, DL, Op.getValueType(), Op,
+                 getConstant(Imm, DL, Op.getValueType()));
+}
+
+SDValue SelectionDAG::getAnyExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                              EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::ANY_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getSignExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                               EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
+SDValue SelectionDAG::getZeroExtendVectorInReg(SDValue Op, const SDLoc &DL,
+                                               EVT VT) {
+  assert(VT.isVector() && "This DAG node is restricted to vector types.");
+  assert(VT.getSizeInBits() == Op.getValueType().getSizeInBits() &&
+         "The sizes of the input and result must match in order to perform the "
+         "extend in-register.");
+  assert(VT.getVectorNumElements() < Op.getValueType().getVectorNumElements() &&
+         "The destination vector type must have fewer lanes than the input.");
+  return getNode(ISD::ZERO_EXTEND_VECTOR_INREG, DL, VT, Op);
+}
+
+/// getNOT - Create a bitwise NOT operation as (XOR Val, -1).
+///
+SDValue SelectionDAG::getNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+  EVT EltVT = VT.getScalarType();
+  SDValue NegOne =
+    getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, VT);
+  return getNode(ISD::XOR, DL, VT, Val, NegOne);
+}
+
+SDValue SelectionDAG::getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT) {
+  EVT EltVT = VT.getScalarType();
+  SDValue TrueValue;
+  switch (TLI->getBooleanContents(VT)) {
+    case TargetLowering::ZeroOrOneBooleanContent:
+    case TargetLowering::UndefinedBooleanContent:
+      TrueValue = getConstant(1, DL, VT);
+      break;
+    case TargetLowering::ZeroOrNegativeOneBooleanContent:
+      TrueValue = getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL,
+                              VT);
+      break;
+  }
+  return getNode(ISD::XOR, DL, VT, Val, TrueValue);
+}
+
+SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
+  EVT EltVT = VT.getScalarType();
+  assert((EltVT.getSizeInBits() >= 64 ||
+         (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
+         "getConstant with a uint64_t value that doesn't fit in the type!");
+  return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
+}
+
+SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
+                                  bool isT, bool isO) {
+  return getConstant(*ConstantInt::get(*Context, Val), DL, VT, isT, isO);
+}
+
+SDValue SelectionDAG::getConstant(const ConstantInt &Val, const SDLoc &DL,
+                                  EVT VT, bool isT, bool isO) {
+  assert(VT.isInteger() && "Cannot create FP integer constant!");
+
+  EVT EltVT = VT.getScalarType();
+  const ConstantInt *Elt = &Val;
+
+  // In some cases the vector type is legal but the element type is illegal and
+  // needs to be promoted, for example v8i8 on ARM.  In this case, promote the
+  // inserted value (the type does not need to match the vector element type).
+  // Any extra bits introduced will be truncated away.
+  if (VT.isVector() && TLI->getTypeAction(*getContext(), EltVT) ==
+      TargetLowering::TypePromoteInteger) {
+   EltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+   APInt NewVal = Elt->getValue().zext(EltVT.getSizeInBits());
+   Elt = ConstantInt::get(*getContext(), NewVal);
+  }
+  // In other cases the element type is illegal and needs to be expanded, for
+  // example v2i64 on MIPS32. In this case, find the nearest legal type, split
+  // the value into n parts and use a vector type with n-times the elements.
+  // Then bitcast to the type requested.
+  // Legalizing constants too early makes the DAGCombiner's job harder so we
+  // only legalize if the DAG tells us we must produce legal types.
+  else if (NewNodesMustHaveLegalTypes && VT.isVector() &&
+           TLI->getTypeAction(*getContext(), EltVT) ==
+           TargetLowering::TypeExpandInteger) {
+    const APInt &NewVal = Elt->getValue();
+    EVT ViaEltVT = TLI->getTypeToTransformTo(*getContext(), EltVT);
+    unsigned ViaEltSizeInBits = ViaEltVT.getSizeInBits();
+    unsigned ViaVecNumElts = VT.getSizeInBits() / ViaEltSizeInBits;
+    EVT ViaVecVT = EVT::getVectorVT(*getContext(), ViaEltVT, ViaVecNumElts);
+
+    // Check the temporary vector is the correct size. If this fails then
+    // getTypeToTransformTo() probably returned a type whose size (in bits)
+    // isn't a power-of-2 factor of the requested type size.
+    assert(ViaVecVT.getSizeInBits() == VT.getSizeInBits());
+
+    SmallVector<SDValue, 2> EltParts;
+    for (unsigned i = 0; i < ViaVecNumElts / VT.getVectorNumElements(); ++i) {
+      EltParts.push_back(getConstant(NewVal.lshr(i * ViaEltSizeInBits)
+                                           .trunc(ViaEltSizeInBits), DL,
+                                     ViaEltVT, isT, isO));
+    }
+
+    // EltParts is currently in little endian order. If we actually want
+    // big-endian order then reverse it now.
+    if (getDataLayout().isBigEndian())
+      std::reverse(EltParts.begin(), EltParts.end());
+
+    // The elements must be reversed when the element order is different
+    // to the endianness of the elements (because the BITCAST is itself a
+    // vector shuffle in this situation). However, we do not need any code to
+    // perform this reversal because getConstant() is producing a vector
+    // splat.
+    // This situation occurs in MIPS MSA.
+
+    SmallVector<SDValue, 8> Ops;
+    for (unsigned i = 0; i < VT.getVectorNumElements(); ++i)
+      Ops.insert(Ops.end(), EltParts.begin(), EltParts.end());
+
+    SDValue Result = getNode(ISD::BITCAST, DL, VT,
+                             getNode(ISD::BUILD_VECTOR, DL, ViaVecVT, Ops));
+    return Result;
+  }
+
+  assert(Elt->getBitWidth() == EltVT.getSizeInBits() &&
+         "APInt size does not match type size!");
+  unsigned Opc = isT ? ISD::TargetConstant : ISD::Constant;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
+  ID.AddPointer(Elt);
+  ID.AddBoolean(isO);
+  void *IP = nullptr;
+  SDNode *N = nullptr;
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = newSDNode<ConstantSDNode>(isT, isO, Elt, DL.getDebugLoc(), EltVT);
+    CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector())
+    Result = getSplatBuildVector(VT, DL, Result);
+  return Result;
+}
+
+SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
+                                        bool isTarget) {
+  return getConstant(Val, DL, TLI->getPointerTy(getDataLayout()), isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
+                                    bool isTarget) {
+  return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
+}
+
+SDValue SelectionDAG::getConstantFP(const ConstantFP &V, const SDLoc &DL,
+                                    EVT VT, bool isTarget) {
+  assert(VT.isFloatingPoint() && "Cannot create integer FP constant!");
+
+  EVT EltVT = VT.getScalarType();
+
+  // Do the map lookup using the actual bit pattern for the floating point
+  // value, so that we don't have problems with 0.0 comparing equal to -0.0, and
+  // we don't have issues with SNANs.
+  unsigned Opc = isTarget ? ISD::TargetConstantFP : ISD::ConstantFP;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(EltVT), None);
+  ID.AddPointer(&V);
+  void *IP = nullptr;
+  SDNode *N = nullptr;
+  if ((N = FindNodeOrInsertPos(ID, DL, IP)))
+    if (!VT.isVector())
+      return SDValue(N, 0);
+
+  if (!N) {
+    N = newSDNode<ConstantFPSDNode>(isTarget, &V, DL.getDebugLoc(), EltVT);
+    CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+  }
+
+  SDValue Result(N, 0);
+  if (VT.isVector())
+    Result = getSplatBuildVector(VT, DL, Result);
+  return Result;
+}
+
+SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
+                                    bool isTarget) {
+  EVT EltVT = VT.getScalarType();
+  if (EltVT == MVT::f32)
+    return getConstantFP(APFloat((float)Val), DL, VT, isTarget);
+  else if (EltVT == MVT::f64)
+    return getConstantFP(APFloat(Val), DL, VT, isTarget);
+  else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
+           EltVT == MVT::f16) {
+    bool Ignored;
+    APFloat APF = APFloat(Val);
+    APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
+                &Ignored);
+    return getConstantFP(APF, DL, VT, isTarget);
+  } else
+    llvm_unreachable("Unsupported type in getConstantFP");
+}
+
+SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, const SDLoc &DL,
+                                       EVT VT, int64_t Offset, bool isTargetGA,
+                                       unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTargetGA) &&
+         "Cannot set target flags on target-independent globals");
+
+  // Truncate (with sign-extension) the offset value to the pointer size.
+  unsigned BitWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
+  if (BitWidth < 64)
+    Offset = SignExtend64(Offset, BitWidth);
+
+  unsigned Opc;
+  if (GV->isThreadLocal())
+    Opc = isTargetGA ? ISD::TargetGlobalTLSAddress : ISD::GlobalTLSAddress;
+  else
+    Opc = isTargetGA ? ISD::TargetGlobalAddress : ISD::GlobalAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddPointer(GV);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  ID.AddInteger(GV->getType()->getAddressSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<GlobalAddressSDNode>(
+      Opc, DL.getIROrder(), DL.getDebugLoc(), GV, VT, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+    InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getFrameIndex(int FI, EVT VT, bool isTarget) {
+  unsigned Opc = isTarget ? ISD::TargetFrameIndex : ISD::FrameIndex;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddInteger(FI);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<FrameIndexSDNode>(FI, VT, isTarget);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
+                                   unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent jump tables");
+  unsigned Opc = isTarget ? ISD::TargetJumpTable : ISD::JumpTable;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddInteger(JTI);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<JumpTableSDNode>(JTI, VT, isTarget, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  ID.AddPointer(C);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                          TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+
+SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
+                                      unsigned Alignment, int Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  assert((TargetFlags == 0 || isTarget) &&
+         "Cannot set target flags on target-independent globals");
+  if (Alignment == 0)
+    Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
+  unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddInteger(Alignment);
+  ID.AddInteger(Offset);
+  C->addSelectionDAGCSEId(ID);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+                                          TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                     unsigned char TargetFlags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), None);
+  ID.AddInteger(Index);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<TargetIndexSDNode>(Index, VT, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), None);
+  ID.AddPointer(MBB);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<BasicBlockSDNode>(MBB);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getValueType(EVT VT) {
+  if (VT.isSimple() && (unsigned)VT.getSimpleVT().SimpleTy >=
+      ValueTypeNodes.size())
+    ValueTypeNodes.resize(VT.getSimpleVT().SimpleTy+1);
+
+  SDNode *&N = VT.isExtended() ?
+    ExtendedValueTypeNodes[VT] : ValueTypeNodes[VT.getSimpleVT().SimpleTy];
+
+  if (N) return SDValue(N, 0);
+  N = newSDNode<VTSDNode>(VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getExternalSymbol(const char *Sym, EVT VT) {
+  SDNode *&N = ExternalSymbols[Sym];
+  if (N) return SDValue(N, 0);
+  N = newSDNode<ExternalSymbolSDNode>(false, Sym, 0, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMCSymbol(MCSymbol *Sym, EVT VT) {
+  SDNode *&N = MCSymbols[Sym];
+  if (N)
+    return SDValue(N, 0);
+  N = newSDNode<MCSymbolSDNode>(Sym, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTargetExternalSymbol(const char *Sym, EVT VT,
+                                              unsigned char TargetFlags) {
+  SDNode *&N =
+    TargetExternalSymbols[std::pair<std::string,unsigned char>(Sym,
+                                                               TargetFlags)];
+  if (N) return SDValue(N, 0);
+  N = newSDNode<ExternalSymbolSDNode>(true, Sym, TargetFlags, VT);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCondCode(ISD::CondCode Cond) {
+  if ((unsigned)Cond >= CondCodeNodes.size())
+    CondCodeNodes.resize(Cond+1);
+
+  if (!CondCodeNodes[Cond]) {
+    auto *N = newSDNode<CondCodeSDNode>(Cond);
+    CondCodeNodes[Cond] = N;
+    InsertNode(N);
+  }
+
+  return SDValue(CondCodeNodes[Cond], 0);
+}
+
+/// Swaps the values of N1 and N2. Swaps all indices in the shuffle mask M that
+/// point at N1 to point at N2 and indices that point at N2 to point at N1.
+static void commuteShuffle(SDValue &N1, SDValue &N2, MutableArrayRef<int> M) {
+  std::swap(N1, N2);
+  ShuffleVectorSDNode::commuteMask(M);
+}
+
+SDValue SelectionDAG::getVectorShuffle(EVT VT, const SDLoc &dl, SDValue N1,
+                                       SDValue N2, ArrayRef<int> Mask) {
+  assert(VT.getVectorNumElements() == Mask.size() &&
+           "Must have the same number of vector elements as mask elements!");
+  assert(VT == N1.getValueType() && VT == N2.getValueType() &&
+         "Invalid VECTOR_SHUFFLE");
+
+  // Canonicalize shuffle undef, undef -> undef
+  if (N1.isUndef() && N2.isUndef())
+    return getUNDEF(VT);
+
+  // Validate that all indices in Mask are within the range of the elements
+  // input to the shuffle.
+  int NElts = Mask.size();
+  assert(all_of(Mask, [&](int M) { return M < (NElts * 2); }) &&
+         "Index out of range");
+
+  // Copy the mask so we can do any needed cleanup.
+  SmallVector<int, 8> MaskVec(Mask.begin(), Mask.end());
+
+  // Canonicalize shuffle v, v -> v, undef
+  if (N1 == N2) {
+    N2 = getUNDEF(VT);
+    for (int i = 0; i != NElts; ++i)
+      if (MaskVec[i] >= NElts) MaskVec[i] -= NElts;
+  }
+
+  // Canonicalize shuffle undef, v -> v, undef.  Commute the shuffle mask.
+  if (N1.isUndef())
+    commuteShuffle(N1, N2, MaskVec);
+
+  // If shuffling a splat, try to blend the splat instead. We do this here so
+  // that even when this arises during lowering we don't have to re-handle it.
+  auto BlendSplat = [&](BuildVectorSDNode *BV, int Offset) {
+    BitVector UndefElements;
+    SDValue Splat = BV->getSplatValue(&UndefElements);
+    if (!Splat)
+      return;
+
+    for (int i = 0; i < NElts; ++i) {
+      if (MaskVec[i] < Offset || MaskVec[i] >= (Offset + NElts))
+        continue;
+
+      // If this input comes from undef, mark it as such.
+      if (UndefElements[MaskVec[i] - Offset]) {
+        MaskVec[i] = -1;
+        continue;
+      }
+
+      // If we can blend a non-undef lane, use that instead.
+      if (!UndefElements[i])
+        MaskVec[i] = i + Offset;
+    }
+  };
+  if (auto *N1BV = dyn_cast<BuildVectorSDNode>(N1))
+    BlendSplat(N1BV, 0);
+  if (auto *N2BV = dyn_cast<BuildVectorSDNode>(N2))
+    BlendSplat(N2BV, NElts);
+
+  // Canonicalize all index into lhs, -> shuffle lhs, undef
+  // Canonicalize all index into rhs, -> shuffle rhs, undef
+  bool AllLHS = true, AllRHS = true;
+  bool N2Undef = N2.isUndef();
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= NElts) {
+      if (N2Undef)
+        MaskVec[i] = -1;
+      else
+        AllLHS = false;
+    } else if (MaskVec[i] >= 0) {
+      AllRHS = false;
+    }
+  }
+  if (AllLHS && AllRHS)
+    return getUNDEF(VT);
+  if (AllLHS && !N2Undef)
+    N2 = getUNDEF(VT);
+  if (AllRHS) {
+    N1 = getUNDEF(VT);
+    commuteShuffle(N1, N2, MaskVec);
+  }
+  // Reset our undef status after accounting for the mask.
+  N2Undef = N2.isUndef();
+  // Re-check whether both sides ended up undef.
+  if (N1.isUndef() && N2Undef)
+    return getUNDEF(VT);
+
+  // If Identity shuffle return that node.
+  bool Identity = true, AllSame = true;
+  for (int i = 0; i != NElts; ++i) {
+    if (MaskVec[i] >= 0 && MaskVec[i] != i) Identity = false;
+    if (MaskVec[i] != MaskVec[0]) AllSame = false;
+  }
+  if (Identity && NElts)
+    return N1;
+
+  // Shuffling a constant splat doesn't change the result.
+  if (N2Undef) {
+    SDValue V = N1;
+
+    // Look through any bitcasts. We check that these don't change the number
+    // (and size) of elements and just changes their types.
+    while (V.getOpcode() == ISD::BITCAST)
+      V = V->getOperand(0);
+
+    // A splat should always show up as a build vector node.
+    if (auto *BV = dyn_cast<BuildVectorSDNode>(V)) {
+      BitVector UndefElements;
+      SDValue Splat = BV->getSplatValue(&UndefElements);
+      // If this is a splat of an undef, shuffling it is also undef.
+      if (Splat && Splat.isUndef())
+        return getUNDEF(VT);
+
+      bool SameNumElts =
+          V.getValueType().getVectorNumElements() == VT.getVectorNumElements();
+
+      // We only have a splat which can skip shuffles if there is a splatted
+      // value and no undef lanes rearranged by the shuffle.
+      if (Splat && UndefElements.none()) {
+        // Splat of <x, x, ..., x>, return <x, x, ..., x>, provided that the
+        // number of elements match or the value splatted is a zero constant.
+        if (SameNumElts)
+          return N1;
+        if (auto *C = dyn_cast<ConstantSDNode>(Splat))
+          if (C->isNullValue())
+            return N1;
+      }
+
+      // If the shuffle itself creates a splat, build the vector directly.
+      if (AllSame && SameNumElts) {
+        EVT BuildVT = BV->getValueType(0);
+        const SDValue &Splatted = BV->getOperand(MaskVec[0]);
+        SDValue NewBV = getSplatBuildVector(BuildVT, dl, Splatted);
+
+        // We may have jumped through bitcasts, so the type of the
+        // BUILD_VECTOR may not match the type of the shuffle.
+        if (BuildVT != VT)
+          NewBV = getNode(ISD::BITCAST, dl, VT, NewBV);
+        return NewBV;
+      }
+    }
+  }
+
+  FoldingSetNodeID ID;
+  SDValue Ops[2] = { N1, N2 };
+  AddNodeIDNode(ID, ISD::VECTOR_SHUFFLE, getVTList(VT), Ops);
+  for (int i = 0; i != NElts; ++i)
+    ID.AddInteger(MaskVec[i]);
+
+  void* IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  // Allocate the mask array for the node out of the BumpPtrAllocator, since
+  // SDNode doesn't have access to it.  This memory will be "leaked" when
+  // the node is deallocated, but recovered when the NodeAllocator is released.
+  int *MaskAlloc = OperandAllocator.Allocate<int>(NElts);
+  std::copy(MaskVec.begin(), MaskVec.end(), MaskAlloc);
+
+  auto *N = newSDNode<ShuffleVectorSDNode>(VT, dl.getIROrder(),
+                                           dl.getDebugLoc(), MaskAlloc);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getCommutedVectorShuffle(const ShuffleVectorSDNode &SV) {
+  MVT VT = SV.getSimpleValueType(0);
+  SmallVector<int, 8> MaskVec(SV.getMask().begin(), SV.getMask().end());
+  ShuffleVectorSDNode::commuteMask(MaskVec);
+
+  SDValue Op0 = SV.getOperand(0);
+  SDValue Op1 = SV.getOperand(1);
+  return getVectorShuffle(VT, SDLoc(&SV), Op1, Op0, MaskVec);
+}
+
+SDValue SelectionDAG::getConvertRndSat(EVT VT, const SDLoc &dl, SDValue Val,
+                                       SDValue DTy, SDValue STy, SDValue Rnd,
+                                       SDValue Sat, ISD::CvtCode Code) {
+  // If the src and dest types are the same and the conversion is between
+  // integer types of the same sign or two floats, no conversion is necessary.
+  if (DTy == STy &&
+      (Code == ISD::CVT_UU || Code == ISD::CVT_SS || Code == ISD::CVT_FF))
+    return Val;
+
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Val, DTy, STy, Rnd, Sat };
+  AddNodeIDNode(ID, ISD::CONVERT_RNDSAT, getVTList(VT), Ops);
+  void* IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N =
+      newSDNode<CvtRndSatSDNode>(VT, dl.getIROrder(), dl.getDebugLoc(), Code);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegister(unsigned RegNo, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::Register, getVTList(VT), None);
+  ID.AddInteger(RegNo);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<RegisterSDNode>(RegNo, VT);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getRegisterMask(const uint32_t *RegMask) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::RegisterMask, getVTList(MVT::Untyped), None);
+  ID.AddPointer(RegMask);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<RegisterMaskSDNode>(RegMask);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getEHLabel(const SDLoc &dl, SDValue Root,
+                                 MCSymbol *Label) {
+  FoldingSetNodeID ID;
+  SDValue Ops[] = { Root };
+  AddNodeIDNode(ID, ISD::EH_LABEL, getVTList(MVT::Other), Ops);
+  ID.AddPointer(Label);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<EHLabelSDNode>(dl.getIROrder(), dl.getDebugLoc(), Label);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      int64_t Offset,
+                                      bool isTarget,
+                                      unsigned char TargetFlags) {
+  unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opc, getVTList(VT), None);
+  ID.AddPointer(BA);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<BlockAddressSDNode>(Opc, VT, BA, Offset, TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getSrcValue(const Value *V) {
+  assert((!V || V->getType()->isPointerTy()) &&
+         "SrcValue is not a pointer?");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
+  ID.AddPointer(V);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<SrcValueSDNode>(V);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMDNode(const MDNode *MD) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MDNODE_SDNODE, getVTList(MVT::Other), None);
+  ID.AddPointer(MD);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<MDNodeSDNode>(MD);
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getBitcast(EVT VT, SDValue V) {
+  if (VT == V.getValueType())
+    return V;
+
+  return getNode(ISD::BITCAST, SDLoc(V), VT, V);
+}
+
+SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
+                                       unsigned SrcAS, unsigned DestAS) {
+  SDValue Ops[] = {Ptr};
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::ADDRSPACECAST, getVTList(VT), Ops);
+  ID.AddInteger(SrcAS);
+  ID.AddInteger(DestAS);
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<AddrSpaceCastSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VT, SrcAS, DestAS);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+/// getShiftAmountOperand - Return the specified value casted to
+/// the target's desired shift amount type.
+SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
+  EVT OpTy = Op.getValueType();
+  EVT ShTy = TLI->getShiftAmountTy(LHSTy, getDataLayout());
+  if (OpTy == ShTy || OpTy.isVector()) return Op;
+
+  return getZExtOrTrunc(Op, SDLoc(Op), ShTy);
+}
+
+SDValue SelectionDAG::expandVAArg(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  const Value *V = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  EVT VT = Node->getValueType(0);
+  SDValue Tmp1 = Node->getOperand(0);
+  SDValue Tmp2 = Node->getOperand(1);
+  unsigned Align = Node->getConstantOperandVal(3);
+
+  SDValue VAListLoad = getLoad(TLI.getPointerTy(getDataLayout()), dl, Tmp1,
+                               Tmp2, MachinePointerInfo(V));
+  SDValue VAList = VAListLoad;
+
+  if (Align > TLI.getMinStackArgumentAlignment()) {
+    assert(((Align & (Align-1)) == 0) && "Expected Align to be a power of 2");
+
+    VAList = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                     getConstant(Align - 1, dl, VAList.getValueType()));
+
+    VAList = getNode(ISD::AND, dl, VAList.getValueType(), VAList,
+                     getConstant(-(int64_t)Align, dl, VAList.getValueType()));
+  }
+
+  // Increment the pointer, VAList, to the next vaarg
+  Tmp1 = getNode(ISD::ADD, dl, VAList.getValueType(), VAList,
+                 getConstant(getDataLayout().getTypeAllocSize(
+                                               VT.getTypeForEVT(*getContext())),
+                             dl, VAList.getValueType()));
+  // Store the incremented VAList to the legalized pointer
+  Tmp1 =
+      getStore(VAListLoad.getValue(1), dl, Tmp1, Tmp2, MachinePointerInfo(V));
+  // Load the actual argument out of the pointer VAList
+  return getLoad(VT, dl, Tmp1, VAList, MachinePointerInfo());
+}
+
+SDValue SelectionDAG::expandVACopy(SDNode *Node) {
+  SDLoc dl(Node);
+  const TargetLowering &TLI = getTargetLoweringInfo();
+  // This defaults to loading a pointer from the input and storing it to the
+  // output, returning the chain.
+  const Value *VD = cast<SrcValueSDNode>(Node->getOperand(3))->getValue();
+  const Value *VS = cast<SrcValueSDNode>(Node->getOperand(4))->getValue();
+  SDValue Tmp1 =
+      getLoad(TLI.getPointerTy(getDataLayout()), dl, Node->getOperand(0),
+              Node->getOperand(2), MachinePointerInfo(VS));
+  return getStore(Tmp1.getValue(1), dl, Tmp1, Node->getOperand(1),
+                  MachinePointerInfo(VD));
+}
+
+SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  unsigned ByteSize = VT.getStoreSize();
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  unsigned StackAlign =
+      std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
+
+  int FrameIdx = FrameInfo->CreateStackObject(ByteSize, StackAlign, false);
+  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
+}
+
+SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
+  unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
+  Type *Ty1 = VT1.getTypeForEVT(*getContext());
+  Type *Ty2 = VT2.getTypeForEVT(*getContext());
+  const DataLayout &DL = getDataLayout();
+  unsigned Align =
+      std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
+
+  MachineFrameInfo *FrameInfo = getMachineFunction().getFrameInfo();
+  int FrameIdx = FrameInfo->CreateStackObject(Bytes, Align, false);
+  return getFrameIndex(FrameIdx, TLI->getPointerTy(getDataLayout()));
+}
+
+SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
+                                ISD::CondCode Cond, const SDLoc &dl) {
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return getConstant(0, dl, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2: {
+    TargetLowering::BooleanContent Cnt =
+        TLI->getBooleanContents(N1->getValueType(0));
+    return getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
+        VT);
+  }
+
+  case ISD::SETOEQ:
+  case ISD::SETOGT:
+  case ISD::SETOGE:
+  case ISD::SETOLT:
+  case ISD::SETOLE:
+  case ISD::SETONE:
+  case ISD::SETO:
+  case ISD::SETUO:
+  case ISD::SETUEQ:
+  case ISD::SETUNE:
+    assert(!N1.getValueType().isInteger() && "Illegal setcc for integer!");
+    break;
+  }
+
+  if (ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2)) {
+    const APInt &C2 = N2C->getAPIntValue();
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
+      const APInt &C1 = N1C->getAPIntValue();
+
+      switch (Cond) {
+      default: llvm_unreachable("Unknown integer setcc!");
+      case ISD::SETEQ:  return getConstant(C1 == C2, dl, VT);
+      case ISD::SETNE:  return getConstant(C1 != C2, dl, VT);
+      case ISD::SETULT: return getConstant(C1.ult(C2), dl, VT);
+      case ISD::SETUGT: return getConstant(C1.ugt(C2), dl, VT);
+      case ISD::SETULE: return getConstant(C1.ule(C2), dl, VT);
+      case ISD::SETUGE: return getConstant(C1.uge(C2), dl, VT);
+      case ISD::SETLT:  return getConstant(C1.slt(C2), dl, VT);
+      case ISD::SETGT:  return getConstant(C1.sgt(C2), dl, VT);
+      case ISD::SETLE:  return getConstant(C1.sle(C2), dl, VT);
+      case ISD::SETGE:  return getConstant(C1.sge(C2), dl, VT);
+      }
+    }
+  }
+  if (ConstantFPSDNode *N1C = dyn_cast<ConstantFPSDNode>(N1)) {
+    if (ConstantFPSDNode *N2C = dyn_cast<ConstantFPSDNode>(N2)) {
+      APFloat::cmpResult R = N1C->getValueAPF().compare(N2C->getValueAPF());
+      switch (Cond) {
+      default: break;
+      case ISD::SETEQ:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOEQ: return getConstant(R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETNE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETONE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpLessThan, dl, VT);
+      case ISD::SETLT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLT: return getConstant(R==APFloat::cmpLessThan, dl, VT);
+      case ISD::SETGT:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGT: return getConstant(R==APFloat::cmpGreaterThan, dl, VT);
+      case ISD::SETLE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOLE: return getConstant(R==APFloat::cmpLessThan ||
+                                           R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETGE:  if (R==APFloat::cmpUnordered)
+                          return getUNDEF(VT);
+                        // fall through
+      case ISD::SETOGE: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETO:   return getConstant(R!=APFloat::cmpUnordered, dl, VT);
+      case ISD::SETUO:  return getConstant(R==APFloat::cmpUnordered, dl, VT);
+      case ISD::SETUEQ: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpEqual, dl, VT);
+      case ISD::SETUNE: return getConstant(R!=APFloat::cmpEqual, dl, VT);
+      case ISD::SETULT: return getConstant(R==APFloat::cmpUnordered ||
+                                           R==APFloat::cmpLessThan, dl, VT);
+      case ISD::SETUGT: return getConstant(R==APFloat::cmpGreaterThan ||
+                                           R==APFloat::cmpUnordered, dl, VT);
+      case ISD::SETULE: return getConstant(R!=APFloat::cmpGreaterThan, dl, VT);
+      case ISD::SETUGE: return getConstant(R!=APFloat::cmpLessThan, dl, VT);
+      }
+    } else {
+      // Ensure that the constant occurs on the RHS.
+      ISD::CondCode SwappedCond = ISD::getSetCCSwappedOperands(Cond);
+      MVT CompVT = N1.getValueType().getSimpleVT();
+      if (!TLI->isCondCodeLegal(SwappedCond, CompVT))
+        return SDValue();
+
+      return getSetCC(dl, VT, N2, N1, SwappedCond);
+    }
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// SignBitIsZero - Return true if the sign bit of Op is known to be zero.  We
+/// use this predicate to simplify operations downstream.
+bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
+  // This predicate is not safe for vector operations.
+  if (Op.getValueType().isVector())
+    return false;
+
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+  return MaskedValueIsZero(Op, APInt::getSignBit(BitWidth), Depth);
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+bool SelectionDAG::MaskedValueIsZero(SDValue Op, const APInt &Mask,
+                                     unsigned Depth) const {
+  APInt KnownZero, KnownOne;
+  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+  return (KnownZero & Mask) == Mask;
+}
+
+/// Determine which bits of Op are known to be either zero or one and return
+/// them in the KnownZero/KnownOne bitsets.
+void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
+                                    APInt &KnownOne, unsigned Depth) const {
+  unsigned BitWidth = Op.getValueType().getScalarType().getSizeInBits();
+
+  KnownZero = KnownOne = APInt(BitWidth, 0);   // Don't know anything.
+  if (Depth == 6)
+    return;  // Limit search depth.
+
+  APInt KnownZero2, KnownOne2;
+
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
+    break;
+  case ISD::AND:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case ISD::OR:
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case ISD::XOR: {
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    break;
+  }
+  case ISD::MUL: {
+    computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+
+    // If low bits are zero in either operand, output low known-0 bits.
+    // Also compute a conserative estimate for high known-0 bits.
+    // More trickiness is possible, but this is sufficient for the
+    // interesting case of alignment computation.
+    KnownOne.clearAllBits();
+    unsigned TrailZ = KnownZero.countTrailingOnes() +
+                      KnownZero2.countTrailingOnes();
+    unsigned LeadZ =  std::max(KnownZero.countLeadingOnes() +
+                               KnownZero2.countLeadingOnes(),
+                               BitWidth) - BitWidth;
+
+    TrailZ = std::min(TrailZ, BitWidth);
+    LeadZ = std::min(LeadZ, BitWidth);
+    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
+                APInt::getHighBitsSet(BitWidth, LeadZ);
+    break;
+  }
+  case ISD::UDIV: {
+    // For the purposes of computing leading zeros we can conservatively
+    // treat a udiv as a logical right shift by the power of 2 known to
+    // be less than the denominator.
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    unsigned LeadZ = KnownZero2.countLeadingOnes();
+
+    KnownOne2.clearAllBits();
+    KnownZero2.clearAllBits();
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    unsigned RHSUnknownLeadingOnes = KnownOne2.countLeadingZeros();
+    if (RHSUnknownLeadingOnes != BitWidth)
+      LeadZ = std::min(BitWidth,
+                       LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
+
+    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    break;
+  }
+  case ISD::SELECT:
+    computeKnownBits(Op.getOperand(2), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SELECT_CC:
+    computeKnownBits(Op.getOperand(3), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(2), KnownZero2, KnownOne2, Depth+1);
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.
+    // If we know the result of a setcc has the top bits zero, use this info.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::SETCC:
+    // If we know the result of a setcc has the top bits zero, use this info.
+    if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+  case ISD::SHL:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      KnownZero <<= ShAmt;
+      KnownOne  <<= ShAmt;
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt);
+    }
+    break;
+  case ISD::SRL:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    break;
+  case ISD::SRA:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+
+      computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bits.
+      APInt SignBit = APInt::getSignBit(BitWidth);
+      SignBit = SignBit.lshr(ShAmt);  // Adjust to where it is now in the mask.
+
+      if (KnownZero.intersects(SignBit)) {
+        KnownZero |= HighBits;  // New bits are known zero.
+      } else if (KnownOne.intersects(SignBit)) {
+        KnownOne  |= HighBits;  // New bits are known one.
+      }
+    }
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    unsigned EBits = EVT.getScalarType().getSizeInBits();
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits = APInt::getHighBitsSet(BitWidth, BitWidth - EBits);
+
+    APInt InSignBit = APInt::getSignBit(EBits);
+    APInt InputDemandedBits = APInt::getLowBitsSet(BitWidth, EBits);
+
+    // If the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InSignBit = InSignBit.zext(BitWidth);
+    if (NewBits.getBoolValue())
+      InputDemandedBits |= InSignBit;
+
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownOne &= InputDemandedBits;
+    KnownZero &= InputDemandedBits;
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero.intersects(InSignBit)) {         // Input sign bit known clear
+      KnownZero |= NewBits;
+      KnownOne  &= ~NewBits;
+    } else if (KnownOne.intersects(InSignBit)) {   // Input sign bit known set
+      KnownOne  |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                              // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne  &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::CTTZ:
+  case ISD::CTTZ_ZERO_UNDEF:
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+  case ISD::CTPOP: {
+    unsigned LowBits = Log2_32(BitWidth)+1;
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+    KnownOne.clearAllBits();
+    break;
+  }
+  case ISD::LOAD: {
+    LoadSDNode *LD = cast<LoadSDNode>(Op);
+    // If this is a ZEXTLoad and we are looking at the loaded value.
+    if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      EVT VT = LD->getMemoryVT();
+      unsigned MemBits = VT.getScalarType().getSizeInBits();
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+    } else if (const MDNode *Ranges = LD->getRanges()) {
+      if (LD->getExtensionType() == ISD::NON_EXTLOAD)
+        computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);
+    }
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
+
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+
+    // Note if the sign bit is known to be zero or one.
+    bool SignBitKnownZero = KnownZero.isNegative();
+    bool SignBitKnownOne  = KnownOne.isNegative();
+
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero or one, the top bits match.
+    if (SignBitKnownZero)
+      KnownZero |= NewBits;
+    else if (SignBitKnownOne)
+      KnownOne  |= NewBits;
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    KnownZero = KnownZero.zext(InBits);
+    KnownOne = KnownOne.zext(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    break;
+  }
+  case ISD::AssertZext: {
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    KnownZero |= (~InMask);
+    KnownOne  &= (~KnownZero);
+    break;
+  }
+  case ISD::FGETSIGN:
+    // All bits are zero except the low bit.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    break;
+
+  case ISD::SUB: {
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0))) {
+      // We know that the top bits of C-X are clear if X contains less bits
+      // than C (i.e. no wrap-around can happen).  For example, 20-X is
+      // positive if we can prove that X is >= 0 and < 16.
+      if (CLHS->getAPIntValue().isNonNegative()) {
+        unsigned NLZ = (CLHS->getAPIntValue()+1).countLeadingZeros();
+        // NLZ can't be BitWidth with no sign bit
+        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
+        computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+
+        // If all of the MaskV bits are known to be zero, then we know the
+        // output top bits are zero, because we now know that the output is
+        // from [0-C].
+        if ((KnownZero2 & MaskV) == MaskV) {
+          unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
+          // Top bits known zero.
+          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+        }
+      }
+    }
+  }
+  // fall through
+  case ISD::ADD:
+  case ISD::ADDE: {
+    // Output known-0 bits are known if clear or set in both the low clear bits
+    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
+    // low 3 bits clear.
+    // Output known-0 bits are also known if the top bits of each input are
+    // known to be clear. For example, if one input has the top 10 bits clear
+    // and the other has the top 8 bits clear, we know the top 7 bits of the
+    // output must be clear.
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    unsigned KnownZeroHigh = KnownZero2.countLeadingOnes();
+    unsigned KnownZeroLow = KnownZero2.countTrailingOnes();
+
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+    KnownZeroHigh = std::min(KnownZeroHigh,
+                             KnownZero2.countLeadingOnes());
+    KnownZeroLow = std::min(KnownZeroLow,
+                            KnownZero2.countTrailingOnes());
+
+    if (Op.getOpcode() == ISD::ADD) {
+      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
+      if (KnownZeroHigh > 1)
+        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
+      break;
+    }
+
+    // With ADDE, a carry bit may be added in, so we can only use this
+    // information if we know (at least) that the low two bits are clear.  We
+    // then return to the caller that the low bit is unknown but that other bits
+    // are known zero.
+    if (KnownZeroLow >= 2) // ADDE
+      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow);
+    break;
+  }
+  case ISD::SREM:
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue().abs();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = RA - 1;
+        computeKnownBits(Op.getOperand(0), KnownZero2,KnownOne2,Depth+1);
+
+        // The low bits of the first operand are unchanged by the srem.
+        KnownZero = KnownZero2 & LowBits;
+        KnownOne = KnownOne2 & LowBits;
+
+        // If the first operand is non-negative or has all low bits zero, then
+        // the upper bits are all zero.
+        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+          KnownZero |= ~LowBits;
+
+        // If the first operand is negative and not all low bits are zero, then
+        // the upper bits are all one.
+        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+          KnownOne |= ~LowBits;
+        assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?");
+      }
+    }
+    break;
+  case ISD::UREM: {
+    if (ConstantSDNode *Rem = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      const APInt &RA = Rem->getAPIntValue();
+      if (RA.isPowerOf2()) {
+        APInt LowBits = (RA - 1);
+        computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth + 1);
+
+        // The upper bits are all zero, the lower ones are unchanged.
+        KnownZero = KnownZero2 | ~LowBits;
+        KnownOne = KnownOne2 & LowBits;
+        break;
+      }
+    }
+
+    // Since the result is less than or equal to either operand, any leading
+    // zero bits in either operand must also exist in the result.
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, Depth+1);
+
+    uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
+                                KnownZero2.countLeadingOnes());
+    KnownOne.clearAllBits();
+    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    break;
+  }
+  case ISD::EXTRACT_ELEMENT: {
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+    const unsigned Index =
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    const unsigned BitWidth = Op.getValueType().getSizeInBits();
+
+    // Remove low part of known bits mask
+    KnownZero = KnownZero.getHiBits(KnownZero.getBitWidth() - Index * BitWidth);
+    KnownOne = KnownOne.getHiBits(KnownOne.getBitWidth() - Index * BitWidth);
+
+    // Remove high part of known bit mask
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+    break;
+  }
+  case ISD::BSWAP: {
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, Depth+1);
+    KnownZero = KnownZero2.byteSwap();
+    KnownOne = KnownOne2.byteSwap();
+    break;
+  }
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX: {
+    APInt Op0Zero, Op0One;
+    APInt Op1Zero, Op1One;
+    computeKnownBits(Op.getOperand(0), Op0Zero, Op0One, Depth);
+    computeKnownBits(Op.getOperand(1), Op1Zero, Op1One, Depth);
+
+    KnownZero = Op0Zero & Op1Zero;
+    KnownOne = Op0One & Op1One;
+    break;
+  }
+  case ISD::FrameIndex:
+  case ISD::TargetFrameIndex:
+    if (unsigned Align = InferPtrAlignment(Op)) {
+      // The low bits are known zero if the pointer is aligned.
+      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
+      break;
+    }
+    break;
+
+  default:
+    if (Op.getOpcode() < ISD::BUILTIN_OP_END)
+      break;
+    // Fallthrough
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:
+    // Allow the target to implement this method for its nodes.
+    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    break;
+  }
+
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+}
+
+bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
+  // A left-shift of a constant one will have exactly one bit set because
+  // shifting the bit off the end is undefined.
+  if (Val.getOpcode() == ISD::SHL) {
+    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
+    if (C && C->getAPIntValue() == 1)
+      return true;
+  }
+
+  // Similarly, a logical right-shift of a constant sign-bit will have exactly
+  // one bit set.
+  if (Val.getOpcode() == ISD::SRL) {
+    auto *C = dyn_cast<ConstantSDNode>(Val.getOperand(0));
+    if (C && C->getAPIntValue().isSignBit())
+      return true;
+  }
+
+  // More could be done here, though the above checks are enough
+  // to handle some common cases.
+
+  // Fall back to computeKnownBits to catch other known cases.
+  EVT OpVT = Val.getValueType();
+  unsigned BitWidth = OpVT.getScalarType().getSizeInBits();
+  APInt KnownZero, KnownOne;
+  computeKnownBits(Val, KnownZero, KnownOne);
+  return (KnownZero.countPopulation() == BitWidth - 1) &&
+         (KnownOne.countPopulation() == 1);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isInteger() && "Invalid VT!");
+  unsigned VTBits = VT.getScalarType().getSizeInBits();
+  unsigned Tmp, Tmp2;
+  unsigned FirstAnswer = 1;
+
+  if (Depth == 6)
+    return 1;  // Limit search depth.
+
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::AssertSext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp+1;
+  case ISD::AssertZext:
+    Tmp = cast<VTSDNode>(Op.getOperand(1))->getVT().getSizeInBits();
+    return VTBits-Tmp;
+
+  case ISD::Constant: {
+    const APInt &Val = cast<ConstantSDNode>(Op)->getAPIntValue();
+    return Val.getNumSignBits();
+  }
+
+  case ISD::SIGN_EXTEND:
+    Tmp =
+        VTBits-Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
+
+  case ISD::SIGN_EXTEND_INREG:
+    // Max of the input and what this extends.
+    Tmp =
+      cast<VTSDNode>(Op.getOperand(1))->getVT().getScalarType().getSizeInBits();
+    Tmp = VTBits-Tmp+1;
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    return std::max(Tmp, Tmp2);
+
+  case ISD::SRA:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    // SRA X, C   -> adds C sign bits.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      Tmp += C->getZExtValue();
+      if (Tmp > VTBits) Tmp = VTBits;
+    }
+    return Tmp;
+  case ISD::SHL:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      // shl destroys sign bits.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (C->getZExtValue() >= VTBits ||      // Bad shift.
+          C->getZExtValue() >= Tmp) break;    // Shifted all sign bits out.
+      return Tmp - C->getZExtValue();
+    }
+    break;
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:    // NOT is handled here.
+    // Logical binary ops preserve the number of sign bits at the worst.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp != 1) {
+      Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+      FirstAnswer = std::min(Tmp, Tmp2);
+      // We computed what we know about the sign bits as our first
+      // answer. Now proceed to the generic code that uses
+      // computeKnownBits, and pick whichever answer is better.
+    }
+    break;
+
+  case ISD::SELECT:
+    Tmp = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    return std::min(Tmp, Tmp2);
+  case ISD::SELECT_CC:
+    Tmp = ComputeNumSignBits(Op.getOperand(2), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(3), Depth+1);
+    return std::min(Tmp, Tmp2);
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    if (Tmp == 1)
+      return 1;  // Early out.
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    return std::min(Tmp, Tmp2);
+  case ISD::SADDO:
+  case ISD::UADDO:
+  case ISD::SSUBO:
+  case ISD::USUBO:
+  case ISD::SMULO:
+  case ISD::UMULO:
+    if (Op.getResNo() != 1)
+      break;
+    // The boolean result conforms to getBooleanContents.  Fall through.
+    // If setcc returns 0/-1, all bits are sign bits.
+    // We know that we have an integer-based boolean since these operations
+    // are only available for integer.
+    if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::SETCC:
+    // If setcc returns 0/-1, all bits are sign bits.
+    if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+        TargetLowering::ZeroOrNegativeOneBooleanContent)
+      return VTBits;
+    break;
+  case ISD::ROTL:
+  case ISD::ROTR:
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned RotAmt = C->getZExtValue() & (VTBits-1);
+
+      // Handle rotate right by N like a rotate left by 32-N.
+      if (Op.getOpcode() == ISD::ROTR)
+        RotAmt = (VTBits-RotAmt) & (VTBits-1);
+
+      // If we aren't rotating out all of the known-in sign bits, return the
+      // number that are left.  This handles rotl(sext(x), 1) for example.
+      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+      if (Tmp > RotAmt+1) return Tmp-RotAmt;
+    }
+    break;
+  case ISD::ADD:
+    // Add can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+
+    // Special case decrementing a value (ADD X, -1):
+    if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      if (CRHS->isAllOnesValue()) {
+        APInt KnownZero, KnownOne;
+        computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
+
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+          return VTBits;
+
+        // If we are subtracting one from a positive number, there is no carry
+        // out of the result.
+        if (KnownZero.isNegative())
+          return Tmp;
+      }
+
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+    return std::min(Tmp, Tmp2)-1;
+
+  case ISD::SUB:
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
+    if (Tmp2 == 1) return 1;
+
+    // Handle NEG.
+    if (ConstantSDNode *CLHS = dyn_cast<ConstantSDNode>(Op.getOperand(0)))
+      if (CLHS->isNullValue()) {
+        APInt KnownZero, KnownOne;
+        computeKnownBits(Op.getOperand(1), KnownZero, KnownOne, Depth+1);
+        // If the input is known to be 0 or 1, the output is 0/-1, which is all
+        // sign bits set.
+        if ((KnownZero | APInt(VTBits, 1)).isAllOnesValue())
+          return VTBits;
+
+        // If the input is known to be positive (the sign bit is known clear),
+        // the output of the NEG has the same number of sign bits as the input.
+        if (KnownZero.isNegative())
+          return Tmp2;
+
+        // Otherwise, we treat this like a SUB.
+      }
+
+    // Sub can have at most one carry bit.  Thus we know that the output
+    // is, at worst, one more bit than the inputs.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    if (Tmp == 1) return 1;  // Early out.
+    return std::min(Tmp, Tmp2)-1;
+  case ISD::TRUNCATE:
+    // FIXME: it's tricky to do anything useful for this, but it is an important
+    // case for targets like X86.
+    break;
+  case ISD::EXTRACT_ELEMENT: {
+    const int KnownSign = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    const int BitWidth = Op.getValueType().getSizeInBits();
+    const int Items =
+      Op.getOperand(0).getValueType().getSizeInBits() / BitWidth;
+
+    // Get reverse index (starting from 1), Op1 value indexes elements from
+    // little end. Sign starts at big end.
+    const int rIndex = Items - 1 -
+      cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+
+    // If the sign portion ends in our element the subtraction gives correct
+    // result. Otherwise it gives either negative or > bitwidth result
+    return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
+  }
+  }
+
+  // If we are looking at the loaded value of the SDNode.
+  if (Op.getResNo() == 0) {
+    // Handle LOADX separately here. EXTLOAD case will fallthrough.
+    if (LoadSDNode *LD = dyn_cast<LoadSDNode>(Op)) {
+      unsigned ExtType = LD->getExtensionType();
+      switch (ExtType) {
+        default: break;
+        case ISD::SEXTLOAD:    // '17' bits known
+          Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+          return VTBits-Tmp+1;
+        case ISD::ZEXTLOAD:    // '16' bits known
+          Tmp = LD->getMemoryVT().getScalarType().getSizeInBits();
+          return VTBits-Tmp;
+      }
+    }
+  }
+
+  // Allow the target to implement this method for its nodes.
+  if (Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+      Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+      Op.getOpcode() == ISD::INTRINSIC_VOID) {
+    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
+    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  APInt KnownZero, KnownOne;
+  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+
+  APInt Mask;
+  if (KnownZero.isNegative()) {        // sign bit is 0
+    Mask = KnownZero;
+  } else if (KnownOne.isNegative()) {  // sign bit is 1;
+    Mask = KnownOne;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
+
+  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // the number of identical bits in the top of the input value.
+  Mask = ~Mask;
+  Mask <<= Mask.getBitWidth()-VTBits;
+  // Return # leading zeros.  We use 'min' here in case Val was zero before
+  // shifting.  We don't want to return '64' as for an i32 "0".
+  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+}
+
+bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
+  if ((Op.getOpcode() != ISD::ADD && Op.getOpcode() != ISD::OR) ||
+      !isa<ConstantSDNode>(Op.getOperand(1)))
+    return false;
+
+  if (Op.getOpcode() == ISD::OR &&
+      !MaskedValueIsZero(Op.getOperand(0),
+                     cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue()))
+    return false;
+
+  return true;
+}
+
+bool SelectionDAG::isKnownNeverNaN(SDValue Op) const {
+  // If we're told that NaNs won't happen, assume they won't.
+  if (getTarget().Options.NoNaNsFPMath)
+    return true;
+
+  // If the value is a constant, we can obviously see if it is a NaN or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->getValueAPF().isNaN();
+
+  // TODO: Recognize more cases here.
+
+  return false;
+}
+
+bool SelectionDAG::isKnownNeverZero(SDValue Op) const {
+  // If the value is a constant, we can obviously see if it is a zero or not.
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op))
+    return !C->isZero();
+
+  // TODO: Recognize more cases here.
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::OR:
+    if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+      return !C->isNullValue();
+    break;
+  }
+
+  return false;
+}
+
+bool SelectionDAG::isEqualTo(SDValue A, SDValue B) const {
+  // Check the obvious case.
+  if (A == B) return true;
+
+  // For for negative and positive zero.
+  if (const ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A))
+    if (const ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B))
+      if (CA->isZero() && CB->isZero()) return true;
+
+  // Otherwise they may not be equal.
+  return false;
+}
+
+bool SelectionDAG::haveNoCommonBitsSet(SDValue A, SDValue B) const {
+  assert(A.getValueType() == B.getValueType() &&
+         "Values must have the same type");
+  APInt AZero, AOne;
+  APInt BZero, BOne;
+  computeKnownBits(A, AZero, AOne);
+  computeKnownBits(B, BZero, BOne);
+  return (AZero | BZero).isAllOnesValue();
+}
+
+static SDValue FoldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
+                                  ArrayRef<SDValue> Ops,
+                                  llvm::SelectionDAG &DAG) {
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  // Concat of UNDEFs is UNDEF.
+  if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
+    return DAG.getUNDEF(VT);
+
+  // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
+  // simplified to one big BUILD_VECTOR.
+  // FIXME: Add support for SCALAR_TO_VECTOR as well.
+  EVT SVT = VT.getScalarType();
+  SmallVector<SDValue, 16> Elts;
+  for (SDValue Op : Ops) {
+    EVT OpVT = Op.getValueType();
+    if (Op.isUndef())
+      Elts.append(OpVT.getVectorNumElements(), DAG.getUNDEF(SVT));
+    else if (Op.getOpcode() == ISD::BUILD_VECTOR)
+      Elts.append(Op->op_begin(), Op->op_end());
+    else
+      return SDValue();
+  }
+
+  // BUILD_VECTOR requires all inputs to be of the same type, find the
+  // maximum type and extend them all.
+  for (SDValue Op : Elts)
+    SVT = (SVT.bitsLT(Op.getValueType()) ? Op.getValueType() : SVT);
+
+  if (SVT.bitsGT(VT.getScalarType()))
+    for (SDValue &Op : Elts)
+      Op = DAG.getTargetLoweringInfo().isZExtFree(Op.getValueType(), SVT)
+               ? DAG.getZExtOrTrunc(Op, DL, SVT)
+               : DAG.getSExtOrTrunc(Op, DL, SVT);
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, VT, Elts);
+}
+
+/// Gets or creates the specified node.
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, Opcode, getVTList(VT), None);
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(),
+                              getVTList(VT));
+  CSEMap.InsertNode(N, IP);
+
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue Operand) {
+  // Constant fold unary operations with an integer constant operand. Even
+  // opaque constant will be folded, because the folding of unary operations
+  // doesn't create new constants with different values. Nevertheless, the
+  // opaque flag is preserved during folding to prevent future folding with
+  // other constants.
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Operand)) {
+    const APInt &Val = C->getAPIntValue();
+    switch (Opcode) {
+    default: break;
+    case ISD::SIGN_EXTEND:
+      return getConstant(Val.sextOrTrunc(VT.getSizeInBits()), DL, VT,
+                         C->isTargetOpcode(), C->isOpaque());
+    case ISD::ANY_EXTEND:
+    case ISD::ZERO_EXTEND:
+    case ISD::TRUNCATE:
+      return getConstant(Val.zextOrTrunc(VT.getSizeInBits()), DL, VT,
+                         C->isTargetOpcode(), C->isOpaque());
+    case ISD::UINT_TO_FP:
+    case ISD::SINT_TO_FP: {
+      APFloat apf(EVTToAPFloatSemantics(VT),
+                  APInt::getNullValue(VT.getSizeInBits()));
+      (void)apf.convertFromAPInt(Val,
+                                 Opcode==ISD::SINT_TO_FP,
+                                 APFloat::rmNearestTiesToEven);
+      return getConstantFP(apf, DL, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::f16 && C->getValueType(0) == MVT::i16)
+        return getConstantFP(APFloat(APFloat::IEEEhalf, Val), DL, VT);
+      if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
+        return getConstantFP(APFloat(APFloat::IEEEsingle, Val), DL, VT);
+      if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
+        return getConstantFP(APFloat(APFloat::IEEEdouble, Val), DL, VT);
+      if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
+        return getConstantFP(APFloat(APFloat::IEEEquad, Val), DL, VT);
+      break;
+    case ISD::BSWAP:
+      return getConstant(Val.byteSwap(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTPOP:
+      return getConstant(Val.countPopulation(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTLZ:
+    case ISD::CTLZ_ZERO_UNDEF:
+      return getConstant(Val.countLeadingZeros(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    case ISD::CTTZ:
+    case ISD::CTTZ_ZERO_UNDEF:
+      return getConstant(Val.countTrailingZeros(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
+    }
+  }
+
+  // Constant fold unary operations with a floating point constant operand.
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Operand)) {
+    APFloat V = C->getValueAPF();    // make copy
+    switch (Opcode) {
+    case ISD::FNEG:
+      V.changeSign();
+      return getConstantFP(V, DL, VT);
+    case ISD::FABS:
+      V.clearSign();
+      return getConstantFP(V, DL, VT);
+    case ISD::FCEIL: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FTRUNC: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FFLOOR: {
+      APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+      if (fs == APFloat::opOK || fs == APFloat::opInexact)
+        return getConstantFP(V, DL, VT);
+      break;
+    }
+    case ISD::FP_EXTEND: {
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, DL, VT);
+    }
+    case ISD::FP_TO_SINT:
+    case ISD::FP_TO_UINT: {
+      integerPart x[2];
+      bool ignored;
+      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
+      // FIXME need to be more flexible about rounding mode.
+      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
+                            Opcode==ISD::FP_TO_SINT,
+                            APFloat::rmTowardZero, &ignored);
+      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+        break;
+      APInt api(VT.getSizeInBits(), x);
+      return getConstant(api, DL, VT);
+    }
+    case ISD::BITCAST:
+      if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
+        return getConstant((uint16_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+      else if (VT == MVT::i32 && C->getValueType(0) == MVT::f32)
+        return getConstant((uint32_t)V.bitcastToAPInt().getZExtValue(), DL, VT);
+      else if (VT == MVT::i64 && C->getValueType(0) == MVT::f64)
+        return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
+      break;
+    }
+  }
+
+  // Constant fold unary operations with a vector integer or float operand.
+  if (BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Operand)) {
+    if (BV->isConstant()) {
+      switch (Opcode) {
+      default:
+        // FIXME: Entirely reasonable to perform folding of other unary
+        // operations here as the need arises.
+        break;
+      case ISD::FNEG:
+      case ISD::FABS:
+      case ISD::FCEIL:
+      case ISD::FTRUNC:
+      case ISD::FFLOOR:
+      case ISD::FP_EXTEND:
+      case ISD::FP_TO_SINT:
+      case ISD::FP_TO_UINT:
+      case ISD::TRUNCATE:
+      case ISD::UINT_TO_FP:
+      case ISD::SINT_TO_FP:
+      case ISD::BSWAP:
+      case ISD::CTLZ:
+      case ISD::CTLZ_ZERO_UNDEF:
+      case ISD::CTTZ:
+      case ISD::CTTZ_ZERO_UNDEF:
+      case ISD::CTPOP: {
+        SDValue Ops = { Operand };
+        if (SDValue Fold = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+          return Fold;
+      }
+      }
+    }
+  }
+
+  unsigned OpOpcode = Operand.getNode()->getOpcode();
+  switch (Opcode) {
+  case ISD::TokenFactor:
+  case ISD::MERGE_VALUES:
+  case ISD::CONCAT_VECTORS:
+    return Operand;         // Factor, merge or concat of one node?  No need.
+  case ISD::FP_ROUND: llvm_unreachable("Invalid method to make FP_ROUND node");
+  case ISD::FP_EXTEND:
+    assert(VT.isFloatingPoint() &&
+           Operand.getValueType().isFloatingPoint() && "Invalid FP cast!");
+    if (Operand.getValueType() == VT) return Operand;  // noop conversion.
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid fpext node, dst < src!");
+    if (Operand.isUndef())
+      return getUNDEF(VT);
+    break;
+  case ISD::SIGN_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid SIGN_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid sext node, dst < src!");
+    if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      // sext(undef) = 0, because the top bits will all be the same.
+      return getConstant(0, DL, VT);
+    break;
+  case ISD::ZERO_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ZERO_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid zext node, dst < src!");
+    if (OpOpcode == ISD::ZERO_EXTEND)   // (zext (zext x)) -> (zext x)
+      return getNode(ISD::ZERO_EXTEND, DL, VT,
+                     Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      // zext(undef) = 0, because the top bits will be zero.
+      return getConstant(0, DL, VT);
+    break;
+  case ISD::ANY_EXTEND:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid ANY_EXTEND!");
+    if (Operand.getValueType() == VT) return Operand;   // noop extension
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsLT(VT) &&
+           "Invalid anyext node, dst < src!");
+
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND)
+      // (ext (zext x)) -> (zext x)  and  (ext (sext x)) -> (sext x)
+      return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+    else if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+
+    // (ext (trunx x)) -> x
+    if (OpOpcode == ISD::TRUNCATE) {
+      SDValue OpOp = Operand.getNode()->getOperand(0);
+      if (OpOp.getValueType() == VT)
+        return OpOp;
+    }
+    break;
+  case ISD::TRUNCATE:
+    assert(VT.isInteger() && Operand.getValueType().isInteger() &&
+           "Invalid TRUNCATE!");
+    if (Operand.getValueType() == VT) return Operand;   // noop truncate
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() ==
+            Operand.getValueType().getVectorNumElements()) &&
+           "Vector element count mismatch!");
+    assert(Operand.getValueType().bitsGT(VT) &&
+           "Invalid truncate node, src < dst!");
+    if (OpOpcode == ISD::TRUNCATE)
+      return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+    if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND ||
+        OpOpcode == ISD::ANY_EXTEND) {
+      // If the source is smaller than the dest, we still need an extend.
+      if (Operand.getNode()->getOperand(0).getValueType().getScalarType()
+            .bitsLT(VT.getScalarType()))
+        return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0));
+      if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT))
+        return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0));
+      return Operand.getNode()->getOperand(0);
+    }
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::BSWAP:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid BSWAP!");
+    assert((VT.getScalarSizeInBits() % 16 == 0) &&
+           "BSWAP types must be a multiple of 16 bits!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::BITREVERSE:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid BITREVERSE!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::BITCAST:
+    // Basic sanity checking.
+    assert(VT.getSizeInBits() == Operand.getValueType().getSizeInBits()
+           && "Cannot BITCAST between types of different sizes!");
+    if (VT == Operand.getValueType()) return Operand;  // noop conversion.
+    if (OpOpcode == ISD::BITCAST)  // bitconv(bitconv(x)) -> bitconv(x)
+      return getNode(ISD::BITCAST, DL, VT, Operand.getOperand(0));
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
+  case ISD::SCALAR_TO_VECTOR:
+    assert(VT.isVector() && !Operand.getValueType().isVector() &&
+           (VT.getVectorElementType() == Operand.getValueType() ||
+            (VT.getVectorElementType().isInteger() &&
+             Operand.getValueType().isInteger() &&
+             VT.getVectorElementType().bitsLE(Operand.getValueType()))) &&
+           "Illegal SCALAR_TO_VECTOR node!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    // scalar_to_vector(extract_vector_elt V, 0) -> V, top bits are undefined.
+    if (OpOpcode == ISD::EXTRACT_VECTOR_ELT &&
+        isa<ConstantSDNode>(Operand.getOperand(1)) &&
+        Operand.getConstantOperandVal(1) == 0 &&
+        Operand.getOperand(0).getValueType() == VT)
+      return Operand.getOperand(0);
+    break;
+  case ISD::FNEG:
+    // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+    if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB)
+      // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags?
+      return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1),
+                       Operand.getNode()->getOperand(0),
+                       &cast<BinaryWithFlagsSDNode>(Operand.getNode())->Flags);
+    if (OpOpcode == ISD::FNEG)  // --X -> X
+      return Operand.getNode()->getOperand(0);
+    break;
+  case ISD::FABS:
+    if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
+      return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0));
+    break;
+  }
+
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {Operand};
+  if (VT != MVT::Glue) { // Don't CSE flag producing nodes
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+static std::pair<APInt, bool> FoldValue(unsigned Opcode, const APInt &C1,
+                                        const APInt &C2) {
+  switch (Opcode) {
+  case ISD::ADD:  return std::make_pair(C1 + C2, true);
+  case ISD::SUB:  return std::make_pair(C1 - C2, true);
+  case ISD::MUL:  return std::make_pair(C1 * C2, true);
+  case ISD::AND:  return std::make_pair(C1 & C2, true);
+  case ISD::OR:   return std::make_pair(C1 | C2, true);
+  case ISD::XOR:  return std::make_pair(C1 ^ C2, true);
+  case ISD::SHL:  return std::make_pair(C1 << C2, true);
+  case ISD::SRL:  return std::make_pair(C1.lshr(C2), true);
+  case ISD::SRA:  return std::make_pair(C1.ashr(C2), true);
+  case ISD::ROTL: return std::make_pair(C1.rotl(C2), true);
+  case ISD::ROTR: return std::make_pair(C1.rotr(C2), true);
+  case ISD::SMIN: return std::make_pair(C1.sle(C2) ? C1 : C2, true);
+  case ISD::SMAX: return std::make_pair(C1.sge(C2) ? C1 : C2, true);
+  case ISD::UMIN: return std::make_pair(C1.ule(C2) ? C1 : C2, true);
+  case ISD::UMAX: return std::make_pair(C1.uge(C2) ? C1 : C2, true);
+  case ISD::UDIV:
+    if (!C2.getBoolValue())
+      break;
+    return std::make_pair(C1.udiv(C2), true);
+  case ISD::UREM:
+    if (!C2.getBoolValue())
+      break;
+    return std::make_pair(C1.urem(C2), true);
+  case ISD::SDIV:
+    if (!C2.getBoolValue())
+      break;
+    return std::make_pair(C1.sdiv(C2), true);
+  case ISD::SREM:
+    if (!C2.getBoolValue())
+      break;
+    return std::make_pair(C1.srem(C2), true);
+  }
+  return std::make_pair(APInt(1, 0), false);
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                             EVT VT, const ConstantSDNode *Cst1,
+                                             const ConstantSDNode *Cst2) {
+  if (Cst1->isOpaque() || Cst2->isOpaque())
+    return SDValue();
+
+  std::pair<APInt, bool> Folded = FoldValue(Opcode, Cst1->getAPIntValue(),
+                                            Cst2->getAPIntValue());
+  if (!Folded.second)
+    return SDValue();
+  return getConstant(Folded.first, DL, VT);
+}
+
+SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
+                                       const GlobalAddressSDNode *GA,
+                                       const SDNode *N2) {
+  if (GA->getOpcode() != ISD::GlobalAddress)
+    return SDValue();
+  if (!TLI->isOffsetFoldingLegal(GA))
+    return SDValue();
+  const ConstantSDNode *Cst2 = dyn_cast<ConstantSDNode>(N2);
+  if (!Cst2)
+    return SDValue();
+  int64_t Offset = Cst2->getSExtValue();
+  switch (Opcode) {
+  case ISD::ADD: break;
+  case ISD::SUB: Offset = -uint64_t(Offset); break;
+  default: return SDValue();
+  }
+  return getGlobalAddress(GA->getGlobal(), SDLoc(Cst2), VT,
+                          GA->getOffset() + uint64_t(Offset));
+}
+
+SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
+                                             EVT VT, SDNode *Cst1,
+                                             SDNode *Cst2) {
+  // If the opcode is a target-specific ISD node, there's nothing we can
+  // do here and the operand rules may not line up with the below, so
+  // bail early.
+  if (Opcode >= ISD::BUILTIN_OP_END)
+    return SDValue();
+
+  // Handle the case of two scalars.
+  if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
+    if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
+      SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, Scalar1, Scalar2);
+      assert((!Folded || !VT.isVector()) &&
+             "Can't fold vectors ops with scalar operands");
+      return Folded;
+    }
+  }
+
+  // fold (add Sym, c) -> Sym+c
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst1))
+    return FoldSymbolOffset(Opcode, VT, GA, Cst2);
+  if (isCommutativeBinOp(Opcode))
+    if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Cst2))
+      return FoldSymbolOffset(Opcode, VT, GA, Cst1);
+
+  // For vectors extract each constant element into Inputs so we can constant
+  // fold them individually.
+  BuildVectorSDNode *BV1 = dyn_cast<BuildVectorSDNode>(Cst1);
+  BuildVectorSDNode *BV2 = dyn_cast<BuildVectorSDNode>(Cst2);
+  if (!BV1 || !BV2)
+    return SDValue();
+
+  assert(BV1->getNumOperands() == BV2->getNumOperands() && "Out of sync!");
+
+  EVT SVT = VT.getScalarType();
+  SmallVector<SDValue, 4> Outputs;
+  for (unsigned I = 0, E = BV1->getNumOperands(); I != E; ++I) {
+    ConstantSDNode *V1 = dyn_cast<ConstantSDNode>(BV1->getOperand(I));
+    ConstantSDNode *V2 = dyn_cast<ConstantSDNode>(BV2->getOperand(I));
+    if (!V1 || !V2) // Not a constant, bail.
+      return SDValue();
+
+    if (V1->isOpaque() || V2->isOpaque())
+      return SDValue();
+
+    // Avoid BUILD_VECTOR nodes that perform implicit truncation.
+    // FIXME: This is valid and could be handled by truncating the APInts.
+    if (V1->getValueType(0) != SVT || V2->getValueType(0) != SVT)
+      return SDValue();
+
+    // Fold one vector element.
+    std::pair<APInt, bool> Folded = FoldValue(Opcode, V1->getAPIntValue(),
+                                              V2->getAPIntValue());
+    if (!Folded.second)
+      return SDValue();
+    Outputs.push_back(getConstant(Folded.first, DL, SVT));
+  }
+
+  assert(VT.getVectorNumElements() == Outputs.size() &&
+         "Vector size mismatch!");
+
+  // We may have a vector type but a scalar result. Create a splat.
+  Outputs.resize(VT.getVectorNumElements(), Outputs.back());
+
+  // Build a big vector out of the scalar elements we generated.
+  return getBuildVector(VT, SDLoc(), Outputs);
+}
+
+SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
+                                                   const SDLoc &DL, EVT VT,
+                                                   ArrayRef<SDValue> Ops,
+                                                   const SDNodeFlags *Flags) {
+  // If the opcode is a target-specific ISD node, there's nothing we can
+  // do here and the operand rules may not line up with the below, so
+  // bail early.
+  if (Opcode >= ISD::BUILTIN_OP_END)
+    return SDValue();
+
+  // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
+  if (!VT.isVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
+  auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
+    return !Op.getValueType().isVector() ||
+           Op.getValueType().getVectorNumElements() == NumElts;
+  };
+
+  auto IsConstantBuildVectorOrUndef = [&](const SDValue &Op) {
+    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(Op);
+    return (Op.isUndef()) || (Op.getOpcode() == ISD::CONDCODE) ||
+           (BV && BV->isConstant());
+  };
+
+  // All operands must be vector types with the same number of elements as
+  // the result type and must be either UNDEF or a build vector of constant
+  // or UNDEF scalars.
+  if (!std::all_of(Ops.begin(), Ops.end(), IsConstantBuildVectorOrUndef) ||
+      !std::all_of(Ops.begin(), Ops.end(), IsScalarOrSameVectorSize))
+    return SDValue();
+
+  // If we are comparing vectors, then the result needs to be a i1 boolean
+  // that is then sign-extended back to the legal result type.
+  EVT SVT = (Opcode == ISD::SETCC ? MVT::i1 : VT.getScalarType());
+
+  // Find legal integer scalar type for constant promotion and
+  // ensure that its scalar size is at least as large as source.
+  EVT LegalSVT = VT.getScalarType();
+  if (LegalSVT.isInteger()) {
+    LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
+    if (LegalSVT.bitsLT(VT.getScalarType()))
+      return SDValue();
+  }
+
+  // Constant fold each scalar lane separately.
+  SmallVector<SDValue, 4> ScalarResults;
+  for (unsigned i = 0; i != NumElts; i++) {
+    SmallVector<SDValue, 4> ScalarOps;
+    for (SDValue Op : Ops) {
+      EVT InSVT = Op.getValueType().getScalarType();
+      BuildVectorSDNode *InBV = dyn_cast<BuildVectorSDNode>(Op);
+      if (!InBV) {
+        // We've checked that this is UNDEF or a constant of some kind.
+        if (Op.isUndef())
+          ScalarOps.push_back(getUNDEF(InSVT));
+        else
+          ScalarOps.push_back(Op);
+        continue;
+      }
+
+      SDValue ScalarOp = InBV->getOperand(i);
+      EVT ScalarVT = ScalarOp.getValueType();
+
+      // Build vector (integer) scalar operands may need implicit
+      // truncation - do this before constant folding.
+      if (ScalarVT.isInteger() && ScalarVT.bitsGT(InSVT))
+        ScalarOp = getNode(ISD::TRUNCATE, DL, InSVT, ScalarOp);
+
+      ScalarOps.push_back(ScalarOp);
+    }
+
+    // Constant fold the scalar operands.
+    SDValue ScalarResult = getNode(Opcode, DL, SVT, ScalarOps, Flags);
+
+    // Legalize the (integer) scalar constant if necessary.
+    if (LegalSVT != SVT)
+      ScalarResult = getNode(ISD::SIGN_EXTEND, DL, LegalSVT, ScalarResult);
+
+    // Scalar folding only succeeded if the result is a constant or UNDEF.
+    if (!ScalarResult.isUndef() && ScalarResult.getOpcode() != ISD::Constant &&
+        ScalarResult.getOpcode() != ISD::ConstantFP)
+      return SDValue();
+    ScalarResults.push_back(ScalarResult);
+  }
+
+  return getBuildVector(VT, DL, ScalarResults);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2,
+                              const SDNodeFlags *Flags) {
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  ConstantSDNode *N2C = dyn_cast<ConstantSDNode>(N2);
+  ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+  ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+
+  // Canonicalize constant to RHS if commutative.
+  if (isCommutativeBinOp(Opcode)) {
+    if (N1C && !N2C) {
+      std::swap(N1C, N2C);
+      std::swap(N1, N2);
+    } else if (N1CFP && !N2CFP) {
+      std::swap(N1CFP, N2CFP);
+      std::swap(N1, N2);
+    }
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::TokenFactor:
+    assert(VT == MVT::Other && N1.getValueType() == MVT::Other &&
+           N2.getValueType() == MVT::Other && "Invalid token factor!");
+    // Fold trivial token factors.
+    if (N1.getOpcode() == ISD::EntryToken) return N2;
+    if (N2.getOpcode() == ISD::EntryToken) return N1;
+    if (N1 == N2) return N1;
+    break;
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::AND:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X & 0) -> 0.  This commonly occurs when legalizing i64 values, so it's
+    // worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N2;
+    if (N2C && N2C->isAllOnesValue())  // X & -1 -> X
+      return N1;
+    break;
+  case ISD::OR:
+  case ISD::XOR:
+  case ISD::ADD:
+  case ISD::SUB:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    // (X ^|+- 0) -> X.  This commonly occurs when legalizing i64 values, so
+    // it's worth handling here.
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::UDIV:
+  case ISD::UREM:
+  case ISD::MULHU:
+  case ISD::MULHS:
+  case ISD::MUL:
+  case ISD::SDIV:
+  case ISD::SREM:
+  case ISD::SMIN:
+  case ISD::SMAX:
+  case ISD::UMIN:
+  case ISD::UMAX:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+    if (getTarget().Options.UnsafeFPMath) {
+      if (Opcode == ISD::FADD) {
+        // x+0 --> x
+        if (N2CFP && N2CFP->getValueAPF().isZero())
+          return N1;
+      } else if (Opcode == ISD::FSUB) {
+        // x-0 --> x
+        if (N2CFP && N2CFP->getValueAPF().isZero())
+          return N1;
+      } else if (Opcode == ISD::FMUL) {
+        // x*0 --> 0
+        if (N2CFP && N2CFP->isZero())
+          return N2;
+        // x*1 --> x
+        if (N2CFP && N2CFP->isExactlyValue(1.0))
+          return N1;
+      }
+    }
+    assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    break;
+  case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
+    assert(N1.getValueType() == VT &&
+           N1.getValueType().isFloatingPoint() &&
+           N2.getValueType().isFloatingPoint() &&
+           "Invalid FCOPYSIGN!");
+    break;
+  case ISD::SHL:
+  case ISD::SRA:
+  case ISD::SRL:
+  case ISD::ROTL:
+  case ISD::ROTR:
+    assert(VT == N1.getValueType() &&
+           "Shift operators return type must be the same as their first arg");
+    assert(VT.isInteger() && N2.getValueType().isInteger() &&
+           "Shifts only work on integers");
+    assert((!VT.isVector() || VT == N2.getValueType()) &&
+           "Vector shift amounts must be in the same as their first arg");
+    // Verify that the shift amount VT is bit enough to hold valid shift
+    // amounts.  This catches things like trying to shift an i1024 value by an
+    // i8, which is easy to fall into in generic code that uses
+    // TLI.getShiftAmount().
+    assert(N2.getValueType().getSizeInBits() >=
+                   Log2_32_Ceil(N1.getValueType().getSizeInBits()) &&
+           "Invalid use of small shift amount with oversized value!");
+
+    // Always fold shifts of i1 values so the code generator doesn't need to
+    // handle them.  Since we know the size of the shift has to be less than the
+    // size of the value, the shift/rotate count is guaranteed to be zero.
+    if (VT == MVT::i1)
+      return N1;
+    if (N2C && N2C->isNullValue())
+      return N1;
+    break;
+  case ISD::FP_ROUND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg round!");
+    assert(VT.isFloatingPoint() && EVT.isFloatingPoint() &&
+           "Cannot FP_ROUND_INREG integer types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "FP_ROUND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in FP_ROUND_INREG");
+    assert(EVT.bitsLE(VT) && "Not rounding down!");
+    (void)EVT;
+    if (cast<VTSDNode>(N2)->getVT() == VT) return N1;  // Not actually rounding.
+    break;
+  }
+  case ISD::FP_ROUND:
+    assert(VT.isFloatingPoint() &&
+           N1.getValueType().isFloatingPoint() &&
+           VT.bitsLE(N1.getValueType()) &&
+           N2C && "Invalid FP_ROUND!");
+    if (N1.getValueType() == VT) return N1;  // noop conversion.
+    break;
+  case ISD::AssertSext:
+  case ISD::AssertZext: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(!EVT.isVector() &&
+           "AssertSExt/AssertZExt type should be the vector element type "
+           "rather than the vector type!");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (VT == EVT) return N1; // noop assertion.
+    break;
+  }
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT EVT = cast<VTSDNode>(N2)->getVT();
+    assert(VT == N1.getValueType() && "Not an inreg extend!");
+    assert(VT.isInteger() && EVT.isInteger() &&
+           "Cannot *_EXTEND_INREG FP types");
+    assert(EVT.isVector() == VT.isVector() &&
+           "SIGN_EXTEND_INREG type should be vector iff the operand "
+           "type is vector!");
+    assert((!EVT.isVector() ||
+            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+           "Vector element counts must match in SIGN_EXTEND_INREG");
+    assert(EVT.bitsLE(VT) && "Not extending!");
+    if (EVT == VT) return N1;  // Not actually extending
+
+    auto SignExtendInReg = [&](APInt Val) {
+      unsigned FromBits = EVT.getScalarType().getSizeInBits();
+      Val <<= Val.getBitWidth() - FromBits;
+      Val = Val.ashr(Val.getBitWidth() - FromBits);
+      return getConstant(Val, DL, VT.getScalarType());
+    };
+
+    if (N1C) {
+      const APInt &Val = N1C->getAPIntValue();
+      return SignExtendInReg(Val);
+    }
+    if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
+      SmallVector<SDValue, 8> Ops;
+      for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+        SDValue Op = N1.getOperand(i);
+        if (Op.isUndef()) {
+          Ops.push_back(getUNDEF(VT.getScalarType()));
+          continue;
+        }
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+          APInt Val = C->getAPIntValue();
+          Val = Val.zextOrTrunc(VT.getScalarSizeInBits());
+          Ops.push_back(SignExtendInReg(Val));
+          continue;
+        }
+        break;
+      }
+      if (Ops.size() == VT.getVectorNumElements())
+        return getBuildVector(VT, DL, Ops);
+    }
+    break;
+  }
+  case ISD::EXTRACT_VECTOR_ELT:
+    // EXTRACT_VECTOR_ELT of an UNDEF is an UNDEF.
+    if (N1.isUndef())
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
+    if (N2C && N2C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+      return getUNDEF(VT);
+
+    // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
+    // expanding copies of large vectors from registers.
+    if (N2C &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getNumOperands() > 0) {
+      unsigned Factor =
+        N1.getOperand(0).getValueType().getVectorNumElements();
+      return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
+                     N1.getOperand(N2C->getZExtValue() / Factor),
+                     getConstant(N2C->getZExtValue() % Factor, DL,
+                                 N2.getValueType()));
+    }
+
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
+    // expanding large vector constants.
+    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
+      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+
+      if (VT != Elt.getValueType())
+        // If the vector element type is not legal, the BUILD_VECTOR operands
+        // are promoted and implicitly truncated, and the result implicitly
+        // extended. Make that explicit here.
+        Elt = getAnyExtOrTrunc(Elt, DL, VT);
+
+      return Elt;
+    }
+
+    // EXTRACT_VECTOR_ELT of INSERT_VECTOR_ELT is often formed when vector
+    // operations are lowered to scalars.
+    if (N1.getOpcode() == ISD::INSERT_VECTOR_ELT) {
+      // If the indices are the same, return the inserted element else
+      // if the indices are known different, extract the element from
+      // the original vector.
+      SDValue N1Op2 = N1.getOperand(2);
+      ConstantSDNode *N1Op2C = dyn_cast<ConstantSDNode>(N1Op2);
+
+      if (N1Op2C && N2C) {
+        if (N1Op2C->getZExtValue() == N2C->getZExtValue()) {
+          if (VT == N1.getOperand(1).getValueType())
+            return N1.getOperand(1);
+          else
+            return getSExtOrTrunc(N1.getOperand(1), DL, VT);
+        }
+
+        return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0), N2);
+      }
+    }
+    break;
+  case ISD::EXTRACT_ELEMENT:
+    assert(N2C && (unsigned)N2C->getZExtValue() < 2 && "Bad EXTRACT_ELEMENT!");
+    assert(!N1.getValueType().isVector() && !VT.isVector() &&
+           (N1.getValueType().isInteger() == VT.isInteger()) &&
+           N1.getValueType() != VT &&
+           "Wrong types for EXTRACT_ELEMENT!");
+
+    // EXTRACT_ELEMENT of BUILD_PAIR is often formed while legalize is expanding
+    // 64-bit integers into 32-bit parts.  Instead of building the extract of
+    // the BUILD_PAIR, only to have legalize rip it apart, just do it now.
+    if (N1.getOpcode() == ISD::BUILD_PAIR)
+      return N1.getOperand(N2C->getZExtValue());
+
+    // EXTRACT_ELEMENT of a constant int is also very common.
+    if (N1C) {
+      unsigned ElementSize = VT.getSizeInBits();
+      unsigned Shift = ElementSize * N2C->getZExtValue();
+      APInt ShiftedVal = N1C->getAPIntValue().lshr(Shift);
+      return getConstant(ShiftedVal.trunc(ElementSize), DL, VT);
+    }
+    break;
+  case ISD::EXTRACT_SUBVECTOR:
+    if (VT.isSimple() && N1.getValueType().isSimple()) {
+      assert(VT.isVector() && N1.getValueType().isVector() &&
+             "Extract subvector VTs must be a vectors!");
+      assert(VT.getVectorElementType() ==
+             N1.getValueType().getVectorElementType() &&
+             "Extract subvector VTs must have the same element type!");
+      assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
+             "Extract subvector must be from larger vector to smaller vector!");
+
+      if (N2C) {
+        assert((VT.getVectorNumElements() + N2C->getZExtValue()
+                <= N1.getValueType().getVectorNumElements())
+               && "Extract subvector overflow!");
+      }
+
+      // Trivial extraction.
+      if (VT.getSimpleVT() == N1.getSimpleValueType())
+        return N1;
+    }
+    break;
+  }
+
+  // Perform trivial constant folding.
+  if (SDValue SV =
+          FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
+    return SV;
+
+  // Constant fold FP operations.
+  bool HasFPExceptions = TLI->hasFloatingPointExceptions();
+  if (N1CFP) {
+    if (N2CFP) {
+      APFloat V1 = N1CFP->getValueAPF(), V2 = N2CFP->getValueAPF();
+      APFloat::opStatus s;
+      switch (Opcode) {
+      case ISD::FADD:
+        s = V1.add(V2, APFloat::rmNearestTiesToEven);
+        if (!HasFPExceptions || s != APFloat::opInvalidOp)
+          return getConstantFP(V1, DL, VT);
+        break;
+      case ISD::FSUB:
+        s = V1.subtract(V2, APFloat::rmNearestTiesToEven);
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, DL, VT);
+        break;
+      case ISD::FMUL:
+        s = V1.multiply(V2, APFloat::rmNearestTiesToEven);
+        if (!HasFPExceptions || s!=APFloat::opInvalidOp)
+          return getConstantFP(V1, DL, VT);
+        break;
+      case ISD::FDIV:
+        s = V1.divide(V2, APFloat::rmNearestTiesToEven);
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
+          return getConstantFP(V1, DL, VT);
+        }
+        break;
+      case ISD::FREM :
+        s = V1.mod(V2);
+        if (!HasFPExceptions || (s!=APFloat::opInvalidOp &&
+                                 s!=APFloat::opDivByZero)) {
+          return getConstantFP(V1, DL, VT);
+        }
+        break;
+      case ISD::FCOPYSIGN:
+        V1.copySign(V2);
+        return getConstantFP(V1, DL, VT);
+      default: break;
+      }
+    }
+
+    if (Opcode == ISD::FP_ROUND) {
+      APFloat V = N1CFP->getValueAPF();    // make copy
+      bool ignored;
+      // This can return overflow, underflow, or inexact; we don't care.
+      // FIXME need to be more flexible about rounding mode.
+      (void)V.convert(EVTToAPFloatSemantics(VT),
+                      APFloat::rmNearestTiesToEven, &ignored);
+      return getConstantFP(V, DL, VT);
+    }
+  }
+
+  // Canonicalize an UNDEF to the RHS, even over a constant.
+  if (N1.isUndef()) {
+    if (isCommutativeBinOp(Opcode)) {
+      std::swap(N1, N2);
+    } else {
+      switch (Opcode) {
+      case ISD::FP_ROUND_INREG:
+      case ISD::SIGN_EXTEND_INREG:
+      case ISD::SUB:
+      case ISD::FSUB:
+      case ISD::FDIV:
+      case ISD::FREM:
+      case ISD::SRA:
+        return N1;     // fold op(undef, arg2) -> undef
+      case ISD::UDIV:
+      case ISD::SDIV:
+      case ISD::UREM:
+      case ISD::SREM:
+      case ISD::SRL:
+      case ISD::SHL:
+        if (!VT.isVector())
+          return getConstant(0, DL, VT);    // fold op(undef, arg2) -> 0
+        // For vectors, we can't easily build an all zero vector, just return
+        // the LHS.
+        return N2;
+      }
+    }
+  }
+
+  // Fold a bunch of operators when the RHS is undef.
+  if (N2.isUndef()) {
+    switch (Opcode) {
+    case ISD::XOR:
+      if (N1.isUndef())
+        // Handle undef ^ undef -> 0 special case. This is a common
+        // idiom (misuse).
+        return getConstant(0, DL, VT);
+      // fallthrough
+    case ISD::ADD:
+    case ISD::ADDC:
+    case ISD::ADDE:
+    case ISD::SUB:
+    case ISD::UDIV:
+    case ISD::SDIV:
+    case ISD::UREM:
+    case ISD::SREM:
+      return N2;       // fold op(arg1, undef) -> undef
+    case ISD::FADD:
+    case ISD::FSUB:
+    case ISD::FMUL:
+    case ISD::FDIV:
+    case ISD::FREM:
+      if (getTarget().Options.UnsafeFPMath)
+        return N2;
+      break;
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::SRL:
+    case ISD::SHL:
+      if (!VT.isVector())
+        return getConstant(0, DL, VT);  // fold op(arg1, undef) -> 0
+      // For vectors, we can't easily build an all zero vector, just return
+      // the LHS.
+      return N1;
+    case ISD::OR:
+      if (!VT.isVector())
+        return getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+      // For vectors, we can't easily build an all one vector, just return
+      // the LHS.
+      return N1;
+    case ISD::SRA:
+      return N1;
+    }
+  }
+
+  // Memoize this node if possible.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  if (VT != MVT::Glue) {
+    SDValue Ops[] = {N1, N2};
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
+      return SDValue(E, 0);
+    }
+
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = GetBinarySDNode(Opcode, DL, VTs, N1, N2, Flags);
+  }
+
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  // Perform various simplifications.
+  switch (Opcode) {
+  case ISD::FMA: {
+    ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
+    ConstantFPSDNode *N2CFP = dyn_cast<ConstantFPSDNode>(N2);
+    ConstantFPSDNode *N3CFP = dyn_cast<ConstantFPSDNode>(N3);
+    if (N1CFP && N2CFP && N3CFP) {
+      APFloat  V1 = N1CFP->getValueAPF();
+      const APFloat &V2 = N2CFP->getValueAPF();
+      const APFloat &V3 = N3CFP->getValueAPF();
+      APFloat::opStatus s =
+        V1.fusedMultiplyAdd(V2, V3, APFloat::rmNearestTiesToEven);
+      if (!TLI->hasFloatingPointExceptions() || s != APFloat::opInvalidOp)
+        return getConstantFP(V1, DL, VT);
+    }
+    break;
+  }
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::SETCC: {
+    // Use FoldSetCC to simplify SETCC's.
+    if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
+      return V;
+    // Vector constant folding.
+    SDValue Ops[] = {N1, N2, N3};
+    if (SDValue V = FoldConstantVectorArithmetic(Opcode, DL, VT, Ops))
+      return V;
+    break;
+  }
+  case ISD::SELECT:
+    if (ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1)) {
+     if (N1C->getZExtValue())
+       return N2;             // select true, X, Y -> X
+     return N3;             // select false, X, Y -> Y
+    }
+
+    if (N2 == N3) return N2;   // select C, X, X -> X
+    break;
+  case ISD::VECTOR_SHUFFLE:
+    llvm_unreachable("should use getVectorShuffle constructor!");
+  case ISD::INSERT_SUBVECTOR: {
+    SDValue Index = N3;
+    if (VT.isSimple() && N1.getValueType().isSimple()
+        && N2.getValueType().isSimple()) {
+      assert(VT.isVector() && N1.getValueType().isVector() &&
+             N2.getValueType().isVector() &&
+             "Insert subvector VTs must be a vectors");
+      assert(VT == N1.getValueType() &&
+             "Dest and insert subvector source types must match!");
+      assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
+             "Insert subvector must be from smaller vector to larger vector!");
+      if (isa<ConstantSDNode>(Index)) {
+        assert((N2.getValueType().getVectorNumElements() +
+                cast<ConstantSDNode>(Index)->getZExtValue()
+                <= VT.getVectorNumElements())
+               && "Insert subvector overflow!");
+      }
+
+      // Trivial insertion.
+      if (VT.getSimpleVT() == N2.getSimpleValueType())
+        return N2;
+    }
+    break;
+  }
+  case ISD::BITCAST:
+    // Fold bit_convert nodes from a type to themselves.
+    if (N1.getValueType() == VT)
+      return N1;
+    break;
+  }
+
+  // Memoize node if it doesn't produce a flag.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = {N1, N2, N3};
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VT, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VT, Ops);
+}
+
+/// getStackArgumentTokenFactor - Compute a TokenFactor to force all
+/// the incoming stack arguments to be loaded from the stack.
+SDValue SelectionDAG::getStackArgumentTokenFactor(SDValue Chain) {
+  SmallVector<SDValue, 8> ArgChains;
+
+  // Include the original chain at the beginning of the list. When this is
+  // used by target LowerCall hooks, this helps legalize find the
+  // CALLSEQ_BEGIN node.
+  ArgChains.push_back(Chain);
+
+  // Add a chain value for each stack argument.
+  for (SDNode::use_iterator U = getEntryNode().getNode()->use_begin(),
+       UE = getEntryNode().getNode()->use_end(); U != UE; ++U)
+    if (LoadSDNode *L = dyn_cast<LoadSDNode>(*U))
+      if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(L->getBasePtr()))
+        if (FI->getIndex() < 0)
+          ArgChains.push_back(SDValue(L, 1));
+
+  // Build a tokenfactor for all the chains.
+  return getNode(ISD::TokenFactor, SDLoc(Chain), MVT::Other, ArgChains);
+}
+
+/// getMemsetValue - Vectorized representation of the memset value
+/// operand.
+static SDValue getMemsetValue(SDValue Value, EVT VT, SelectionDAG &DAG,
+                              const SDLoc &dl) {
+  assert(!Value.isUndef());
+
+  unsigned NumBits = VT.getScalarType().getSizeInBits();
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Value)) {
+    assert(C->getAPIntValue().getBitWidth() == 8);
+    APInt Val = APInt::getSplat(NumBits, C->getAPIntValue());
+    if (VT.isInteger())
+      return DAG.getConstant(Val, dl, VT);
+    return DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(VT), Val), dl,
+                             VT);
+  }
+
+  assert(Value.getValueType() == MVT::i8 && "memset with non-byte fill value?");
+  EVT IntVT = VT.getScalarType();
+  if (!IntVT.isInteger())
+    IntVT = EVT::getIntegerVT(*DAG.getContext(), IntVT.getSizeInBits());
+
+  Value = DAG.getNode(ISD::ZERO_EXTEND, dl, IntVT, Value);
+  if (NumBits > 8) {
+    // Use a multiplication with 0x010101... to extend the input to the
+    // required length.
+    APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01));
+    Value = DAG.getNode(ISD::MUL, dl, IntVT, Value,
+                        DAG.getConstant(Magic, dl, IntVT));
+  }
+
+  if (VT != Value.getValueType() && !VT.isInteger())
+    Value = DAG.getBitcast(VT.getScalarType(), Value);
+  if (VT != Value.getValueType())
+    Value = DAG.getSplatBuildVector(VT, dl, Value);
+
+  return Value;
+}
+
+/// getMemsetStringVal - Similar to getMemsetValue. Except this is only
+/// used when a memcpy is turned into a memset when the source is a constant
+/// string ptr.
+static SDValue getMemsetStringVal(EVT VT, const SDLoc &dl, SelectionDAG &DAG,
+                                  const TargetLowering &TLI, StringRef Str) {
+  // Handle vector with all elements zero.
+  if (Str.empty()) {
+    if (VT.isInteger())
+      return DAG.getConstant(0, dl, VT);
+    else if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128)
+      return DAG.getConstantFP(0.0, dl, VT);
+    else if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      MVT EltVT = (VT.getVectorElementType() == MVT::f32) ? MVT::i32 : MVT::i64;
+      return DAG.getNode(ISD::BITCAST, dl, VT,
+                         DAG.getConstant(0, dl,
+                                         EVT::getVectorVT(*DAG.getContext(),
+                                                          EltVT, NumElts)));
+    } else
+      llvm_unreachable("Expected type!");
+  }
+
+  assert(!VT.isVector() && "Can't handle vector type here!");
+  unsigned NumVTBits = VT.getSizeInBits();
+  unsigned NumVTBytes = NumVTBits / 8;
+  unsigned NumBytes = std::min(NumVTBytes, unsigned(Str.size()));
+
+  APInt Val(NumVTBits, 0);
+  if (DAG.getDataLayout().isLittleEndian()) {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Str[i] << i*8;
+  } else {
+    for (unsigned i = 0; i != NumBytes; ++i)
+      Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
+  }
+
+  // If the "cost" of materializing the integer immediate is less than the cost
+  // of a load, then it is cost effective to turn the load into the immediate.
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  if (TLI.shouldConvertConstantLoadToIntImm(Val, Ty))
+    return DAG.getConstant(Val, dl, VT);
+  return SDValue(nullptr, 0);
+}
+
+SDValue SelectionDAG::getMemBasePlusOffset(SDValue Base, unsigned Offset,
+                                           const SDLoc &DL) {
+  EVT VT = Base.getValueType();
+  return getNode(ISD::ADD, DL, VT, Base, getConstant(Offset, DL, VT));
+}
+
+/// isMemSrcFromString - Returns true if memcpy source is a string constant.
+///
+static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
+  uint64_t SrcDelta = 0;
+  GlobalAddressSDNode *G = nullptr;
+  if (Src.getOpcode() == ISD::GlobalAddress)
+    G = cast<GlobalAddressSDNode>(Src);
+  else if (Src.getOpcode() == ISD::ADD &&
+           Src.getOperand(0).getOpcode() == ISD::GlobalAddress &&
+           Src.getOperand(1).getOpcode() == ISD::Constant) {
+    G = cast<GlobalAddressSDNode>(Src.getOperand(0));
+    SrcDelta = cast<ConstantSDNode>(Src.getOperand(1))->getZExtValue();
+  }
+  if (!G)
+    return false;
+
+  return getConstantStringInfo(G->getGlobal(), Str,
+                               SrcDelta + G->getOffset(), false);
+}
+
+/// Determines the optimal series of memory ops to replace the memset / memcpy.
+/// Return true if the number of memory ops is below the threshold (Limit).
+/// It returns the types of the sequence of memory ops to perform
+/// memset / memcpy by reference.
+static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
+                                     unsigned Limit, uint64_t Size,
+                                     unsigned DstAlign, unsigned SrcAlign,
+                                     bool IsMemset,
+                                     bool ZeroMemset,
+                                     bool MemcpyStrSrc,
+                                     bool AllowOverlap,
+                                     unsigned DstAS, unsigned SrcAS,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI) {
+  assert((SrcAlign == 0 || SrcAlign >= DstAlign) &&
+         "Expecting memcpy / memset source to meet alignment requirement!");
+  // If 'SrcAlign' is zero, that means the memory operation does not need to
+  // load the value, i.e. memset or memcpy from constant string. Otherwise,
+  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
+  // is the specified alignment of the memory operation. If it is zero, that
+  // means it's possible to change the alignment of the destination.
+  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
+  // not need to be loaded.
+  EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
+                                   IsMemset, ZeroMemset, MemcpyStrSrc,
+                                   DAG.getMachineFunction());
+
+  if (VT == MVT::Other) {
+    if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) ||
+        TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {
+      VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);
+    } else {
+      switch (DstAlign & 7) {
+      case 0:  VT = MVT::i64; break;
+      case 4:  VT = MVT::i32; break;
+      case 2:  VT = MVT::i16; break;
+      default: VT = MVT::i8;  break;
+      }
+    }
+
+    MVT LVT = MVT::i64;
+    while (!TLI.isTypeLegal(LVT))
+      LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
+    assert(LVT.isInteger());
+
+    if (VT.bitsGT(LVT))
+      VT = LVT;
+  }
+
+  unsigned NumMemOps = 0;
+  while (Size != 0) {
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    while (VTSize > Size) {
+      // For now, only use non-vector load / store's for the left-over pieces.
+      EVT NewVT = VT;
+      unsigned NewVTSize;
+
+      bool Found = false;
+      if (VT.isVector() || VT.isFloatingPoint()) {
+        NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
+        if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
+            TLI.isSafeMemOpType(NewVT.getSimpleVT()))
+          Found = true;
+        else if (NewVT == MVT::i64 &&
+                 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
+                 TLI.isSafeMemOpType(MVT::f64)) {
+          // i64 is usually not legal on 32-bit targets, but f64 may be.
+          NewVT = MVT::f64;
+          Found = true;
+        }
+      }
+
+      if (!Found) {
+        do {
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+          if (NewVT == MVT::i8)
+            break;
+        } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
+      }
+      NewVTSize = NewVT.getSizeInBits() / 8;
+
+      // If the new VT cannot cover all of the remaining bits, then consider
+      // issuing a (or a pair of) unaligned and overlapping load / store.
+      // FIXME: Only does this for 64-bit or more since we don't have proper
+      // cost model for unaligned load / store.
+      bool Fast;
+      if (NumMemOps && AllowOverlap &&
+          VTSize >= 8 && NewVTSize < Size &&
+          TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign, &Fast) && Fast)
+        VTSize = Size;
+      else {
+        VT = NewVT;
+        VTSize = NewVTSize;
+      }
+    }
+
+    if (++NumMemOps > Limit)
+      return false;
+
+    MemOps.push_back(VT);
+    Size -= VTSize;
+  }
+
+  return true;
+}
+
+static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
+  // On Darwin, -Os means optimize for size without hurting performance, so
+  // only really optimize for size when -Oz (MinSize) is used.
+  if (MF.getTarget().getTargetTriple().isOSDarwin())
+    return MF.getFunction()->optForMinSize();
+  return MF.getFunction()->optForSize();
+}
+
+static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                       SDValue Chain, SDValue Dst, SDValue Src,
+                                       uint64_t Size, unsigned Align,
+                                       bool isVol, bool AlwaysInline,
+                                       MachinePointerInfo DstPtrInfo,
+                                       MachinePointerInfo SrcPtrInfo) {
+  // Turn a memcpy of undef to nop.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memcpy to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  // TODO: In the AlwaysInline case, if the size is big then generate a loop
+  // rather than maybe a humongous number of loads and stores.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  StringRef Str;
+  bool CopyFromStr = isMemSrcFromString(Src, Str);
+  bool isZeroStr = CopyFromStr && Str.empty();
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align),
+                                (isZeroStr ? 0 : SrcAlign),
+                                false, false, CopyFromStr, true,
+                                DstPtrInfo.getAddrSpace(),
+                                SrcPtrInfo.getAddrSpace(),
+                                DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+
+    // Don't promote to an alignment that would require dynamic stack
+    // realignment.
+    const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+    if (!TRI->needsStackRealignment(MF))
+      while (NewAlign > Align &&
+             DAG.getDataLayout().exceedsNaturalStackAlignment(NewAlign))
+          NewAlign /= 2;
+
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  uint64_t SrcOff = 0, DstOff = 0;
+  for (unsigned i = 0; i != NumMemOps; ++i) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value, Store;
+
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      SrcOff -= VTSize - Size;
+      DstOff -= VTSize - Size;
+    }
+
+    if (CopyFromStr &&
+        (isZeroStr || (VT.isInteger() && !VT.isVector()))) {
+      // It's unlikely a store of a vector immediate can be done in a single
+      // instruction. It would require a load from a constantpool first.
+      // We only handle zero vectors here.
+      // FIXME: Handle other cases where store of vector immediate is done in
+      // a single instruction.
+      Value = getMemsetStringVal(VT, dl, DAG, TLI, Str.substr(SrcOff));
+      if (Value.getNode())
+        Store = DAG.getStore(Chain, dl, Value,
+                             DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                             DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+    }
+
+    if (!Store.getNode()) {
+      // The type might not be legal for the target.  This should only happen
+      // if the type is smaller than a legal type, as on PPC, so the right
+      // thing to do is generate a LoadExt/StoreTrunc pair.  These simplify
+      // to Load/Store if NVT==VT.
+      // FIXME does the case above also need this?
+      EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
+      assert(NVT.bitsGE(VT));
+      Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
+                             DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                             SrcPtrInfo.getWithOffset(SrcOff), VT,
+                             MinAlign(SrcAlign, SrcOff), MMOFlags);
+      OutChains.push_back(Value.getValue(1));
+      Store = DAG.getTruncStore(
+          Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+          DstPtrInfo.getWithOffset(DstOff), VT, Align, MMOFlags);
+    }
+    OutChains.push_back(Store);
+    SrcOff += VTSize;
+    DstOff += VTSize;
+    Size -= VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
+                                        SDValue Chain, SDValue Dst, SDValue Src,
+                                        uint64_t Size, unsigned Align,
+                                        bool isVol, bool AlwaysInline,
+                                        MachinePointerInfo DstPtrInfo,
+                                        MachinePointerInfo SrcPtrInfo) {
+  // Turn a memmove of undef to nop.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memmove to a series of load and store ops if the size operand falls
+  // below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
+  if (Align > SrcAlign)
+    SrcAlign = Align;
+  unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
+
+  if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
+                                (DstAlignCanChange ? 0 : Align), SrcAlign,
+                                false, false, false, false,
+                                DstPtrInfo.getAddrSpace(),
+                                SrcPtrInfo.getAddrSpace(),
+                                DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  MachineMemOperand::Flags MMOFlags =
+      isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone;
+  uint64_t SrcOff = 0, DstOff = 0;
+  SmallVector<SDValue, 8> LoadValues;
+  SmallVector<SDValue, 8> LoadChains;
+  SmallVector<SDValue, 8> OutChains;
+  unsigned NumMemOps = MemOps.size();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Value;
+
+    Value =
+        DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, MMOFlags);
+    LoadValues.push_back(Value);
+    LoadChains.push_back(Value.getValue(1));
+    SrcOff += VTSize;
+  }
+  Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
+  OutChains.clear();
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    SDValue Store;
+
+    Store = DAG.getStore(Chain, dl, LoadValues[i],
+                         DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+                         DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+    OutChains.push_back(Store);
+    DstOff += VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+/// \brief Lower the call to 'memset' intrinsic function into a series of store
+/// operations.
+///
+/// \param DAG Selection DAG where lowered code is placed.
+/// \param dl Link to corresponding IR location.
+/// \param Chain Control flow dependency.
+/// \param Dst Pointer to destination memory location.
+/// \param Src Value of byte to write into the memory.
+/// \param Size Number of bytes to write.
+/// \param Align Alignment of the destination in bytes.
+/// \param isVol True if destination is volatile.
+/// \param DstPtrInfo IR information on the memory pointer.
+/// \returns New head in the control flow, if lowering was successful, empty
+/// SDValue otherwise.
+///
+/// The function tries to replace 'llvm.memset' intrinsic with several store
+/// operations and value calculation code. This is usually profitable for small
+/// memory size.
+static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
+                               SDValue Chain, SDValue Dst, SDValue Src,
+                               uint64_t Size, unsigned Align, bool isVol,
+                               MachinePointerInfo DstPtrInfo) {
+  // Turn a memset of undef to nop.
+  if (Src.isUndef())
+    return Chain;
+
+  // Expand memset to a series of load/store ops if the size operand
+  // falls below a certain threshold.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::vector<EVT> MemOps;
+  bool DstAlignCanChange = false;
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  bool OptSize = shouldLowerMemFuncForSize(MF);
+  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
+  if (FI && !MFI->isFixedObjectIndex(FI->getIndex()))
+    DstAlignCanChange = true;
+  bool IsZeroVal =
+    isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
+  if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
+                                Size, (DstAlignCanChange ? 0 : Align), 0,
+                                true, IsZeroVal, false, true,
+                                DstPtrInfo.getAddrSpace(), ~0u,
+                                DAG, TLI))
+    return SDValue();
+
+  if (DstAlignCanChange) {
+    Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
+    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
+    if (NewAlign > Align) {
+      // Give the stack frame object a larger alignment if needed.
+      if (MFI->getObjectAlignment(FI->getIndex()) < NewAlign)
+        MFI->setObjectAlignment(FI->getIndex(), NewAlign);
+      Align = NewAlign;
+    }
+  }
+
+  SmallVector<SDValue, 8> OutChains;
+  uint64_t DstOff = 0;
+  unsigned NumMemOps = MemOps.size();
+
+  // Find the largest store and generate the bit pattern for it.
+  EVT LargestVT = MemOps[0];
+  for (unsigned i = 1; i < NumMemOps; i++)
+    if (MemOps[i].bitsGT(LargestVT))
+      LargestVT = MemOps[i];
+  SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl);
+
+  for (unsigned i = 0; i < NumMemOps; i++) {
+    EVT VT = MemOps[i];
+    unsigned VTSize = VT.getSizeInBits() / 8;
+    if (VTSize > Size) {
+      // Issuing an unaligned load / store pair  that overlaps with the previous
+      // pair. Adjust the offset accordingly.
+      assert(i == NumMemOps-1 && i != 0);
+      DstOff -= VTSize - Size;
+    }
+
+    // If this store is smaller than the largest store see whether we can get
+    // the smaller value for free with a truncate.
+    SDValue Value = MemSetValue;
+    if (VT.bitsLT(LargestVT)) {
+      if (!LargestVT.isVector() && !VT.isVector() &&
+          TLI.isTruncateFree(LargestVT, VT))
+        Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue);
+      else
+        Value = getMemsetValue(Src, VT, DAG, dl);
+    }
+    assert(Value.getValueType() == VT && "Value with wrong type.");
+    SDValue Store = DAG.getStore(
+        Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+        DstPtrInfo.getWithOffset(DstOff), Align,
+        isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
+    OutChains.push_back(Store);
+    DstOff += VT.getSizeInBits() / 8;
+    Size -= VTSize;
+  }
+
+  return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+}
+
+static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
+                                            unsigned AS) {
+  // Lowering memcpy / memset / memmove intrinsics to calls is only valid if all
+  // pointer operands can be losslessly bitcasted to pointers of address space 0
+  if (AS != 0 && !TLI->isNoopAddrSpaceCast(AS, 0)) {
+    report_fatal_error("cannot lower memory intrinsic in address space " +
+                       Twine(AS));
+  }
+}
+
+SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, unsigned Align,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
+                                MachinePointerInfo DstPtrInfo,
+                                MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memcpy to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memcpy with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                             ConstantSize->getZExtValue(),Align,
+                                isVol, false, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memcpy with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemcpy(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
+        DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
+                                   ConstantSize->getZExtValue(), Align, isVol,
+                                   true, DstPtrInfo, SrcPtrInfo);
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
+  // FIXME: If the memcpy is volatile (isVol), lowering it to a plain libc
+  // memcpy is not guaranteed to be safe. libc memcpys aren't required to
+  // respect volatile, so they may do things like read or write memory
+  // beyond the given memory regions. But fixing this isn't easy, and most
+  // people don't care.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME: pass in SDLoc
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                 SDValue Src, SDValue Size, unsigned Align,
+                                 bool isVol, bool isTailCall,
+                                 MachinePointerInfo DstPtrInfo,
+                                 MachinePointerInfo SrcPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memmove to loads and stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memmove with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
+                               ConstantSize->getZExtValue(), Align, isVol,
+                               false, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memmove with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemmove(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+  checkAddrSpaceIsValidForLibcall(TLI, SrcPtrInfo.getAddrSpace());
+
+  // FIXME: If the memmove is volatile, lowering it to plain libc memmove may
+  // not be safe.  See memcpy above for more details.
+
+  // Emit a library call.
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Ty = getDataLayout().getIntPtrType(*getContext());
+  Entry.Node = Dst; Args.push_back(Entry);
+  Entry.Node = Src; Args.push_back(Entry);
+  Entry.Node = Size; Args.push_back(Entry);
+  // FIXME:  pass in SDLoc
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
+                                SDValue Src, SDValue Size, unsigned Align,
+                                bool isVol, bool isTailCall,
+                                MachinePointerInfo DstPtrInfo) {
+  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
+
+  // Check to see if we should lower the memset to stores first.
+  // For cases within the target-specified limits, this is the best choice.
+  ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+  if (ConstantSize) {
+    // Memset with size zero? Just return the original chain.
+    if (ConstantSize->isNullValue())
+      return Chain;
+
+    SDValue Result =
+      getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+                      Align, isVol, DstPtrInfo);
+
+    if (Result.getNode())
+      return Result;
+  }
+
+  // Then check to see if we should lower the memset with target-specific
+  // code. If the target chooses to do this, this is the next best.
+  if (TSI) {
+    SDValue Result = TSI->EmitTargetCodeForMemset(
+        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
+    if (Result.getNode())
+      return Result;
+  }
+
+  checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
+
+  // Emit a library call.
+  Type *IntPtrTy = getDataLayout().getIntPtrType(*getContext());
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Dst; Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+  Entry.Node = Src;
+  Entry.Ty = Src.getValueType().getTypeForEVT(*getContext());
+  Args.push_back(Entry);
+  Entry.Node = Size;
+  Entry.Ty = IntPtrTy;
+  Args.push_back(Entry);
+
+  // FIXME: pass in SDLoc
+  TargetLowering::CallLoweringInfo CLI(*this);
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                 Dst.getValueType().getTypeForEVT(*getContext()),
+                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                   TLI->getPointerTy(getDataLayout())),
+                 std::move(Args))
+      .setDiscardResult()
+      .setTailCall(isTailCall);
+
+  std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
+  return CallResult.second;
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDVTList VTList, ArrayRef<SDValue> Ops,
+                                MachineMemOperand *MMO,
+                                AtomicOrdering SuccessOrdering,
+                                AtomicOrdering FailureOrdering,
+                                SynchronizationScope SynchScope) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(MemVT.getRawBits());
+  AddNodeIDNode(ID, Opcode, VTList, Ops);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void* IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<AtomicSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<AtomicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                    VTList, MemVT, MMO, SuccessOrdering,
+                                    FailureOrdering, SynchScope);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDVTList VTList, ArrayRef<SDValue> Ops,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  return getAtomic(Opcode, dl, MemVT, VTList, Ops, MMO, Ordering,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomicCmpSwap(
+    unsigned Opcode, const SDLoc &dl, EVT MemVT, SDVTList VTs, SDValue Chain,
+    SDValue Ptr, SDValue Cmp, SDValue Swp, MachinePointerInfo PtrInfo,
+    unsigned Alignment, AtomicOrdering SuccessOrdering,
+    AtomicOrdering FailureOrdering, SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  auto Flags = MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad |
+               MachineMemOperand::MOStore;
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, MemVT.getStoreSize(), Alignment);
+
+  return getAtomicCmpSwap(Opcode, dl, MemVT, VTs, Chain, Ptr, Cmp, Swp, MMO,
+                          SuccessOrdering, FailureOrdering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomicCmpSwap(unsigned Opcode, const SDLoc &dl,
+                                       EVT MemVT, SDVTList VTs, SDValue Chain,
+                                       SDValue Ptr, SDValue Cmp, SDValue Swp,
+                                       MachineMemOperand *MMO,
+                                       AtomicOrdering SuccessOrdering,
+                                       AtomicOrdering FailureOrdering,
+                                       SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_CMP_SWAP ||
+         Opcode == ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS);
+  assert(Cmp.getValueType() == Swp.getValueType() && "Invalid Atomic Op Types");
+
+  SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO,
+                   SuccessOrdering, FailureOrdering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Val,
+                                const Value *PtrVal, unsigned Alignment,
+                                AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  // An atomic store does not load. An atomic load does not store.
+  // (An atomicrmw obviously both loads and stores.)
+  // For now, atomics are considered to be volatile always, and they are
+  // chained as such.
+  // FIXME: Volatile isn't really correct; we should keep track of atomic
+  // orderings in the memoperand.
+  auto Flags = MachineMemOperand::MOVolatile;
+  if (Opcode != ISD::ATOMIC_STORE)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Opcode != ISD::ATOMIC_LOAD)
+    Flags |= MachineMemOperand::MOStore;
+
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(MachinePointerInfo(PtrVal), Flags,
+                            MemVT.getStoreSize(), Alignment);
+
+  return getAtomic(Opcode, dl, MemVT, Chain, Ptr, Val, MMO,
+                   Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                SDValue Chain, SDValue Ptr, SDValue Val,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert((Opcode == ISD::ATOMIC_LOAD_ADD ||
+          Opcode == ISD::ATOMIC_LOAD_SUB ||
+          Opcode == ISD::ATOMIC_LOAD_AND ||
+          Opcode == ISD::ATOMIC_LOAD_OR ||
+          Opcode == ISD::ATOMIC_LOAD_XOR ||
+          Opcode == ISD::ATOMIC_LOAD_NAND ||
+          Opcode == ISD::ATOMIC_LOAD_MIN ||
+          Opcode == ISD::ATOMIC_LOAD_MAX ||
+          Opcode == ISD::ATOMIC_LOAD_UMIN ||
+          Opcode == ISD::ATOMIC_LOAD_UMAX ||
+          Opcode == ISD::ATOMIC_SWAP ||
+          Opcode == ISD::ATOMIC_STORE) &&
+         "Invalid Atomic Op");
+
+  EVT VT = Val.getValueType();
+
+  SDVTList VTs = Opcode == ISD::ATOMIC_STORE ? getVTList(MVT::Other) :
+                                               getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr, Val};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope);
+}
+
+SDValue SelectionDAG::getAtomic(unsigned Opcode, const SDLoc &dl, EVT MemVT,
+                                EVT VT, SDValue Chain, SDValue Ptr,
+                                MachineMemOperand *MMO, AtomicOrdering Ordering,
+                                SynchronizationScope SynchScope) {
+  assert(Opcode == ISD::ATOMIC_LOAD && "Invalid Atomic Op");
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = {Chain, Ptr};
+  return getAtomic(Opcode, dl, MemVT, VTs, Ops, MMO, Ordering, SynchScope);
+}
+
+/// getMergeValues - Create a MERGE_VALUES node from the given operands.
+SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
+  if (Ops.size() == 1)
+    return Ops[0];
+
+  SmallVector<EVT, 4> VTs;
+  VTs.reserve(Ops.size());
+  for (unsigned i = 0; i < Ops.size(); ++i)
+    VTs.push_back(Ops[i].getValueType());
+  return getNode(ISD::MERGE_VALUES, dl, getVTList(VTs), Ops);
+}
+
+SDValue SelectionDAG::getMemIntrinsicNode(
+    unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
+    EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align, bool Vol,
+    bool ReadMem, bool WriteMem, unsigned Size) {
+  if (Align == 0)  // Ensure that codegen never sees alignment 0
+    Align = getEVTAlignment(MemVT);
+
+  MachineFunction &MF = getMachineFunction();
+  auto Flags = MachineMemOperand::MONone;
+  if (WriteMem)
+    Flags |= MachineMemOperand::MOStore;
+  if (ReadMem)
+    Flags |= MachineMemOperand::MOLoad;
+  if (Vol)
+    Flags |= MachineMemOperand::MOVolatile;
+  if (!Size)
+    Size = MemVT.getStoreSize();
+  MachineMemOperand *MMO =
+    MF.getMachineMemOperand(PtrInfo, Flags, Size, Align);
+
+  return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
+}
+
+SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
+                                          SDVTList VTList,
+                                          ArrayRef<SDValue> Ops, EVT MemVT,
+                                          MachineMemOperand *MMO) {
+  assert((Opcode == ISD::INTRINSIC_VOID ||
+          Opcode == ISD::INTRINSIC_W_CHAIN ||
+          Opcode == ISD::PREFETCH ||
+          Opcode == ISD::LIFETIME_START ||
+          Opcode == ISD::LIFETIME_END ||
+          (Opcode <= INT_MAX &&
+           (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
+         "Opcode is not a memory-accessing opcode!");
+
+  // Memoize the node unless it returns a flag.
+  MemIntrinsicSDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+      cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
+      return SDValue(E, 0);
+    }
+
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<MemIntrinsicSDNode>(Opcode, dl.getIROrder(), dl.getDebugLoc(),
+                                      VTList, MemVT, MMO);
+    createOperands(N, Ops);
+  }
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
+                                           int64_t Offset = 0) {
+  // If this is FI+Offset, we can model it.
+  if (const FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr))
+    return MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                             FI->getIndex(), Offset);
+
+  // If this is (FI+Offset1)+Offset2, we can model it.
+  if (Ptr.getOpcode() != ISD::ADD ||
+      !isa<ConstantSDNode>(Ptr.getOperand(1)) ||
+      !isa<FrameIndexSDNode>(Ptr.getOperand(0)))
+    return MachinePointerInfo();
+
+  int FI = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+  return MachinePointerInfo::getFixedStack(
+      DAG.getMachineFunction(), FI,
+      Offset + cast<ConstantSDNode>(Ptr.getOperand(1))->getSExtValue());
+}
+
+/// InferPointerInfo - If the specified ptr/offset is a frame index, infer a
+/// MachinePointerInfo record from it.  This is particularly useful because the
+/// code generator has many cases where it doesn't bother passing in a
+/// MachinePointerInfo to getLoad or getStore when it has "FI+Cst".
+static MachinePointerInfo InferPointerInfo(SelectionDAG &DAG, SDValue Ptr,
+                                           SDValue OffsetOp) {
+  // If the 'Offset' value isn't a constant, we can't handle this.
+  if (ConstantSDNode *OffsetNode = dyn_cast<ConstantSDNode>(OffsetOp))
+    return InferPointerInfo(DAG, Ptr, OffsetNode->getSExtValue());
+  if (OffsetOp.isUndef())
+    return InferPointerInfo(DAG, Ptr);
+  return MachinePointerInfo();
+}
+
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset,
+                              MachinePointerInfo PtrInfo, EVT MemVT,
+                              unsigned Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(VT);
+
+  MMOFlags |= MachineMemOperand::MOLoad;
+  assert((MMOFlags & MachineMemOperand::MOStore) == 0);
+  // If we don't have a PtrInfo, infer the trivial frame index case to simplify
+  // clients.
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(*this, Ptr, Offset);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
+  return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
+}
+
+SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
+                              EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, SDValue Offset, EVT MemVT,
+                              MachineMemOperand *MMO) {
+  if (VT == MemVT) {
+    ExtType = ISD::NON_EXTLOAD;
+  } else if (ExtType == ISD::NON_EXTLOAD) {
+    assert(VT == MemVT && "Non-extending load from different memory type!");
+  } else {
+    // Extending load.
+    assert(MemVT.getScalarType().bitsLT(VT.getScalarType()) &&
+           "Should only be an extending load, not truncating!");
+    assert(VT.isInteger() == MemVT.isInteger() &&
+           "Cannot convert from FP to Int or Int -> FP!");
+    assert(VT.isVector() == MemVT.isVector() &&
+           "Cannot use an ext load to convert to or from a vector!");
+    assert((!VT.isVector() ||
+            VT.getVectorNumElements() == MemVT.getVectorNumElements()) &&
+           "Cannot use an ext load to change the number of vector elements!");
+  }
+
+  bool Indexed = AM != ISD::UNINDEXED;
+  assert((Indexed || Offset.isUndef()) && "Unindexed load with an offset!");
+
+  SDVTList VTs = Indexed ?
+    getVTList(VT, Ptr.getValueType(), MVT::Other) : getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::LOAD, VTs, Ops);
+  ID.AddInteger(MemVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<LoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<LoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                  ExtType, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachinePointerInfo PtrInfo,
+                              unsigned Alignment,
+                              MachineMemOperand::Flags MMOFlags,
+                              const AAMDNodes &AAInfo, const MDNode *Ranges) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 PtrInfo, VT, Alignment, MMOFlags, AAInfo, Ranges);
+}
+
+SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                              SDValue Ptr, MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ISD::NON_EXTLOAD, VT, dl, Chain, Ptr, Undef,
+                 VT, MMO);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr,
+                                 MachinePointerInfo PtrInfo, EVT MemVT,
+                                 unsigned Alignment,
+                                 MachineMemOperand::Flags MMOFlags,
+                                 const AAMDNodes &AAInfo) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef, PtrInfo,
+                 MemVT, Alignment, MMOFlags, AAInfo);
+}
+
+SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
+                                 EVT VT, SDValue Chain, SDValue Ptr, EVT MemVT,
+                                 MachineMemOperand *MMO) {
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  return getLoad(ISD::UNINDEXED, ExtType, VT, dl, Chain, Ptr, Undef,
+                 MemVT, MMO);
+}
+
+SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
+                                     SDValue Base, SDValue Offset,
+                                     ISD::MemIndexedMode AM) {
+  LoadSDNode *LD = cast<LoadSDNode>(OrigLoad);
+  assert(LD->getOffset().isUndef() && "Load is already a indexed load!");
+  // Don't propagate the invariant flag.
+  auto MMOFlags =
+      LD->getMemOperand()->getFlags() & ~MachineMemOperand::MOInvariant;
+  return getLoad(AM, LD->getExtensionType(), OrigLoad.getValueType(), dl,
+                 LD->getChain(), Base, Offset, LD->getPointerInfo(),
+                 LD->getMemoryVT(), LD->getAlignment(), MMOFlags);
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                               SDValue Ptr, MachinePointerInfo PtrInfo,
+                               unsigned Alignment,
+                               MachineMemOperand::Flags MMOFlags,
+                               const AAMDNodes &AAInfo) {
+  assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(Val.getValueType());
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(*this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
+  return getStore(Chain, dl, Val, Ptr, MMO);
+}
+
+SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                               SDValue Ptr, MachineMemOperand *MMO) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, false, VT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                    SDValue Ptr, MachinePointerInfo PtrInfo,
+                                    EVT SVT, unsigned Alignment,
+                                    MachineMemOperand::Flags MMOFlags,
+                                    const AAMDNodes &AAInfo) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
+    Alignment = getEVTAlignment(SVT);
+
+  MMOFlags |= MachineMemOperand::MOStore;
+  assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
+
+  if (PtrInfo.V.isNull())
+    PtrInfo = InferPointerInfo(*this, Ptr);
+
+  MachineFunction &MF = getMachineFunction();
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo, MMOFlags, SVT.getStoreSize(), Alignment, AAInfo);
+  return getTruncStore(Chain, dl, Val, Ptr, SVT, MMO);
+}
+
+SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
+                                    SDValue Ptr, EVT SVT,
+                                    MachineMemOperand *MMO) {
+  EVT VT = Val.getValueType();
+
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  if (VT == SVT)
+    return getStore(Chain, dl, Val, Ptr, MMO);
+
+  assert(SVT.getScalarType().bitsLT(VT.getScalarType()) &&
+         "Should only be a truncating store, not extending!");
+  assert(VT.isInteger() == SVT.isInteger() &&
+         "Can't do FP-INT conversion!");
+  assert(VT.isVector() == SVT.isVector() &&
+         "Cannot use trunc store to convert to or from a vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorNumElements() == SVT.getVectorNumElements()) &&
+         "Cannot use trunc store to change the number of vector elements!");
+
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Undef = getUNDEF(Ptr.getValueType());
+  SDValue Ops[] = { Chain, Val, Ptr, Undef };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(SVT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<StoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                   ISD::UNINDEXED, true, SVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getIndexedStore(SDValue OrigStore, const SDLoc &dl,
+                                      SDValue Base, SDValue Offset,
+                                      ISD::MemIndexedMode AM) {
+  StoreSDNode *ST = cast<StoreSDNode>(OrigStore);
+  assert(ST->getOffset().isUndef() && "Store is already a indexed store!");
+  SDVTList VTs = getVTList(Base.getValueType(), MVT::Other);
+  SDValue Ops[] = { ST->getChain(), ST->getValue(), Base, Offset };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::STORE, VTs, Ops);
+  ID.AddInteger(ST->getMemoryVT().getRawBits());
+  ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<StoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs, AM,
+                                   ST->isTruncatingStore(), ST->getMemoryVT(),
+                                   ST->getMemOperand());
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedLoad(EVT VT, const SDLoc &dl, SDValue Chain,
+                                    SDValue Ptr, SDValue Mask, SDValue Src0,
+                                    EVT MemVT, MachineMemOperand *MMO,
+                                    ISD::LoadExtType ExtTy) {
+
+  SDVTList VTs = getVTList(VT, MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Src0 };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MLOAD, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ExtTy, ISD::UNINDEXED,
+                                     MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedLoadSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<MaskedLoadSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                        ExtTy, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedStore(SDValue Chain, const SDLoc &dl,
+                                     SDValue Val, SDValue Ptr, SDValue Mask,
+                                     EVT MemVT, MachineMemOperand *MMO,
+                                     bool isTrunc) {
+  assert(Chain.getValueType() == MVT::Other &&
+        "Invalid chain type");
+  EVT VT = Val.getValueType();
+  SDVTList VTs = getVTList(MVT::Other);
+  SDValue Ops[] = { Chain, Ptr, Mask, Val };
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSTORE, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedStoreSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<MaskedStoreSDNode>(dl.getIROrder(), dl.getDebugLoc(), VTs,
+                                         isTrunc, MemVT, MMO);
+  createOperands(N, Ops);
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedGather(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                      ArrayRef<SDValue> Ops,
+                                      MachineMemOperand *MMO) {
+  assert(Ops.size() == 5 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MGATHER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(ISD::NON_EXTLOAD, ISD::UNINDEXED,
+                                     MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedGatherSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+
+  auto *N = newSDNode<MaskedGatherSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                          VTs, VT, MMO);
+  createOperands(N, Ops);
+
+  assert(N->getValue().getValueType() == N->getValueType(0) &&
+         "Incompatible type of the PassThru value in MaskedGatherSDNode");
+  assert(N->getMask().getValueType().getVectorNumElements() ==
+             N->getValueType(0).getVectorNumElements() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorNumElements() ==
+             N->getValueType(0).getVectorNumElements() &&
+         "Vector width mismatch between index and data");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getMaskedScatter(SDVTList VTs, EVT VT, const SDLoc &dl,
+                                       ArrayRef<SDValue> Ops,
+                                       MachineMemOperand *MMO) {
+  assert(Ops.size() == 5 && "Incompatible number of operands");
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::MSCATTER, VTs, Ops);
+  ID.AddInteger(VT.getRawBits());
+  ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
+                                     MMO->isNonTemporal(),
+                                     MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, dl, IP)) {
+    cast<MaskedScatterSDNode>(E)->refineAlignment(MMO);
+    return SDValue(E, 0);
+  }
+  auto *N = newSDNode<MaskedScatterSDNode>(dl.getIROrder(), dl.getDebugLoc(),
+                                           VTs, VT, MMO);
+  createOperands(N, Ops);
+
+  assert(N->getMask().getValueType().getVectorNumElements() ==
+             N->getValue().getValueType().getVectorNumElements() &&
+         "Vector width mismatch between mask and data");
+  assert(N->getIndex().getValueType().getVectorNumElements() ==
+             N->getValue().getValueType().getVectorNumElements() &&
+         "Vector width mismatch between index and data");
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getVAArg(EVT VT, const SDLoc &dl, SDValue Chain,
+                               SDValue Ptr, SDValue SV, unsigned Align) {
+  SDValue Ops[] = { Chain, Ptr, SV, getTargetConstant(Align, dl, MVT::i32) };
+  return getNode(ISD::VAARG, dl, getVTList(VT, MVT::Other), Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDUse> Ops) {
+  switch (Ops.size()) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, static_cast<const SDValue>(Ops[0]));
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1]);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  // Copy from an SDUse array into an SDValue array for use with
+  // the regular getNode logic.
+  SmallVector<SDValue, 8> NewOps(Ops.begin(), Ops.end());
+  return getNode(Opcode, DL, VT, NewOps);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
+                              ArrayRef<SDValue> Ops, const SDNodeFlags *Flags) {
+  unsigned NumOps = Ops.size();
+  switch (NumOps) {
+  case 0: return getNode(Opcode, DL, VT);
+  case 1: return getNode(Opcode, DL, VT, Ops[0]);
+  case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags);
+  case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]);
+  default: break;
+  }
+
+  switch (Opcode) {
+  default: break;
+  case ISD::CONCAT_VECTORS: {
+    // Attempt to fold CONCAT_VECTORS into BUILD_VECTOR or UNDEF.
+    if (SDValue V = FoldCONCAT_VECTORS(DL, VT, Ops, *this))
+      return V;
+    break;
+  }
+  case ISD::SELECT_CC: {
+    assert(NumOps == 5 && "SELECT_CC takes 5 operands!");
+    assert(Ops[0].getValueType() == Ops[1].getValueType() &&
+           "LHS and RHS of condition must have same type!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "True and False arms of SelectCC must have same type!");
+    assert(Ops[2].getValueType() == VT &&
+           "select_cc node must be of same type as true and false value!");
+    break;
+  }
+  case ISD::BR_CC: {
+    assert(NumOps == 5 && "BR_CC takes 5 operands!");
+    assert(Ops[2].getValueType() == Ops[3].getValueType() &&
+           "LHS/RHS of comparison should match types!");
+    break;
+  }
+  }
+
+  // Memoize nodes.
+  SDNode *N;
+  SDVTList VTs = getVTList(VT);
+
+  if (VT != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTs, Ops);
+    void *IP = nullptr;
+
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+    createOperands(N, Ops);
+  }
+
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              ArrayRef<EVT> ResultTys, ArrayRef<SDValue> Ops) {
+  return getNode(Opcode, DL, getVTList(ResultTys), Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              ArrayRef<SDValue> Ops) {
+  if (VTList.NumVTs == 1)
+    return getNode(Opcode, DL, VTList.VTs[0], Ops);
+
+#if 0
+  switch (Opcode) {
+  // FIXME: figure out how to safely handle things like
+  // int foo(int x) { return 1 << (x & 255); }
+  // int bar() { return foo(256); }
+  case ISD::SRA_PARTS:
+  case ISD::SRL_PARTS:
+  case ISD::SHL_PARTS:
+    if (N3.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+        cast<VTSDNode>(N3.getOperand(1))->getVT() != MVT::i1)
+      return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+    else if (N3.getOpcode() == ISD::AND)
+      if (ConstantSDNode *AndRHS = dyn_cast<ConstantSDNode>(N3.getOperand(1))) {
+        // If the and is only masking out bits that cannot effect the shift,
+        // eliminate the and.
+        unsigned NumBits = VT.getScalarType().getSizeInBits()*2;
+        if ((AndRHS->getValue() & (NumBits-1)) == NumBits-1)
+          return getNode(Opcode, DL, VT, N1, N2, N3.getOperand(0));
+      }
+    break;
+  }
+#endif
+
+  // Memoize the node unless it returns a flag.
+  SDNode *N;
+  if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+      return SDValue(E, 0);
+
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
+    CSEMap.InsertNode(N, IP);
+  } else {
+    N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
+    createOperands(N, Ops);
+  }
+  InsertNode(N);
+  return SDValue(N, 0);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
+                              SDVTList VTList) {
+  return getNode(Opcode, DL, VTList, None);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1) {
+  SDValue Ops[] = { N1 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2) {
+  SDValue Ops[] = { N1, N2 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3) {
+  SDValue Ops[] = { N1, N2, N3 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4) {
+  SDValue Ops[] = { N1, N2, N3, N4 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
+                              SDValue N1, SDValue N2, SDValue N3, SDValue N4,
+                              SDValue N5) {
+  SDValue Ops[] = { N1, N2, N3, N4, N5 };
+  return getNode(Opcode, DL, VTList, Ops);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT) {
+  return makeVTList(SDNode::getValueTypeList(VT), 1);
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(2U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(2);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 2);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(3U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(3);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 3);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(EVT VT1, EVT VT2, EVT VT3, EVT VT4) {
+  FoldingSetNodeID ID;
+  ID.AddInteger(4U);
+  ID.AddInteger(VT1.getRawBits());
+  ID.AddInteger(VT2.getRawBits());
+  ID.AddInteger(VT3.getRawBits());
+  ID.AddInteger(VT4.getRawBits());
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(4);
+    Array[0] = VT1;
+    Array[1] = VT2;
+    Array[2] = VT3;
+    Array[3] = VT4;
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, 4);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+SDVTList SelectionDAG::getVTList(ArrayRef<EVT> VTs) {
+  unsigned NumVTs = VTs.size();
+  FoldingSetNodeID ID;
+  ID.AddInteger(NumVTs);
+  for (unsigned index = 0; index < NumVTs; index++) {
+    ID.AddInteger(VTs[index].getRawBits());
+  }
+
+  void *IP = nullptr;
+  SDVTListNode *Result = VTListMap.FindNodeOrInsertPos(ID, IP);
+  if (!Result) {
+    EVT *Array = Allocator.Allocate<EVT>(NumVTs);
+    std::copy(VTs.begin(), VTs.end(), Array);
+    Result = new (Allocator) SDVTListNode(ID.Intern(Allocator), Array, NumVTs);
+    VTListMap.InsertNode(Result, IP);
+  }
+  return Result->getSDVTList();
+}
+
+
+/// UpdateNodeOperands - *Mutate* the specified node in-place to have the
+/// specified operands.  If the resultant node already exists in the DAG,
+/// this does not modify the specified node, instead it returns the node that
+/// already exists.  If the resultant node does not exist in the DAG, the
+/// input node is returned.  As a degenerate case, if you specify the same
+/// input operands as the node already has, the input node is returned.
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op) {
+  assert(N->getNumOperands() == 1 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op == N->getOperand(0)) return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  N->OperandList[0].set(Op);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2) {
+  assert(N->getNumOperands() == 2 && "Update with wrong number of operands");
+
+  // Check to see if there is no change.
+  if (Op1 == N->getOperand(0) && Op2 == N->getOperand(1))
+    return N;   // No operands changed, just return the input node.
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Op1, Op2, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  if (N->OperandList[0] != Op1)
+    N->OperandList[0].set(Op1);
+  if (N->OperandList[1] != Op2)
+    N->OperandList[1].set(Op2);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2, SDValue Op3) {
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, SDValue Op1, SDValue Op2,
+                   SDValue Op3, SDValue Op4, SDValue Op5) {
+  SDValue Ops[] = { Op1, Op2, Op3, Op4, Op5 };
+  return UpdateNodeOperands(N, Ops);
+}
+
+SDNode *SelectionDAG::
+UpdateNodeOperands(SDNode *N, ArrayRef<SDValue> Ops) {
+  unsigned NumOps = Ops.size();
+  assert(N->getNumOperands() == NumOps &&
+         "Update with wrong number of operands");
+
+  // If no operands changed just return the input node.
+  if (std::equal(Ops.begin(), Ops.end(), N->op_begin()))
+    return N;
+
+  // See if the modified node already exists.
+  void *InsertPos = nullptr;
+  if (SDNode *Existing = FindModifiedNodeSlot(N, Ops, InsertPos))
+    return Existing;
+
+  // Nope it doesn't.  Remove the node from its current place in the maps.
+  if (InsertPos)
+    if (!RemoveNodeFromCSEMaps(N))
+      InsertPos = nullptr;
+
+  // Now we update the operands.
+  for (unsigned i = 0; i != NumOps; ++i)
+    if (N->OperandList[i] != Ops[i])
+      N->OperandList[i].set(Ops[i]);
+
+  // If this gets put into a CSE map, add it.
+  if (InsertPos) CSEMap.InsertNode(N, InsertPos);
+  return N;
+}
+
+/// DropOperands - Release the operands and set this node to have
+/// zero operands.
+void SDNode::DropOperands() {
+  // Unlike the code in MorphNodeTo that does this, we don't need to
+  // watch for dead nodes here.
+  for (op_iterator I = op_begin(), E = op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    Use.set(SDValue());
+  }
+}
+
+/// SelectNodeTo - These are wrappers around MorphNodeTo that accept a
+/// machine opcode.
+///
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, None);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, SDValue Op1,
+                                   SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return SelectNodeTo(N, MachineOpc, VTs, None);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3, EVT VT4,
+                                   ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   EVT VT1, EVT VT2, EVT VT3,
+                                   SDValue Op1, SDValue Op2,
+                                   SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return SelectNodeTo(N, MachineOpc, VTs, Ops);
+}
+
+SDNode *SelectionDAG::SelectNodeTo(SDNode *N, unsigned MachineOpc,
+                                   SDVTList VTs,ArrayRef<SDValue> Ops) {
+  SDNode *New = MorphNodeTo(N, ~MachineOpc, VTs, Ops);
+  // Reset the NodeID to -1.
+  New->setNodeId(-1);
+  if (New != N) {
+    ReplaceAllUsesWith(N, New);
+    RemoveDeadNode(N);
+  }
+  return New;
+}
+
+/// UpdadeSDLocOnMergedSDNode - If the opt level is -O0 then it throws away
+/// the line number information on the merged node since it is not possible to
+/// preserve the information that operation is associated with multiple lines.
+/// This will make the debugger working better at -O0, were there is a higher
+/// probability having other instructions associated with that line.
+///
+/// For IROrder, we keep the smaller of the two
+SDNode *SelectionDAG::UpdadeSDLocOnMergedSDNode(SDNode *N, const SDLoc &OLoc) {
+  DebugLoc NLoc = N->getDebugLoc();
+  if (NLoc && OptLevel == CodeGenOpt::None && OLoc.getDebugLoc() != NLoc) {
+    N->setDebugLoc(DebugLoc());
+  }
+  unsigned Order = std::min(N->getIROrder(), OLoc.getIROrder());
+  N->setIROrder(Order);
+  return N;
+}
+
+/// MorphNodeTo - This *mutates* the specified node to have the specified
+/// return type, opcode, and operands.
+///
+/// Note that MorphNodeTo returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.  Note that the SDLoc need not be the same.
+///
+/// Using MorphNodeTo is faster than creating a new node and swapping it in
+/// with ReplaceAllUsesWith both because it often avoids allocating a new
+/// node, and because it doesn't require CSE recalculation for any of
+/// the node's users.
+///
+/// However, note that MorphNodeTo recursively deletes dead nodes from the DAG.
+/// As a consequence it isn't appropriate to use from within the DAG combiner or
+/// the legalizer which maintain worklists that would need to be updated when
+/// deleting things.
+SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
+                                  SDVTList VTs, ArrayRef<SDValue> Ops) {
+  // If an identical node already exists, use it.
+  void *IP = nullptr;
+  if (VTs.VTs[VTs.NumVTs-1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opc, VTs, Ops);
+    if (SDNode *ON = FindNodeOrInsertPos(ID, SDLoc(N), IP))
+      return UpdadeSDLocOnMergedSDNode(ON, SDLoc(N));
+  }
+
+  if (!RemoveNodeFromCSEMaps(N))
+    IP = nullptr;
+
+  // Start the morphing.
+  N->NodeType = Opc;
+  N->ValueList = VTs.VTs;
+  N->NumValues = VTs.NumVTs;
+
+  // Clear the operands list, updating used nodes to remove this from their
+  // use list.  Keep track of any operands that become dead as a result.
+  SmallPtrSet<SDNode*, 16> DeadNodeSet;
+  for (SDNode::op_iterator I = N->op_begin(), E = N->op_end(); I != E; ) {
+    SDUse &Use = *I++;
+    SDNode *Used = Use.getNode();
+    Use.set(SDValue());
+    if (Used->use_empty())
+      DeadNodeSet.insert(Used);
+  }
+
+  // For MachineNode, initialize the memory references information.
+  if (MachineSDNode *MN = dyn_cast<MachineSDNode>(N))
+    MN->setMemRefs(nullptr, nullptr);
+
+  // Swap for an appropriately sized array from the recycler.
+  removeOperands(N);
+  createOperands(N, Ops);
+
+  // Delete any nodes that are still dead after adding the uses for the
+  // new operands.
+  if (!DeadNodeSet.empty()) {
+    SmallVector<SDNode *, 16> DeadNodes;
+    for (SDNode *N : DeadNodeSet)
+      if (N->use_empty())
+        DeadNodes.push_back(N);
+    RemoveDeadNodes(DeadNodes);
+  }
+
+  if (IP)
+    CSEMap.InsertNode(N, IP);   // Memoize the new node.
+  return N;
+}
+
+
+/// getMachineNode - These are used for target selectors to create a new node
+/// with specified return type(s), MachineInstr opcode, and operands.
+///
+/// Note that getMachineNode returns the resultant node.  If there is already a
+/// node of the specified opcode and operands, it returns that node instead of
+/// the current one.
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, None);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
+  SDVTList VTs = getVTList(VT);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT, ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, None);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, SDValue Op1,
+                                            SDValue Op2, SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            SDValue Op1, SDValue Op2,
+                                            SDValue Op3) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  SDValue Ops[] = { Op1, Op2, Op3 };
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            EVT VT1, EVT VT2, EVT VT3, EVT VT4,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(VT1, VT2, VT3, VT4);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &dl,
+                                            ArrayRef<EVT> ResultTys,
+                                            ArrayRef<SDValue> Ops) {
+  SDVTList VTs = getVTList(ResultTys);
+  return getMachineNode(Opcode, dl, VTs, Ops);
+}
+
+MachineSDNode *SelectionDAG::getMachineNode(unsigned Opcode, const SDLoc &DL,
+                                            SDVTList VTs,
+                                            ArrayRef<SDValue> Ops) {
+  bool DoCSE = VTs.VTs[VTs.NumVTs-1] != MVT::Glue;
+  MachineSDNode *N;
+  void *IP = nullptr;
+
+  if (DoCSE) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, ~Opcode, VTs, Ops);
+    IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP)) {
+      return cast<MachineSDNode>(UpdadeSDLocOnMergedSDNode(E, DL));
+    }
+  }
+
+  // Allocate a new MachineSDNode.
+  N = newSDNode<MachineSDNode>(~Opcode, DL.getIROrder(), DL.getDebugLoc(), VTs);
+  createOperands(N, Ops);
+
+  if (DoCSE)
+    CSEMap.InsertNode(N, IP);
+
+  InsertNode(N);
+  return N;
+}
+
+/// getTargetExtractSubreg - A convenience function for creating
+/// TargetOpcode::EXTRACT_SUBREG nodes.
+SDValue SelectionDAG::getTargetExtractSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                             SDValue Operand) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+  SDNode *Subreg = getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                  VT, Operand, SRIdxVal);
+  return SDValue(Subreg, 0);
+}
+
+/// getTargetInsertSubreg - A convenience function for creating
+/// TargetOpcode::INSERT_SUBREG nodes.
+SDValue SelectionDAG::getTargetInsertSubreg(int SRIdx, const SDLoc &DL, EVT VT,
+                                            SDValue Operand, SDValue Subreg) {
+  SDValue SRIdxVal = getTargetConstant(SRIdx, DL, MVT::i32);
+  SDNode *Result = getMachineNode(TargetOpcode::INSERT_SUBREG, DL,
+                                  VT, Operand, Subreg, SRIdxVal);
+  return SDValue(Result, 0);
+}
+
+/// getNodeIfExists - Get the specified node if it's already available, or
+/// else return NULL.
+SDNode *SelectionDAG::getNodeIfExists(unsigned Opcode, SDVTList VTList,
+                                      ArrayRef<SDValue> Ops,
+                                      const SDNodeFlags *Flags) {
+  if (VTList.VTs[VTList.NumVTs - 1] != MVT::Glue) {
+    FoldingSetNodeID ID;
+    AddNodeIDNode(ID, Opcode, VTList, Ops);
+    void *IP = nullptr;
+    if (SDNode *E = FindNodeOrInsertPos(ID, SDLoc(), IP)) {
+      if (Flags)
+        E->intersectFlagsWith(Flags);
+      return E;
+    }
+  }
+  return nullptr;
+}
+
+/// getDbgValue - Creates a SDDbgValue node.
+///
+/// SDNode
+SDDbgValue *SelectionDAG::getDbgValue(MDNode *Var, MDNode *Expr, SDNode *N,
+                                      unsigned R, bool IsIndirect, uint64_t Off,
+                                      const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc())
+      SDDbgValue(Var, Expr, N, R, IsIndirect, Off, DL, O);
+}
+
+/// Constant
+SDDbgValue *SelectionDAG::getConstantDbgValue(MDNode *Var, MDNode *Expr,
+                                              const Value *C, uint64_t Off,
+                                              const DebugLoc &DL, unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, C, Off, DL, O);
+}
+
+/// FrameIndex
+SDDbgValue *SelectionDAG::getFrameIndexDbgValue(MDNode *Var, MDNode *Expr,
+                                                unsigned FI, uint64_t Off,
+                                                const DebugLoc &DL,
+                                                unsigned O) {
+  assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  return new (DbgInfo->getAlloc()) SDDbgValue(Var, Expr, FI, Off, DL, O);
+}
+
+namespace {
+
+/// RAUWUpdateListener - Helper for ReplaceAllUsesWith - When the node
+/// pointed to by a use iterator is deleted, increment the use iterator
+/// so that it doesn't dangle.
+///
+class RAUWUpdateListener : public SelectionDAG::DAGUpdateListener {
+  SDNode::use_iterator &UI;
+  SDNode::use_iterator &UE;
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    // Increment the iterator as needed.
+    while (UI != UE && N == *UI)
+      ++UI;
+  }
+
+public:
+  RAUWUpdateListener(SelectionDAG &d,
+                     SDNode::use_iterator &ui,
+                     SDNode::use_iterator &ue)
+    : SelectionDAG::DAGUpdateListener(d), UI(ui), UE(ue) {}
+};
+
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes From has a single result value.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDValue FromN, SDValue To) {
+  SDNode *From = FromN.getNode();
+  assert(From->getNumValues() == 1 && FromN.getResNo() == 0 &&
+         "Cannot replace with this method!");
+  assert(From != To.getNode() && "Cannot replace uses of with self");
+
+  // Iterate over all the existing uses of From. New uses will be added
+  // to the beginning of the use list, which we avoid visiting.
+  // This specifically avoids visiting uses of From that arise while the
+  // replacement is happening, because any such uses would be the result
+  // of CSE: If an existing node looks like From after one of its operands
+  // is replaced by To, we don't want to replace of all its users with To
+  // too. See PR3018 for more info.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // Preserve Debug Values
+  TransferDbgValues(FromN, To);
+
+  // If we just RAUW'd the root, take note.
+  if (FromN == getRoot())
+    setRoot(To);
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version assumes that for each value of From, there is a
+/// corresponding value in To in the same position with the same type.
+///
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, SDNode *To) {
+#ifndef NDEBUG
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    assert((!From->hasAnyUseOfValue(i) ||
+            From->getValueType(i) == To->getValueType(i)) &&
+           "Cannot use this version of ReplaceAllUsesWith!");
+#endif
+
+  // Handle the trivial case.
+  if (From == To)
+    return;
+
+  // Preserve Debug Info. Only do this if there's a use.
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    if (From->hasAnyUseOfValue(i)) {
+      assert((i < To->getNumValues()) && "Invalid To location");
+      TransferDbgValues(SDValue(From, i), SDValue(To, i));
+    }
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      ++UI;
+      Use.setNode(To);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To, getRoot().getResNo()));
+}
+
+/// ReplaceAllUsesWith - Modify anything using 'From' to use 'To' instead.
+/// This can cause recursive merging of nodes in the DAG.
+///
+/// This version can replace From with any result values.  To must match the
+/// number and types of values returned by From.
+void SelectionDAG::ReplaceAllUsesWith(SDNode *From, const SDValue *To) {
+  if (From->getNumValues() == 1)  // Handle the simple case efficiently.
+    return ReplaceAllUsesWith(SDValue(From, 0), To[0]);
+
+  // Preserve Debug Info.
+  for (unsigned i = 0, e = From->getNumValues(); i != e; ++i)
+    TransferDbgValues(SDValue(From, i), *To);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From->use_begin(), UE = From->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+      const SDValue &ToOp = To[Use.getResNo()];
+      ++UI;
+      Use.set(ToOp);
+    } while (UI != UE && *UI == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot().getNode())
+    setRoot(SDValue(To[getRoot().getResNo()]));
+}
+
+/// ReplaceAllUsesOfValueWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The Deleted
+/// vector is handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValueWith(SDValue From, SDValue To){
+  // Handle the really simple, really trivial case efficiently.
+  if (From == To) return;
+
+  // Handle the simple, trivial, case efficiently.
+  if (From.getNode()->getNumValues() == 1) {
+    ReplaceAllUsesWith(From, To);
+    return;
+  }
+
+  // Preserve Debug Info.
+  TransferDbgValues(From, To);
+
+  // Iterate over just the existing users of From. See the comments in
+  // the ReplaceAllUsesWith above.
+  SDNode::use_iterator UI = From.getNode()->use_begin(),
+                       UE = From.getNode()->use_end();
+  RAUWUpdateListener Listener(*this, UI, UE);
+  while (UI != UE) {
+    SDNode *User = *UI;
+    bool UserRemovedFromCSEMaps = false;
+
+    // A user can appear in a use list multiple times, and when this
+    // happens the uses are usually next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      SDUse &Use = UI.getUse();
+
+      // Skip uses of different values from the same node.
+      if (Use.getResNo() != From.getResNo()) {
+        ++UI;
+        continue;
+      }
+
+      // If this node hasn't been modified yet, it's still in the CSE maps,
+      // so remove its old self from the CSE maps.
+      if (!UserRemovedFromCSEMaps) {
+        RemoveNodeFromCSEMaps(User);
+        UserRemovedFromCSEMaps = true;
+      }
+
+      ++UI;
+      Use.set(To);
+    } while (UI != UE && *UI == User);
+
+    // We are iterating over all uses of the From node, so if a use
+    // doesn't use the specific value, no changes are made.
+    if (!UserRemovedFromCSEMaps)
+      continue;
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+
+  // If we just RAUW'd the root, take note.
+  if (From == getRoot())
+    setRoot(To);
+}
+
+namespace {
+  /// UseMemo - This class is used by SelectionDAG::ReplaceAllUsesOfValuesWith
+  /// to record information about a use.
+  struct UseMemo {
+    SDNode *User;
+    unsigned Index;
+    SDUse *Use;
+  };
+
+  /// operator< - Sort Memos by User.
+  bool operator<(const UseMemo &L, const UseMemo &R) {
+    return (intptr_t)L.User < (intptr_t)R.User;
+  }
+}
+
+/// ReplaceAllUsesOfValuesWith - Replace any uses of From with To, leaving
+/// uses of other values produced by From.getNode() alone.  The same value
+/// may appear in both the From and To list.  The Deleted vector is
+/// handled the same way as for ReplaceAllUsesWith.
+void SelectionDAG::ReplaceAllUsesOfValuesWith(const SDValue *From,
+                                              const SDValue *To,
+                                              unsigned Num){
+  // Handle the simple, trivial case efficiently.
+  if (Num == 1)
+    return ReplaceAllUsesOfValueWith(*From, *To);
+
+  TransferDbgValues(*From, *To);
+
+  // Read up all the uses and make records of them. This helps
+  // processing new uses that are introduced during the
+  // replacement process.
+  SmallVector<UseMemo, 4> Uses;
+  for (unsigned i = 0; i != Num; ++i) {
+    unsigned FromResNo = From[i].getResNo();
+    SDNode *FromNode = From[i].getNode();
+    for (SDNode::use_iterator UI = FromNode->use_begin(),
+         E = FromNode->use_end(); UI != E; ++UI) {
+      SDUse &Use = UI.getUse();
+      if (Use.getResNo() == FromResNo) {
+        UseMemo Memo = { *UI, i, &Use };
+        Uses.push_back(Memo);
+      }
+    }
+  }
+
+  // Sort the uses, so that all the uses from a given User are together.
+  std::sort(Uses.begin(), Uses.end());
+
+  for (unsigned UseIndex = 0, UseIndexEnd = Uses.size();
+       UseIndex != UseIndexEnd; ) {
+    // We know that this user uses some value of From.  If it is the right
+    // value, update it.
+    SDNode *User = Uses[UseIndex].User;
+
+    // This node is about to morph, remove its old self from the CSE maps.
+    RemoveNodeFromCSEMaps(User);
+
+    // The Uses array is sorted, so all the uses for a given User
+    // are next to each other in the list.
+    // To help reduce the number of CSE recomputations, process all
+    // the uses of this user that we can find this way.
+    do {
+      unsigned i = Uses[UseIndex].Index;
+      SDUse &Use = *Uses[UseIndex].Use;
+      ++UseIndex;
+
+      Use.set(To[i]);
+    } while (UseIndex != UseIndexEnd && Uses[UseIndex].User == User);
+
+    // Now that we have modified User, add it back to the CSE maps.  If it
+    // already exists there, recursively merge the results together.
+    AddModifiedNodeToCSEMaps(User);
+  }
+}
+
+/// AssignTopologicalOrder - Assign a unique node id for each node in the DAG
+/// based on their topological order. It returns the maximum id and a vector
+/// of the SDNodes* in assigned order by reference.
+unsigned SelectionDAG::AssignTopologicalOrder() {
+
+  unsigned DAGSize = 0;
+
+  // SortedPos tracks the progress of the algorithm. Nodes before it are
+  // sorted, nodes after it are unsorted. When the algorithm completes
+  // it is at the end of the list.
+  allnodes_iterator SortedPos = allnodes_begin();
+
+  // Visit all the nodes. Move nodes with no operands to the front of
+  // the list immediately. Annotate nodes that do have operands with their
+  // operand count. Before we do this, the Node Id fields of the nodes
+  // may contain arbitrary values. After, the Node Id fields for nodes
+  // before SortedPos will contain the topological sort index, and the
+  // Node Id fields for nodes At SortedPos and after will contain the
+  // count of outstanding operands.
+  for (allnodes_iterator I = allnodes_begin(),E = allnodes_end(); I != E; ) {
+    SDNode *N = &*I++;
+    checkForCycles(N, this);
+    unsigned Degree = N->getNumOperands();
+    if (Degree == 0) {
+      // A node with no uses, add it to the result array immediately.
+      N->setNodeId(DAGSize++);
+      allnodes_iterator Q(N);
+      if (Q != SortedPos)
+        SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(Q));
+      assert(SortedPos != AllNodes.end() && "Overran node list");
+      ++SortedPos;
+    } else {
+      // Temporarily use the Node Id as scratch space for the degree count.
+      N->setNodeId(Degree);
+    }
+  }
+
+  // Visit all the nodes. As we iterate, move nodes into sorted order,
+  // such that by the time the end is reached all nodes will be sorted.
+  for (SDNode &Node : allnodes()) {
+    SDNode *N = &Node;
+    checkForCycles(N, this);
+    // N is in sorted position, so all its uses have one less operand
+    // that needs to be sorted.
+    for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end();
+         UI != UE; ++UI) {
+      SDNode *P = *UI;
+      unsigned Degree = P->getNodeId();
+      assert(Degree != 0 && "Invalid node degree");
+      --Degree;
+      if (Degree == 0) {
+        // All of P's operands are sorted, so P may sorted now.
+        P->setNodeId(DAGSize++);
+        if (P->getIterator() != SortedPos)
+          SortedPos = AllNodes.insert(SortedPos, AllNodes.remove(P));
+        assert(SortedPos != AllNodes.end() && "Overran node list");
+        ++SortedPos;
+      } else {
+        // Update P's outstanding operand count.
+        P->setNodeId(Degree);
+      }
+    }
+    if (Node.getIterator() == SortedPos) {
+#ifndef NDEBUG
+      allnodes_iterator I(N);
+      SDNode *S = &*++I;
+      dbgs() << "Overran sorted position:\n";
+      S->dumprFull(this); dbgs() << "\n";
+      dbgs() << "Checking if this is due to cycles\n";
+      checkForCycles(this, true);
+#endif
+      llvm_unreachable(nullptr);
+    }
+  }
+
+  assert(SortedPos == AllNodes.end() &&
+         "Topological sort incomplete!");
+  assert(AllNodes.front().getOpcode() == ISD::EntryToken &&
+         "First node in topological sort is not the entry token!");
+  assert(AllNodes.front().getNodeId() == 0 &&
+         "First node in topological sort has non-zero id!");
+  assert(AllNodes.front().getNumOperands() == 0 &&
+         "First node in topological sort has operands!");
+  assert(AllNodes.back().getNodeId() == (int)DAGSize-1 &&
+         "Last node in topologic sort has unexpected id!");
+  assert(AllNodes.back().use_empty() &&
+         "Last node in topologic sort has users!");
+  assert(DAGSize == allnodes_size() && "Node count mismatch!");
+  return DAGSize;
+}
+
+/// AddDbgValue - Add a dbg_value SDNode. If SD is non-null that means the
+/// value is produced by SD.
+void SelectionDAG::AddDbgValue(SDDbgValue *DB, SDNode *SD, bool isParameter) {
+  if (SD) {
+    assert(DbgInfo->getSDDbgValues(SD).empty() || SD->getHasDebugValue());
+    SD->setHasDebugValue(true);
+  }
+  DbgInfo->add(DB, SD, isParameter);
+}
+
+/// TransferDbgValues - Transfer SDDbgValues. Called in replace nodes.
+void SelectionDAG::TransferDbgValues(SDValue From, SDValue To) {
+  if (From == To || !From.getNode()->getHasDebugValue())
+    return;
+  SDNode *FromNode = From.getNode();
+  SDNode *ToNode = To.getNode();
+  ArrayRef<SDDbgValue *> DVs = GetDbgValues(FromNode);
+  for (ArrayRef<SDDbgValue *>::iterator I = DVs.begin(), E = DVs.end();
+       I != E; ++I) {
+    SDDbgValue *Dbg = *I;
+    // Only add Dbgvalues attached to same ResNo.
+    if (Dbg->getKind() == SDDbgValue::SDNODE &&
+        Dbg->getResNo() == From.getResNo()) {
+      SDDbgValue *Clone =
+          getDbgValue(Dbg->getVariable(), Dbg->getExpression(), ToNode,
+                      To.getResNo(), Dbg->isIndirect(), Dbg->getOffset(),
+                      Dbg->getDebugLoc(), Dbg->getOrder());
+      AddDbgValue(Clone, ToNode, false);
+    }
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                              SDNode Class
+//===----------------------------------------------------------------------===//
+
+bool llvm::isNullConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isNullValue();
+}
+
+bool llvm::isNullFPConstant(SDValue V) {
+  ConstantFPSDNode *Const = dyn_cast<ConstantFPSDNode>(V);
+  return Const != nullptr && Const->isZero() && !Const->isNegative();
+}
+
+bool llvm::isAllOnesConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isAllOnesValue();
+}
+
+bool llvm::isOneConstant(SDValue V) {
+  ConstantSDNode *Const = dyn_cast<ConstantSDNode>(V);
+  return Const != nullptr && Const->isOne();
+}
+
+bool llvm::isBitwiseNot(SDValue V) {
+  return V.getOpcode() == ISD::XOR && isAllOnesConstant(V.getOperand(1));
+}
+
+HandleSDNode::~HandleSDNode() {
+  DropOperands();
+}
+
+GlobalAddressSDNode::GlobalAddressSDNode(unsigned Opc, unsigned Order,
+                                         const DebugLoc &DL,
+                                         const GlobalValue *GA, EVT VT,
+                                         int64_t o, unsigned char TF)
+    : SDNode(Opc, Order, DL, getSDVTList(VT)), Offset(o), TargetFlags(TF) {
+  TheGlobal = GA;
+}
+
+AddrSpaceCastSDNode::AddrSpaceCastSDNode(unsigned Order, const DebugLoc &dl,
+                                         EVT VT, unsigned SrcAS,
+                                         unsigned DestAS)
+    : SDNode(ISD::ADDRSPACECAST, Order, dl, getSDVTList(VT)),
+      SrcAddrSpace(SrcAS), DestAddrSpace(DestAS) {}
+
+MemSDNode::MemSDNode(unsigned Opc, unsigned Order, const DebugLoc &dl,
+                     SDVTList VTs, EVT memvt, MachineMemOperand *mmo)
+    : SDNode(Opc, Order, dl, VTs), MemoryVT(memvt), MMO(mmo) {
+  SubclassData = encodeMemSDNodeFlags(0, ISD::UNINDEXED, MMO->isVolatile(),
+                                      MMO->isNonTemporal(), MMO->isInvariant());
+  assert(isVolatile() == MMO->isVolatile() && "Volatile encoding error!");
+  assert(isNonTemporal() == MMO->isNonTemporal() &&
+         "Non-temporal encoding error!");
+  // We check here that the size of the memory operand fits within the size of
+  // the MMO. This is because the MMO might indicate only a possible address
+  // range instead of specifying the affected memory addresses precisely.
+  assert(memvt.getStoreSize() <= MMO->getSize() && "Size mismatch!");
+}
+
+/// Profile - Gather unique data for the node.
+///
+void SDNode::Profile(FoldingSetNodeID &ID) const {
+  AddNodeIDNode(ID, this);
+}
+
+namespace {
+  struct EVTArray {
+    std::vector<EVT> VTs;
+
+    EVTArray() {
+      VTs.reserve(MVT::LAST_VALUETYPE);
+      for (unsigned i = 0; i < MVT::LAST_VALUETYPE; ++i)
+        VTs.push_back(MVT((MVT::SimpleValueType)i));
+    }
+  };
+}
+
+static ManagedStatic<std::set<EVT, EVT::compareRawBits> > EVTs;
+static ManagedStatic<EVTArray> SimpleVTArray;
+static ManagedStatic<sys::SmartMutex<true> > VTMutex;
+
+/// getValueTypeList - Return a pointer to the specified value type.
+///
+const EVT *SDNode::getValueTypeList(EVT VT) {
+  if (VT.isExtended()) {
+    sys::SmartScopedLock<true> Lock(*VTMutex);
+    return &(*EVTs->insert(VT).first);
+  } else {
+    assert(VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
+           "Value type out of range!");
+    return &SimpleVTArray->VTs[VT.getSimpleVT().SimpleTy];
+  }
+}
+
+/// hasNUsesOfValue - Return true if there are exactly NUSES uses of the
+/// indicated value.  This method ignores uses of other values defined by this
+/// operation.
+bool SDNode::hasNUsesOfValue(unsigned NUses, unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  // TODO: Only iterate over uses of a given value of the node
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI) {
+    if (UI.getUse().getResNo() == Value) {
+      if (NUses == 0)
+        return false;
+      --NUses;
+    }
+  }
+
+  // Found exactly the right number of uses?
+  return NUses == 0;
+}
+
+
+/// hasAnyUseOfValue - Return true if there are any use of the indicated
+/// value. This method ignores uses of other values defined by this operation.
+bool SDNode::hasAnyUseOfValue(unsigned Value) const {
+  assert(Value < getNumValues() && "Bad value!");
+
+  for (SDNode::use_iterator UI = use_begin(), E = use_end(); UI != E; ++UI)
+    if (UI.getUse().getResNo() == Value)
+      return true;
+
+  return false;
+}
+
+
+/// isOnlyUserOf - Return true if this node is the only use of N.
+///
+bool SDNode::isOnlyUserOf(const SDNode *N) const {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (User == this)
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
+/// isOperand - Return true if this node is an operand of N.
+///
+bool SDValue::isOperandOf(const SDNode *N) const {
+  for (const SDValue &Op : N->op_values())
+    if (*this == Op)
+      return true;
+  return false;
+}
+
+bool SDNode::isOperandOf(const SDNode *N) const {
+  for (const SDValue &Op : N->op_values())
+    if (this == Op.getNode())
+      return true;
+  return false;
+}
+
+/// reachesChainWithoutSideEffects - Return true if this operand (which must
+/// be a chain) reaches the specified operand without crossing any
+/// side-effecting instructions on any chain path.  In practice, this looks
+/// through token factors and non-volatile loads.  In order to remain efficient,
+/// this only looks a couple of nodes in, it does not do an exhaustive search.
+bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
+                                               unsigned Depth) const {
+  if (*this == Dest) return true;
+
+  // Don't search too deeply, we just want to be able to see through
+  // TokenFactor's etc.
+  if (Depth == 0) return false;
+
+  // If this is a token factor, all inputs to the TF happen in parallel.  If any
+  // of the operands of the TF does not reach dest, then we cannot do the xform.
+  if (getOpcode() == ISD::TokenFactor) {
+    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
+      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
+        return false;
+    return true;
+  }
+
+  // Loads don't have side effects, look through them.
+  if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(*this)) {
+    if (!Ld->isVolatile())
+      return Ld->getChain().reachesChainWithoutSideEffects(Dest, Depth-1);
+  }
+  return false;
+}
+
+bool SDNode::hasPredecessor(const SDNode *N) const {
+  SmallPtrSet<const SDNode *, 32> Visited;
+  SmallVector<const SDNode *, 16> Worklist;
+  Worklist.push_back(this);
+  return hasPredecessorHelper(N, Visited, Worklist);
+}
+
+uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
+  assert(Num < NumOperands && "Invalid child # of SDNode!");
+  return cast<ConstantSDNode>(OperandList[Num])->getZExtValue();
+}
+
+const SDNodeFlags *SDNode::getFlags() const {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    return &FlagsNode->Flags;
+  return nullptr;
+}
+
+void SDNode::intersectFlagsWith(const SDNodeFlags *Flags) {
+  if (auto *FlagsNode = dyn_cast<BinaryWithFlagsSDNode>(this))
+    FlagsNode->Flags.intersectWith(Flags);
+}
+
+SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
+  assert(N->getNumValues() == 1 &&
+         "Can't unroll a vector with multiple results!");
+
+  EVT VT = N->getValueType(0);
+  unsigned NE = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+  SDLoc dl(N);
+
+  SmallVector<SDValue, 8> Scalars;
+  SmallVector<SDValue, 4> Operands(N->getNumOperands());
+
+  // If ResNE is 0, fully unroll the vector op.
+  if (ResNE == 0)
+    ResNE = NE;
+  else if (NE > ResNE)
+    NE = ResNE;
+
+  unsigned i;
+  for (i= 0; i != NE; ++i) {
+    for (unsigned j = 0, e = N->getNumOperands(); j != e; ++j) {
+      SDValue Operand = N->getOperand(j);
+      EVT OperandVT = Operand.getValueType();
+      if (OperandVT.isVector()) {
+        // A vector operand; extract a single element.
+        EVT OperandEltVT = OperandVT.getVectorElementType();
+        Operands[j] =
+            getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
+                    getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
+      } else {
+        // A scalar operand; just use it as is.
+        Operands[j] = Operand;
+      }
+    }
+
+    switch (N->getOpcode()) {
+    default: {
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands,
+                                N->getFlags()));
+      break;
+    }
+    case ISD::VSELECT:
+      Scalars.push_back(getNode(ISD::SELECT, dl, EltVT, Operands));
+      break;
+    case ISD::SHL:
+    case ISD::SRA:
+    case ISD::SRL:
+    case ISD::ROTL:
+    case ISD::ROTR:
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT, Operands[0],
+                               getShiftAmountOperand(Operands[0].getValueType(),
+                                                     Operands[1])));
+      break;
+    case ISD::SIGN_EXTEND_INREG:
+    case ISD::FP_ROUND_INREG: {
+      EVT ExtVT = cast<VTSDNode>(Operands[1])->getVT().getVectorElementType();
+      Scalars.push_back(getNode(N->getOpcode(), dl, EltVT,
+                                Operands[0],
+                                getValueType(ExtVT)));
+    }
+    }
+  }
+
+  for (; i < ResNE; ++i)
+    Scalars.push_back(getUNDEF(EltVT));
+
+  return getNode(ISD::BUILD_VECTOR, dl,
+                 EVT::getVectorVT(*getContext(), EltVT, ResNE), Scalars);
+}
+
+bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
+                                                  LoadSDNode *Base,
+                                                  unsigned Bytes,
+                                                  int Dist) const {
+  if (LD->isVolatile() || Base->isVolatile())
+    return false;
+  if (LD->isIndexed() || Base->isIndexed())
+    return false;
+  if (LD->getChain() != Base->getChain())
+    return false;
+  EVT VT = LD->getValueType(0);
+  if (VT.getSizeInBits() / 8 != Bytes)
+    return false;
+
+  SDValue Loc = LD->getOperand(1);
+  SDValue BaseLoc = Base->getOperand(1);
+  if (Loc.getOpcode() == ISD::FrameIndex) {
+    if (BaseLoc.getOpcode() != ISD::FrameIndex)
+      return false;
+    const MachineFrameInfo *MFI = getMachineFunction().getFrameInfo();
+    int FI  = cast<FrameIndexSDNode>(Loc)->getIndex();
+    int BFI = cast<FrameIndexSDNode>(BaseLoc)->getIndex();
+    int FS  = MFI->getObjectSize(FI);
+    int BFS = MFI->getObjectSize(BFI);
+    if (FS != BFS || FS != (int)Bytes) return false;
+    return MFI->getObjectOffset(FI) == (MFI->getObjectOffset(BFI) + Dist*Bytes);
+  }
+
+  // Handle X + C.
+  if (isBaseWithConstantOffset(Loc)) {
+    int64_t LocOffset = cast<ConstantSDNode>(Loc.getOperand(1))->getSExtValue();
+    if (Loc.getOperand(0) == BaseLoc) {
+      // If the base location is a simple address with no offset itself, then
+      // the second load's first add operand should be the base address.
+      if (LocOffset == Dist * (int)Bytes)
+        return true;
+    } else if (isBaseWithConstantOffset(BaseLoc)) {
+      // The base location itself has an offset, so subtract that value from the
+      // second load's offset before comparing to distance * size.
+      int64_t BOffset =
+        cast<ConstantSDNode>(BaseLoc.getOperand(1))->getSExtValue();
+      if (Loc.getOperand(0) == BaseLoc.getOperand(0)) {
+        if ((LocOffset - BOffset) == Dist * (int)Bytes)
+          return true;
+      }
+    }
+  }
+  const GlobalValue *GV1 = nullptr;
+  const GlobalValue *GV2 = nullptr;
+  int64_t Offset1 = 0;
+  int64_t Offset2 = 0;
+  bool isGA1 = TLI->isGAPlusOffset(Loc.getNode(), GV1, Offset1);
+  bool isGA2 = TLI->isGAPlusOffset(BaseLoc.getNode(), GV2, Offset2);
+  if (isGA1 && isGA2 && GV1 == GV2)
+    return Offset1 == (Offset2 + Dist*Bytes);
+  return false;
+}
+
+
+/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
+/// it cannot be inferred.
+unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+  // If this is a GlobalAddress + cst, return the alignment.
+  const GlobalValue *GV;
+  int64_t GVOffset = 0;
+  if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) {
+    unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType());
+    APInt KnownZero(PtrWidth, 0), KnownOne(PtrWidth, 0);
+    llvm::computeKnownBits(const_cast<GlobalValue *>(GV), KnownZero, KnownOne,
+                           getDataLayout());
+    unsigned AlignBits = KnownZero.countTrailingOnes();
+    unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
+    if (Align)
+      return MinAlign(Align, GVOffset);
+  }
+
+  // If this is a direct reference to a stack slot, use information about the
+  // stack slot's alignment.
+  int FrameIdx = 1 << 31;
+  int64_t FrameOffset = 0;
+  if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Ptr)) {
+    FrameIdx = FI->getIndex();
+  } else if (isBaseWithConstantOffset(Ptr) &&
+             isa<FrameIndexSDNode>(Ptr.getOperand(0))) {
+    // Handle FI+Cst
+    FrameIdx = cast<FrameIndexSDNode>(Ptr.getOperand(0))->getIndex();
+    FrameOffset = Ptr.getConstantOperandVal(1);
+  }
+
+  if (FrameIdx != (1 << 31)) {
+    const MachineFrameInfo &MFI = *getMachineFunction().getFrameInfo();
+    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
+                                    FrameOffset);
+    return FIInfoAlign;
+  }
+
+  return 0;
+}
+
+/// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
+/// which is split (or expanded) into two not necessarily identical pieces.
+std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
+  // Currently all types are split in half.
+  EVT LoVT, HiVT;
+  if (!VT.isVector()) {
+    LoVT = HiVT = TLI->getTypeToTransformTo(*getContext(), VT);
+  } else {
+    unsigned NumElements = VT.getVectorNumElements();
+    assert(!(NumElements & 1) && "Splitting vector, but not in half!");
+    LoVT = HiVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
+                                   NumElements/2);
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
+/// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
+/// low/high part.
+std::pair<SDValue, SDValue>
+SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
+                          const EVT &HiVT) {
+  assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
+         N.getValueType().getVectorNumElements() &&
+         "More vector elements requested than available!");
+  SDValue Lo, Hi;
+  Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
+               getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
+  Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
+               getConstant(LoVT.getVectorNumElements(), DL,
+                           TLI->getVectorIdxTy(getDataLayout())));
+  return std::make_pair(Lo, Hi);
+}
+
+void SelectionDAG::ExtractVectorElements(SDValue Op,
+                                         SmallVectorImpl<SDValue> &Args,
+                                         unsigned Start, unsigned Count) {
+  EVT VT = Op.getValueType();
+  if (Count == 0)
+    Count = VT.getVectorNumElements();
+
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
+  SDLoc SL(Op);
+  for (unsigned i = Start, e = Start + Count; i != e; ++i) {
+    Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
+                           Op, getConstant(i, SL, IdxTy)));
+  }
+}
+
+// getAddressSpace - Return the address space this GlobalAddress belongs to.
+unsigned GlobalAddressSDNode::getAddressSpace() const {
+  return getGlobal()->getType()->getAddressSpace();
+}
+
+
+Type *ConstantPoolSDNode::getType() const {
+  if (isMachineConstantPoolEntry())
+    return Val.MachineCPVal->getType();
+  return Val.ConstVal->getType();
+}
+
+bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
+                                        APInt &SplatUndef,
+                                        unsigned &SplatBitSize,
+                                        bool &HasAnyUndefs,
+                                        unsigned MinSplatBits,
+                                        bool isBigEndian) const {
+  EVT VT = getValueType(0);
+  assert(VT.isVector() && "Expected a vector type");
+  unsigned sz = VT.getSizeInBits();
+  if (MinSplatBits > sz)
+    return false;
+
+  SplatValue = APInt(sz, 0);
+  SplatUndef = APInt(sz, 0);
+
+  // Get the bits.  Bits with undefined values (when the corresponding element
+  // of the vector is an ISD::UNDEF value) are set in SplatUndef and cleared
+  // in SplatValue.  If any of the values are not constant, give up and return
+  // false.
+  unsigned int nOps = getNumOperands();
+  assert(nOps > 0 && "isConstantSplat has 0-size build vector");
+  unsigned EltBitSize = VT.getVectorElementType().getSizeInBits();
+
+  for (unsigned j = 0; j < nOps; ++j) {
+    unsigned i = isBigEndian ? nOps-1-j : j;
+    SDValue OpVal = getOperand(i);
+    unsigned BitPos = j * EltBitSize;
+
+    if (OpVal.isUndef())
+      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
+    else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
+      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
+                    zextOrTrunc(sz) << BitPos;
+    else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
+      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
+     else
+      return false;
+  }
+
+  // The build_vector is all constants or undefs.  Find the smallest element
+  // size that splats the vector.
+
+  HasAnyUndefs = (SplatUndef != 0);
+  while (sz > 8) {
+
+    unsigned HalfSize = sz / 2;
+    APInt HighValue = SplatValue.lshr(HalfSize).trunc(HalfSize);
+    APInt LowValue = SplatValue.trunc(HalfSize);
+    APInt HighUndef = SplatUndef.lshr(HalfSize).trunc(HalfSize);
+    APInt LowUndef = SplatUndef.trunc(HalfSize);
+
+    // If the two halves do not match (ignoring undef bits), stop here.
+    if ((HighValue & ~LowUndef) != (LowValue & ~HighUndef) ||
+        MinSplatBits > HalfSize)
+      break;
+
+    SplatValue = HighValue | LowValue;
+    SplatUndef = HighUndef & LowUndef;
+
+    sz = HalfSize;
+  }
+
+  SplatBitSize = sz;
+  return true;
+}
+
+SDValue BuildVectorSDNode::getSplatValue(BitVector *UndefElements) const {
+  if (UndefElements) {
+    UndefElements->clear();
+    UndefElements->resize(getNumOperands());
+  }
+  SDValue Splatted;
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    SDValue Op = getOperand(i);
+    if (Op.isUndef()) {
+      if (UndefElements)
+        (*UndefElements)[i] = true;
+    } else if (!Splatted) {
+      Splatted = Op;
+    } else if (Splatted != Op) {
+      return SDValue();
+    }
+  }
+
+  if (!Splatted) {
+    assert(getOperand(0).isUndef() &&
+           "Can only have a splat without a constant for all undefs.");
+    return getOperand(0);
+  }
+
+  return Splatted;
+}
+
+ConstantSDNode *
+BuildVectorSDNode::getConstantSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantSDNode>(getSplatValue(UndefElements));
+}
+
+ConstantFPSDNode *
+BuildVectorSDNode::getConstantFPSplatNode(BitVector *UndefElements) const {
+  return dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements));
+}
+
+int32_t
+BuildVectorSDNode::getConstantFPSplatPow2ToLog2Int(BitVector *UndefElements,
+                                                   uint32_t BitWidth) const {
+  if (ConstantFPSDNode *CN =
+          dyn_cast_or_null<ConstantFPSDNode>(getSplatValue(UndefElements))) {
+    bool IsExact;
+    APSInt IntVal(BitWidth);
+    const APFloat &APF = CN->getValueAPF();
+    if (APF.convertToInteger(IntVal, APFloat::rmTowardZero, &IsExact) !=
+            APFloat::opOK ||
+        !IsExact)
+      return -1;
+
+    return IntVal.exactLogBase2();
+  }
+  return -1;
+}
+
+bool BuildVectorSDNode::isConstant() const {
+  for (const SDValue &Op : op_values()) {
+    unsigned Opc = Op.getOpcode();
+    if (Opc != ISD::UNDEF && Opc != ISD::Constant && Opc != ISD::ConstantFP)
+      return false;
+  }
+  return true;
+}
+
+bool ShuffleVectorSDNode::isSplatMask(const int *Mask, EVT VT) {
+  // Find the first non-undef value in the shuffle mask.
+  unsigned i, e;
+  for (i = 0, e = VT.getVectorNumElements(); i != e && Mask[i] < 0; ++i)
+    /* search */;
+
+  assert(i != e && "VECTOR_SHUFFLE node with all undef indices!");
+
+  // Make sure all remaining elements are either undef or the same as the first
+  // non-undef value.
+  for (int Idx = Mask[i]; i != e; ++i)
+    if (Mask[i] >= 0 && Mask[i] != Idx)
+      return false;
+  return true;
+}
+
+// \brief Returns the SDNode if it is a constant integer BuildVector
+// or constant integer.
+SDNode *SelectionDAG::isConstantIntBuildVectorOrConstantInt(SDValue N) {
+  if (isa<ConstantSDNode>(N))
+    return N.getNode();
+  if (ISD::isBuildVectorOfConstantSDNodes(N.getNode()))
+    return N.getNode();
+  // Treat a GlobalAddress supporting constant offset folding as a
+  // constant integer.
+  if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N))
+    if (GA->getOpcode() == ISD::GlobalAddress &&
+        TLI->isOffsetFoldingLegal(GA))
+      return GA;
+  return nullptr;
+}
+
+#ifndef NDEBUG
+static void checkForCyclesHelper(const SDNode *N,
+                                 SmallPtrSetImpl<const SDNode*> &Visited,
+                                 SmallPtrSetImpl<const SDNode*> &Checked,
+                                 const llvm::SelectionDAG *DAG) {
+  // If this node has already been checked, don't check it again.
+  if (Checked.count(N))
+    return;
+
+  // If a node has already been visited on this depth-first walk, reject it as
+  // a cycle.
+  if (!Visited.insert(N).second) {
+    errs() << "Detected cycle in SelectionDAG\n";
+    dbgs() << "Offending node:\n";
+    N->dumprFull(DAG); dbgs() << "\n";
+    abort();
+  }
+
+  for (const SDValue &Op : N->op_values())
+    checkForCyclesHelper(Op.getNode(), Visited, Checked, DAG);
+
+  Checked.insert(N);
+  Visited.erase(N);
+}
+#endif
+
+void llvm::checkForCycles(const llvm::SDNode *N,
+                          const llvm::SelectionDAG *DAG,
+                          bool force) {
+#ifndef NDEBUG
+  bool check = force;
+#ifdef EXPENSIVE_CHECKS
+  check = true;
+#endif  // EXPENSIVE_CHECKS
+  if (check) {
+    assert(N && "Checking nonexistent SDNode");
+    SmallPtrSet<const SDNode*, 32> visited;
+    SmallPtrSet<const SDNode*, 32> checked;
+    checkForCyclesHelper(N, visited, checked, DAG);
+  }
+#endif  // !NDEBUG
+}
+
+void llvm::checkForCycles(const llvm::SelectionDAG *DAG, bool force) {
+  checkForCycles(DAG->getRoot().getNode(), DAG, force);
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
new file mode 100644
index 000000000000..e03282cad6b8
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -0,0 +1,9127 @@
+//===-- SelectionDAGBuilder.cpp - Selection-DAG building ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SelectionDAGBuilder.h"
+#include "SDNodeDbgValue.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GetElementPtrTypeIterator.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <utility>
+using namespace llvm;
+
+#define DEBUG_TYPE "isel"
+
+/// LimitFloatPrecision - Generate low-precision inline sequences for
+/// some float libcalls (6, 8 or 12 bits).
+static unsigned LimitFloatPrecision;
+
+static cl::opt<unsigned, true>
+LimitFPPrecision("limit-float-precision",
+                 cl::desc("Generate low-precision inline sequences "
+                          "for some float libcalls"),
+                 cl::location(LimitFloatPrecision),
+                 cl::init(0));
+
+static cl::opt<bool>
+EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
+                cl::desc("Enable fast-math-flags for DAG nodes"));
+
+/// Minimum jump table density for normal functions.
+static cl::opt<unsigned>
+JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
+                 cl::desc("Minimum density for building a jump table in "
+                          "a normal function"));
+
+/// Minimum jump table density for -Os or -Oz functions.
+static cl::opt<unsigned>
+OptsizeJumpTableDensity("optsize-jump-table-density", cl::init(40), cl::Hidden,
+                        cl::desc("Minimum density for building a jump table in "
+                                 "an optsize function"));
+
+
+// Limit the width of DAG chains. This is important in general to prevent
+// DAG-based analysis from blowing up. For example, alias analysis and
+// load clustering may not complete in reasonable time. It is difficult to
+// recognize and avoid this situation within each individual analysis, and
+// future analyses are likely to have the same behavior. Limiting DAG width is
+// the safe approach and will be especially important with global DAGs.
+//
+// MaxParallelChains default is arbitrarily high to avoid affecting
+// optimization, but could be lowered to improve compile time. Any ld-ld-st-st
+// sequence over this should have been converted to llvm.memcpy by the
+// frontend. It is easy to induce this behavior with .ll code such as:
+// %buffer = alloca [4096 x i8]
+// %data = load [4096 x i8]* %argPtr
+// store [4096 x i8] %data, [4096 x i8]* %buffer
+static const unsigned MaxParallelChains = 64;
+
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V);
+
+/// getCopyFromParts - Create a value that contains the specified legal parts
+/// combined into the value they represent.  If the parts combine to a type
+/// larger than ValueVT then AssertOp can be used to specify whether the extra
+/// bits are known to be zero (ISD::AssertZext) or sign extended from ValueVT
+/// (ISD::AssertSext).
+static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
+                                const SDValue *Parts, unsigned NumParts,
+                                MVT PartVT, EVT ValueVT, const Value *V,
+                                Optional<ISD::NodeType> AssertOp = None) {
+  if (ValueVT.isVector())
+    return getCopyFromPartsVector(DAG, DL, Parts, NumParts,
+                                  PartVT, ValueVT, V);
+
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  if (NumParts > 1) {
+    // Assemble the value from multiple parts.
+    if (ValueVT.isInteger()) {
+      unsigned PartBits = PartVT.getSizeInBits();
+      unsigned ValueBits = ValueVT.getSizeInBits();
+
+      // Assemble the power of 2 part.
+      unsigned RoundParts = NumParts & (NumParts - 1) ?
+        1 << Log2_32(NumParts) : NumParts;
+      unsigned RoundBits = PartBits * RoundParts;
+      EVT RoundVT = RoundBits == ValueBits ?
+        ValueVT : EVT::getIntegerVT(*DAG.getContext(), RoundBits);
+      SDValue Lo, Hi;
+
+      EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), RoundBits/2);
+
+      if (RoundParts > 2) {
+        Lo = getCopyFromParts(DAG, DL, Parts, RoundParts / 2,
+                              PartVT, HalfVT, V);
+        Hi = getCopyFromParts(DAG, DL, Parts + RoundParts / 2,
+                              RoundParts / 2, PartVT, HalfVT, V);
+      } else {
+        Lo = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[0]);
+        Hi = DAG.getNode(ISD::BITCAST, DL, HalfVT, Parts[1]);
+      }
+
+      if (DAG.getDataLayout().isBigEndian())
+        std::swap(Lo, Hi);
+
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, RoundVT, Lo, Hi);
+
+      if (RoundParts < NumParts) {
+        // Assemble the trailing non-power-of-2 part.
+        unsigned OddParts = NumParts - RoundParts;
+        EVT OddVT = EVT::getIntegerVT(*DAG.getContext(), OddParts * PartBits);
+        Hi = getCopyFromParts(DAG, DL,
+                              Parts + RoundParts, OddParts, PartVT, OddVT, V);
+
+        // Combine the round and odd parts.
+        Lo = Val;
+        if (DAG.getDataLayout().isBigEndian())
+          std::swap(Lo, Hi);
+        EVT TotalVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+        Hi = DAG.getNode(ISD::ANY_EXTEND, DL, TotalVT, Hi);
+        Hi =
+            DAG.getNode(ISD::SHL, DL, TotalVT, Hi,
+                        DAG.getConstant(Lo.getValueType().getSizeInBits(), DL,
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+        Lo = DAG.getNode(ISD::ZERO_EXTEND, DL, TotalVT, Lo);
+        Val = DAG.getNode(ISD::OR, DL, TotalVT, Lo, Hi);
+      }
+    } else if (PartVT.isFloatingPoint()) {
+      // FP split into multiple FP parts (for ppcf128)
+      assert(ValueVT == EVT(MVT::ppcf128) && PartVT == MVT::f64 &&
+             "Unexpected split");
+      SDValue Lo, Hi;
+      Lo = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[0]);
+      Hi = DAG.getNode(ISD::BITCAST, DL, EVT(MVT::f64), Parts[1]);
+      if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
+        std::swap(Lo, Hi);
+      Val = DAG.getNode(ISD::BUILD_PAIR, DL, ValueVT, Lo, Hi);
+    } else {
+      // FP split into integer parts (soft fp)
+      assert(ValueVT.isFloatingPoint() && PartVT.isInteger() &&
+             !PartVT.isVector() && "Unexpected split");
+      EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), ValueVT.getSizeInBits());
+      Val = getCopyFromParts(DAG, DL, Parts, NumParts, PartVT, IntVT, V);
+    }
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  // PartEVT is the type of the register class that holds the value.
+  // ValueVT is the type of the inline asm operation.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isInteger() && ValueVT.isFloatingPoint() &&
+      ValueVT.bitsLT(PartEVT)) {
+    // For an FP value in an integer part, we need to truncate to the right
+    // width first.
+    PartEVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+    Val = DAG.getNode(ISD::TRUNCATE, DL, PartEVT, Val);
+  }
+
+  // Handle types that have the same size.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits())
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle types with different sizes.
+  if (PartEVT.isInteger() && ValueVT.isInteger()) {
+    if (ValueVT.bitsLT(PartEVT)) {
+      // For a truncate, see if we have any information to
+      // indicate whether the truncated bits will always be
+      // zero or sign-extension.
+      if (AssertOp.hasValue())
+        Val = DAG.getNode(*AssertOp, DL, PartEVT, Val,
+                          DAG.getValueType(ValueVT));
+      return DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    }
+    return DAG.getNode(ISD::ANY_EXTEND, DL, ValueVT, Val);
+  }
+
+  if (PartEVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+    // FP_ROUND's are always exact here.
+    if (ValueVT.bitsLT(Val.getValueType()))
+      return DAG.getNode(
+          ISD::FP_ROUND, DL, ValueVT, Val,
+          DAG.getTargetConstant(1, DL, TLI.getPointerTy(DAG.getDataLayout())));
+
+    return DAG.getNode(ISD::FP_EXTEND, DL, ValueVT, Val);
+  }
+
+  llvm_unreachable("Unknown mismatch!");
+}
+
+static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
+                                              const Twine &ErrMsg) {
+  const Instruction *I = dyn_cast_or_null<Instruction>(V);
+  if (!V)
+    return Ctx.emitError(ErrMsg);
+
+  const char *AsmError = ", possible invalid constraint for vector type";
+  if (const CallInst *CI = dyn_cast<CallInst>(I))
+    if (isa<InlineAsm>(CI->getCalledValue()))
+      return Ctx.emitError(I, ErrMsg + AsmError);
+
+  return Ctx.emitError(I, ErrMsg);
+}
+
+/// getCopyFromPartsVector - Create a value that contains the specified legal
+/// parts combined into the value they represent.  If the parts combine to a
+/// type larger than ValueVT then AssertOp can be used to specify whether the
+/// extra bits are known to be zero (ISD::AssertZext) or sign extended from
+/// ValueVT (ISD::AssertSext).
+static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                      const SDValue *Parts, unsigned NumParts,
+                                      MVT PartVT, EVT ValueVT, const Value *V) {
+  assert(ValueVT.isVector() && "Not a vector value");
+  assert(NumParts > 0 && "No parts to assemble!");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Val = Parts[0];
+
+  // Handle a multi-element vector.
+  if (NumParts > 1) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    unsigned NumRegs =
+    TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT, IntermediateVT,
+                               NumIntermediates, RegisterVT);
+    assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+    NumParts = NumRegs; // Silence a compiler warning.
+    assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+    assert(RegisterVT.getSizeInBits() ==
+           Parts[0].getSimpleValueType().getSizeInBits() &&
+           "Part type sizes don't match!");
+
+    // Assemble the parts into intermediate operands.
+    SmallVector<SDValue, 8> Ops(NumIntermediates);
+    if (NumIntermediates == NumParts) {
+      // If the register was not expanded, truncate or copy the value,
+      // as appropriate.
+      for (unsigned i = 0; i != NumParts; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i], 1,
+                                  PartVT, IntermediateVT, V);
+    } else if (NumParts > 0) {
+      // If the intermediate type was expanded, build the intermediate
+      // operands from the parts.
+      assert(NumParts % NumIntermediates == 0 &&
+             "Must expand into a divisible number of parts!");
+      unsigned Factor = NumParts / NumIntermediates;
+      for (unsigned i = 0; i != NumIntermediates; ++i)
+        Ops[i] = getCopyFromParts(DAG, DL, &Parts[i * Factor], Factor,
+                                  PartVT, IntermediateVT, V);
+    }
+
+    // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
+    // intermediate operands.
+    Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
+                                                : ISD::BUILD_VECTOR,
+                      DL, ValueVT, Ops);
+  }
+
+  // There is now one part, held in Val.  Correct it to match ValueVT.
+  EVT PartEVT = Val.getValueType();
+
+  if (PartEVT == ValueVT)
+    return Val;
+
+  if (PartEVT.isVector()) {
+    // If the element type of the source/dest vectors are the same, but the
+    // parts vector has more elements than the value vector, then we have a
+    // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
+    // elements we want.
+    if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
+      assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
+             "Cannot narrow, it would be a lossy transformation");
+      return DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    }
+
+    // Vector/Vector bitcast.
+    if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
+      return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+    assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
+      "Cannot handle this kind of promotion");
+    // Promoted vector extract
+    return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
+
+  }
+
+  // Trivial bitcast if the types are the same size and the destination
+  // vector type is legal.
+  if (PartEVT.getSizeInBits() == ValueVT.getSizeInBits() &&
+      TLI.isTypeLegal(ValueVT))
+    return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+
+  // Handle cases such as i8 -> <1 x i1>
+  if (ValueVT.getVectorNumElements() != 1) {
+    diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
+                                      "non-trivial scalar-to-vector conversion");
+    return DAG.getUNDEF(ValueVT);
+  }
+
+  if (ValueVT.getVectorNumElements() == 1 &&
+      ValueVT.getVectorElementType() != PartEVT)
+    Val = DAG.getAnyExtOrTrunc(Val, DL, ValueVT.getScalarType());
+
+  return DAG.getNode(ISD::BUILD_VECTOR, DL, ValueVT, Val);
+}
+
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &dl,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V);
+
+/// getCopyToParts - Create a series of nodes that contain the specified value
+/// split into legal parts.  If the parts contain more bits than Val, then, for
+/// integers, ExtendKind can be used to specify how to generate the extra bits.
+static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+                           SDValue *Parts, unsigned NumParts, MVT PartVT,
+                           const Value *V,
+                           ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  EVT ValueVT = Val.getValueType();
+
+  // Handle the vector case separately.
+  if (ValueVT.isVector())
+    return getCopyToPartsVector(DAG, DL, Val, Parts, NumParts, PartVT, V);
+
+  unsigned PartBits = PartVT.getSizeInBits();
+  unsigned OrigNumParts = NumParts;
+  assert(DAG.getTargetLoweringInfo().isTypeLegal(PartVT) &&
+         "Copying to an illegal type!");
+
+  if (NumParts == 0)
+    return;
+
+  assert(!ValueVT.isVector() && "Vector case handled elsewhere");
+  EVT PartEVT = PartVT;
+  if (PartEVT == ValueVT) {
+    assert(NumParts == 1 && "No-op copy with multiple parts!");
+    Parts[0] = Val;
+    return;
+  }
+
+  if (NumParts * PartBits > ValueVT.getSizeInBits()) {
+    // If the parts cover more bits than the value has, promote the value.
+    if (PartVT.isFloatingPoint() && ValueVT.isFloatingPoint()) {
+      assert(NumParts == 1 && "Do not know what to promote to!");
+      Val = DAG.getNode(ISD::FP_EXTEND, DL, PartVT, Val);
+    } else {
+      if (ValueVT.isFloatingPoint()) {
+        // FP values need to be bitcast, then extended if they are being put
+        // into a larger container.
+        ValueVT = EVT::getIntegerVT(*DAG.getContext(),  ValueVT.getSizeInBits());
+        Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+      }
+      assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+             ValueVT.isInteger() &&
+             "Unknown mismatch!");
+      ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+      Val = DAG.getNode(ExtendKind, DL, ValueVT, Val);
+      if (PartVT == MVT::x86mmx)
+        Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
+  } else if (PartBits == ValueVT.getSizeInBits()) {
+    // Different types of the same size.
+    assert(NumParts == 1 && PartEVT != ValueVT);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  } else if (NumParts * PartBits < ValueVT.getSizeInBits()) {
+    // If the parts cover less bits than value has, truncate the value.
+    assert((PartVT.isInteger() || PartVT == MVT::x86mmx) &&
+           ValueVT.isInteger() &&
+           "Unknown mismatch!");
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+    if (PartVT == MVT::x86mmx)
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+  }
+
+  // The value may have changed - recompute ValueVT.
+  ValueVT = Val.getValueType();
+  assert(NumParts * PartBits == ValueVT.getSizeInBits() &&
+         "Failed to tile the value with PartVT!");
+
+  if (NumParts == 1) {
+    if (PartEVT != ValueVT) {
+      diagnosePossiblyInvalidConstraint(*DAG.getContext(), V,
+                                        "scalar-to-vector conversion failed");
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Expand the value into multiple parts.
+  if (NumParts & (NumParts - 1)) {
+    // The number of parts is not a power of 2.  Split off and copy the tail.
+    assert(PartVT.isInteger() && ValueVT.isInteger() &&
+           "Do not know what to expand to!");
+    unsigned RoundParts = 1 << Log2_32(NumParts);
+    unsigned RoundBits = RoundParts * PartBits;
+    unsigned OddParts = NumParts - RoundParts;
+    SDValue OddVal = DAG.getNode(ISD::SRL, DL, ValueVT, Val,
+                                 DAG.getIntPtrConstant(RoundBits, DL));
+    getCopyToParts(DAG, DL, OddVal, Parts + RoundParts, OddParts, PartVT, V);
+
+    if (DAG.getDataLayout().isBigEndian())
+      // The odd parts were reversed by getCopyToParts - unreverse them.
+      std::reverse(Parts + RoundParts, Parts + NumParts);
+
+    NumParts = RoundParts;
+    ValueVT = EVT::getIntegerVT(*DAG.getContext(), NumParts * PartBits);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, ValueVT, Val);
+  }
+
+  // The number of parts is a power of 2.  Repeatedly bisect the value using
+  // EXTRACT_ELEMENT.
+  Parts[0] = DAG.getNode(ISD::BITCAST, DL,
+                         EVT::getIntegerVT(*DAG.getContext(),
+                                           ValueVT.getSizeInBits()),
+                         Val);
+
+  for (unsigned StepSize = NumParts; StepSize > 1; StepSize /= 2) {
+    for (unsigned i = 0; i < NumParts; i += StepSize) {
+      unsigned ThisBits = StepSize * PartBits / 2;
+      EVT ThisVT = EVT::getIntegerVT(*DAG.getContext(), ThisBits);
+      SDValue &Part0 = Parts[i];
+      SDValue &Part1 = Parts[i+StepSize/2];
+
+      Part1 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(1, DL));
+      Part0 = DAG.getNode(ISD::EXTRACT_ELEMENT, DL,
+                          ThisVT, Part0, DAG.getIntPtrConstant(0, DL));
+
+      if (ThisBits == PartBits && ThisVT != PartVT) {
+        Part0 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part0);
+        Part1 = DAG.getNode(ISD::BITCAST, DL, PartVT, Part1);
+      }
+    }
+  }
+
+  if (DAG.getDataLayout().isBigEndian())
+    std::reverse(Parts, Parts + OrigNumParts);
+}
+
+
+/// getCopyToPartsVector - Create a series of nodes that contain the specified
+/// value split into legal parts.
+static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue Val, SDValue *Parts, unsigned NumParts,
+                                 MVT PartVT, const Value *V) {
+  EVT ValueVT = Val.getValueType();
+  assert(ValueVT.isVector() && "Not a vector");
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (NumParts == 1) {
+    EVT PartEVT = PartVT;
+    if (PartEVT == ValueVT) {
+      // Nothing to do.
+    } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
+      // Bitconvert vector->vector case.
+      Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    } else if (PartVT.isVector() &&
+               PartEVT.getVectorElementType() == ValueVT.getVectorElementType() &&
+               PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+      EVT ElementVT = PartVT.getVectorElementType();
+      // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+      // undef elements.
+      SmallVector<SDValue, 16> Ops;
+      for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getNode(
+            ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val,
+            DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+
+      for (unsigned i = ValueVT.getVectorNumElements(),
+           e = PartVT.getVectorNumElements(); i != e; ++i)
+        Ops.push_back(DAG.getUNDEF(ElementVT));
+
+      Val = DAG.getNode(ISD::BUILD_VECTOR, DL, PartVT, Ops);
+
+      // FIXME: Use CONCAT for 2x -> 4x.
+
+      //SDValue UndefElts = DAG.getUNDEF(VectorTy);
+      //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
+    } else if (PartVT.isVector() &&
+               PartEVT.getVectorElementType().bitsGE(
+                 ValueVT.getVectorElementType()) &&
+               PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
+
+      // Promoted vector extract
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+    } else{
+      // Vector -> scalar conversion.
+      assert(ValueVT.getVectorNumElements() == 1 &&
+             "Only trivial vector-to-scalar conversions should get here!");
+      Val = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+      Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
+    }
+
+    Parts[0] = Val;
+    return;
+  }
+
+  // Handle a multi-element vector.
+  EVT IntermediateVT;
+  MVT RegisterVT;
+  unsigned NumIntermediates;
+  unsigned NumRegs = TLI.getVectorTypeBreakdown(*DAG.getContext(), ValueVT,
+                                                IntermediateVT,
+                                                NumIntermediates, RegisterVT);
+  unsigned NumElements = ValueVT.getVectorNumElements();
+
+  assert(NumRegs == NumParts && "Part count doesn't match vector breakdown!");
+  NumParts = NumRegs; // Silence a compiler warning.
+  assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
+
+  // Split the vector into intermediate operands.
+  SmallVector<SDValue, 8> Ops(NumIntermediates);
+  for (unsigned i = 0; i != NumIntermediates; ++i) {
+    if (IntermediateVT.isVector())
+      Ops[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+                      DAG.getConstant(i * (NumElements / NumIntermediates), DL,
+                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
+    else
+      Ops[i] = DAG.getNode(
+          ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
+          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  // Split the intermediate operands into legal parts.
+  if (NumParts == NumIntermediates) {
+    // If the register was not expanded, promote or copy the value,
+    // as appropriate.
+    for (unsigned i = 0; i != NumParts; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i], 1, PartVT, V);
+  } else if (NumParts > 0) {
+    // If the intermediate type was expanded, split each the value into
+    // legal parts.
+    assert(NumIntermediates != 0 && "division by zero");
+    assert(NumParts % NumIntermediates == 0 &&
+           "Must expand into a divisible number of parts!");
+    unsigned Factor = NumParts / NumIntermediates;
+    for (unsigned i = 0; i != NumIntermediates; ++i)
+      getCopyToParts(DAG, DL, Ops[i], &Parts[i*Factor], Factor, PartVT, V);
+  }
+}
+
+RegsForValue::RegsForValue() {}
+
+RegsForValue::RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt,
+                           EVT valuevt)
+    : ValueVTs(1, valuevt), RegVTs(1, regvt), Regs(regs) {}
+
+RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+                           const DataLayout &DL, unsigned Reg, Type *Ty) {
+  ComputeValueVTs(TLI, DL, Ty, ValueVTs);
+
+  for (EVT ValueVT : ValueVTs) {
+    unsigned NumRegs = TLI.getNumRegisters(Context, ValueVT);
+    MVT RegisterVT = TLI.getRegisterType(Context, ValueVT);
+    for (unsigned i = 0; i != NumRegs; ++i)
+      Regs.push_back(Reg + i);
+    RegVTs.push_back(RegisterVT);
+    Reg += NumRegs;
+  }
+}
+
+/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+/// this value and returns the result as a ValueVT value.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
+                                      FunctionLoweringInfo &FuncInfo,
+                                      const SDLoc &dl, SDValue &Chain,
+                                      SDValue *Flag, const Value *V) const {
+  // A Value with type {} or [0 x %t] needs no registers.
+  if (ValueVTs.empty())
+    return SDValue();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // Assemble the legal parts into the final values.
+  SmallVector<SDValue, 4> Values(ValueVTs.size());
+  SmallVector<SDValue, 8> Parts;
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    // Copy the legal parts from the registers.
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+
+    Parts.resize(NumRegs);
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      SDValue P;
+      if (!Flag) {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT);
+      } else {
+        P = DAG.getCopyFromReg(Chain, dl, Regs[Part+i], RegisterVT, *Flag);
+        *Flag = P.getValue(2);
+      }
+
+      Chain = P.getValue(1);
+      Parts[i] = P;
+
+      // If the source register was virtual and if we know something about it,
+      // add an assert node.
+      if (!TargetRegisterInfo::isVirtualRegister(Regs[Part+i]) ||
+          !RegisterVT.isInteger() || RegisterVT.isVector())
+        continue;
+
+      const FunctionLoweringInfo::LiveOutInfo *LOI =
+        FuncInfo.GetLiveOutRegInfo(Regs[Part+i]);
+      if (!LOI)
+        continue;
+
+      unsigned RegSize = RegisterVT.getSizeInBits();
+      unsigned NumSignBits = LOI->NumSignBits;
+      unsigned NumZeroBits = LOI->KnownZero.countLeadingOnes();
+
+      if (NumZeroBits == RegSize) {
+        // The current value is a zero.
+        // Explicitly express that as it would be easier for
+        // optimizations to kick in.
+        Parts[i] = DAG.getConstant(0, dl, RegisterVT);
+        continue;
+      }
+
+      // FIXME: We capture more information than the dag can represent.  For
+      // now, just use the tightest assertzext/assertsext possible.
+      bool isSExt = true;
+      EVT FromVT(MVT::Other);
+      if (NumSignBits == RegSize) {
+        isSExt = true;   // ASSERT SEXT 1
+        FromVT = MVT::i1;
+      } else if (NumZeroBits >= RegSize - 1) {
+        isSExt = false;  // ASSERT ZEXT 1
+        FromVT = MVT::i1;
+      } else if (NumSignBits > RegSize - 8) {
+        isSExt = true;   // ASSERT SEXT 8
+        FromVT = MVT::i8;
+      } else if (NumZeroBits >= RegSize - 8) {
+        isSExt = false;  // ASSERT ZEXT 8
+        FromVT = MVT::i8;
+      } else if (NumSignBits > RegSize - 16) {
+        isSExt = true;   // ASSERT SEXT 16
+        FromVT = MVT::i16;
+      } else if (NumZeroBits >= RegSize - 16) {
+        isSExt = false;  // ASSERT ZEXT 16
+        FromVT = MVT::i16;
+      } else if (NumSignBits > RegSize - 32) {
+        isSExt = true;   // ASSERT SEXT 32
+        FromVT = MVT::i32;
+      } else if (NumZeroBits >= RegSize - 32) {
+        isSExt = false;  // ASSERT ZEXT 32
+        FromVT = MVT::i32;
+      } else {
+        continue;
+      }
+      // Add an assertion node.
+      assert(FromVT != MVT::Other);
+      Parts[i] = DAG.getNode(isSExt ? ISD::AssertSext : ISD::AssertZext, dl,
+                             RegisterVT, P, DAG.getValueType(FromVT));
+    }
+
+    Values[Value] = getCopyFromParts(DAG, dl, Parts.begin(),
+                                     NumRegs, RegisterVT, ValueVT, V);
+    Part += NumRegs;
+    Parts.clear();
+  }
+
+  return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
+}
+
+/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
+/// specified value into the registers specified by this object.  This uses
+/// Chain/Flag as the input and updates them for the output Chain/Flag.
+/// If the Flag pointer is NULL, no flag is used.
+void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
+                                 const SDLoc &dl, SDValue &Chain, SDValue *Flag,
+                                 const Value *V,
+                                 ISD::NodeType PreferredExtendType) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  ISD::NodeType ExtendKind = PreferredExtendType;
+
+  // Get the list of the values's legal parts.
+  unsigned NumRegs = Regs.size();
+  SmallVector<SDValue, 8> Parts(NumRegs);
+  for (unsigned Value = 0, Part = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    EVT ValueVT = ValueVTs[Value];
+    unsigned NumParts = TLI.getNumRegisters(*DAG.getContext(), ValueVT);
+    MVT RegisterVT = RegVTs[Value];
+
+    if (ExtendKind == ISD::ANY_EXTEND && TLI.isZExtFree(Val, RegisterVT))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    getCopyToParts(DAG, dl, Val.getValue(Val.getResNo() + Value),
+                   &Parts[Part], NumParts, RegisterVT, V, ExtendKind);
+    Part += NumParts;
+  }
+
+  // Copy the parts into the registers.
+  SmallVector<SDValue, 8> Chains(NumRegs);
+  for (unsigned i = 0; i != NumRegs; ++i) {
+    SDValue Part;
+    if (!Flag) {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i]);
+    } else {
+      Part = DAG.getCopyToReg(Chain, dl, Regs[i], Parts[i], *Flag);
+      *Flag = Part.getValue(1);
+    }
+
+    Chains[i] = Part.getValue(0);
+  }
+
+  if (NumRegs == 1 || Flag)
+    // If NumRegs > 1 && Flag is used then the use of the last CopyToReg is
+    // flagged to it. That is the CopyToReg nodes and the user are considered
+    // a single scheduling unit. If we create a TokenFactor and return it as
+    // chain, then the TokenFactor is both a predecessor (operand) of the
+    // user as well as a successor (the TF operands are flagged to the user).
+    // c1, f1 = CopyToReg
+    // c2, f2 = CopyToReg
+    // c3     = TokenFactor c1, c2
+    // ...
+    //        = op c3, ..., f2
+    Chain = Chains[NumRegs-1];
+  else
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+}
+
+/// AddInlineAsmOperands - Add this value to the specified inlineasm node
+/// operand list.  This adds the code marker and includes the number of
+/// values added into it.
+void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
+                                        unsigned MatchingIdx, const SDLoc &dl,
+                                        SelectionDAG &DAG,
+                                        std::vector<SDValue> &Ops) const {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  unsigned Flag = InlineAsm::getFlagWord(Code, Regs.size());
+  if (HasMatching)
+    Flag = InlineAsm::getFlagWordForMatchingOp(Flag, MatchingIdx);
+  else if (!Regs.empty() &&
+           TargetRegisterInfo::isVirtualRegister(Regs.front())) {
+    // Put the register class of the virtual registers in the flag word.  That
+    // way, later passes can recompute register class constraints for inline
+    // assembly as well as normal instructions.
+    // Don't do this for tied operands that can use the regclass information
+    // from the def.
+    const MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
+    const TargetRegisterClass *RC = MRI.getRegClass(Regs.front());
+    Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+  }
+
+  SDValue Res = DAG.getTargetConstant(Flag, dl, MVT::i32);
+  Ops.push_back(Res);
+
+  unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
+  for (unsigned Value = 0, Reg = 0, e = ValueVTs.size(); Value != e; ++Value) {
+    unsigned NumRegs = TLI.getNumRegisters(*DAG.getContext(), ValueVTs[Value]);
+    MVT RegisterVT = RegVTs[Value];
+    for (unsigned i = 0; i != NumRegs; ++i) {
+      assert(Reg < Regs.size() && "Mismatch in # registers expected");
+      unsigned TheReg = Regs[Reg++];
+      Ops.push_back(DAG.getRegister(TheReg, RegisterVT));
+
+      if (TheReg == SP && Code == InlineAsm::Kind_Clobber) {
+        // If we clobbered the stack pointer, MFI should know about it.
+        assert(DAG.getMachineFunction().getFrameInfo()->
+            hasOpaqueSPAdjustment());
+      }
+    }
+  }
+}
+
+void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+                               const TargetLibraryInfo *li) {
+  AA = &aa;
+  GFI = gfi;
+  LibInfo = li;
+  DL = &DAG.getDataLayout();
+  Context = DAG.getContext();
+  LPadToCallSiteMap.clear();
+}
+
+/// clear - Clear out the current SelectionDAG and the associated
+/// state and prepare this SelectionDAGBuilder object to be used
+/// for a new block. This doesn't clear out information about
+/// additional blocks that are needed to complete switch lowering
+/// or PHI node updating; that information is cleared out as it is
+/// consumed.
+void SelectionDAGBuilder::clear() {
+  NodeMap.clear();
+  UnusedArgNodeMap.clear();
+  PendingLoads.clear();
+  PendingExports.clear();
+  CurInst = nullptr;
+  HasTailCall = false;
+  SDNodeOrder = LowestSDNodeOrder;
+  StatepointLowering.clear();
+}
+
+/// clearDanglingDebugInfo - Clear the dangling debug information
+/// map. This function is separated from the clear so that debug
+/// information that is dangling in a basic block can be properly
+/// resolved in a different basic block. This allows the
+/// SelectionDAG to resolve dangling debug information attached
+/// to PHI nodes.
+void SelectionDAGBuilder::clearDanglingDebugInfo() {
+  DanglingDebugInfoMap.clear();
+}
+
+/// getRoot - Return the current virtual root of the Selection DAG,
+/// flushing any PendingLoad items. This must be done before emitting
+/// a store or any other node that may need to be ordered after any
+/// prior load instructions.
+///
+SDValue SelectionDAGBuilder::getRoot() {
+  if (PendingLoads.empty())
+    return DAG.getRoot();
+
+  if (PendingLoads.size() == 1) {
+    SDValue Root = PendingLoads[0];
+    DAG.setRoot(Root);
+    PendingLoads.clear();
+    return Root;
+  }
+
+  // Otherwise, we have to make a token factor node.
+  SDValue Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                             PendingLoads);
+  PendingLoads.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+/// getControlRoot - Similar to getRoot, but instead of flushing all the
+/// PendingLoad items, flush all the PendingExports items. It is necessary
+/// to do this before emitting a terminator instruction.
+///
+SDValue SelectionDAGBuilder::getControlRoot() {
+  SDValue Root = DAG.getRoot();
+
+  if (PendingExports.empty())
+    return Root;
+
+  // Turn all of the CopyToReg chains into one factored node.
+  if (Root.getOpcode() != ISD::EntryToken) {
+    unsigned i = 0, e = PendingExports.size();
+    for (; i != e; ++i) {
+      assert(PendingExports[i].getNode()->getNumOperands() > 1);
+      if (PendingExports[i].getNode()->getOperand(0) == Root)
+        break;  // Don't add the root if we already indirectly depend on it.
+    }
+
+    if (i == e)
+      PendingExports.push_back(Root);
+  }
+
+  Root = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other,
+                     PendingExports);
+  PendingExports.clear();
+  DAG.setRoot(Root);
+  return Root;
+}
+
+/// Copy swift error to the final virtual register at end of a basic block, as
+/// specified by SwiftErrorWorklist, if necessary.
+static void copySwiftErrorsToFinalVRegs(SelectionDAGBuilder &SDB) {
+  const TargetLowering &TLI = SDB.DAG.getTargetLoweringInfo();
+  if (!TLI.supportSwiftError())
+    return;
+
+  if (!SDB.FuncInfo.SwiftErrorWorklist.count(SDB.FuncInfo.MBB))
+    return;
+
+  // Go through entries in SwiftErrorWorklist, and create copy as necessary.
+  FunctionLoweringInfo::SwiftErrorVRegs &WorklistEntry =
+      SDB.FuncInfo.SwiftErrorWorklist[SDB.FuncInfo.MBB];
+  FunctionLoweringInfo::SwiftErrorVRegs &MapEntry =
+      SDB.FuncInfo.SwiftErrorMap[SDB.FuncInfo.MBB];
+  for (unsigned I = 0, E = WorklistEntry.size(); I < E; I++) {
+    unsigned WorkReg = WorklistEntry[I];
+
+    // Find the swifterror virtual register for the value in SwiftErrorMap.
+    unsigned MapReg = MapEntry[I];
+    assert(TargetRegisterInfo::isVirtualRegister(MapReg) &&
+           "Entries in SwiftErrorMap should be virtual registers");
+
+    if (WorkReg == MapReg)
+      continue;
+
+    // Create copy from SwiftErrorMap to SwiftWorklist.
+    auto &DL = SDB.DAG.getDataLayout();
+    SDValue CopyNode = SDB.DAG.getCopyToReg(
+        SDB.getRoot(), SDB.getCurSDLoc(), WorkReg,
+        SDB.DAG.getRegister(MapReg, EVT(TLI.getPointerTy(DL))));
+    MapEntry[I] = WorkReg;
+    SDB.DAG.setRoot(CopyNode);
+  }
+}
+
+void SelectionDAGBuilder::visit(const Instruction &I) {
+  // Set up outgoing PHI node register values before emitting the terminator.
+  if (isa<TerminatorInst>(&I)) {
+    copySwiftErrorsToFinalVRegs(*this);
+    HandlePHINodesInSuccessorBlocks(I.getParent());
+  }
+
+  ++SDNodeOrder;
+
+  CurInst = &I;
+
+  visit(I.getOpcode(), I);
+
+  if (!isa<TerminatorInst>(&I) && !HasTailCall &&
+      !isStatepoint(&I)) // statepoints handle their exports internally
+    CopyToExportRegsIfNeeded(&I);
+
+  CurInst = nullptr;
+}
+
+void SelectionDAGBuilder::visitPHI(const PHINode &) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit PHI nodes!");
+}
+
+void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
+  // Note: this doesn't use InstVisitor, because it has to work with
+  // ConstantExpr's in addition to instructions.
+  switch (Opcode) {
+  default: llvm_unreachable("Unknown instruction type encountered!");
+    // Build the switch statement using the Instruction.def file.
+#define HANDLE_INST(NUM, OPCODE, CLASS) \
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
+#include "llvm/IR/Instruction.def"
+  }
+}
+
+// resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+// generate the debug data structures now that we've seen its definition.
+void SelectionDAGBuilder::resolveDanglingDebugInfo(const Value *V,
+                                                   SDValue Val) {
+  DanglingDebugInfo &DDI = DanglingDebugInfoMap[V];
+  if (DDI.getDI()) {
+    const DbgValueInst *DI = DDI.getDI();
+    DebugLoc dl = DDI.getdl();
+    unsigned DbgSDNodeOrder = DDI.getSDNodeOrder();
+    DILocalVariable *Variable = DI->getVariable();
+    DIExpression *Expr = DI->getExpression();
+    assert(Variable->isValidLocationForIntrinsic(dl) &&
+           "Expected inlined-at fields to agree");
+    uint64_t Offset = DI->getOffset();
+    SDDbgValue *SDV;
+    if (Val.getNode()) {
+      if (!EmitFuncArgumentDbgValue(V, Variable, Expr, dl, Offset, false,
+                                    Val)) {
+        SDV = DAG.getDbgValue(Variable, Expr, Val.getNode(), Val.getResNo(),
+                              false, Offset, dl, DbgSDNodeOrder);
+        DAG.AddDbgValue(SDV, Val.getNode(), false);
+      }
+    } else
+      DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+    DanglingDebugInfoMap[V] = DanglingDebugInfo();
+  }
+}
+
+/// getCopyFromRegs - If there was virtual register allocated for the value V
+/// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
+SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
+  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  SDValue Result;
+
+  if (It != FuncInfo.ValueMap.end()) {
+    unsigned InReg = It->second;
+    RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                     DAG.getDataLayout(), InReg, Ty);
+    SDValue Chain = DAG.getEntryNode();
+    Result = RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
+    resolveDanglingDebugInfo(V, Result);
+  }
+
+  return Result;
+}
+
+/// getValue - Return an SDValue for the given Value.
+SDValue SelectionDAGBuilder::getValue(const Value *V) {
+  // If we already have an SDValue for this value, use it. It's important
+  // to do this first, so that we don't create a CopyFromReg if we already
+  // have a regular SDValue.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) return N;
+
+  // If there's a virtual register allocated and initialized for this
+  // value, use it.
+  if (SDValue copyFromReg = getCopyFromRegs(V, V->getType()))
+    return copyFromReg;
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+// Return true if SDValue exists for the given Value
+bool SelectionDAGBuilder::findValue(const Value *V) const {
+  return (NodeMap.find(V) != NodeMap.end()) ||
+    (FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
+}
+
+/// getNonRegisterValue - Return an SDValue for the given Value, but
+/// don't look in FuncInfo.ValueMap for a virtual register.
+SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
+  // If we already have an SDValue for this value, use it.
+  SDValue &N = NodeMap[V];
+  if (N.getNode()) {
+    if (isa<ConstantSDNode>(N) || isa<ConstantFPSDNode>(N)) {
+      // Remove the debug location from the node as the node is about to be used
+      // in a location which may differ from the original debug location.  This
+      // is relevant to Constant and ConstantFP nodes because they can appear
+      // as constant expressions inside PHI nodes.
+      N->setDebugLoc(DebugLoc());
+    }
+    return N;
+  }
+
+  // Otherwise create a new SDValue and remember it.
+  SDValue Val = getValueImpl(V);
+  NodeMap[V] = Val;
+  resolveDanglingDebugInfo(V, Val);
+  return Val;
+}
+
+/// getValueImpl - Helper function for getValue and getNonRegisterValue.
+/// Create an SDValue for the given value.
+SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), V->getType(), true);
+
+    if (const ConstantInt *CI = dyn_cast<ConstantInt>(C))
+      return DAG.getConstant(*CI, getCurSDLoc(), VT);
+
+    if (const GlobalValue *GV = dyn_cast<GlobalValue>(C))
+      return DAG.getGlobalAddress(GV, getCurSDLoc(), VT);
+
+    if (isa<ConstantPointerNull>(C)) {
+      unsigned AS = V->getType()->getPointerAddressSpace();
+      return DAG.getConstant(0, getCurSDLoc(),
+                             TLI.getPointerTy(DAG.getDataLayout(), AS));
+    }
+
+    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
+      return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);
+
+    if (isa<UndefValue>(C) && !V->getType()->isAggregateType())
+      return DAG.getUNDEF(VT);
+
+    if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
+      visit(CE->getOpcode(), *CE);
+      SDValue N1 = NodeMap[V];
+      assert(N1.getNode() && "visit didn't populate the NodeMap!");
+      return N1;
+    }
+
+    if (isa<ConstantStruct>(C) || isa<ConstantArray>(C)) {
+      SmallVector<SDValue, 4> Constants;
+      for (User::const_op_iterator OI = C->op_begin(), OE = C->op_end();
+           OI != OE; ++OI) {
+        SDNode *Val = getValue(*OI).getNode();
+        // If the operand is an empty aggregate, there are no values.
+        if (!Val) continue;
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Constants.push_back(SDValue(Val, i));
+      }
+
+      return DAG.getMergeValues(Constants, getCurSDLoc());
+    }
+
+    if (const ConstantDataSequential *CDS =
+          dyn_cast<ConstantDataSequential>(C)) {
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
+        SDNode *Val = getValue(CDS->getElementAsConstant(i)).getNode();
+        // Add each leaf value from the operand to the Constants list
+        // to form a flattened list of all the values.
+        for (unsigned i = 0, e = Val->getNumValues(); i != e; ++i)
+          Ops.push_back(SDValue(Val, i));
+      }
+
+      if (isa<ArrayType>(CDS->getType()))
+        return DAG.getMergeValues(Ops, getCurSDLoc());
+      return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(),
+                                      VT, Ops);
+    }
+
+    if (C->getType()->isStructTy() || C->getType()->isArrayTy()) {
+      assert((isa<ConstantAggregateZero>(C) || isa<UndefValue>(C)) &&
+             "Unknown struct or array constant!");
+
+      SmallVector<EVT, 4> ValueVTs;
+      ComputeValueVTs(TLI, DAG.getDataLayout(), C->getType(), ValueVTs);
+      unsigned NumElts = ValueVTs.size();
+      if (NumElts == 0)
+        return SDValue(); // empty struct
+      SmallVector<SDValue, 4> Constants(NumElts);
+      for (unsigned i = 0; i != NumElts; ++i) {
+        EVT EltVT = ValueVTs[i];
+        if (isa<UndefValue>(C))
+          Constants[i] = DAG.getUNDEF(EltVT);
+        else if (EltVT.isFloatingPoint())
+          Constants[i] = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
+        else
+          Constants[i] = DAG.getConstant(0, getCurSDLoc(), EltVT);
+      }
+
+      return DAG.getMergeValues(Constants, getCurSDLoc());
+    }
+
+    if (const BlockAddress *BA = dyn_cast<BlockAddress>(C))
+      return DAG.getBlockAddress(BA, VT);
+
+    VectorType *VecTy = cast<VectorType>(V->getType());
+    unsigned NumElements = VecTy->getNumElements();
+
+    // Now that we know the number and type of the elements, get that number of
+    // elements into the Ops array based on what kind of constant it is.
+    SmallVector<SDValue, 16> Ops;
+    if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+      for (unsigned i = 0; i != NumElements; ++i)
+        Ops.push_back(getValue(CV->getOperand(i)));
+    } else {
+      assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
+      EVT EltVT =
+          TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
+
+      SDValue Op;
+      if (EltVT.isFloatingPoint())
+        Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
+      else
+        Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
+      Ops.assign(NumElements, Op);
+    }
+
+    // Create a BUILD_VECTOR node.
+    return NodeMap[V] = DAG.getNode(ISD::BUILD_VECTOR, getCurSDLoc(), VT, Ops);
+  }
+
+  // If this is a static alloca, generate it as the frameindex instead of
+  // computation.
+  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI != FuncInfo.StaticAllocaMap.end())
+      return DAG.getFrameIndex(SI->second,
+                               TLI.getPointerTy(DAG.getDataLayout()));
+  }
+
+  // If this is an instruction which fast-isel has deferred, select it now.
+  if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
+    unsigned InReg = FuncInfo.InitializeRegForValue(Inst);
+    RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
+                     Inst->getType());
+    SDValue Chain = DAG.getEntryNode();
+    return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
+  }
+
+  llvm_unreachable("Can't get register for value!");
+}
+
+void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
+  MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
+  // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
+  if (IsMSVCCXX || IsCoreCLR)
+    CatchPadMBB->setIsEHFuncletEntry();
+
+  DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other, getControlRoot()));
+}
+
+void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
+  // Update machine-CFG edge.
+  MachineBasicBlock *TargetMBB = FuncInfo.MBBMap[I.getSuccessor()];
+  FuncInfo.MBB->addSuccessor(TargetMBB);
+
+  auto Pers = classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsSEH = isAsynchronousEHPersonality(Pers);
+  if (IsSEH) {
+    // If this is not a fall-through branch or optimizations are switched off,
+    // emit the branch.
+    if (TargetMBB != NextBlock(FuncInfo.MBB) ||
+        TM.getOptLevel() == CodeGenOpt::None)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(TargetMBB)));
+    return;
+  }
+
+  // Figure out the funclet membership for the catchret's successor.
+  // This will be used by the FuncletLayout pass to determine how to order the
+  // BB's.
+  // A 'catchret' returns to the outer scope's color.
+  Value *ParentPad = I.getCatchSwitchParentPad();
+  const BasicBlock *SuccessorColor;
+  if (isa<ConstantTokenNone>(ParentPad))
+    SuccessorColor = &FuncInfo.Fn->getEntryBlock();
+  else
+    SuccessorColor = cast<Instruction>(ParentPad)->getParent();
+  assert(SuccessorColor && "No parent funclet for catchret!");
+  MachineBasicBlock *SuccessorColorMBB = FuncInfo.MBBMap[SuccessorColor];
+  assert(SuccessorColorMBB && "No MBB for SuccessorColor!");
+
+  // Create the terminator node.
+  SDValue Ret = DAG.getNode(ISD::CATCHRET, getCurSDLoc(), MVT::Other,
+                            getControlRoot(), DAG.getBasicBlock(TargetMBB),
+                            DAG.getBasicBlock(SuccessorColorMBB));
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCleanupPad(const CleanupPadInst &CPI) {
+  // Don't emit any special code for the cleanuppad instruction. It just marks
+  // the start of a funclet.
+  FuncInfo.MBB->setIsEHFuncletEntry();
+  FuncInfo.MBB->setIsCleanupFuncletEntry();
+}
+
+/// When an invoke or a cleanupret unwinds to the next EH pad, there are
+/// many places it could ultimately go. In the IR, we have a single unwind
+/// destination, but in the machine CFG, we enumerate all the possible blocks.
+/// This function skips over imaginary basic blocks that hold catchswitch
+/// instructions, and finds all the "real" machine
+/// basic block destinations. As those destinations may not be successors of
+/// EHPadBB, here we also calculate the edge probability to those destinations.
+/// The passed-in Prob is the edge probability to EHPadBB.
+static void findUnwindDestinations(
+    FunctionLoweringInfo &FuncInfo, const BasicBlock *EHPadBB,
+    BranchProbability Prob,
+    SmallVectorImpl<std::pair<MachineBasicBlock *, BranchProbability>>
+        &UnwindDests) {
+  EHPersonality Personality =
+    classifyEHPersonality(FuncInfo.Fn->getPersonalityFn());
+  bool IsMSVCCXX = Personality == EHPersonality::MSVC_CXX;
+  bool IsCoreCLR = Personality == EHPersonality::CoreCLR;
+
+  while (EHPadBB) {
+    const Instruction *Pad = EHPadBB->getFirstNonPHI();
+    BasicBlock *NewEHPadBB = nullptr;
+    if (isa<LandingPadInst>(Pad)) {
+      // Stop on landingpads. They are not funclets.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      break;
+    } else if (isa<CleanupPadInst>(Pad)) {
+      // Stop on cleanup pads. Cleanups are always funclet entries for all known
+      // personalities.
+      UnwindDests.emplace_back(FuncInfo.MBBMap[EHPadBB], Prob);
+      UnwindDests.back().first->setIsEHFuncletEntry();
+      break;
+    } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(Pad)) {
+      // Add the catchpad handlers to the possible destinations.
+      for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+        UnwindDests.emplace_back(FuncInfo.MBBMap[CatchPadBB], Prob);
+        // For MSVC++ and the CLR, catchblocks are funclets and need prologues.
+        if (IsMSVCCXX || IsCoreCLR)
+          UnwindDests.back().first->setIsEHFuncletEntry();
+      }
+      NewEHPadBB = CatchSwitch->getUnwindDest();
+    } else {
+      continue;
+    }
+
+    BranchProbabilityInfo *BPI = FuncInfo.BPI;
+    if (BPI && NewEHPadBB)
+      Prob *= BPI->getEdgeProbability(EHPadBB, NewEHPadBB);
+    EHPadBB = NewEHPadBB;
+  }
+}
+
+void SelectionDAGBuilder::visitCleanupRet(const CleanupReturnInst &I) {
+  // Update successor info.
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  auto UnwindDest = I.getUnwindDest();
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability UnwindDestProb =
+      (BPI && UnwindDest)
+          ? BPI->getEdgeProbability(FuncInfo.MBB->getBasicBlock(), UnwindDest)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, UnwindDest, UnwindDestProb, UnwindDests);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(FuncInfo.MBB, UnwindDest.first, UnwindDest.second);
+  }
+  FuncInfo.MBB->normalizeSuccProbs();
+
+  // Create the terminator node.
+  SDValue Ret =
+      DAG.getNode(ISD::CLEANUPRET, getCurSDLoc(), MVT::Other, getControlRoot());
+  DAG.setRoot(Ret);
+}
+
+void SelectionDAGBuilder::visitCatchSwitch(const CatchSwitchInst &CSI) {
+  report_fatal_error("visitCatchSwitch not yet implemented!");
+}
+
+void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto &DL = DAG.getDataLayout();
+  SDValue Chain = getControlRoot();
+  SmallVector<ISD::OutputArg, 8> Outs;
+  SmallVector<SDValue, 8> OutVals;
+
+  // Calls to @llvm.experimental.deoptimize don't generate a return value, so
+  // lower
+  //
+  //   %val = call <ty> @llvm.experimental.deoptimize()
+  //   ret <ty> %val
+  //
+  // differently.
+  if (I.getParent()->getTerminatingDeoptimizeCall()) {
+    LowerDeoptimizingReturn();
+    return;
+  }
+
+  if (!FuncInfo.CanLowerReturn) {
+    unsigned DemoteReg = FuncInfo.DemoteRegister;
+    const Function *F = I.getParent()->getParent();
+
+    // Emit a store of the return value through the virtual register.
+    // Leave Outs empty so that LowerReturn won't try to load return
+    // registers the usual way.
+    SmallVector<EVT, 1> PtrValueVTs;
+    ComputeValueVTs(TLI, DL, PointerType::getUnqual(F->getReturnType()),
+                    PtrValueVTs);
+
+    SDValue RetPtr = DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(),
+                                        DemoteReg, PtrValueVTs[0]);
+    SDValue RetOp = getValue(I.getOperand(0));
+
+    SmallVector<EVT, 4> ValueVTs;
+    SmallVector<uint64_t, 4> Offsets;
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs, &Offsets);
+    unsigned NumValues = ValueVTs.size();
+
+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
+    SmallVector<SDValue, 4> Chains(NumValues);
+    for (unsigned i = 0; i != NumValues; ++i) {
+      SDValue Add = DAG.getNode(ISD::ADD, getCurSDLoc(),
+                                RetPtr.getValueType(), RetPtr,
+                                DAG.getIntPtrConstant(Offsets[i],
+                                                      getCurSDLoc()),
+                                &Flags);
+      Chains[i] = DAG.getStore(Chain, getCurSDLoc(),
+                               SDValue(RetOp.getNode(), RetOp.getResNo() + i),
+                               // FIXME: better loc info would be nice.
+                               Add, MachinePointerInfo());
+    }
+
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
+                        MVT::Other, Chains);
+  } else if (I.getNumOperands() != 0) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(TLI, DL, I.getOperand(0)->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+    if (NumValues) {
+      SDValue RetOp = getValue(I.getOperand(0));
+
+      const Function *F = I.getParent()->getParent();
+
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                          Attribute::SExt))
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                               Attribute::ZExt))
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      LLVMContext &Context = F->getContext();
+      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+                                                      Attribute::InReg);
+
+      for (unsigned j = 0; j != NumValues; ++j) {
+        EVT VT = ValueVTs[j];
+
+        if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger())
+          VT = TLI.getTypeForExtReturn(Context, VT, ExtendKind);
+
+        unsigned NumParts = TLI.getNumRegisters(Context, VT);
+        MVT PartVT = TLI.getRegisterType(Context, VT);
+        SmallVector<SDValue, 4> Parts(NumParts);
+        getCopyToParts(DAG, getCurSDLoc(),
+                       SDValue(RetOp.getNode(), RetOp.getResNo() + j),
+                       &Parts[0], NumParts, PartVT, &I, ExtendKind);
+
+        // 'inreg' on function refers to return value
+        ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+        if (RetInReg)
+          Flags.setInReg();
+
+        // Propagate extension type if any
+        if (ExtendKind == ISD::SIGN_EXTEND)
+          Flags.setSExt();
+        else if (ExtendKind == ISD::ZERO_EXTEND)
+          Flags.setZExt();
+
+        for (unsigned i = 0; i < NumParts; ++i) {
+          Outs.push_back(ISD::OutputArg(Flags, Parts[i].getValueType(),
+                                        VT, /*isfixed=*/true, 0, 0));
+          OutVals.push_back(Parts[i]);
+        }
+      }
+    }
+  }
+
+  // Push in swifterror virtual register as the last element of Outs. This makes
+  // sure swifterror virtual register will be returned in the swifterror
+  // physical register.
+  const Function *F = I.getParent()->getParent();
+  if (TLI.supportSwiftError() &&
+      F->getAttributes().hasAttrSomewhere(Attribute::SwiftError)) {
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    Flags.setSwiftError();
+    Outs.push_back(ISD::OutputArg(Flags, EVT(TLI.getPointerTy(DL)) /*vt*/,
+                                  EVT(TLI.getPointerTy(DL)) /*argvt*/,
+                                  true /*isfixed*/, 1 /*origidx*/,
+                                  0 /*partOffs*/));
+    // Create SDNode for the swifterror virtual register.
+    OutVals.push_back(DAG.getRegister(FuncInfo.SwiftErrorMap[FuncInfo.MBB][0],
+                                      EVT(TLI.getPointerTy(DL))));
+  }
+
+  bool isVarArg = DAG.getMachineFunction().getFunction()->isVarArg();
+  CallingConv::ID CallConv =
+    DAG.getMachineFunction().getFunction()->getCallingConv();
+  Chain = DAG.getTargetLoweringInfo().LowerReturn(
+      Chain, CallConv, isVarArg, Outs, OutVals, getCurSDLoc(), DAG);
+
+  // Verify that the target's LowerReturn behaved as expected.
+  assert(Chain.getNode() && Chain.getValueType() == MVT::Other &&
+         "LowerReturn didn't return a valid chain!");
+
+  // Update the DAG with the new chain value resulting from return lowering.
+  DAG.setRoot(Chain);
+}
+
+/// CopyToExportRegsIfNeeded - If the given value has virtual registers
+/// created for it, emit nodes to copy the value into the virtual
+/// registers.
+void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
+  // Skip empty types
+  if (V->getType()->isEmptyTy())
+    return;
+
+  DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+  if (VMI != FuncInfo.ValueMap.end()) {
+    assert(!V->use_empty() && "Unused value assigned virtual registers!");
+    CopyValueToVirtualRegister(V, VMI->second);
+  }
+}
+
+/// ExportFromCurrentBlock - If this condition isn't known to be exported from
+/// the current basic block, add it to ValueMap now so that we'll get a
+/// CopyTo/FromReg.
+void SelectionDAGBuilder::ExportFromCurrentBlock(const Value *V) {
+  // No need to export constants.
+  if (!isa<Instruction>(V) && !isa<Argument>(V)) return;
+
+  // Already exported?
+  if (FuncInfo.isExportedInst(V)) return;
+
+  unsigned Reg = FuncInfo.InitializeRegForValue(V);
+  CopyValueToVirtualRegister(V, Reg);
+}
+
+bool SelectionDAGBuilder::isExportableFromCurrentBlock(const Value *V,
+                                                     const BasicBlock *FromBB) {
+  // The operands of the setcc have to be in this block.  We don't know
+  // how to export them from some other block.
+  if (const Instruction *VI = dyn_cast<Instruction>(V)) {
+    // Can export from current BB.
+    if (VI->getParent() == FromBB)
+      return true;
+
+    // Is already exported, noop.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // If this is an argument, we can export it if the BB is the entry block or
+  // if it is already exported.
+  if (isa<Argument>(V)) {
+    if (FromBB == &FromBB->getParent()->getEntryBlock())
+      return true;
+
+    // Otherwise, can only export this if it is already exported.
+    return FuncInfo.isExportedInst(V);
+  }
+
+  // Otherwise, constants can always be exported.
+  return true;
+}
+
+/// Return branch probability calculated by BranchProbabilityInfo for IR blocks.
+BranchProbability
+SelectionDAGBuilder::getEdgeProbability(const MachineBasicBlock *Src,
+                                        const MachineBasicBlock *Dst) const {
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  const BasicBlock *SrcBB = Src->getBasicBlock();
+  const BasicBlock *DstBB = Dst->getBasicBlock();
+  if (!BPI) {
+    // If BPI is not available, set the default probability as 1 / N, where N is
+    // the number of successors.
+    auto SuccSize = std::max<uint32_t>(
+        std::distance(succ_begin(SrcBB), succ_end(SrcBB)), 1);
+    return BranchProbability(1, SuccSize);
+  }
+  return BPI->getEdgeProbability(SrcBB, DstBB);
+}
+
+void SelectionDAGBuilder::addSuccessorWithProb(MachineBasicBlock *Src,
+                                               MachineBasicBlock *Dst,
+                                               BranchProbability Prob) {
+  if (!FuncInfo.BPI)
+    Src->addSuccessorWithoutProb(Dst);
+  else {
+    if (Prob.isUnknown())
+      Prob = getEdgeProbability(Src, Dst);
+    Src->addSuccessor(Dst, Prob);
+  }
+}
+
+static bool InBlock(const Value *V, const BasicBlock *BB) {
+  if (const Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == BB;
+  return true;
+}
+
+/// EmitBranchForMergedCondition - Helper method for FindMergedConditions.
+/// This function emits a branch and is used at the leaves of an OR or an
+/// AND operator tree.
+///
+void
+SelectionDAGBuilder::EmitBranchForMergedCondition(const Value *Cond,
+                                                  MachineBasicBlock *TBB,
+                                                  MachineBasicBlock *FBB,
+                                                  MachineBasicBlock *CurBB,
+                                                  MachineBasicBlock *SwitchBB,
+                                                  BranchProbability TProb,
+                                                  BranchProbability FProb) {
+  const BasicBlock *BB = CurBB->getBasicBlock();
+
+  // If the leaf of the tree is a comparison, merge the condition into
+  // the caseblock.
+  if (const CmpInst *BOp = dyn_cast<CmpInst>(Cond)) {
+    // The operands of the cmp have to be in this block.  We don't know
+    // how to export them from some other block.  If this is the first block
+    // of the sequence, no exporting is needed.
+    if (CurBB == SwitchBB ||
+        (isExportableFromCurrentBlock(BOp->getOperand(0), BB) &&
+         isExportableFromCurrentBlock(BOp->getOperand(1), BB))) {
+      ISD::CondCode Condition;
+      if (const ICmpInst *IC = dyn_cast<ICmpInst>(Cond)) {
+        Condition = getICmpCondCode(IC->getPredicate());
+      } else {
+        const FCmpInst *FC = cast<FCmpInst>(Cond);
+        Condition = getFCmpCondCode(FC->getPredicate());
+        if (TM.Options.NoNaNsFPMath)
+          Condition = getFCmpCodeWithoutNaN(Condition);
+      }
+
+      CaseBlock CB(Condition, BOp->getOperand(0), BOp->getOperand(1), nullptr,
+                   TBB, FBB, CurBB, TProb, FProb);
+      SwitchCases.push_back(CB);
+      return;
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, Cond, ConstantInt::getTrue(*DAG.getContext()),
+               nullptr, TBB, FBB, CurBB, TProb, FProb);
+  SwitchCases.push_back(CB);
+}
+
+/// FindMergedConditions - If Cond is an expression like
+void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
+                                               MachineBasicBlock *TBB,
+                                               MachineBasicBlock *FBB,
+                                               MachineBasicBlock *CurBB,
+                                               MachineBasicBlock *SwitchBB,
+                                               Instruction::BinaryOps Opc,
+                                               BranchProbability TProb,
+                                               BranchProbability FProb) {
+  // If this node is not part of the or/and tree, emit it as a branch.
+  const Instruction *BOp = dyn_cast<Instruction>(Cond);
+  if (!BOp || !(isa<BinaryOperator>(BOp) || isa<CmpInst>(BOp)) ||
+      (unsigned)BOp->getOpcode() != Opc || !BOp->hasOneUse() ||
+      BOp->getParent() != CurBB->getBasicBlock() ||
+      !InBlock(BOp->getOperand(0), CurBB->getBasicBlock()) ||
+      !InBlock(BOp->getOperand(1), CurBB->getBasicBlock())) {
+    EmitBranchForMergedCondition(Cond, TBB, FBB, CurBB, SwitchBB,
+                                 TProb, FProb);
+    return;
+  }
+
+  //  Create TmpBB after CurBB.
+  MachineFunction::iterator BBI(CurBB);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineBasicBlock *TmpBB = MF.CreateMachineBasicBlock(CurBB->getBasicBlock());
+  CurBB->getParent()->insert(++BBI, TmpBB);
+
+  if (Opc == Instruction::Or) {
+    // Codegen X | Y as:
+    // BB1:
+    //   jmp_if_X TBB
+    //   jmp TmpBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   TrueProb for BB1 + (FalseProb for BB1 * TrueProb for TmpBB)
+    //     = TrueProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A/2 and A/2+B, and set TmpBB's probabilities to
+    // A/(1+B) and 2B/(1+B). This choice assumes that
+    //   TrueProb for BB1 == FalseProb for BB1 * TrueProb for TmpBB.
+    // Another choice is to assume TrueProb for BB1 equals to TrueProb for
+    // TmpBB, but the math is more complicated.
+
+    auto NewTrueProb = TProb / 2;
+    auto NewFalseProb = TProb / 2 + FProb;
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TBB, TmpBB, CurBB, SwitchBB, Opc,
+                         NewTrueProb, NewFalseProb);
+
+    // Normalize A/2 and B to get A/(1+B) and 2B/(1+B).
+    SmallVector<BranchProbability, 2> Probs{TProb / 2, FProb};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
+                         Probs[0], Probs[1]);
+  } else {
+    assert(Opc == Instruction::And && "Unknown merge op!");
+    // Codegen X & Y as:
+    // BB1:
+    //   jmp_if_X TmpBB
+    //   jmp FBB
+    // TmpBB:
+    //   jmp_if_Y TBB
+    //   jmp FBB
+    //
+    //  This requires creation of TmpBB after CurBB.
+
+    // We have flexibility in setting Prob for BB1 and Prob for TmpBB.
+    // The requirement is that
+    //   FalseProb for BB1 + (TrueProb for BB1 * FalseProb for TmpBB)
+    //     = FalseProb for original BB.
+    // Assuming the original probabilities are A and B, one choice is to set
+    // BB1's probabilities to A+B/2 and B/2, and set TmpBB's probabilities to
+    // 2A/(1+A) and B/(1+A). This choice assumes that FalseProb for BB1 ==
+    // TrueProb for BB1 * FalseProb for TmpBB.
+
+    auto NewTrueProb = TProb + FProb / 2;
+    auto NewFalseProb = FProb / 2;
+    // Emit the LHS condition.
+    FindMergedConditions(BOp->getOperand(0), TmpBB, FBB, CurBB, SwitchBB, Opc,
+                         NewTrueProb, NewFalseProb);
+
+    // Normalize A and B/2 to get 2A/(1+A) and B/(1+A).
+    SmallVector<BranchProbability, 2> Probs{TProb, FProb / 2};
+    BranchProbability::normalizeProbabilities(Probs.begin(), Probs.end());
+    // Emit the RHS condition into TmpBB.
+    FindMergedConditions(BOp->getOperand(1), TBB, FBB, TmpBB, SwitchBB, Opc,
+                         Probs[0], Probs[1]);
+  }
+}
+
+/// If the set of cases should be emitted as a series of branches, return true.
+/// If we should emit this as a bunch of and/or'd together conditions, return
+/// false.
+bool
+SelectionDAGBuilder::ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases) {
+  if (Cases.size() != 2) return true;
+
+  // If this is two comparisons of the same values or'd or and'd together, they
+  // will get folded into a single comparison, so don't emit two blocks.
+  if ((Cases[0].CmpLHS == Cases[1].CmpLHS &&
+       Cases[0].CmpRHS == Cases[1].CmpRHS) ||
+      (Cases[0].CmpRHS == Cases[1].CmpLHS &&
+       Cases[0].CmpLHS == Cases[1].CmpRHS)) {
+    return false;
+  }
+
+  // Handle: (X != null) | (Y != null) --> (X|Y) != 0
+  // Handle: (X == null) & (Y == null) --> (X|Y) == 0
+  if (Cases[0].CmpRHS == Cases[1].CmpRHS &&
+      Cases[0].CC == Cases[1].CC &&
+      isa<Constant>(Cases[0].CmpRHS) &&
+      cast<Constant>(Cases[0].CmpRHS)->isNullValue()) {
+    if (Cases[0].CC == ISD::SETEQ && Cases[0].TrueBB == Cases[1].ThisBB)
+      return false;
+    if (Cases[0].CC == ISD::SETNE && Cases[0].FalseBB == Cases[1].ThisBB)
+      return false;
+  }
+
+  return true;
+}
+
+void SelectionDAGBuilder::visitBr(const BranchInst &I) {
+  MachineBasicBlock *BrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges.
+  MachineBasicBlock *Succ0MBB = FuncInfo.MBBMap[I.getSuccessor(0)];
+
+  if (I.isUnconditional()) {
+    // Update machine-CFG edges.
+    BrMBB->addSuccessor(Succ0MBB);
+
+    // If this is not a fall-through branch or optimizations are switched off,
+    // emit the branch.
+    if (Succ0MBB != NextBlock(BrMBB) || TM.getOptLevel() == CodeGenOpt::None)
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                              MVT::Other, getControlRoot(),
+                              DAG.getBasicBlock(Succ0MBB)));
+
+    return;
+  }
+
+  // If this condition is one of the special cases we handle, do special stuff
+  // now.
+  const Value *CondVal = I.getCondition();
+  MachineBasicBlock *Succ1MBB = FuncInfo.MBBMap[I.getSuccessor(1)];
+
+  // If this is a series of conditions that are or'd or and'd together, emit
+  // this as a sequence of branches instead of setcc's with and/or operations.
+  // As long as jumps are not expensive, this should improve performance.
+  // For example, instead of something like:
+  //     cmp A, B
+  //     C = seteq
+  //     cmp D, E
+  //     F = setle
+  //     or C, F
+  //     jnz foo
+  // Emit:
+  //     cmp A, B
+  //     je foo
+  //     cmp D, E
+  //     jle foo
+  //
+  if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
+    Instruction::BinaryOps Opcode = BOp->getOpcode();
+    if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
+        !I.getMetadata(LLVMContext::MD_unpredictable) &&
+        (Opcode == Instruction::And || Opcode == Instruction::Or)) {
+      FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
+                           Opcode,
+                           getEdgeProbability(BrMBB, Succ0MBB),
+                           getEdgeProbability(BrMBB, Succ1MBB));
+      // If the compares in later blocks need to use values not currently
+      // exported from this block, export them now.  This block should always
+      // be the first entry.
+      assert(SwitchCases[0].ThisBB == BrMBB && "Unexpected lowering!");
+
+      // Allow some cases to be rejected.
+      if (ShouldEmitAsBranches(SwitchCases)) {
+        for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i) {
+          ExportFromCurrentBlock(SwitchCases[i].CmpLHS);
+          ExportFromCurrentBlock(SwitchCases[i].CmpRHS);
+        }
+
+        // Emit the branch for this block.
+        visitSwitchCase(SwitchCases[0], BrMBB);
+        SwitchCases.erase(SwitchCases.begin());
+        return;
+      }
+
+      // Okay, we decided not to do this, remove any inserted MBB's and clear
+      // SwitchCases.
+      for (unsigned i = 1, e = SwitchCases.size(); i != e; ++i)
+        FuncInfo.MF->erase(SwitchCases[i].ThisBB);
+
+      SwitchCases.clear();
+    }
+  }
+
+  // Create a CaseBlock record representing this branch.
+  CaseBlock CB(ISD::SETEQ, CondVal, ConstantInt::getTrue(*DAG.getContext()),
+               nullptr, Succ0MBB, Succ1MBB, BrMBB);
+
+  // Use visitSwitchCase to actually insert the fast branch sequence for this
+  // cond branch.
+  visitSwitchCase(CB, BrMBB);
+}
+
+/// visitSwitchCase - Emits the necessary code to represent a single node in
+/// the binary search tree resulting from lowering a switch instruction.
+void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
+                                          MachineBasicBlock *SwitchBB) {
+  SDValue Cond;
+  SDValue CondLHS = getValue(CB.CmpLHS);
+  SDLoc dl = getCurSDLoc();
+
+  // Build the setcc now.
+  if (!CB.CmpMHS) {
+    // Fold "(X == true)" to X and "(X == false)" to !X to
+    // handle common cases produced by branch lowering.
+    if (CB.CmpRHS == ConstantInt::getTrue(*DAG.getContext()) &&
+        CB.CC == ISD::SETEQ)
+      Cond = CondLHS;
+    else if (CB.CmpRHS == ConstantInt::getFalse(*DAG.getContext()) &&
+             CB.CC == ISD::SETEQ) {
+      SDValue True = DAG.getConstant(1, dl, CondLHS.getValueType());
+      Cond = DAG.getNode(ISD::XOR, dl, CondLHS.getValueType(), CondLHS, True);
+    } else
+      Cond = DAG.getSetCC(dl, MVT::i1, CondLHS, getValue(CB.CmpRHS), CB.CC);
+  } else {
+    assert(CB.CC == ISD::SETLE && "Can handle only LE ranges now");
+
+    const APInt& Low = cast<ConstantInt>(CB.CmpLHS)->getValue();
+    const APInt& High = cast<ConstantInt>(CB.CmpRHS)->getValue();
+
+    SDValue CmpOp = getValue(CB.CmpMHS);
+    EVT VT = CmpOp.getValueType();
+
+    if (cast<ConstantInt>(CB.CmpLHS)->isMinValue(true)) {
+      Cond = DAG.getSetCC(dl, MVT::i1, CmpOp, DAG.getConstant(High, dl, VT),
+                          ISD::SETLE);
+    } else {
+      SDValue SUB = DAG.getNode(ISD::SUB, dl,
+                                VT, CmpOp, DAG.getConstant(Low, dl, VT));
+      Cond = DAG.getSetCC(dl, MVT::i1, SUB,
+                          DAG.getConstant(High-Low, dl, VT), ISD::SETULE);
+    }
+  }
+
+  // Update successor info
+  addSuccessorWithProb(SwitchBB, CB.TrueBB, CB.TrueProb);
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithProb(SwitchBB, CB.FalseBB, CB.FalseProb);
+  SwitchBB->normalizeSuccProbs();
+
+  // If the lhs block is the next block, invert the condition so that we can
+  // fall through to the lhs instead of the rhs block.
+  if (CB.TrueBB == NextBlock(SwitchBB)) {
+    std::swap(CB.TrueBB, CB.FalseBB);
+    SDValue True = DAG.getConstant(1, dl, Cond.getValueType());
+    Cond = DAG.getNode(ISD::XOR, dl, Cond.getValueType(), Cond, True);
+  }
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, getControlRoot(), Cond,
+                               DAG.getBasicBlock(CB.TrueBB));
+
+  // Insert the false branch. Do this even if it's a fall through branch,
+  // this makes it easier to do DAG optimizations which require inverting
+  // the branch condition.
+  BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                       DAG.getBasicBlock(CB.FalseBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// visitJumpTable - Emit JumpTable node in the current MBB
+void SelectionDAGBuilder::visitJumpTable(JumpTable &JT) {
+  // Emit the code for the jump table
+  assert(JT.Reg != -1U && "Should lower JT Header first!");
+  EVT PTy = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+  SDValue Index = DAG.getCopyFromReg(getControlRoot(), getCurSDLoc(),
+                                     JT.Reg, PTy);
+  SDValue Table = DAG.getJumpTable(JT.JTI, PTy);
+  SDValue BrJumpTable = DAG.getNode(ISD::BR_JT, getCurSDLoc(),
+                                    MVT::Other, Index.getValue(1),
+                                    Table, Index);
+  DAG.setRoot(BrJumpTable);
+}
+
+/// visitJumpTableHeader - This function emits necessary code to produce index
+/// in the JumpTable from switch case.
+void SelectionDAGBuilder::visitJumpTableHeader(JumpTable &JT,
+                                               JumpTableHeader &JTH,
+                                               MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+
+  // Subtract the lowest switch case value from the value being switched on and
+  // conditional branch to default mbb if the result is greater than the
+  // difference between smallest and largest cases.
+  SDValue SwitchOp = getValue(JTH.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
+                            DAG.getConstant(JTH.First, dl, VT));
+
+  // The SDNode we just created, which holds the value being switched on minus
+  // the smallest case value, needs to be copied to a virtual register so it
+  // can be used as an index into the jump table in a subsequent basic block.
+  // This value may be smaller or larger than the target's pointer type, and
+  // therefore require extension or truncating.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SwitchOp = DAG.getZExtOrTrunc(Sub, dl, TLI.getPointerTy(DAG.getDataLayout()));
+
+  unsigned JumpTableReg =
+      FuncInfo.CreateReg(TLI.getPointerTy(DAG.getDataLayout()));
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl,
+                                    JumpTableReg, SwitchOp);
+  JT.Reg = JumpTableReg;
+
+  // Emit the range check for the jump table, and branch to the default block
+  // for the switch statement if the value being switched on exceeds the largest
+  // case in the switch.
+  SDValue CMP = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 Sub.getValueType()),
+      Sub, DAG.getConstant(JTH.Last - JTH.First, dl, VT), ISD::SETUGT);
+
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, CopyTo, CMP,
+                               DAG.getBasicBlock(JT.Default));
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (JT.MBB != NextBlock(SwitchBB))
+    BrCond = DAG.getNode(ISD::BR, dl, MVT::Other, BrCond,
+                         DAG.getBasicBlock(JT.MBB));
+
+  DAG.setRoot(BrCond);
+}
+
+/// Create a LOAD_STACK_GUARD node, and let it carry the target specific global
+/// variable if there exists one.
+static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
+                                 SDValue &Chain) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+  MachineFunction &MF = DAG.getMachineFunction();
+  Value *Global = TLI.getSDagStackGuard(*MF.getFunction()->getParent());
+  MachineSDNode *Node =
+      DAG.getMachineNode(TargetOpcode::LOAD_STACK_GUARD, DL, PtrTy, Chain);
+  if (Global) {
+    MachinePointerInfo MPInfo(Global);
+    MachineInstr::mmo_iterator MemRefs = MF.allocateMemRefsArray(1);
+    auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant;
+    *MemRefs = MF.getMachineMemOperand(MPInfo, Flags, PtrTy.getSizeInBits() / 8,
+                                       DAG.getEVTAlignment(PtrTy));
+    Node->setMemRefs(MemRefs, MemRefs + 1);
+  }
+  return SDValue(Node, 0);
+}
+
+/// Codegen a new tail for a stack protector check ParentMBB which has had its
+/// tail spliced into a stack protector check success bb.
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                                                  MachineBasicBlock *ParentBB) {
+
+  // First create the loads to the guard/stack slot for the comparison.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+
+  MachineFrameInfo *MFI = ParentBB->getParent()->getFrameInfo();
+  int FI = MFI->getStackProtectorIndex();
+
+  SDValue Guard;
+  SDLoc dl = getCurSDLoc();
+  SDValue StackSlotPtr = DAG.getFrameIndex(FI, PtrTy);
+  const Module &M = *ParentBB->getParent()->getFunction()->getParent();
+  unsigned Align = DL->getPrefTypeAlignment(Type::getInt8PtrTy(M.getContext()));
+
+  // Generate code to load the content of the guard slot.
+  SDValue StackSlot = DAG.getLoad(
+      PtrTy, dl, DAG.getEntryNode(), StackSlotPtr,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI), Align,
+      MachineMemOperand::MOVolatile);
+
+  // Retrieve guard check function, nullptr if instrumentation is inlined.
+  if (const Value *GuardCheck = TLI.getSSPStackGuardCheck(M)) {
+    // The target provides a guard check function to validate the guard value.
+    // Generate a call to that function with the content of the guard slot as
+    // argument.
+    auto *Fn = cast<Function>(GuardCheck);
+    FunctionType *FnTy = Fn->getFunctionType();
+    assert(FnTy->getNumParams() == 1 && "Invalid function signature");
+
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = StackSlot;
+    Entry.Ty = FnTy->getParamType(0);
+    if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
+      Entry.isInReg = true;
+    Args.push_back(Entry);
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(getCurSDLoc())
+      .setChain(DAG.getEntryNode())
+      .setCallee(Fn->getCallingConv(), FnTy->getReturnType(),
+                 getValue(GuardCheck), std::move(Args));
+
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return;
+  }
+
+  // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
+  // Otherwise, emit a volatile load to retrieve the stack guard value.
+  SDValue Chain = DAG.getEntryNode();
+  if (TLI.useLoadStackGuardNode()) {
+    Guard = getLoadStackGuard(DAG, dl, Chain);
+  } else {
+    const Value *IRGuard = TLI.getSDagStackGuard(M);
+    SDValue GuardPtr = getValue(IRGuard);
+
+    Guard =
+        DAG.getLoad(PtrTy, dl, Chain, GuardPtr, MachinePointerInfo(IRGuard, 0),
+                    Align, MachineMemOperand::MOVolatile);
+  }
+
+  // Perform the comparison via a subtract/getsetcc.
+  EVT VT = Guard.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, StackSlot);
+
+  SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
+                                                        *DAG.getContext(),
+                                                        Sub.getValueType()),
+                             Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
+
+  // If the sub is not 0, then we know the guard/stackslot do not equal, so
+  // branch to failure MBB.
+  SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
+                               MVT::Other, StackSlot.getOperand(0),
+                               Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
+  // Otherwise branch to success MBB.
+  SDValue Br = DAG.getNode(ISD::BR, dl,
+                           MVT::Other, BrCond,
+                           DAG.getBasicBlock(SPD.getSuccessMBB()));
+
+  DAG.setRoot(Br);
+}
+
+/// Codegen the failure basic block for a stack protector check.
+///
+/// A failure stack protector machine basic block consists simply of a call to
+/// __stack_chk_fail().
+///
+/// For a high level explanation of how this fits into the stack protector
+/// generation see the comment on the declaration of class
+/// StackProtectorDescriptor.
+void
+SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Chain =
+      TLI.makeLibCall(DAG, RTLIB::STACKPROTECTOR_CHECK_FAIL, MVT::isVoid,
+                      None, false, getCurSDLoc(), false, false).second;
+  DAG.setRoot(Chain);
+}
+
+/// visitBitTestHeader - This function emits necessary code to produce value
+/// suitable for "bit tests"
+void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
+                                             MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+
+  // Subtract the minimum value
+  SDValue SwitchOp = getValue(B.SValue);
+  EVT VT = SwitchOp.getValueType();
+  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SwitchOp,
+                            DAG.getConstant(B.First, dl, VT));
+
+  // Check range
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue RangeCmp = DAG.getSetCC(
+      dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(),
+                                 Sub.getValueType()),
+      Sub, DAG.getConstant(B.Range, dl, VT), ISD::SETUGT);
+
+  // Determine the type of the test operands.
+  bool UsePtrType = false;
+  if (!TLI.isTypeLegal(VT))
+    UsePtrType = true;
+  else {
+    for (unsigned i = 0, e = B.Cases.size(); i != e; ++i)
+      if (!isUIntN(VT.getSizeInBits(), B.Cases[i].Mask)) {
+        // Switch table case range are encoded into series of masks.
+        // Just use pointer type, it's guaranteed to fit.
+        UsePtrType = true;
+        break;
+      }
+  }
+  if (UsePtrType) {
+    VT = TLI.getPointerTy(DAG.getDataLayout());
+    Sub = DAG.getZExtOrTrunc(Sub, dl, VT);
+  }
+
+  B.RegVT = VT.getSimpleVT();
+  B.Reg = FuncInfo.CreateReg(B.RegVT);
+  SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), dl, B.Reg, Sub);
+
+  MachineBasicBlock* MBB = B.Cases[0].ThisBB;
+
+  addSuccessorWithProb(SwitchBB, B.Default, B.DefaultProb);
+  addSuccessorWithProb(SwitchBB, MBB, B.Prob);
+  SwitchBB->normalizeSuccProbs();
+
+  SDValue BrRange = DAG.getNode(ISD::BRCOND, dl,
+                                MVT::Other, CopyTo, RangeCmp,
+                                DAG.getBasicBlock(B.Default));
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (MBB != NextBlock(SwitchBB))
+    BrRange = DAG.getNode(ISD::BR, dl, MVT::Other, BrRange,
+                          DAG.getBasicBlock(MBB));
+
+  DAG.setRoot(BrRange);
+}
+
+/// visitBitTestCase - this function produces one "bit test"
+void SelectionDAGBuilder::visitBitTestCase(BitTestBlock &BB,
+                                           MachineBasicBlock* NextMBB,
+                                           BranchProbability BranchProbToNext,
+                                           unsigned Reg,
+                                           BitTestCase &B,
+                                           MachineBasicBlock *SwitchBB) {
+  SDLoc dl = getCurSDLoc();
+  MVT VT = BB.RegVT;
+  SDValue ShiftOp = DAG.getCopyFromReg(getControlRoot(), dl, Reg, VT);
+  SDValue Cmp;
+  unsigned PopCount = countPopulation(B.Mask);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (PopCount == 1) {
+    // Testing for a single bit; just compare the shift count with what it
+    // would need to be to shift a 1 bit in that position.
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(countTrailingZeros(B.Mask), dl, VT),
+        ISD::SETEQ);
+  } else if (PopCount == BB.Range) {
+    // There is only one zero bit in the range, test for it directly.
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        ShiftOp, DAG.getConstant(countTrailingOnes(B.Mask), dl, VT),
+        ISD::SETNE);
+  } else {
+    // Make desired shift
+    SDValue SwitchVal = DAG.getNode(ISD::SHL, dl, VT,
+                                    DAG.getConstant(1, dl, VT), ShiftOp);
+
+    // Emit bit tests and jumps
+    SDValue AndOp = DAG.getNode(ISD::AND, dl,
+                                VT, SwitchVal, DAG.getConstant(B.Mask, dl, VT));
+    Cmp = DAG.getSetCC(
+        dl, TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT),
+        AndOp, DAG.getConstant(0, dl, VT), ISD::SETNE);
+  }
+
+  // The branch probability from SwitchBB to B.TargetBB is B.ExtraProb.
+  addSuccessorWithProb(SwitchBB, B.TargetBB, B.ExtraProb);
+  // The branch probability from SwitchBB to NextMBB is BranchProbToNext.
+  addSuccessorWithProb(SwitchBB, NextMBB, BranchProbToNext);
+  // It is not guaranteed that the sum of B.ExtraProb and BranchProbToNext is
+  // one as they are relative probabilities (and thus work more like weights),
+  // and hence we need to normalize them to let the sum of them become one.
+  SwitchBB->normalizeSuccProbs();
+
+  SDValue BrAnd = DAG.getNode(ISD::BRCOND, dl,
+                              MVT::Other, getControlRoot(),
+                              Cmp, DAG.getBasicBlock(B.TargetBB));
+
+  // Avoid emitting unnecessary branches to the next block.
+  if (NextMBB != NextBlock(SwitchBB))
+    BrAnd = DAG.getNode(ISD::BR, dl, MVT::Other, BrAnd,
+                        DAG.getBasicBlock(NextMBB));
+
+  DAG.setRoot(BrAnd);
+}
+
+void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
+  MachineBasicBlock *InvokeMBB = FuncInfo.MBB;
+
+  // Retrieve successors. Look through artificial IR level blocks like
+  // catchswitch for successors.
+  MachineBasicBlock *Return = FuncInfo.MBBMap[I.getSuccessor(0)];
+  const BasicBlock *EHPadBB = I.getSuccessor(1);
+
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
+         "Cannot lower invokes with arbitrary operand bundles yet!");
+
+  const Value *Callee(I.getCalledValue());
+  const Function *Fn = dyn_cast<Function>(Callee);
+  if (isa<InlineAsm>(Callee))
+    visitInlineAsm(&I);
+  else if (Fn && Fn->isIntrinsic()) {
+    switch (Fn->getIntrinsicID()) {
+    default:
+      llvm_unreachable("Cannot invoke this intrinsic");
+    case Intrinsic::donothing:
+      // Ignore invokes to @llvm.donothing: jump directly to the next BB.
+      break;
+    case Intrinsic::experimental_patchpoint_void:
+    case Intrinsic::experimental_patchpoint_i64:
+      visitPatchpoint(&I, EHPadBB);
+      break;
+    case Intrinsic::experimental_gc_statepoint:
+      LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
+      break;
+    }
+  } else if (I.countOperandBundlesOfType(LLVMContext::OB_deopt)) {
+    // Currently we do not lower any intrinsic calls with deopt operand bundles.
+    // Eventually we will support lowering the @llvm.experimental.deoptimize
+    // intrinsic, and right now there are no plans to support other intrinsics
+    // with deopt state.
+    LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
+  } else {
+    LowerCallTo(&I, getValue(Callee), false, EHPadBB);
+  }
+
+  // If the value of the invoke is used outside of its defining block, make it
+  // available as a virtual register.
+  // We already took care of the exported value for the statepoint instruction
+  // during call to the LowerStatepoint.
+  if (!isStatepoint(I)) {
+    CopyToExportRegsIfNeeded(&I);
+  }
+
+  SmallVector<std::pair<MachineBasicBlock *, BranchProbability>, 1> UnwindDests;
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  BranchProbability EHPadBBProb =
+      BPI ? BPI->getEdgeProbability(InvokeMBB->getBasicBlock(), EHPadBB)
+          : BranchProbability::getZero();
+  findUnwindDestinations(FuncInfo, EHPadBB, EHPadBBProb, UnwindDests);
+
+  // Update successor info.
+  addSuccessorWithProb(InvokeMBB, Return);
+  for (auto &UnwindDest : UnwindDests) {
+    UnwindDest.first->setIsEHPad();
+    addSuccessorWithProb(InvokeMBB, UnwindDest.first, UnwindDest.second);
+  }
+  InvokeMBB->normalizeSuccProbs();
+
+  // Drop into normal successor.
+  DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(),
+                          MVT::Other, getControlRoot(),
+                          DAG.getBasicBlock(Return)));
+}
+
+void SelectionDAGBuilder::visitResume(const ResumeInst &RI) {
+  llvm_unreachable("SelectionDAGBuilder shouldn't visit resume instructions!");
+}
+
+void SelectionDAGBuilder::visitLandingPad(const LandingPadInst &LP) {
+  assert(FuncInfo.MBB->isEHPad() &&
+         "Call to landingpad not in landing pad!");
+
+  MachineBasicBlock *MBB = FuncInfo.MBB;
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  AddLandingPadInfo(LP, MMI, MBB);
+
+  // If there aren't registers to copy the values into (e.g., during SjLj
+  // exceptions), then don't bother to create these DAG nodes.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Constant *PersonalityFn = FuncInfo.Fn->getPersonalityFn();
+  if (TLI.getExceptionPointerRegister(PersonalityFn) == 0 &&
+      TLI.getExceptionSelectorRegister(PersonalityFn) == 0)
+    return;
+
+  // If landingpad's return type is token type, we don't create DAG nodes
+  // for its exception pointer and selector value. The extraction of exception
+  // pointer or selector value from token type landingpads is not currently
+  // supported.
+  if (LP.getType()->isTokenTy())
+    return;
+
+  SmallVector<EVT, 2> ValueVTs;
+  SDLoc dl = getCurSDLoc();
+  ComputeValueVTs(TLI, DAG.getDataLayout(), LP.getType(), ValueVTs);
+  assert(ValueVTs.size() == 2 && "Only two-valued landingpads are supported");
+
+  // Get the two live-in registers as SDValues. The physregs have already been
+  // copied into virtual registers.
+  SDValue Ops[2];
+  if (FuncInfo.ExceptionPointerVirtReg) {
+    Ops[0] = DAG.getZExtOrTrunc(
+        DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                           FuncInfo.ExceptionPointerVirtReg,
+                           TLI.getPointerTy(DAG.getDataLayout())),
+        dl, ValueVTs[0]);
+  } else {
+    Ops[0] = DAG.getConstant(0, dl, TLI.getPointerTy(DAG.getDataLayout()));
+  }
+  Ops[1] = DAG.getZExtOrTrunc(
+      DAG.getCopyFromReg(DAG.getEntryNode(), dl,
+                         FuncInfo.ExceptionSelectorVirtReg,
+                         TLI.getPointerTy(DAG.getDataLayout())),
+      dl, ValueVTs[1]);
+
+  // Merge into one.
+  SDValue Res = DAG.getNode(ISD::MERGE_VALUES, dl,
+                            DAG.getVTList(ValueVTs), Ops);
+  setValue(&LP, Res);
+}
+
+void SelectionDAGBuilder::sortAndRangeify(CaseClusterVector &Clusters) {
+#ifndef NDEBUG
+  for (const CaseCluster &CC : Clusters)
+    assert(CC.Low == CC.High && "Input clusters must be single-case");
+#endif
+
+  std::sort(Clusters.begin(), Clusters.end(),
+            [](const CaseCluster &a, const CaseCluster &b) {
+    return a.Low->getValue().slt(b.Low->getValue());
+  });
+
+  // Merge adjacent clusters with the same destination.
+  const unsigned N = Clusters.size();
+  unsigned DstIndex = 0;
+  for (unsigned SrcIndex = 0; SrcIndex < N; ++SrcIndex) {
+    CaseCluster &CC = Clusters[SrcIndex];
+    const ConstantInt *CaseVal = CC.Low;
+    MachineBasicBlock *Succ = CC.MBB;
+
+    if (DstIndex != 0 && Clusters[DstIndex - 1].MBB == Succ &&
+        (CaseVal->getValue() - Clusters[DstIndex - 1].High->getValue()) == 1) {
+      // If this case has the same successor and is a neighbour, merge it into
+      // the previous cluster.
+      Clusters[DstIndex - 1].High = CaseVal;
+      Clusters[DstIndex - 1].Prob += CC.Prob;
+    } else {
+      std::memmove(&Clusters[DstIndex++], &Clusters[SrcIndex],
+                   sizeof(Clusters[SrcIndex]));
+    }
+  }
+  Clusters.resize(DstIndex);
+}
+
+void SelectionDAGBuilder::UpdateSplitBlock(MachineBasicBlock *First,
+                                           MachineBasicBlock *Last) {
+  // Update JTCases.
+  for (unsigned i = 0, e = JTCases.size(); i != e; ++i)
+    if (JTCases[i].first.HeaderBB == First)
+      JTCases[i].first.HeaderBB = Last;
+
+  // Update BitTestCases.
+  for (unsigned i = 0, e = BitTestCases.size(); i != e; ++i)
+    if (BitTestCases[i].Parent == First)
+      BitTestCases[i].Parent = Last;
+}
+
+void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
+  MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
+
+  // Update machine-CFG edges with unique successors.
+  SmallSet<BasicBlock*, 32> Done;
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
+    BasicBlock *BB = I.getSuccessor(i);
+    bool Inserted = Done.insert(BB).second;
+    if (!Inserted)
+        continue;
+
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
+    addSuccessorWithProb(IndirectBrMBB, Succ);
+  }
+  IndirectBrMBB->normalizeSuccProbs();
+
+  DAG.setRoot(DAG.getNode(ISD::BRIND, getCurSDLoc(),
+                          MVT::Other, getControlRoot(),
+                          getValue(I.getAddress())));
+}
+
+void SelectionDAGBuilder::visitUnreachable(const UnreachableInst &I) {
+  if (DAG.getTarget().Options.TrapUnreachable)
+    DAG.setRoot(
+        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
+
+void SelectionDAGBuilder::visitFSub(const User &I) {
+  // -0.0 - X --> fneg
+  Type *Ty = I.getType();
+  if (isa<Constant>(I.getOperand(0)) &&
+      I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) {
+    SDValue Op2 = getValue(I.getOperand(1));
+    setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(),
+                             Op2.getValueType(), Op2));
+    return;
+  }
+
+  visitBinary(I, ISD::FSUB);
+}
+
+/// Checks if the given instruction performs a vector reduction, in which case
+/// we have the freedom to alter the elements in the result as long as the
+/// reduction of them stays unchanged.
+static bool isVectorReductionOp(const User *I) {
+  const Instruction *Inst = dyn_cast<Instruction>(I);
+  if (!Inst || !Inst->getType()->isVectorTy())
+    return false;
+
+  auto OpCode = Inst->getOpcode();
+  switch (OpCode) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    break;
+  case Instruction::FAdd:
+  case Instruction::FMul:
+    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+      if (FPOp->getFastMathFlags().unsafeAlgebra())
+        break;
+    // Fall through.
+  default:
+    return false;
+  }
+
+  unsigned ElemNum = Inst->getType()->getVectorNumElements();
+  unsigned ElemNumToReduce = ElemNum;
+
+  // Do DFS search on the def-use chain from the given instruction. We only
+  // allow four kinds of operations during the search until we reach the
+  // instruction that extracts the first element from the vector:
+  //
+  //   1. The reduction operation of the same opcode as the given instruction.
+  //
+  //   2. PHI node.
+  //
+  //   3. ShuffleVector instruction together with a reduction operation that
+  //      does a partial reduction.
+  //
+  //   4. ExtractElement that extracts the first element from the vector, and we
+  //      stop searching the def-use chain here.
+  //
+  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
+  // from 1-3 to the stack to continue the DFS. The given instruction is not
+  // a reduction operation if we meet any other instructions other than those
+  // listed above.
+
+  SmallVector<const User *, 16> UsersToVisit{Inst};
+  SmallPtrSet<const User *, 16> Visited;
+  bool ReduxExtracted = false;
+
+  while (!UsersToVisit.empty()) {
+    auto User = UsersToVisit.back();
+    UsersToVisit.pop_back();
+    if (!Visited.insert(User).second)
+      continue;
+
+    for (const auto &U : User->users()) {
+      auto Inst = dyn_cast<Instruction>(U);
+      if (!Inst)
+        return false;
+
+      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
+        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
+          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().unsafeAlgebra())
+            return false;
+        UsersToVisit.push_back(U);
+      } else if (const ShuffleVectorInst *ShufInst =
+                     dyn_cast<ShuffleVectorInst>(U)) {
+        // Detect the following pattern: A ShuffleVector instruction together
+        // with a reduction that do partial reduction on the first and second
+        // ElemNumToReduce / 2 elements, and store the result in
+        // ElemNumToReduce / 2 elements in another vector.
+
+        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
+        if (ResultElements < ElemNum)
+          return false;
+
+        if (ElemNumToReduce == 1)
+          return false;
+        if (!isa<UndefValue>(U->getOperand(1)))
+          return false;
+        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
+          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
+            return false;
+        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
+          if (ShufInst->getMaskValue(i) != -1)
+            return false;
+
+        // There is only one user of this ShuffleVector instruction, which
+        // must be a reduction operation.
+        if (!U->hasOneUse())
+          return false;
+
+        auto U2 = dyn_cast<Instruction>(*U->user_begin());
+        if (!U2 || U2->getOpcode() != OpCode)
+          return false;
+
+        // Check operands of the reduction operation.
+        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
+            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
+          UsersToVisit.push_back(U2);
+          ElemNumToReduce /= 2;
+        } else
+          return false;
+      } else if (isa<ExtractElementInst>(U)) {
+        // At this moment we should have reduced all elements in the vector.
+        if (ElemNumToReduce != 1)
+          return false;
+
+        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
+        if (!Val || Val->getZExtValue() != 0)
+          return false;
+
+        ReduxExtracted = true;
+      } else
+        return false;
+    }
+  }
+  return ReduxExtracted;
+}
+
+void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+  bool vec_redux = false;
+  FastMathFlags FMF;
+
+  if (const OverflowingBinaryOperator *OFBinOp =
+          dyn_cast<const OverflowingBinaryOperator>(&I)) {
+    nuw = OFBinOp->hasNoUnsignedWrap();
+    nsw = OFBinOp->hasNoSignedWrap();
+  }
+  if (const PossiblyExactOperator *ExactOp =
+          dyn_cast<const PossiblyExactOperator>(&I))
+    exact = ExactOp->isExact();
+  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&I))
+    FMF = FPOp->getFastMathFlags();
+
+  if (isVectorReductionOp(&I)) {
+    vec_redux = true;
+    DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
+  }
+
+  SDNodeFlags Flags;
+  Flags.setExact(exact);
+  Flags.setNoSignedWrap(nsw);
+  Flags.setNoUnsignedWrap(nuw);
+  Flags.setVectorReduction(vec_redux);
+  if (EnableFMFInDAG) {
+    Flags.setAllowReciprocal(FMF.allowReciprocal());
+    Flags.setNoInfs(FMF.noInfs());
+    Flags.setNoNaNs(FMF.noNaNs());
+    Flags.setNoSignedZeros(FMF.noSignedZeros());
+    Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+  }
+  SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
+                                     Op1, Op2, &Flags);
+  setValue(&I, BinNodeValue);
+}
+
+void SelectionDAGBuilder::visitShift(const User &I, unsigned Opcode) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  EVT ShiftTy = DAG.getTargetLoweringInfo().getShiftAmountTy(
+      Op2.getValueType(), DAG.getDataLayout());
+
+  // Coerce the shift amount to the right type if we can.
+  if (!I.getType()->isVectorTy() && Op2.getValueType() != ShiftTy) {
+    unsigned ShiftSize = ShiftTy.getSizeInBits();
+    unsigned Op2Size = Op2.getValueType().getSizeInBits();
+    SDLoc DL = getCurSDLoc();
+
+    // If the operand is smaller than the shift count type, promote it.
+    if (ShiftSize > Op2Size)
+      Op2 = DAG.getNode(ISD::ZERO_EXTEND, DL, ShiftTy, Op2);
+
+    // If the operand is larger than the shift count type but the shift
+    // count type has enough bits to represent any shift value, truncate
+    // it now. This is a common case and it exposes the truncate to
+    // optimization early.
+    else if (ShiftSize >= Log2_32_Ceil(Op2.getValueType().getSizeInBits()))
+      Op2 = DAG.getNode(ISD::TRUNCATE, DL, ShiftTy, Op2);
+    // Otherwise we'll need to temporarily settle for some other convenient
+    // type.  Type legalization will make adjustments once the shiftee is split.
+    else
+      Op2 = DAG.getZExtOrTrunc(Op2, DL, MVT::i32);
+  }
+
+  bool nuw = false;
+  bool nsw = false;
+  bool exact = false;
+
+  if (Opcode == ISD::SRL || Opcode == ISD::SRA || Opcode == ISD::SHL) {
+
+    if (const OverflowingBinaryOperator *OFBinOp =
+            dyn_cast<const OverflowingBinaryOperator>(&I)) {
+      nuw = OFBinOp->hasNoUnsignedWrap();
+      nsw = OFBinOp->hasNoSignedWrap();
+    }
+    if (const PossiblyExactOperator *ExactOp =
+            dyn_cast<const PossiblyExactOperator>(&I))
+      exact = ExactOp->isExact();
+  }
+  SDNodeFlags Flags;
+  Flags.setExact(exact);
+  Flags.setNoSignedWrap(nsw);
+  Flags.setNoUnsignedWrap(nuw);
+  SDValue Res = DAG.getNode(Opcode, getCurSDLoc(), Op1.getValueType(), Op1, Op2,
+                            &Flags);
+  setValue(&I, Res);
+}
+
+void SelectionDAGBuilder::visitSDiv(const User &I) {
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+
+  SDNodeFlags Flags;
+  Flags.setExact(isa<PossiblyExactOperator>(&I) &&
+                 cast<PossiblyExactOperator>(&I)->isExact());
+  setValue(&I, DAG.getNode(ISD::SDIV, getCurSDLoc(), Op1.getValueType(), Op1,
+                           Op2, &Flags));
+}
+
+void SelectionDAGBuilder::visitICmp(const User &I) {
+  ICmpInst::Predicate predicate = ICmpInst::BAD_ICMP_PREDICATE;
+  if (const ICmpInst *IC = dyn_cast<ICmpInst>(&I))
+    predicate = IC->getPredicate();
+  else if (const ConstantExpr *IC = dyn_cast<ConstantExpr>(&I))
+    predicate = ICmpInst::Predicate(IC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Opcode = getICmpCondCode(predicate);
+
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Opcode));
+}
+
+void SelectionDAGBuilder::visitFCmp(const User &I) {
+  FCmpInst::Predicate predicate = FCmpInst::BAD_FCMP_PREDICATE;
+  if (const FCmpInst *FC = dyn_cast<FCmpInst>(&I))
+    predicate = FC->getPredicate();
+  else if (const ConstantExpr *FC = dyn_cast<ConstantExpr>(&I))
+    predicate = FCmpInst::Predicate(FC->getPredicate());
+  SDValue Op1 = getValue(I.getOperand(0));
+  SDValue Op2 = getValue(I.getOperand(1));
+  ISD::CondCode Condition = getFCmpCondCode(predicate);
+
+  // FIXME: Fcmp instructions have fast-math-flags in IR, so we should use them.
+  // FIXME: We should propagate the fast-math-flags to the DAG node itself for
+  // further optimization, but currently FMF is only applicable to binary nodes.
+  if (TM.Options.NoNaNsFPMath)
+    Condition = getFCmpCodeWithoutNaN(Condition);
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getSetCC(getCurSDLoc(), DestVT, Op1, Op2, Condition));
+}
+
+// Check if the condition of the select has one use or two users that are both
+// selects with the same condition.
+static bool hasOnlySelectUsers(const Value *Cond) {
+  return std::all_of(Cond->user_begin(), Cond->user_end(), [](const Value *V) {
+    return isa<SelectInst>(V);
+  });
+}
+
+void SelectionDAGBuilder::visitSelect(const User &I) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+                  ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Cond     = getValue(I.getOperand(0));
+  SDValue LHSVal   = getValue(I.getOperand(1));
+  SDValue RHSVal   = getValue(I.getOperand(2));
+  auto BaseOps = {Cond};
+  ISD::NodeType OpCode = Cond.getValueType().isVector() ?
+    ISD::VSELECT : ISD::SELECT;
+
+  // Min/max matching is only viable if all output VTs are the same.
+  if (std::equal(ValueVTs.begin(), ValueVTs.end(), ValueVTs.begin())) {
+    EVT VT = ValueVTs[0];
+    LLVMContext &Ctx = *DAG.getContext();
+    auto &TLI = DAG.getTargetLoweringInfo();
+
+    // We care about the legality of the operation after it has been type
+    // legalized.
+    while (TLI.getTypeAction(Ctx, VT) != TargetLoweringBase::TypeLegal &&
+           VT != TLI.getTypeToTransformTo(Ctx, VT))
+      VT = TLI.getTypeToTransformTo(Ctx, VT);
+
+    // If the vselect is legal, assume we want to leave this as a vector setcc +
+    // vselect. Otherwise, if this is going to be scalarized, we want to see if
+    // min/max is legal on the scalar type.
+    bool UseScalarMinMax = VT.isVector() &&
+      !TLI.isOperationLegalOrCustom(ISD::VSELECT, VT);
+
+    Value *LHS, *RHS;
+    auto SPR = matchSelectPattern(const_cast<User*>(&I), LHS, RHS);
+    ISD::NodeType Opc = ISD::DELETED_NODE;
+    switch (SPR.Flavor) {
+    case SPF_UMAX:    Opc = ISD::UMAX; break;
+    case SPF_UMIN:    Opc = ISD::UMIN; break;
+    case SPF_SMAX:    Opc = ISD::SMAX; break;
+    case SPF_SMIN:    Opc = ISD::SMIN; break;
+    case SPF_FMINNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMINNAN; break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMINNUM; break;
+      case SPNB_RETURNS_ANY: {
+        if (TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT))
+          Opc = ISD::FMINNUM;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT))
+          Opc = ISD::FMINNAN;
+        else if (UseScalarMinMax)
+          Opc = TLI.isOperationLegalOrCustom(ISD::FMINNUM, VT.getScalarType()) ?
+            ISD::FMINNUM : ISD::FMINNAN;
+        break;
+      }
+      }
+      break;
+    case SPF_FMAXNUM:
+      switch (SPR.NaNBehavior) {
+      case SPNB_NA: llvm_unreachable("No NaN behavior for FP op?");
+      case SPNB_RETURNS_NAN:   Opc = ISD::FMAXNAN; break;
+      case SPNB_RETURNS_OTHER: Opc = ISD::FMAXNUM; break;
+      case SPNB_RETURNS_ANY:
+
+        if (TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT))
+          Opc = ISD::FMAXNUM;
+        else if (TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT))
+          Opc = ISD::FMAXNAN;
+        else if (UseScalarMinMax)
+          Opc = TLI.isOperationLegalOrCustom(ISD::FMAXNUM, VT.getScalarType()) ?
+            ISD::FMAXNUM : ISD::FMAXNAN;
+        break;
+      }
+      break;
+    default: break;
+    }
+
+    if (Opc != ISD::DELETED_NODE &&
+        (TLI.isOperationLegalOrCustom(Opc, VT) ||
+         (UseScalarMinMax &&
+          TLI.isOperationLegalOrCustom(Opc, VT.getScalarType()))) &&
+        // If the underlying comparison instruction is used by any other
+        // instruction, the consumed instructions won't be destroyed, so it is
+        // not profitable to convert to a min/max.
+        hasOnlySelectUsers(cast<SelectInst>(I).getCondition())) {
+      OpCode = Opc;
+      LHSVal = getValue(LHS);
+      RHSVal = getValue(RHS);
+      BaseOps = {};
+    }
+  }
+
+  for (unsigned i = 0; i != NumValues; ++i) {
+    SmallVector<SDValue, 3> Ops(BaseOps.begin(), BaseOps.end());
+    Ops.push_back(SDValue(LHSVal.getNode(), LHSVal.getResNo() + i));
+    Ops.push_back(SDValue(RHSVal.getNode(), RHSVal.getResNo() + i));
+    Values[i] = DAG.getNode(OpCode, getCurSDLoc(),
+                            LHSVal.getNode()->getValueType(LHSVal.getResNo()+i),
+                            Ops);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitTrunc(const User &I) {
+  // TruncInst cannot be a no-op cast because sizeof(src) > sizeof(dest).
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitZExt(const User &I) {
+  // ZExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // ZExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::ZERO_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSExt(const User &I) {
+  // SExt cannot be a no-op cast because sizeof(src) < sizeof(dest).
+  // SExt also can't be a cast to bool for same reason. So, nothing much to do
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::SIGN_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPTrunc(const User &I) {
+  // FPTrunc is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  SDLoc dl = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_ROUND, dl, DestVT, N,
+                           DAG.getTargetConstant(
+                               0, dl, TLI.getPointerTy(DAG.getDataLayout()))));
+}
+
+void SelectionDAGBuilder::visitFPExt(const User &I) {
+  // FPExt is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_EXTEND, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToUI(const User &I) {
+  // FPToUI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_UINT, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitFPToSI(const User &I) {
+  // FPToSI is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::FP_TO_SINT, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitUIToFP(const User &I) {
+  // UIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::UINT_TO_FP, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitSIToFP(const User &I) {
+  // SIToFP is never a no-op cast, no need to check
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getNode(ISD::SINT_TO_FP, getCurSDLoc(), DestVT, N));
+}
+
+void SelectionDAGBuilder::visitPtrToInt(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitIntToPtr(const User &I) {
+  // What to do depends on the size of the integer and the size of the pointer.
+  // We can either truncate, zero extend, or no-op, accordingly.
+  SDValue N = getValue(I.getOperand(0));
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+  setValue(&I, DAG.getZExtOrTrunc(N, getCurSDLoc(), DestVT));
+}
+
+void SelectionDAGBuilder::visitBitCast(const User &I) {
+  SDValue N = getValue(I.getOperand(0));
+  SDLoc dl = getCurSDLoc();
+  EVT DestVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                        I.getType());
+
+  // BitCast assures us that source and destination are the same size so this is
+  // either a BITCAST or a no-op.
+  if (DestVT != N.getValueType())
+    setValue(&I, DAG.getNode(ISD::BITCAST, dl,
+                             DestVT, N)); // convert types.
+  // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
+  // might fold any kind of constant expression to an integer constant and that
+  // is not what we are looking for. Only regcognize a bitcast of a genuine
+  // constant integer as an opaque constant.
+  else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
+    setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
+                                 /*isOpaque*/true));
+  else
+    setValue(&I, N);            // noop cast.
+}
+
+void SelectionDAGBuilder::visitAddrSpaceCast(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  SDValue N = getValue(SV);
+  EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  unsigned SrcAS = SV->getType()->getPointerAddressSpace();
+  unsigned DestAS = I.getType()->getPointerAddressSpace();
+
+  if (!TLI.isNoopAddrSpaceCast(SrcAS, DestAS))
+    N = DAG.getAddrSpaceCast(getCurSDLoc(), DestVT, N, SrcAS, DestAS);
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitInsertElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InVal = getValue(I.getOperand(1));
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(2)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
+  setValue(&I, DAG.getNode(ISD::INSERT_VECTOR_ELT, getCurSDLoc(),
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InVal, InIdx));
+}
+
+void SelectionDAGBuilder::visitExtractElement(const User &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue InVec = getValue(I.getOperand(0));
+  SDValue InIdx = DAG.getSExtOrTrunc(getValue(I.getOperand(1)), getCurSDLoc(),
+                                     TLI.getVectorIdxTy(DAG.getDataLayout()));
+  setValue(&I, DAG.getNode(ISD::EXTRACT_VECTOR_ELT, getCurSDLoc(),
+                           TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           InVec, InIdx));
+}
+
+void SelectionDAGBuilder::visitShuffleVector(const User &I) {
+  SDValue Src1 = getValue(I.getOperand(0));
+  SDValue Src2 = getValue(I.getOperand(1));
+
+  SmallVector<int, 8> Mask;
+  ShuffleVectorInst::getShuffleMask(cast<Constant>(I.getOperand(2)), Mask);
+  unsigned MaskNumElts = Mask.size();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  EVT SrcVT = Src1.getValueType();
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
+
+  if (SrcNumElts == MaskNumElts) {
+    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2, Mask));
+    return;
+  }
+
+  // Normalize the shuffle vector since mask and vector length don't match.
+  if (SrcNumElts < MaskNumElts && MaskNumElts % SrcNumElts == 0) {
+    // Mask is longer than the source vectors and is a multiple of the source
+    // vectors.  We can use concatenate vector to make the mask and vectors
+    // lengths match.
+
+    unsigned NumConcat = MaskNumElts / SrcNumElts;
+
+    // Check if the shuffle is some kind of concatenation of the input vectors.
+    bool IsConcat = true;
+    SmallVector<int, 8> ConcatSrcs(NumConcat, -1);
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx < 0)
+        continue;
+      // Ensure the indices in each SrcVT sized piece are sequential and that
+      // the same source is used for the whole piece.
+      if ((Idx % SrcNumElts != (i % SrcNumElts)) ||
+          (ConcatSrcs[i / SrcNumElts] >= 0 &&
+           ConcatSrcs[i / SrcNumElts] != (int)(Idx / SrcNumElts))) {
+        IsConcat = false;
+        break;
+      }
+      // Remember which source this index came from.
+      ConcatSrcs[i / SrcNumElts] = Idx / SrcNumElts;
+    }
+
+    // The shuffle is concatenating multiple vectors together. Just emit
+    // a CONCAT_VECTORS operation.
+    if (IsConcat) {
+      SmallVector<SDValue, 8> ConcatOps;
+      for (auto Src : ConcatSrcs) {
+        if (Src < 0)
+          ConcatOps.push_back(DAG.getUNDEF(SrcVT));
+        else if (Src == 0)
+          ConcatOps.push_back(Src1);
+        else
+          ConcatOps.push_back(Src2);
+      }
+      setValue(&I, DAG.getNode(ISD::CONCAT_VECTORS, getCurSDLoc(),
+                               VT, ConcatOps));
+      return;
+    }
+
+    // Pad both vectors with undefs to make them the same length as the mask.
+    SDValue UndefVal = DAG.getUNDEF(SrcVT);
+
+    SmallVector<SDValue, 8> MOps1(NumConcat, UndefVal);
+    SmallVector<SDValue, 8> MOps2(NumConcat, UndefVal);
+    MOps1[0] = Src1;
+    MOps2[0] = Src2;
+
+    Src1 = Src1.isUndef() ? DAG.getUNDEF(VT)
+                          : DAG.getNode(ISD::CONCAT_VECTORS,
+                                        getCurSDLoc(), VT, MOps1);
+    Src2 = Src2.isUndef() ? DAG.getUNDEF(VT)
+                          : DAG.getNode(ISD::CONCAT_VECTORS,
+                                        getCurSDLoc(), VT, MOps2);
+
+    // Readjust mask for new input vector length.
+    SmallVector<int, 8> MappedOps;
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      if (Idx >= (int)SrcNumElts)
+        Idx -= SrcNumElts - MaskNumElts;
+      MappedOps.push_back(Idx);
+    }
+
+    setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
+                                      MappedOps));
+    return;
+  }
+
+  if (SrcNumElts > MaskNumElts) {
+    // Analyze the access pattern of the vector to see if we can extract
+    // two subvectors and do the shuffle. The analysis is done by calculating
+    // the range of elements the mask access on both vectors.
+    int MinRange[2] = { static_cast<int>(SrcNumElts),
+                        static_cast<int>(SrcNumElts)};
+    int MaxRange[2] = {-1, -1};
+
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      int Idx = Mask[i];
+      unsigned Input = 0;
+      if (Idx < 0)
+        continue;
+
+      if (Idx >= (int)SrcNumElts) {
+        Input = 1;
+        Idx -= SrcNumElts;
+      }
+      if (Idx > MaxRange[Input])
+        MaxRange[Input] = Idx;
+      if (Idx < MinRange[Input])
+        MinRange[Input] = Idx;
+    }
+
+    // Check if the access is smaller than the vector size and can we find
+    // a reasonable extract index.
+    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
+                                   // Extract.
+    int StartIdx[2];  // StartIdx to extract from
+    for (unsigned Input = 0; Input < 2; ++Input) {
+      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
+        RangeUse[Input] = 0; // Unused
+        StartIdx[Input] = 0;
+        continue;
+      }
+
+      // Find a good start index that is a multiple of the mask length. Then
+      // see if the rest of the elements are in range.
+      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
+      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
+          StartIdx[Input] + MaskNumElts <= SrcNumElts)
+        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+    }
+
+    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+      setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
+      return;
+    }
+    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
+      // Extract appropriate subvector and generate a vector shuffle
+      for (unsigned Input = 0; Input < 2; ++Input) {
+        SDValue &Src = Input == 0 ? Src1 : Src2;
+        if (RangeUse[Input] == 0)
+          Src = DAG.getUNDEF(VT);
+        else {
+          SDLoc dl = getCurSDLoc();
+          Src = DAG.getNode(
+              ISD::EXTRACT_SUBVECTOR, dl, VT, Src,
+              DAG.getConstant(StartIdx[Input], dl,
+                              TLI.getVectorIdxTy(DAG.getDataLayout())));
+        }
+      }
+
+      // Calculate new mask.
+      SmallVector<int, 8> MappedOps;
+      for (unsigned i = 0; i != MaskNumElts; ++i) {
+        int Idx = Mask[i];
+        if (Idx >= 0) {
+          if (Idx < (int)SrcNumElts)
+            Idx -= StartIdx[0];
+          else
+            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        }
+        MappedOps.push_back(Idx);
+      }
+
+      setValue(&I, DAG.getVectorShuffle(VT, getCurSDLoc(), Src1, Src2,
+                                        MappedOps));
+      return;
+    }
+  }
+
+  // We can't use either concat vectors or extract subvectors so fall back to
+  // replacing the shuffle with extract and build vector.
+  // to insert and build vector.
+  EVT EltVT = VT.getVectorElementType();
+  EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+  SDLoc dl = getCurSDLoc();
+  SmallVector<SDValue,8> Ops;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    int Idx = Mask[i];
+    SDValue Res;
+
+    if (Idx < 0) {
+      Res = DAG.getUNDEF(EltVT);
+    } else {
+      SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
+      if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
+
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                        EltVT, Src, DAG.getConstant(Idx, dl, IdxVT));
+    }
+
+    Ops.push_back(Res);
+  }
+
+  setValue(&I, DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops));
+}
+
+void SelectionDAGBuilder::visitInsertValue(const InsertValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  const Value *Op1 = I.getOperand(1);
+  Type *AggTy = I.getType();
+  Type *ValTy = Op1->getType();
+  bool IntoUndef = isa<UndefValue>(Op0);
+  bool FromUndef = isa<UndefValue>(Op1);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SmallVector<EVT, 4> AggValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), AggTy, AggValueVTs);
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
+
+  unsigned NumAggValues = AggValueVTs.size();
+  unsigned NumValValues = ValValueVTs.size();
+  SmallVector<SDValue, 4> Values(NumAggValues);
+
+  // Ignore an insertvalue that produces an empty object
+  if (!NumAggValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
+  SDValue Agg = getValue(Op0);
+  unsigned i = 0;
+  // Copy the beginning value(s) from the original aggregate.
+  for (; i != LinearIndex; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+  // Copy values from the inserted value(s).
+  if (NumValValues) {
+    SDValue Val = getValue(Op1);
+    for (; i != LinearIndex + NumValValues; ++i)
+      Values[i] = FromUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                  SDValue(Val.getNode(), Val.getResNo() + i - LinearIndex);
+  }
+  // Copy remaining value(s) from the original aggregate.
+  for (; i != NumAggValues; ++i)
+    Values[i] = IntoUndef ? DAG.getUNDEF(AggValueVTs[i]) :
+                SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(AggValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitExtractValue(const ExtractValueInst &I) {
+  const Value *Op0 = I.getOperand(0);
+  Type *AggTy = Op0->getType();
+  Type *ValTy = I.getType();
+  bool OutOfUndef = isa<UndefValue>(Op0);
+
+  unsigned LinearIndex = ComputeLinearIndex(AggTy, I.getIndices());
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SmallVector<EVT, 4> ValValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), ValTy, ValValueVTs);
+
+  unsigned NumValValues = ValValueVTs.size();
+
+  // Ignore a extractvalue that produces an empty object
+  if (!NumValValues) {
+    setValue(&I, DAG.getUNDEF(MVT(MVT::Other)));
+    return;
+  }
+
+  SmallVector<SDValue, 4> Values(NumValValues);
+
+  SDValue Agg = getValue(Op0);
+  // Copy out the selected value(s).
+  for (unsigned i = LinearIndex; i != LinearIndex + NumValValues; ++i)
+    Values[i - LinearIndex] =
+      OutOfUndef ?
+        DAG.getUNDEF(Agg.getNode()->getValueType(Agg.getResNo() + i)) :
+        SDValue(Agg.getNode(), Agg.getResNo() + i);
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
+  Value *Op0 = I.getOperand(0);
+  // Note that the pointer operand may be a vector of pointers. Take the scalar
+  // element which holds a pointer.
+  unsigned AS = Op0->getType()->getScalarType()->getPointerAddressSpace();
+  SDValue N = getValue(Op0);
+  SDLoc dl = getCurSDLoc();
+
+  // Normalize Vector GEP - all scalar operands should be converted to the
+  // splat vector.
+  unsigned VectorWidth = I.getType()->isVectorTy() ?
+    cast<VectorType>(I.getType())->getVectorNumElements() : 0;
+
+  if (VectorWidth && !N.getValueType().isVector()) {
+    LLVMContext &Context = *DAG.getContext();
+    EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
+    SmallVector<SDValue, 16> Ops(VectorWidth, N);
+    N = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+  }
+  for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
+       GTI != E; ++GTI) {
+    const Value *Idx = GTI.getOperand();
+    if (StructType *StTy = dyn_cast<StructType>(*GTI)) {
+      unsigned Field = cast<Constant>(Idx)->getUniqueInteger().getZExtValue();
+      if (Field) {
+        // N = N + Offset
+        uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N,
+                        DAG.getConstant(Offset, dl, N.getValueType()), &Flags);
+      }
+    } else {
+      MVT PtrTy =
+          DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout(), AS);
+      unsigned PtrSize = PtrTy.getSizeInBits();
+      APInt ElementSize(PtrSize, DL->getTypeAllocSize(GTI.getIndexedType()));
+
+      // If this is a scalar constant or a splat vector of constants,
+      // handle it quickly.
+      const auto *CI = dyn_cast<ConstantInt>(Idx);
+      if (!CI && isa<ConstantDataVector>(Idx) &&
+          cast<ConstantDataVector>(Idx)->getSplatValue())
+        CI = cast<ConstantInt>(cast<ConstantDataVector>(Idx)->getSplatValue());
+
+      if (CI) {
+        if (CI->isZero())
+          continue;
+        APInt Offs = ElementSize * CI->getValue().sextOrTrunc(PtrSize);
+        SDValue OffsVal = VectorWidth ?
+          DAG.getConstant(Offs, dl, MVT::getVectorVT(PtrTy, VectorWidth)) :
+          DAG.getConstant(Offs, dl, PtrTy);
+
+        // In an inbouds GEP with an offset that is nonnegative even when
+        // interpreted as signed, assume there is no unsigned overflow.
+        SDNodeFlags Flags;
+        if (Offs.isNonNegative() && cast<GEPOperator>(I).isInBounds())
+          Flags.setNoUnsignedWrap(true);
+
+        N = DAG.getNode(ISD::ADD, dl, N.getValueType(), N, OffsVal, &Flags);
+        continue;
+      }
+
+      // N = N + Idx * ElementSize;
+      SDValue IdxN = getValue(Idx);
+
+      if (!IdxN.getValueType().isVector() && VectorWidth) {
+        MVT VT = MVT::getVectorVT(IdxN.getValueType().getSimpleVT(), VectorWidth);
+        SmallVector<SDValue, 16> Ops(VectorWidth, IdxN);
+        IdxN = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, Ops);
+      }
+      // If the index is smaller or larger than intptr_t, truncate or extend
+      // it.
+      IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
+
+      // If this is a multiply by a power of two, turn it into a shl
+      // immediately.  This is a very common case.
+      if (ElementSize != 1) {
+        if (ElementSize.isPowerOf2()) {
+          unsigned Amt = ElementSize.logBase2();
+          IdxN = DAG.getNode(ISD::SHL, dl,
+                             N.getValueType(), IdxN,
+                             DAG.getConstant(Amt, dl, IdxN.getValueType()));
+        } else {
+          SDValue Scale = DAG.getConstant(ElementSize, dl, IdxN.getValueType());
+          IdxN = DAG.getNode(ISD::MUL, dl,
+                             N.getValueType(), IdxN, Scale);
+        }
+      }
+
+      N = DAG.getNode(ISD::ADD, dl,
+                      N.getValueType(), N, IdxN);
+    }
+  }
+
+  setValue(&I, N);
+}
+
+void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
+  // If this is a fixed sized alloca in the entry block of the function,
+  // allocate it statically on the stack.
+  if (FuncInfo.StaticAllocaMap.count(&I))
+    return;   // getValue will auto-populate this.
+
+  SDLoc dl = getCurSDLoc();
+  Type *Ty = I.getAllocatedType();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto &DL = DAG.getDataLayout();
+  uint64_t TySize = DL.getTypeAllocSize(Ty);
+  unsigned Align =
+      std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
+
+  SDValue AllocSize = getValue(I.getArraySize());
+
+  EVT IntPtr = TLI.getPointerTy(DAG.getDataLayout());
+  if (AllocSize.getValueType() != IntPtr)
+    AllocSize = DAG.getZExtOrTrunc(AllocSize, dl, IntPtr);
+
+  AllocSize = DAG.getNode(ISD::MUL, dl, IntPtr,
+                          AllocSize,
+                          DAG.getConstant(TySize, dl, IntPtr));
+
+  // Handle alignment.  If the requested alignment is less than or equal to
+  // the stack alignment, ignore it.  If the size is greater than or equal to
+  // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
+  unsigned StackAlign =
+      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
+  if (Align <= StackAlign)
+    Align = 0;
+
+  // Round the size of the allocation up to the stack alignment size
+  // by add SA-1 to the size. This doesn't overflow because we're computing
+  // an address inside an alloca.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+  AllocSize = DAG.getNode(ISD::ADD, dl,
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(StackAlign - 1, dl), &Flags);
+
+  // Mask out the low bits for alignment purposes.
+  AllocSize = DAG.getNode(ISD::AND, dl,
+                          AllocSize.getValueType(), AllocSize,
+                          DAG.getIntPtrConstant(~(uint64_t)(StackAlign - 1),
+                                                dl));
+
+  SDValue Ops[] = { getRoot(), AllocSize, DAG.getIntPtrConstant(Align, dl) };
+  SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
+  SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
+  setValue(&I, DSA);
+  DAG.setRoot(DSA.getValue(1));
+
+  assert(FuncInfo.MF->getFrameInfo()->hasVarSizedObjects());
+}
+
+void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
+  if (I.isAtomic())
+    return visitAtomicLoad(I);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const Value *SV = I.getOperand(0);
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(SV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitLoadFromSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(SV)) {
+      if (Alloca->isSwiftError())
+        return visitLoadFromSwiftError(I);
+    }
+  }
+
+  SDValue Ptr = getValue(SV);
+
+  Type *Ty = I.getType();
+
+  bool isVolatile = I.isVolatile();
+  bool isNonTemporal = I.getMetadata(LLVMContext::MD_nontemporal) != nullptr;
+
+  // The IR notion of invariant_load only guarantees that all *non-faulting*
+  // invariant loads result in the same value.  The MI notion of invariant load
+  // guarantees that the load can be legally moved to any location within its
+  // containing function.  The MI notion of invariant_load is stronger than the
+  // IR notion of invariant_load -- an MI invariant_load is an IR invariant_load
+  // with a guarantee that the location being loaded from is dereferenceable
+  // throughout the function's lifetime.
+
+  bool isInvariant = I.getMetadata(LLVMContext::MD_invariant_load) != nullptr &&
+                     isDereferenceablePointer(SV, DAG.getDataLayout());
+  unsigned Alignment = I.getAlignment();
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), Ty, ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  SDValue Root;
+  bool ConstantMemory = false;
+  if (isVolatile || NumValues > MaxParallelChains)
+    // Serialize volatile loads with other side effects.
+    Root = getRoot();
+  else if (AA->pointsToConstantMemory(MemoryLocation(
+               SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = DAG.getRoot();
+  }
+
+  SDLoc dl = getCurSDLoc();
+
+  if (isVolatile)
+    Root = TLI.prepareVolatileOrAtomicLoad(Root, dl, DAG);
+
+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
+  EVT PtrVT = Ptr.getValueType();
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // Serializing loads here may result in excessive register pressure, and
+    // TokenFactor places arbitrary choke points on the scheduler. SD scheduling
+    // could recover a bit by hoisting nodes upward in the chain by recognizing
+    // they are side-effect free or do not alias. The optimizer should really
+    // avoid this case by converting large object/array copies to llvm.memcpy
+    // (MaxParallelChains should always remain as failsafe).
+    if (ChainI == MaxParallelChains) {
+      assert(PendingLoads.empty() && "PendingLoads must be serialized first");
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
+      Root = Chain;
+      ChainI = 0;
+    }
+    SDValue A = DAG.getNode(ISD::ADD, dl,
+                            PtrVT, Ptr,
+                            DAG.getConstant(Offsets[i], dl, PtrVT),
+                            &Flags);
+    auto MMOFlags = MachineMemOperand::MONone;
+    if (isVolatile)
+      MMOFlags |= MachineMemOperand::MOVolatile;
+    if (isNonTemporal)
+      MMOFlags |= MachineMemOperand::MONonTemporal;
+    if (isInvariant)
+      MMOFlags |= MachineMemOperand::MOInvariant;
+
+    SDValue L = DAG.getLoad(ValueVTs[i], dl, Root, A,
+                            MachinePointerInfo(SV, Offsets[i]), Alignment,
+                            MMOFlags, AAInfo, Ranges);
+
+    Values[i] = L;
+    Chains[ChainI] = L.getValue(1);
+  }
+
+  if (!ConstantMemory) {
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                makeArrayRef(Chains.data(), ChainI));
+    if (isVolatile)
+      DAG.setRoot(Chain);
+    else
+      PendingLoads.push_back(Chain);
+  }
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, dl,
+                           DAG.getVTList(ValueVTs), Values));
+}
+
+void SelectionDAGBuilder::visitStoreToSwiftError(const StoreInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  assert(TLI.supportSwiftError() &&
+         "call visitStoreToSwiftError when backend supports swifterror");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  const Value *SrcV = I.getOperand(0);
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &Offsets);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  SDValue Src = getValue(SrcV);
+  // Create a virtual register, then update the virtual register.
+  auto &DL = DAG.getDataLayout();
+  const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+  unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+  // Chain, DL, Reg, N or Chain, DL, Reg, N, Glue
+  // Chain can be getRoot or getControlRoot.
+  SDValue CopyNode = DAG.getCopyToReg(getRoot(), getCurSDLoc(), VReg,
+                                      SDValue(Src.getNode(), Src.getResNo()));
+  DAG.setRoot(CopyNode);
+  FuncInfo.setSwiftErrorVReg(FuncInfo.MBB, I.getOperand(1), VReg);
+}
+
+void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) {
+  assert(DAG.getTargetLoweringInfo().supportSwiftError() &&
+         "call visitLoadFromSwiftError when backend supports swifterror");
+
+  assert(!I.isVolatile() &&
+         I.getMetadata(LLVMContext::MD_nontemporal) == nullptr &&
+         I.getMetadata(LLVMContext::MD_invariant_load) == nullptr &&
+         "Support volatile, non temporal, invariant for load_from_swift_error");
+
+  const Value *SV = I.getOperand(0);
+  Type *Ty = I.getType();
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  assert(!AA->pointsToConstantMemory(MemoryLocation(
+             SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) &&
+         "load_from_swift_error should not be constant memory");
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), Ty,
+                  ValueVTs, &Offsets);
+  assert(ValueVTs.size() == 1 && Offsets[0] == 0 &&
+         "expect a single EVT for swifterror");
+
+  // Chain, DL, Reg, VT, Glue or Chain, DL, Reg, VT
+  SDValue L = DAG.getCopyFromReg(getRoot(), getCurSDLoc(),
+                                 FuncInfo.findSwiftErrorVReg(FuncInfo.MBB, SV),
+                                 ValueVTs[0]);
+
+  setValue(&I, L);
+}
+
+void SelectionDAGBuilder::visitStore(const StoreInst &I) {
+  if (I.isAtomic())
+    return visitAtomicStore(I);
+
+  const Value *SrcV = I.getOperand(0);
+  const Value *PtrV = I.getOperand(1);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.supportSwiftError()) {
+    // Swifterror values can come from either a function parameter with
+    // swifterror attribute or an alloca with swifterror attribute.
+    if (const Argument *Arg = dyn_cast<Argument>(PtrV)) {
+      if (Arg->hasSwiftErrorAttr())
+        return visitStoreToSwiftError(I);
+    }
+
+    if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(PtrV)) {
+      if (Alloca->isSwiftError())
+        return visitStoreToSwiftError(I);
+    }
+  }
+
+  SmallVector<EVT, 4> ValueVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(),
+                  SrcV->getType(), ValueVTs, &Offsets);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0)
+    return;
+
+  // Get the lowered operands. Note that we do this after
+  // checking if NumResults is zero, because with zero results
+  // the operands won't have values in the map.
+  SDValue Src = getValue(SrcV);
+  SDValue Ptr = getValue(PtrV);
+
+  SDValue Root = getRoot();
+  SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
+  SDLoc dl = getCurSDLoc();
+  EVT PtrVT = Ptr.getValueType();
+  unsigned Alignment = I.getAlignment();
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  auto MMOFlags = MachineMemOperand::MONone;
+  if (I.isVolatile())
+    MMOFlags |= MachineMemOperand::MOVolatile;
+  if (I.getMetadata(LLVMContext::MD_nontemporal) != nullptr)
+    MMOFlags |= MachineMemOperand::MONonTemporal;
+
+  // An aggregate load cannot wrap around the address space, so offsets to its
+  // parts don't wrap either.
+  SDNodeFlags Flags;
+  Flags.setNoUnsignedWrap(true);
+
+  unsigned ChainI = 0;
+  for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
+    // See visitLoad comments.
+    if (ChainI == MaxParallelChains) {
+      SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
+      Root = Chain;
+      ChainI = 0;
+    }
+    SDValue Add = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
+                              DAG.getConstant(Offsets[i], dl, PtrVT), &Flags);
+    SDValue St = DAG.getStore(
+        Root, dl, SDValue(Src.getNode(), Src.getResNo() + i), Add,
+        MachinePointerInfo(PtrV, Offsets[i]), Alignment, MMOFlags, AAInfo);
+    Chains[ChainI] = St;
+  }
+
+  SDValue StoreNode = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                  makeArrayRef(Chains.data(), ChainI));
+  DAG.setRoot(StoreNode);
+}
+
+void SelectionDAGBuilder::visitMaskedStore(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
+  Value  *PtrOperand = I.getArgOperand(1);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOStore,  VT.getStoreSize(),
+                          Alignment, AAInfo);
+  SDValue StoreNode = DAG.getMaskedStore(getRoot(), sdl, Src0, Ptr, Mask, VT,
+                                         MMO, false);
+  DAG.setRoot(StoreNode);
+  setValue(&I, StoreNode);
+}
+
+// Get a uniform base for the Gather/Scatter intrinsic.
+// The first argument of the Gather/Scatter intrinsic is a vector of pointers.
+// We try to represent it as a base pointer + vector of indices.
+// Usually, the vector of pointers comes from a 'getelementptr' instruction.
+// The first operand of the GEP may be a single pointer or a vector of pointers
+// Example:
+//   %gep.ptr = getelementptr i32, <8 x i32*> %vptr, <8 x i32> %ind
+//  or
+//   %gep.ptr = getelementptr i32, i32* %ptr,        <8 x i32> %ind
+// %res = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %gep.ptr, ..
+//
+// When the first GEP operand is a single pointer - it is the uniform base we
+// are looking for. If first operand of the GEP is a splat vector - we
+// extract the spalt value and use it as a uniform base.
+// In all other cases the function returns 'false'.
+//
+static bool getUniformBase(const Value *& Ptr, SDValue& Base, SDValue& Index,
+                           SelectionDAGBuilder* SDB) {
+
+  SelectionDAG& DAG = SDB->DAG;
+  LLVMContext &Context = *DAG.getContext();
+
+  assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || GEP->getNumOperands() > 2)
+    return false;
+
+  const Value *GEPPtr = GEP->getPointerOperand();
+  if (!GEPPtr->getType()->isVectorTy())
+    Ptr = GEPPtr;
+  else if (!(Ptr = getSplatValue(GEPPtr)))
+    return false;
+
+  Value *IndexVal = GEP->getOperand(1);
+
+  // The operands of the GEP may be defined in another basic block.
+  // In this case we'll not find nodes for the operands.
+  if (!SDB->findValue(Ptr) || !SDB->findValue(IndexVal))
+    return false;
+
+  Base = SDB->getValue(Ptr);
+  Index = SDB->getValue(IndexVal);
+
+  // Suppress sign extension.
+  if (SExtInst* Sext = dyn_cast<SExtInst>(IndexVal)) {
+    if (SDB->findValue(Sext->getOperand(0))) {
+      IndexVal = Sext->getOperand(0);
+      Index = SDB->getValue(IndexVal);
+    }
+  }
+  if (!Index.getValueType().isVector()) {
+    unsigned GEPWidth = GEP->getType()->getVectorNumElements();
+    EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
+    SmallVector<SDValue, 16> Ops(GEPWidth, Index);
+    Index = DAG.getNode(ISD::BUILD_VECTOR, SDLoc(Index), VT, Ops);
+  }
+  return true;
+}
+
+void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // llvm.masked.scatter.*(Src0, Ptrs, alignemt, Mask)
+  const Value *Ptr = I.getArgOperand(1);
+  SDValue Src0 = getValue(I.getArgOperand(0));
+  SDValue Mask = getValue(I.getArgOperand(3));
+  EVT VT = Src0.getValueType();
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+
+  SDValue Base;
+  SDValue Index;
+  const Value *BasePtr = Ptr;
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
+
+  const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
+  MachineMemOperand *MMO = DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
+                         MachineMemOperand::MOStore,  VT.getStoreSize(),
+                         Alignment, AAInfo);
+  if (!UniformBase) {
+    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(Ptr);
+  }
+  SDValue Ops[] = { getRoot(), Src0, Mask, Base, Index };
+  SDValue Scatter = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), VT, sdl,
+                                         Ops, MMO);
+  DAG.setRoot(Scatter);
+  setValue(&I, Scatter);
+}
+
+void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
+  Value  *PtrOperand = I.getArgOperand(0);
+  SDValue Ptr = getValue(PtrOperand);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  // Do not serialize masked loads of constant memory with anything.
+  bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation(
+      PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo));
+  SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(PtrOperand),
+                          MachineMemOperand::MOLoad,  VT.getStoreSize(),
+                          Alignment, AAInfo, Ranges);
+
+  SDValue Load = DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Mask, Src0, VT, MMO,
+                                   ISD::NON_EXTLOAD);
+  if (AddToChain) {
+    SDValue OutChain = Load.getValue(1);
+    DAG.setRoot(OutChain);
+  }
+  setValue(&I, Load);
+}
+
+void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
+  SDLoc sdl = getCurSDLoc();
+
+  // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
+  const Value *Ptr = I.getArgOperand(0);
+  SDValue Src0 = getValue(I.getArgOperand(3));
+  SDValue Mask = getValue(I.getArgOperand(2));
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
+  if (!Alignment)
+    Alignment = DAG.getEVTAlignment(VT);
+
+  AAMDNodes AAInfo;
+  I.getAAMetadata(AAInfo);
+  const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range);
+
+  SDValue Root = DAG.getRoot();
+  SDValue Base;
+  SDValue Index;
+  const Value *BasePtr = Ptr;
+  bool UniformBase = getUniformBase(BasePtr, Base, Index, this);
+  bool ConstantMemory = false;
+  if (UniformBase &&
+      AA->pointsToConstantMemory(MemoryLocation(
+          BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()),
+          AAInfo))) {
+    // Do not serialize (non-volatile) loads of constant memory with anything.
+    Root = DAG.getEntryNode();
+    ConstantMemory = true;
+  }
+
+  MachineMemOperand *MMO =
+    DAG.getMachineFunction().
+    getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
+                         MachineMemOperand::MOLoad,  VT.getStoreSize(),
+                         Alignment, AAInfo, Ranges);
+
+  if (!UniformBase) {
+    Base = DAG.getTargetConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
+    Index = getValue(Ptr);
+  }
+  SDValue Ops[] = { Root, Src0, Mask, Base, Index };
+  SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
+                                       Ops, MMO);
+
+  SDValue OutChain = Gather.getValue(1);
+  if (!ConstantMemory)
+    PendingLoads.push_back(OutChain);
+  setValue(&I, Gather);
+}
+
+void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
+  SDLoc dl = getCurSDLoc();
+  AtomicOrdering SuccessOrder = I.getSuccessOrdering();
+  AtomicOrdering FailureOrder = I.getFailureOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
+  SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
+  SDValue L = DAG.getAtomicCmpSwap(
+      ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, dl, MemVT, VTs, InChain,
+      getValue(I.getPointerOperand()), getValue(I.getCompareOperand()),
+      getValue(I.getNewValOperand()), MachinePointerInfo(I.getPointerOperand()),
+      /*Alignment=*/ 0, SuccessOrder, FailureOrder, Scope);
+
+  SDValue OutChain = L.getValue(2);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
+  SDLoc dl = getCurSDLoc();
+  ISD::NodeType NT;
+  switch (I.getOperation()) {
+  default: llvm_unreachable("Unknown atomicrmw operation");
+  case AtomicRMWInst::Xchg: NT = ISD::ATOMIC_SWAP; break;
+  case AtomicRMWInst::Add:  NT = ISD::ATOMIC_LOAD_ADD; break;
+  case AtomicRMWInst::Sub:  NT = ISD::ATOMIC_LOAD_SUB; break;
+  case AtomicRMWInst::And:  NT = ISD::ATOMIC_LOAD_AND; break;
+  case AtomicRMWInst::Nand: NT = ISD::ATOMIC_LOAD_NAND; break;
+  case AtomicRMWInst::Or:   NT = ISD::ATOMIC_LOAD_OR; break;
+  case AtomicRMWInst::Xor:  NT = ISD::ATOMIC_LOAD_XOR; break;
+  case AtomicRMWInst::Max:  NT = ISD::ATOMIC_LOAD_MAX; break;
+  case AtomicRMWInst::Min:  NT = ISD::ATOMIC_LOAD_MIN; break;
+  case AtomicRMWInst::UMax: NT = ISD::ATOMIC_LOAD_UMAX; break;
+  case AtomicRMWInst::UMin: NT = ISD::ATOMIC_LOAD_UMIN; break;
+  }
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  SDValue L =
+    DAG.getAtomic(NT, dl,
+                  getValue(I.getValOperand()).getSimpleValueType(),
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValOperand()),
+                  I.getPointerOperand(),
+                  /* Alignment=*/ 0, Order, Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitFence(const FenceInst &I) {
+  SDLoc dl = getCurSDLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDValue Ops[3];
+  Ops[0] = getRoot();
+  Ops[1] = DAG.getConstant((unsigned)I.getOrdering(), dl,
+                           TLI.getPointerTy(DAG.getDataLayout()));
+  Ops[2] = DAG.getConstant(I.getSynchScope(), dl,
+                           TLI.getPointerTy(DAG.getDataLayout()));
+  DAG.setRoot(DAG.getNode(ISD::ATOMIC_FENCE, dl, MVT::Other, Ops));
+}
+
+void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
+  SDLoc dl = getCurSDLoc();
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic load");
+
+  MachineMemOperand *MMO =
+      DAG.getMachineFunction().
+      getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
+                           MachineMemOperand::MOVolatile |
+                           MachineMemOperand::MOLoad,
+                           VT.getStoreSize(),
+                           I.getAlignment() ? I.getAlignment() :
+                                              DAG.getEVTAlignment(VT));
+
+  InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
+  SDValue L =
+      DAG.getAtomic(ISD::ATOMIC_LOAD, dl, VT, VT, InChain,
+                    getValue(I.getPointerOperand()), MMO,
+                    Order, Scope);
+
+  SDValue OutChain = L.getValue(1);
+
+  setValue(&I, L);
+  DAG.setRoot(OutChain);
+}
+
+void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
+  SDLoc dl = getCurSDLoc();
+
+  AtomicOrdering Order = I.getOrdering();
+  SynchronizationScope Scope = I.getSynchScope();
+
+  SDValue InChain = getRoot();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT VT =
+      TLI.getValueType(DAG.getDataLayout(), I.getValueOperand()->getType());
+
+  if (I.getAlignment() < VT.getSizeInBits() / 8)
+    report_fatal_error("Cannot generate unaligned atomic store");
+
+  SDValue OutChain =
+    DAG.getAtomic(ISD::ATOMIC_STORE, dl, VT,
+                  InChain,
+                  getValue(I.getPointerOperand()),
+                  getValue(I.getValueOperand()),
+                  I.getPointerOperand(), I.getAlignment(),
+                  Order, Scope);
+
+  DAG.setRoot(OutChain);
+}
+
+/// visitTargetIntrinsic - Lower a call of a target intrinsic to an INTRINSIC
+/// node.
+void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
+                                               unsigned Intrinsic) {
+  bool HasChain = !I.doesNotAccessMemory();
+  bool OnlyLoad = HasChain && I.onlyReadsMemory();
+
+  // Build the operand list.
+  SmallVector<SDValue, 8> Ops;
+  if (HasChain) {  // If this intrinsic has side-effects, chainify it.
+    if (OnlyLoad) {
+      // We don't need to serialize loads against other loads.
+      Ops.push_back(DAG.getRoot());
+    } else {
+      Ops.push_back(getRoot());
+    }
+  }
+
+  // Info is set by getTgtMemInstrinsic
+  TargetLowering::IntrinsicInfo Info;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  bool IsTgtIntrinsic = TLI.getTgtMemIntrinsic(Info, I, Intrinsic);
+
+  // Add the intrinsic ID as an integer operand if it's not a target intrinsic.
+  if (!IsTgtIntrinsic || Info.opc == ISD::INTRINSIC_VOID ||
+      Info.opc == ISD::INTRINSIC_W_CHAIN)
+    Ops.push_back(DAG.getTargetConstant(Intrinsic, getCurSDLoc(),
+                                        TLI.getPointerTy(DAG.getDataLayout())));
+
+  // Add all operands of the call to the operand list.
+  for (unsigned i = 0, e = I.getNumArgOperands(); i != e; ++i) {
+    SDValue Op = getValue(I.getArgOperand(i));
+    Ops.push_back(Op);
+  }
+
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DAG.getDataLayout(), I.getType(), ValueVTs);
+
+  if (HasChain)
+    ValueVTs.push_back(MVT::Other);
+
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+
+  // Create the node.
+  SDValue Result;
+  if (IsTgtIntrinsic) {
+    // This is target intrinsic that touches memory
+    Result = DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(),
+                                     VTs, Ops, Info.memVT,
+                                   MachinePointerInfo(Info.ptrVal, Info.offset),
+                                     Info.align, Info.vol,
+                                     Info.readMem, Info.writeMem, Info.size);
+  } else if (!HasChain) {
+    Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
+  } else if (!I.getType()->isVoidTy()) {
+    Result = DAG.getNode(ISD::INTRINSIC_W_CHAIN, getCurSDLoc(), VTs, Ops);
+  } else {
+    Result = DAG.getNode(ISD::INTRINSIC_VOID, getCurSDLoc(), VTs, Ops);
+  }
+
+  if (HasChain) {
+    SDValue Chain = Result.getValue(Result.getNode()->getNumValues()-1);
+    if (OnlyLoad)
+      PendingLoads.push_back(Chain);
+    else
+      DAG.setRoot(Chain);
+  }
+
+  if (!I.getType()->isVoidTy()) {
+    if (VectorType *PTy = dyn_cast<VectorType>(I.getType())) {
+      EVT VT = TLI.getValueType(DAG.getDataLayout(), PTy);
+      Result = DAG.getNode(ISD::BITCAST, getCurSDLoc(), VT, Result);
+    } else
+      Result = lowerRangeToAssertZExt(DAG, I, Result);
+
+    setValue(&I, Result);
+  }
+}
+
+/// GetSignificand - Get the significand and build it into a floating-point
+/// number with exponent of 1:
+///
+///   Op = (Op & 0x007fffff) | 0x3f800000;
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue GetSignificand(SelectionDAG &DAG, SDValue Op, const SDLoc &dl) {
+  SDValue t1 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x007fffff, dl, MVT::i32));
+  SDValue t2 = DAG.getNode(ISD::OR, dl, MVT::i32, t1,
+                           DAG.getConstant(0x3f800000, dl, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32, t2);
+}
+
+/// GetExponent - Get the exponent:
+///
+///   (float)(int)(((Op & 0x7f800000) >> 23) - 127);
+///
+/// where Op is the hexadecimal representation of floating point value.
+static SDValue GetExponent(SelectionDAG &DAG, SDValue Op,
+                           const TargetLowering &TLI, const SDLoc &dl) {
+  SDValue t0 = DAG.getNode(ISD::AND, dl, MVT::i32, Op,
+                           DAG.getConstant(0x7f800000, dl, MVT::i32));
+  SDValue t1 = DAG.getNode(
+      ISD::SRL, dl, MVT::i32, t0,
+      DAG.getConstant(23, dl, TLI.getPointerTy(DAG.getDataLayout())));
+  SDValue t2 = DAG.getNode(ISD::SUB, dl, MVT::i32, t1,
+                           DAG.getConstant(127, dl, MVT::i32));
+  return DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, t2);
+}
+
+/// getF32Constant - Get 32-bit floating point constant.
+static SDValue getF32Constant(SelectionDAG &DAG, unsigned Flt,
+                              const SDLoc &dl) {
+  return DAG.getConstantFP(APFloat(APFloat::IEEEsingle, APInt(32, Flt)), dl,
+                           MVT::f32);
+}
+
+static SDValue getLimitedPrecisionExp2(SDValue t0, const SDLoc &dl,
+                                       SelectionDAG &DAG) {
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  //   IntegerPartOfX = ((int32_t)(t0);
+  SDValue IntegerPartOfX = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, t0);
+
+  //   FractionalPartOfX = t0 - (float)IntegerPartOfX;
+  SDValue t1 = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, IntegerPartOfX);
+  SDValue X = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0, t1);
+
+  //   IntegerPartOfX <<= 23;
+  IntegerPartOfX = DAG.getNode(
+      ISD::SHL, dl, MVT::i32, IntegerPartOfX,
+      DAG.getConstant(23, dl, DAG.getTargetLoweringInfo().getPointerTy(
+                                  DAG.getDataLayout())));
+
+  SDValue TwoToFractionalPartOfX;
+  if (LimitFloatPrecision <= 6) {
+    // For floating-point precision of 6:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.997535578f +
+    //       (0.735607626f + 0.252464424f * x) * x;
+    //
+    // error 0.0144103317, which is 6 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3e814304, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3f3c50c8, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                                         getF32Constant(DAG, 0x3f7f5e7e, dl));
+  } else if (LimitFloatPrecision <= 12) {
+    // For floating-point precision of 12:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.999892986f +
+    //       (0.696457318f +
+    //         (0.224338339f + 0.792043434e-1f * x) * x) * x;
+    //
+    // error 0.000107046256, which is 13 to 14 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3da235e3, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3e65b8f3, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                             getF32Constant(DAG, 0x3f324b07, dl));
+    SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                                         getF32Constant(DAG, 0x3f7ff8fd, dl));
+  } else { // LimitFloatPrecision <= 18
+    // For floating-point precision of 18:
+    //
+    //   TwoToFractionalPartOfX =
+    //     0.999999982f +
+    //       (0.693148872f +
+    //         (0.240227044f +
+    //           (0.554906021e-1f +
+    //             (0.961591928e-2f +
+    //               (0.136028312e-2f + 0.157059148e-3f *x)*x)*x)*x)*x)*x;
+    // error 2.47208000*10^(-7), which is better than 18 bits
+    SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                             getF32Constant(DAG, 0x3924b03e, dl));
+    SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                             getF32Constant(DAG, 0x3ab24b87, dl));
+    SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+    SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                             getF32Constant(DAG, 0x3c1d8c17, dl));
+    SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+    SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                             getF32Constant(DAG, 0x3d634a1d, dl));
+    SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+    SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                             getF32Constant(DAG, 0x3e75fe14, dl));
+    SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+    SDValue t11 = DAG.getNode(ISD::FADD, dl, MVT::f32, t10,
+                              getF32Constant(DAG, 0x3f317234, dl));
+    SDValue t12 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t11, X);
+    TwoToFractionalPartOfX = DAG.getNode(ISD::FADD, dl, MVT::f32, t12,
+                                         getF32Constant(DAG, 0x3f800000, dl));
+  }
+
+  // Add the exponent into the result in integer domain.
+  SDValue t13 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, TwoToFractionalPartOfX);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::f32,
+                     DAG.getNode(ISD::ADD, dl, MVT::i32, t13, IntegerPartOfX));
+}
+
+/// expandExp - Lower an exp intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OFe 1.4426950f
+    //   t0 = Op * LOG2OFe
+
+    // TODO: What fast-math-flags should be set here?
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, Op,
+                             getF32Constant(DAG, 0x3fb8aa3b, dl));
+    return getLimitedPrecisionExp2(t0, dl, DAG);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP, dl, Op.getValueType(), Op);
+}
+
+/// expandLog - Lower a log intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                         const TargetLowering &TLI) {
+
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log(2) [0.69314718f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3f317218, dl));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue LogOfMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   LogofMantissa =
+      //     -1.1609546f +
+      //       (1.4034025f - 0.23903021f * x) * x;
+      //
+      // error 0.0034276066, which is better than 8 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbe74c456, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3fb3a2b1, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                  getF32Constant(DAG, 0x3f949a29, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   LogOfMantissa =
+      //     -1.7417939f +
+      //       (2.8212026f +
+      //         (-1.4699568f +
+      //           (0.44717955f - 0.56570851e-1f * x) * x) * x) * x;
+      //
+      // error 0.000061011436, which is 14 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbd67b6d6, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ee4f4b8, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fbc278b, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40348e95, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                  getF32Constant(DAG, 0x3fdef31a, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   LogOfMantissa =
+      //     -2.1072184f +
+      //       (4.2372794f +
+      //         (-3.7029485f +
+      //           (2.2781945f +
+      //             (-0.87823314f +
+      //               (0.19073739f - 0.17809712e-1f * x) * x) * x) * x) * x)*x;
+      //
+      // error 0.0000023660568, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbc91e5ac, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e4350aa, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f60d3e3, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x4011cdf0, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x406cfd1c, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x408797cb, dl));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      LogOfMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                  getF32Constant(DAG, 0x4006dcab, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, LogOfMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG, dl, Op.getValueType(), Op);
+}
+
+/// expandLog2 - Lower a log2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Get the exponent.
+    SDValue LogOfExponent = GetExponent(DAG, Op1, TLI, dl);
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    // Different possible minimax approximations of significand in
+    // floating-point for various degrees of accuracy over [1,2].
+    SDValue Log2ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log2ofMantissa = -1.6749035f + (2.0246817f - .34484768f * x) * x;
+      //
+      // error 0.0049451742, which is more than 7 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbeb08fe0, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x40019463, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                   getF32Constant(DAG, 0x3fd6633d, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log2ofMantissa =
+      //     -2.51285454f +
+      //       (4.07009056f +
+      //         (-2.12067489f +
+      //           (.645142248f - 0.816157886e-1f * x) * x) * x) * x;
+      //
+      // error 0.0000876136000, which is better than 13 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbda7262e, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f25280b, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x4007b923, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40823e2f, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                                   getF32Constant(DAG, 0x4020d29c, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log2ofMantissa =
+      //     -3.0400495f +
+      //       (6.1129976f +
+      //         (-5.3420409f +
+      //           (3.2865683f +
+      //             (-1.2669343f +
+      //               (0.27515199f -
+      //                 0.25691327e-1f * x) * x) * x) * x) * x) * x;
+      //
+      // error 0.0000018516, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbcd2769e, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e8ce0b9, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3fa22ae7, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FADD, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x40525723, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x40aaf200, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      SDValue t9 = DAG.getNode(ISD::FADD, dl, MVT::f32, t8,
+                               getF32Constant(DAG, 0x40c39dad, dl));
+      SDValue t10 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t9, X);
+      Log2ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t10,
+                                   getF32Constant(DAG, 0x4042902c, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log2ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG2, dl, Op.getValueType(), Op);
+}
+
+/// expandLog10 - Lower a log10 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandLog10(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                           const TargetLowering &TLI) {
+
+  // TODO: What fast-math-flags should be set on the floating-point nodes?
+
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    SDValue Op1 = DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op);
+
+    // Scale the exponent by log10(2) [0.30102999f].
+    SDValue Exp = GetExponent(DAG, Op1, TLI, dl);
+    SDValue LogOfExponent = DAG.getNode(ISD::FMUL, dl, MVT::f32, Exp,
+                                        getF32Constant(DAG, 0x3e9a209a, dl));
+
+    // Get the significand and build it into a floating-point number with
+    // exponent of 1.
+    SDValue X = GetSignificand(DAG, Op1, dl);
+
+    SDValue Log10ofMantissa;
+    if (LimitFloatPrecision <= 6) {
+      // For floating-point precision of 6:
+      //
+      //   Log10ofMantissa =
+      //     -0.50419619f +
+      //       (0.60948995f - 0.10380950f * x) * x;
+      //
+      // error 0.0014886165, which is 6 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0xbdd49a13, dl));
+      SDValue t1 = DAG.getNode(ISD::FADD, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3f1c0789, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t2,
+                                    getF32Constant(DAG, 0x3f011300, dl));
+    } else if (LimitFloatPrecision <= 12) {
+      // For floating-point precision of 12:
+      //
+      //   Log10ofMantissa =
+      //     -0.64831180f +
+      //       (0.91751397f +
+      //         (-0.31664806f + 0.47637168e-1f * x) * x) * x;
+      //
+      // error 0.00019228036, which is better than 12 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3d431f31, dl));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3ea21fb2, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3f6ae232, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                                    getF32Constant(DAG, 0x3f25f7c3, dl));
+    } else { // LimitFloatPrecision <= 18
+      // For floating-point precision of 18:
+      //
+      //   Log10ofMantissa =
+      //     -0.84299375f +
+      //       (1.5327582f +
+      //         (-1.0688956f +
+      //           (0.49102474f +
+      //             (-0.12539807f + 0.13508273e-1f * x) * x) * x) * x) * x;
+      //
+      // error 0.0000037995730, which is better than 18 bits
+      SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, X,
+                               getF32Constant(DAG, 0x3c5d51ce, dl));
+      SDValue t1 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t0,
+                               getF32Constant(DAG, 0x3e00685a, dl));
+      SDValue t2 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t1, X);
+      SDValue t3 = DAG.getNode(ISD::FADD, dl, MVT::f32, t2,
+                               getF32Constant(DAG, 0x3efb6798, dl));
+      SDValue t4 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t3, X);
+      SDValue t5 = DAG.getNode(ISD::FSUB, dl, MVT::f32, t4,
+                               getF32Constant(DAG, 0x3f88d192, dl));
+      SDValue t6 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t5, X);
+      SDValue t7 = DAG.getNode(ISD::FADD, dl, MVT::f32, t6,
+                               getF32Constant(DAG, 0x3fc4316c, dl));
+      SDValue t8 = DAG.getNode(ISD::FMUL, dl, MVT::f32, t7, X);
+      Log10ofMantissa = DAG.getNode(ISD::FSUB, dl, MVT::f32, t8,
+                                    getF32Constant(DAG, 0x3f57ce70, dl));
+    }
+
+    return DAG.getNode(ISD::FADD, dl, MVT::f32, LogOfExponent, Log10ofMantissa);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FLOG10, dl, Op.getValueType(), Op);
+}
+
+/// expandExp2 - Lower an exp2 intrinsic. Handles the special sequences for
+/// limited-precision mode.
+static SDValue expandExp2(const SDLoc &dl, SDValue Op, SelectionDAG &DAG,
+                          const TargetLowering &TLI) {
+  if (Op.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18)
+    return getLimitedPrecisionExp2(Op, dl, DAG);
+
+  // No special expansion.
+  return DAG.getNode(ISD::FEXP2, dl, Op.getValueType(), Op);
+}
+
+/// visitPow - Lower a pow intrinsic. Handles the special sequences for
+/// limited-precision mode with x == 10.0f.
+static SDValue expandPow(const SDLoc &dl, SDValue LHS, SDValue RHS,
+                         SelectionDAG &DAG, const TargetLowering &TLI) {
+  bool IsExp10 = false;
+  if (LHS.getValueType() == MVT::f32 && RHS.getValueType() == MVT::f32 &&
+      LimitFloatPrecision > 0 && LimitFloatPrecision <= 18) {
+    if (ConstantFPSDNode *LHSC = dyn_cast<ConstantFPSDNode>(LHS)) {
+      APFloat Ten(10.0f);
+      IsExp10 = LHSC->isExactlyValue(Ten);
+    }
+  }
+
+  // TODO: What fast-math-flags should be set on the FMUL node?
+  if (IsExp10) {
+    // Put the exponent in the right bit position for later addition to the
+    // final result:
+    //
+    //   #define LOG2OF10 3.3219281f
+    //   t0 = Op * LOG2OF10;
+    SDValue t0 = DAG.getNode(ISD::FMUL, dl, MVT::f32, RHS,
+                             getF32Constant(DAG, 0x40549a78, dl));
+    return getLimitedPrecisionExp2(t0, dl, DAG);
+  }
+
+  // No special expansion.
+  return DAG.getNode(ISD::FPOW, dl, LHS.getValueType(), LHS, RHS);
+}
+
+
+/// ExpandPowI - Expand a llvm.powi intrinsic.
+static SDValue ExpandPowI(const SDLoc &DL, SDValue LHS, SDValue RHS,
+                          SelectionDAG &DAG) {
+  // If RHS is a constant, we can expand this out to a multiplication tree,
+  // otherwise we end up lowering to a call to __powidf2 (for example).  When
+  // optimizing for size, we only want to do this if the expansion would produce
+  // a small number of multiplies, otherwise we do the full expansion.
+  if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS)) {
+    // Get the exponent as a positive value.
+    unsigned Val = RHSC->getSExtValue();
+    if ((int)Val < 0) Val = -Val;
+
+    // powi(x, 0) -> 1.0
+    if (Val == 0)
+      return DAG.getConstantFP(1.0, DL, LHS.getValueType());
+
+    const Function *F = DAG.getMachineFunction().getFunction();
+    if (!F->optForSize() ||
+        // If optimizing for size, don't insert too many multiplies.
+        // This inserts up to 5 multiplies.
+        countPopulation(Val) + Log2_32(Val) < 7) {
+      // We use the simple binary decomposition method to generate the multiply
+      // sequence.  There are more optimal ways to do this (for example,
+      // powi(x,15) generates one more multiply than it should), but this has
+      // the benefit of being both really simple and much better than a libcall.
+      SDValue Res;  // Logically starts equal to 1.0
+      SDValue CurSquare = LHS;
+      // TODO: Intrinsics should have fast-math-flags that propagate to these
+      // nodes.
+      while (Val) {
+        if (Val & 1) {
+          if (Res.getNode())
+            Res = DAG.getNode(ISD::FMUL, DL,Res.getValueType(), Res, CurSquare);
+          else
+            Res = CurSquare;  // 1.0*CurSquare.
+        }
+
+        CurSquare = DAG.getNode(ISD::FMUL, DL, CurSquare.getValueType(),
+                                CurSquare, CurSquare);
+        Val >>= 1;
+      }
+
+      // If the original was negative, invert the result, producing 1/(x*x*x).
+      if (RHSC->getSExtValue() < 0)
+        Res = DAG.getNode(ISD::FDIV, DL, LHS.getValueType(),
+                          DAG.getConstantFP(1.0, DL, LHS.getValueType()), Res);
+      return Res;
+    }
+  }
+
+  // Otherwise, expand to a libcall.
+  return DAG.getNode(ISD::FPOWI, DL, LHS.getValueType(), LHS, RHS);
+}
+
+// getUnderlyingArgReg - Find underlying register used for a truncated or
+// bitcasted argument.
+static unsigned getUnderlyingArgReg(const SDValue &N) {
+  switch (N.getOpcode()) {
+  case ISD::CopyFromReg:
+    return cast<RegisterSDNode>(N.getOperand(1))->getReg();
+  case ISD::BITCAST:
+  case ISD::AssertZext:
+  case ISD::AssertSext:
+  case ISD::TRUNCATE:
+    return getUnderlyingArgReg(N.getOperand(0));
+  default:
+    return 0;
+  }
+}
+
+/// EmitFuncArgumentDbgValue - If the DbgValueInst is a dbg_value of a function
+/// argument, create the corresponding DBG_VALUE machine instruction for it now.
+/// At the end of instruction selection, they will be inserted to the entry BB.
+bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
+    const Value *V, DILocalVariable *Variable, DIExpression *Expr,
+    DILocation *DL, int64_t Offset, bool IsIndirect, const SDValue &N) {
+  const Argument *Arg = dyn_cast<Argument>(V);
+  if (!Arg)
+    return false;
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+  // Ignore inlined function arguments here.
+  //
+  // FIXME: Should we be checking DL->inlinedAt() to determine this?
+  if (!Variable->getScope()->getSubprogram()->describes(MF.getFunction()))
+    return false;
+
+  Optional<MachineOperand> Op;
+  // Some arguments' frame index is recorded during argument lowering.
+  if (int FI = FuncInfo.getArgumentFrameIndex(Arg))
+    Op = MachineOperand::CreateFI(FI);
+
+  if (!Op && N.getNode()) {
+    unsigned Reg = getUnderlyingArgReg(N);
+    if (Reg && TargetRegisterInfo::isVirtualRegister(Reg)) {
+      MachineRegisterInfo &RegInfo = MF.getRegInfo();
+      unsigned PR = RegInfo.getLiveInPhysReg(Reg);
+      if (PR)
+        Reg = PR;
+    }
+    if (Reg)
+      Op = MachineOperand::CreateReg(Reg, false);
+  }
+
+  if (!Op) {
+    // Check if ValueMap has reg number.
+    DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+    if (VMI != FuncInfo.ValueMap.end())
+      Op = MachineOperand::CreateReg(VMI->second, false);
+  }
+
+  if (!Op && N.getNode())
+    // Check if frame index is available.
+    if (LoadSDNode *LNode = dyn_cast<LoadSDNode>(N.getNode()))
+      if (FrameIndexSDNode *FINode =
+          dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        Op = MachineOperand::CreateFI(FINode->getIndex());
+
+  if (!Op)
+    return false;
+
+  assert(Variable->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  if (Op->isReg())
+    FuncInfo.ArgDbgValues.push_back(
+        BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+                Op->getReg(), Offset, Variable, Expr));
+  else
+    FuncInfo.ArgDbgValues.push_back(
+        BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE))
+            .addOperand(*Op)
+            .addImm(Offset)
+            .addMetadata(Variable)
+            .addMetadata(Expr));
+
+  return true;
+}
+
+// VisualStudio defines setjmp as _setjmp
+#if defined(_MSC_VER) && defined(setjmp) && \
+                         !defined(setjmp_undefined_for_msvc)
+#  pragma push_macro("setjmp")
+#  undef setjmp
+#  define setjmp_undefined_for_msvc
+#endif
+
+/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
+/// we want to emit this as a call to a named external function, return the name
+/// otherwise lower it and return null.
+const char *
+SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  SDLoc sdl = getCurSDLoc();
+  DebugLoc dl = getCurDebugLoc();
+  SDValue Res;
+
+  switch (Intrinsic) {
+  default:
+    // By default, turn this into a target intrinsic node.
+    visitTargetIntrinsic(I, Intrinsic);
+    return nullptr;
+  case Intrinsic::vastart:  visitVAStart(I); return nullptr;
+  case Intrinsic::vaend:    visitVAEnd(I); return nullptr;
+  case Intrinsic::vacopy:   visitVACopy(I); return nullptr;
+  case Intrinsic::returnaddress:
+    setValue(&I, DAG.getNode(ISD::RETURNADDR, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  case Intrinsic::frameaddress:
+    setValue(&I, DAG.getNode(ISD::FRAMEADDR, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  case Intrinsic::read_register: {
+    Value *Reg = I.getArgOperand(0);
+    SDValue Chain = getRoot();
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    Res = DAG.getNode(ISD::READ_REGISTER, sdl,
+      DAG.getVTList(VT, MVT::Other), Chain, RegName);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return nullptr;
+  }
+  case Intrinsic::write_register: {
+    Value *Reg = I.getArgOperand(0);
+    Value *RegValue = I.getArgOperand(1);
+    SDValue Chain = getRoot();
+    SDValue RegName =
+        DAG.getMDNode(cast<MDNode>(cast<MetadataAsValue>(Reg)->getMetadata()));
+    DAG.setRoot(DAG.getNode(ISD::WRITE_REGISTER, sdl, MVT::Other, Chain,
+                            RegName, getValue(RegValue)));
+    return nullptr;
+  }
+  case Intrinsic::setjmp:
+    return &"_setjmp"[!TLI.usesUnderscoreSetJmp()];
+  case Intrinsic::longjmp:
+    return &"_longjmp"[!TLI.usesUnderscoreLongJmp()];
+  case Intrinsic::memcpy: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memcpy defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
+                               false, isTC,
+                               MachinePointerInfo(I.getArgOperand(0)),
+                               MachinePointerInfo(I.getArgOperand(1)));
+    updateDAGForMaybeTailCall(MC);
+    return nullptr;
+  }
+  case Intrinsic::memset: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memset defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MS = DAG.getMemset(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
+                               isTC, MachinePointerInfo(I.getArgOperand(0)));
+    updateDAGForMaybeTailCall(MS);
+    return nullptr;
+  }
+  case Intrinsic::memmove: {
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+    SDValue Op3 = getValue(I.getArgOperand(2));
+    unsigned Align = cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    if (!Align)
+      Align = 1; // @llvm.memmove defines 0 and 1 to both mean no alignment.
+    bool isVol = cast<ConstantInt>(I.getArgOperand(4))->getZExtValue();
+    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    SDValue MM = DAG.getMemmove(getRoot(), sdl, Op1, Op2, Op3, Align, isVol,
+                                isTC, MachinePointerInfo(I.getArgOperand(0)),
+                                MachinePointerInfo(I.getArgOperand(1)));
+    updateDAGForMaybeTailCall(MM);
+    return nullptr;
+  }
+  case Intrinsic::dbg_declare: {
+    const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
+    DILocalVariable *Variable = DI.getVariable();
+    DIExpression *Expression = DI.getExpression();
+    const Value *Address = DI.getAddress();
+    assert(Variable && "Missing variable");
+    if (!Address) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return nullptr;
+    }
+
+    // Check if address has undef value.
+    if (isa<UndefValue>(Address) ||
+        (Address->use_empty() && !isa<Argument>(Address))) {
+      DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      return nullptr;
+    }
+
+    SDValue &N = NodeMap[Address];
+    if (!N.getNode() && isa<Argument>(Address))
+      // Check unused arguments map.
+      N = UnusedArgNodeMap[Address];
+    SDDbgValue *SDV;
+    if (N.getNode()) {
+      if (const BitCastInst *BCI = dyn_cast<BitCastInst>(Address))
+        Address = BCI->getOperand(0);
+      // Parameters are handled specially.
+      bool isParameter = Variable->isParameter() || isa<Argument>(Address);
+      auto FINode = dyn_cast<FrameIndexSDNode>(N.getNode());
+      if (isParameter && FINode) {
+        // Byval parameter. We have a frame index at this point.
+        SDV = DAG.getFrameIndexDbgValue(Variable, Expression,
+                                        FINode->getIndex(), 0, dl, SDNodeOrder);
+      } else if (isa<Argument>(Address)) {
+        // Address is an argument, so try to emit its dbg value using
+        // virtual register info from the FuncInfo.ValueMap.
+        EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+                                 N);
+        return nullptr;
+      } else {
+        SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
+                              true, 0, dl, SDNodeOrder);
+      }
+      DAG.AddDbgValue(SDV, N.getNode(), isParameter);
+    } else {
+      // If Address is an argument then try to emit its dbg value using
+      // virtual register info from the FuncInfo.ValueMap.
+      if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, false,
+                                    N)) {
+        // If variable is pinned by a alloca in dominating bb then
+        // use StaticAllocaMap.
+        if (const AllocaInst *AI = dyn_cast<AllocaInst>(Address)) {
+          if (AI->getParent() != DI.getParent()) {
+            DenseMap<const AllocaInst*, int>::iterator SI =
+              FuncInfo.StaticAllocaMap.find(AI);
+            if (SI != FuncInfo.StaticAllocaMap.end()) {
+              SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second,
+                                              0, dl, SDNodeOrder);
+              DAG.AddDbgValue(SDV, nullptr, false);
+              return nullptr;
+            }
+          }
+        }
+        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      }
+    }
+    return nullptr;
+  }
+  case Intrinsic::dbg_value: {
+    const DbgValueInst &DI = cast<DbgValueInst>(I);
+    assert(DI.getVariable() && "Missing variable");
+
+    DILocalVariable *Variable = DI.getVariable();
+    DIExpression *Expression = DI.getExpression();
+    uint64_t Offset = DI.getOffset();
+    const Value *V = DI.getValue();
+    if (!V)
+      return nullptr;
+
+    SDDbgValue *SDV;
+    if (isa<ConstantInt>(V) || isa<ConstantFP>(V) || isa<UndefValue>(V)) {
+      SDV = DAG.getConstantDbgValue(Variable, Expression, V, Offset, dl,
+                                    SDNodeOrder);
+      DAG.AddDbgValue(SDV, nullptr, false);
+    } else {
+      // Do not use getValue() in here; we don't want to generate code at
+      // this point if it hasn't been done yet.
+      SDValue N = NodeMap[V];
+      if (!N.getNode() && isa<Argument>(V))
+        // Check unused arguments map.
+        N = UnusedArgNodeMap[V];
+      if (N.getNode()) {
+        if (!EmitFuncArgumentDbgValue(V, Variable, Expression, dl, Offset,
+                                      false, N)) {
+          SDV = DAG.getDbgValue(Variable, Expression, N.getNode(), N.getResNo(),
+                                false, Offset, dl, SDNodeOrder);
+          DAG.AddDbgValue(SDV, N.getNode(), false);
+        }
+      } else if (!V->use_empty() ) {
+        // Do not call getValue(V) yet, as we don't want to generate code.
+        // Remember it for later.
+        DanglingDebugInfo DDI(&DI, dl, SDNodeOrder);
+        DanglingDebugInfoMap[V] = DDI;
+      } else {
+        // We may expand this to cover more cases.  One case where we have no
+        // data available is an unreferenced parameter.
+        DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      }
+    }
+
+    // Build a debug info table entry.
+    if (const BitCastInst *BCI = dyn_cast<BitCastInst>(V))
+      V = BCI->getOperand(0);
+    const AllocaInst *AI = dyn_cast<AllocaInst>(V);
+    // Don't handle byval struct arguments or VLAs, for example.
+    if (!AI) {
+      DEBUG(dbgs() << "Dropping debug location info for:\n  " << DI << "\n");
+      DEBUG(dbgs() << "  Last seen at:\n    " << *V << "\n");
+      return nullptr;
+    }
+    DenseMap<const AllocaInst*, int>::iterator SI =
+      FuncInfo.StaticAllocaMap.find(AI);
+    if (SI == FuncInfo.StaticAllocaMap.end())
+      return nullptr; // VLAs.
+    return nullptr;
+  }
+
+  case Intrinsic::eh_typeid_for: {
+    // Find the type id for the given typeinfo.
+    GlobalValue *GV = ExtractTypeInfo(I.getArgOperand(0));
+    unsigned TypeID = DAG.getMachineFunction().getMMI().getTypeIDFor(GV);
+    Res = DAG.getConstant(TypeID, sdl, MVT::i32);
+    setValue(&I, Res);
+    return nullptr;
+  }
+
+  case Intrinsic::eh_return_i32:
+  case Intrinsic::eh_return_i64:
+    DAG.getMachineFunction().getMMI().setCallsEHReturn(true);
+    DAG.setRoot(DAG.getNode(ISD::EH_RETURN, sdl,
+                            MVT::Other,
+                            getControlRoot(),
+                            getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::eh_unwind_init:
+    DAG.getMachineFunction().getMMI().setCallsUnwindInit(true);
+    return nullptr;
+  case Intrinsic::eh_dwarf_cfa: {
+    SDValue CfaArg = DAG.getSExtOrTrunc(getValue(I.getArgOperand(0)), sdl,
+                                        TLI.getPointerTy(DAG.getDataLayout()));
+    SDValue Offset = DAG.getNode(ISD::ADD, sdl,
+                                 CfaArg.getValueType(),
+                                 DAG.getNode(ISD::FRAME_TO_ARGS_OFFSET, sdl,
+                                             CfaArg.getValueType()),
+                                 CfaArg);
+    SDValue FA = DAG.getNode(
+        ISD::FRAMEADDR, sdl, TLI.getPointerTy(DAG.getDataLayout()),
+        DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout())));
+    setValue(&I, DAG.getNode(ISD::ADD, sdl, FA.getValueType(),
+                             FA, Offset));
+    return nullptr;
+  }
+  case Intrinsic::eh_sjlj_callsite: {
+    MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(0));
+    assert(CI && "Non-constant call site value in eh.sjlj.callsite!");
+    assert(MMI.getCurrentCallSite() == 0 && "Overlapping call sites!");
+
+    MMI.setCurrentCallSite(CI->getZExtValue());
+    return nullptr;
+  }
+  case Intrinsic::eh_sjlj_functioncontext: {
+    // Get and store the index of the function context.
+    MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+    AllocaInst *FnCtx =
+      cast<AllocaInst>(I.getArgOperand(0)->stripPointerCasts());
+    int FI = FuncInfo.StaticAllocaMap[FnCtx];
+    MFI->setFunctionContextIndex(FI);
+    return nullptr;
+  }
+  case Intrinsic::eh_sjlj_setjmp: {
+    SDValue Ops[2];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    SDValue Op = DAG.getNode(ISD::EH_SJLJ_SETJMP, sdl,
+                             DAG.getVTList(MVT::i32, MVT::Other), Ops);
+    setValue(&I, Op.getValue(0));
+    DAG.setRoot(Op.getValue(1));
+    return nullptr;
+  }
+  case Intrinsic::eh_sjlj_longjmp: {
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_LONGJMP, sdl, MVT::Other,
+                            getRoot(), getValue(I.getArgOperand(0))));
+    return nullptr;
+  }
+  case Intrinsic::eh_sjlj_setup_dispatch: {
+    DAG.setRoot(DAG.getNode(ISD::EH_SJLJ_SETUP_DISPATCH, sdl, MVT::Other,
+                            getRoot()));
+    return nullptr;
+  }
+
+  case Intrinsic::masked_gather:
+    visitMaskedGather(I);
+    return nullptr;
+  case Intrinsic::masked_load:
+    visitMaskedLoad(I);
+    return nullptr;
+  case Intrinsic::masked_scatter:
+    visitMaskedScatter(I);
+    return nullptr;
+  case Intrinsic::masked_store:
+    visitMaskedStore(I);
+    return nullptr;
+  case Intrinsic::x86_mmx_pslli_w:
+  case Intrinsic::x86_mmx_pslli_d:
+  case Intrinsic::x86_mmx_pslli_q:
+  case Intrinsic::x86_mmx_psrli_w:
+  case Intrinsic::x86_mmx_psrli_d:
+  case Intrinsic::x86_mmx_psrli_q:
+  case Intrinsic::x86_mmx_psrai_w:
+  case Intrinsic::x86_mmx_psrai_d: {
+    SDValue ShAmt = getValue(I.getArgOperand(1));
+    if (isa<ConstantSDNode>(ShAmt)) {
+      visitTargetIntrinsic(I, Intrinsic);
+      return nullptr;
+    }
+    unsigned NewIntrinsic = 0;
+    EVT ShAmtVT = MVT::v2i32;
+    switch (Intrinsic) {
+    case Intrinsic::x86_mmx_pslli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_w;
+      break;
+    case Intrinsic::x86_mmx_pslli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_d;
+      break;
+    case Intrinsic::x86_mmx_pslli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psll_q;
+      break;
+    case Intrinsic::x86_mmx_psrli_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_w;
+      break;
+    case Intrinsic::x86_mmx_psrli_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_d;
+      break;
+    case Intrinsic::x86_mmx_psrli_q:
+      NewIntrinsic = Intrinsic::x86_mmx_psrl_q;
+      break;
+    case Intrinsic::x86_mmx_psrai_w:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_w;
+      break;
+    case Intrinsic::x86_mmx_psrai_d:
+      NewIntrinsic = Intrinsic::x86_mmx_psra_d;
+      break;
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    }
+
+    // The vector shift intrinsics with scalars uses 32b shift amounts but
+    // the sse2/mmx shift instructions reads 64 bits. Set the upper 32 bits
+    // to be zero.
+    // We must do this early because v2i32 is not a legal type.
+    SDValue ShOps[2];
+    ShOps[0] = ShAmt;
+    ShOps[1] = DAG.getConstant(0, sdl, MVT::i32);
+    ShAmt =  DAG.getNode(ISD::BUILD_VECTOR, sdl, ShAmtVT, ShOps);
+    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    ShAmt = DAG.getNode(ISD::BITCAST, sdl, DestVT, ShAmt);
+    Res = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, sdl, DestVT,
+                       DAG.getConstant(NewIntrinsic, sdl, MVT::i32),
+                       getValue(I.getArgOperand(0)), ShAmt);
+    setValue(&I, Res);
+    return nullptr;
+  }
+  case Intrinsic::convertff:
+  case Intrinsic::convertfsi:
+  case Intrinsic::convertfui:
+  case Intrinsic::convertsif:
+  case Intrinsic::convertuif:
+  case Intrinsic::convertss:
+  case Intrinsic::convertsu:
+  case Intrinsic::convertus:
+  case Intrinsic::convertuu: {
+    ISD::CvtCode Code = ISD::CVT_INVALID;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::convertff:  Code = ISD::CVT_FF; break;
+    case Intrinsic::convertfsi: Code = ISD::CVT_FS; break;
+    case Intrinsic::convertfui: Code = ISD::CVT_FU; break;
+    case Intrinsic::convertsif: Code = ISD::CVT_SF; break;
+    case Intrinsic::convertuif: Code = ISD::CVT_UF; break;
+    case Intrinsic::convertss:  Code = ISD::CVT_SS; break;
+    case Intrinsic::convertsu:  Code = ISD::CVT_SU; break;
+    case Intrinsic::convertus:  Code = ISD::CVT_US; break;
+    case Intrinsic::convertuu:  Code = ISD::CVT_UU; break;
+    }
+    EVT DestVT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    const Value *Op1 = I.getArgOperand(0);
+    Res = DAG.getConvertRndSat(DestVT, sdl, getValue(Op1),
+                               DAG.getValueType(DestVT),
+                               DAG.getValueType(getValue(Op1).getValueType()),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2)),
+                               Code);
+    setValue(&I, Res);
+    return nullptr;
+  }
+  case Intrinsic::powi:
+    setValue(&I, ExpandPowI(sdl, getValue(I.getArgOperand(0)),
+                            getValue(I.getArgOperand(1)), DAG));
+    return nullptr;
+  case Intrinsic::log:
+    setValue(&I, expandLog(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::log2:
+    setValue(&I, expandLog2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::log10:
+    setValue(&I, expandLog10(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::exp:
+    setValue(&I, expandExp(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::exp2:
+    setValue(&I, expandExp2(sdl, getValue(I.getArgOperand(0)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::pow:
+    setValue(&I, expandPow(sdl, getValue(I.getArgOperand(0)),
+                           getValue(I.getArgOperand(1)), DAG, TLI));
+    return nullptr;
+  case Intrinsic::sqrt:
+  case Intrinsic::fabs:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::round:
+  case Intrinsic::canonicalize: {
+    unsigned Opcode;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
+    case Intrinsic::fabs:      Opcode = ISD::FABS;       break;
+    case Intrinsic::sin:       Opcode = ISD::FSIN;       break;
+    case Intrinsic::cos:       Opcode = ISD::FCOS;       break;
+    case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
+    case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
+    case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
+    case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
+    case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
+    case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+    case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
+    }
+
+    setValue(&I, DAG.getNode(Opcode, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  }
+  case Intrinsic::minnum: {
+    auto VT = getValue(I.getArgOperand(0)).getValueType();
+    unsigned Opc =
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMINNAN, VT)
+            ? ISD::FMINNAN
+            : ISD::FMINNUM;
+    setValue(&I, DAG.getNode(Opc, sdl, VT,
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  }
+  case Intrinsic::maxnum: {
+    auto VT = getValue(I.getArgOperand(0)).getValueType();
+    unsigned Opc =
+        I.hasNoNaNs() && TLI.isOperationLegalOrCustom(ISD::FMAXNAN, VT)
+            ? ISD::FMAXNAN
+            : ISD::FMAXNUM;
+    setValue(&I, DAG.getNode(Opc, sdl, VT,
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  }
+  case Intrinsic::copysign:
+    setValue(&I, DAG.getNode(ISD::FCOPYSIGN, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1))));
+    return nullptr;
+  case Intrinsic::fma:
+    setValue(&I, DAG.getNode(ISD::FMA, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)),
+                             getValue(I.getArgOperand(2))));
+    return nullptr;
+  case Intrinsic::fmuladd: {
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    if (TM.Options.AllowFPOpFusion != FPOpFusion::Strict &&
+        TLI.isFMAFasterThanFMulAndFAdd(VT)) {
+      setValue(&I, DAG.getNode(ISD::FMA, sdl,
+                               getValue(I.getArgOperand(0)).getValueType(),
+                               getValue(I.getArgOperand(0)),
+                               getValue(I.getArgOperand(1)),
+                               getValue(I.getArgOperand(2))));
+    } else {
+      // TODO: Intrinsic calls should have fast-math-flags.
+      SDValue Mul = DAG.getNode(ISD::FMUL, sdl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                getValue(I.getArgOperand(0)),
+                                getValue(I.getArgOperand(1)));
+      SDValue Add = DAG.getNode(ISD::FADD, sdl,
+                                getValue(I.getArgOperand(0)).getValueType(),
+                                Mul,
+                                getValue(I.getArgOperand(2)));
+      setValue(&I, Add);
+    }
+    return nullptr;
+  }
+  case Intrinsic::convert_to_fp16:
+    setValue(&I, DAG.getNode(ISD::BITCAST, sdl, MVT::i16,
+                             DAG.getNode(ISD::FP_ROUND, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)),
+                                         DAG.getTargetConstant(0, sdl,
+                                                               MVT::i32))));
+    return nullptr;
+  case Intrinsic::convert_from_fp16:
+    setValue(&I, DAG.getNode(ISD::FP_EXTEND, sdl,
+                             TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                             DAG.getNode(ISD::BITCAST, sdl, MVT::f16,
+                                         getValue(I.getArgOperand(0)))));
+    return nullptr;
+  case Intrinsic::pcmarker: {
+    SDValue Tmp = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::PCMARKER, sdl, MVT::Other, getRoot(), Tmp));
+    return nullptr;
+  }
+  case Intrinsic::readcyclecounter: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(ISD::READCYCLECOUNTER, sdl,
+                      DAG.getVTList(MVT::i64, MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return nullptr;
+  }
+  case Intrinsic::bitreverse:
+    setValue(&I, DAG.getNode(ISD::BITREVERSE, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  case Intrinsic::bswap:
+    setValue(&I, DAG.getNode(ISD::BSWAP, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  case Intrinsic::cttz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTTZ : ISD::CTTZ_ZERO_UNDEF,
+                             sdl, Ty, Arg));
+    return nullptr;
+  }
+  case Intrinsic::ctlz: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    ConstantInt *CI = cast<ConstantInt>(I.getArgOperand(1));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(CI->isZero() ? ISD::CTLZ : ISD::CTLZ_ZERO_UNDEF,
+                             sdl, Ty, Arg));
+    return nullptr;
+  }
+  case Intrinsic::ctpop: {
+    SDValue Arg = getValue(I.getArgOperand(0));
+    EVT Ty = Arg.getValueType();
+    setValue(&I, DAG.getNode(ISD::CTPOP, sdl, Ty, Arg));
+    return nullptr;
+  }
+  case Intrinsic::stacksave: {
+    SDValue Op = getRoot();
+    Res = DAG.getNode(
+        ISD::STACKSAVE, sdl,
+        DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return nullptr;
+  }
+  case Intrinsic::stackrestore: {
+    Res = getValue(I.getArgOperand(0));
+    DAG.setRoot(DAG.getNode(ISD::STACKRESTORE, sdl, MVT::Other, getRoot(), Res));
+    return nullptr;
+  }
+  case Intrinsic::get_dynamic_area_offset: {
+    SDValue Op = getRoot();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
+    // target.
+    if (PtrTy != ResTy)
+      report_fatal_error("Wrong result type for @llvm.get.dynamic.area.offset"
+                         " intrinsic!");
+    Res = DAG.getNode(ISD::GET_DYNAMIC_AREA_OFFSET, sdl, DAG.getVTList(ResTy),
+                      Op);
+    DAG.setRoot(Op);
+    setValue(&I, Res);
+    return nullptr;
+  }
+  case Intrinsic::stackguard: {
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Module &M = *MF.getFunction()->getParent();
+    SDValue Chain = getRoot();
+    if (TLI.useLoadStackGuardNode()) {
+      Res = getLoadStackGuard(DAG, sdl, Chain);
+    } else {
+      const Value *Global = TLI.getSDagStackGuard(M);
+      unsigned Align = DL->getPrefTypeAlignment(Global->getType());
+      Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
+                        MachinePointerInfo(Global, 0), Align,
+                        MachineMemOperand::MOVolatile);
+    }
+    DAG.setRoot(Chain);
+    setValue(&I, Res);
+    return nullptr;
+  }
+  case Intrinsic::stackprotector: {
+    // Emit code into the DAG to store the stack guard onto the stack.
+    MachineFunction &MF = DAG.getMachineFunction();
+    MachineFrameInfo *MFI = MF.getFrameInfo();
+    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    SDValue Src, Chain = getRoot();
+
+    if (TLI.useLoadStackGuardNode())
+      Src = getLoadStackGuard(DAG, sdl, Chain);
+    else
+      Src = getValue(I.getArgOperand(0));   // The guard's value.
+
+    AllocaInst *Slot = cast<AllocaInst>(I.getArgOperand(1));
+
+    int FI = FuncInfo.StaticAllocaMap[Slot];
+    MFI->setStackProtectorIndex(FI);
+
+    SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
+
+    // Store the stack protector onto the stack.
+    Res = DAG.getStore(Chain, sdl, Src, FIN, MachinePointerInfo::getFixedStack(
+                                                 DAG.getMachineFunction(), FI),
+                       /* Alignment = */ 0, MachineMemOperand::MOVolatile);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return nullptr;
+  }
+  case Intrinsic::objectsize: {
+    // If we don't know by now, we're never going to know.
+    ConstantInt *CI = dyn_cast<ConstantInt>(I.getArgOperand(1));
+
+    assert(CI && "Non-constant type in __builtin_object_size?");
+
+    SDValue Arg = getValue(I.getCalledValue());
+    EVT Ty = Arg.getValueType();
+
+    if (CI->isZero())
+      Res = DAG.getConstant(-1ULL, sdl, Ty);
+    else
+      Res = DAG.getConstant(0, sdl, Ty);
+
+    setValue(&I, Res);
+    return nullptr;
+  }
+  case Intrinsic::annotation:
+  case Intrinsic::ptr_annotation:
+    // Drop the intrinsic, but forward the value
+    setValue(&I, getValue(I.getOperand(0)));
+    return nullptr;
+  case Intrinsic::assume:
+  case Intrinsic::var_annotation:
+    // Discard annotate attributes and assumptions
+    return nullptr;
+
+  case Intrinsic::init_trampoline: {
+    const Function *F = cast<Function>(I.getArgOperand(1)->stripPointerCasts());
+
+    SDValue Ops[6];
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = DAG.getSrcValue(I.getArgOperand(0));
+    Ops[5] = DAG.getSrcValue(F);
+
+    Res = DAG.getNode(ISD::INIT_TRAMPOLINE, sdl, MVT::Other, Ops);
+
+    DAG.setRoot(Res);
+    return nullptr;
+  }
+  case Intrinsic::adjust_trampoline: {
+    setValue(&I, DAG.getNode(ISD::ADJUST_TRAMPOLINE, sdl,
+                             TLI.getPointerTy(DAG.getDataLayout()),
+                             getValue(I.getArgOperand(0))));
+    return nullptr;
+  }
+  case Intrinsic::gcroot: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const Function *F = MF.getFunction();
+    (void)F;
+    assert(F->hasGC() &&
+           "only valid in functions with gc specified, enforced by Verifier");
+    assert(GFI && "implied by previous");
+    const Value *Alloca = I.getArgOperand(0)->stripPointerCasts();
+    const Constant *TypeMap = cast<Constant>(I.getArgOperand(1));
+
+    FrameIndexSDNode *FI = cast<FrameIndexSDNode>(getValue(Alloca).getNode());
+    GFI->addStackRoot(FI->getIndex(), TypeMap);
+    return nullptr;
+  }
+  case Intrinsic::gcread:
+  case Intrinsic::gcwrite:
+    llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
+  case Intrinsic::flt_rounds:
+    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
+    return nullptr;
+
+  case Intrinsic::expect: {
+    // Just replace __builtin_expect(exp, c) with EXP.
+    setValue(&I, getValue(I.getArgOperand(0)));
+    return nullptr;
+  }
+
+  case Intrinsic::debugtrap:
+  case Intrinsic::trap: {
+    StringRef TrapFuncName =
+        I.getAttributes()
+            .getAttribute(AttributeSet::FunctionIndex, "trap-func-name")
+            .getValueAsString();
+    if (TrapFuncName.empty()) {
+      ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
+        ISD::TRAP : ISD::DEBUGTRAP;
+      DAG.setRoot(DAG.getNode(Op, sdl,MVT::Other, getRoot()));
+      return nullptr;
+    }
+    TargetLowering::ArgListTy Args;
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee(
+        CallingConv::C, I.getType(),
+        DAG.getExternalSymbol(TrapFuncName.data(),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
+
+    std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+    DAG.setRoot(Result.second);
+    return nullptr;
+  }
+
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow: {
+    ISD::NodeType Op;
+    switch (Intrinsic) {
+    default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
+    case Intrinsic::uadd_with_overflow: Op = ISD::UADDO; break;
+    case Intrinsic::sadd_with_overflow: Op = ISD::SADDO; break;
+    case Intrinsic::usub_with_overflow: Op = ISD::USUBO; break;
+    case Intrinsic::ssub_with_overflow: Op = ISD::SSUBO; break;
+    case Intrinsic::umul_with_overflow: Op = ISD::UMULO; break;
+    case Intrinsic::smul_with_overflow: Op = ISD::SMULO; break;
+    }
+    SDValue Op1 = getValue(I.getArgOperand(0));
+    SDValue Op2 = getValue(I.getArgOperand(1));
+
+    SDVTList VTs = DAG.getVTList(Op1.getValueType(), MVT::i1);
+    setValue(&I, DAG.getNode(Op, sdl, VTs, Op1, Op2));
+    return nullptr;
+  }
+  case Intrinsic::prefetch: {
+    SDValue Ops[5];
+    unsigned rw = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Ops[0] = getRoot();
+    Ops[1] = getValue(I.getArgOperand(0));
+    Ops[2] = getValue(I.getArgOperand(1));
+    Ops[3] = getValue(I.getArgOperand(2));
+    Ops[4] = getValue(I.getArgOperand(3));
+    DAG.setRoot(DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
+                                        DAG.getVTList(MVT::Other), Ops,
+                                        EVT::getIntegerVT(*Context, 8),
+                                        MachinePointerInfo(I.getArgOperand(0)),
+                                        0, /* align */
+                                        false, /* volatile */
+                                        rw==0, /* read */
+                                        rw==1)); /* write */
+    return nullptr;
+  }
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end: {
+    bool IsStart = (Intrinsic == Intrinsic::lifetime_start);
+    // Stack coloring is not enabled in O0, discard region information.
+    if (TM.getOptLevel() == CodeGenOpt::None)
+      return nullptr;
+
+    SmallVector<Value *, 4> Allocas;
+    GetUnderlyingObjects(I.getArgOperand(1), Allocas, *DL);
+
+    for (SmallVectorImpl<Value*>::iterator Object = Allocas.begin(),
+           E = Allocas.end(); Object != E; ++Object) {
+      AllocaInst *LifetimeObject = dyn_cast_or_null<AllocaInst>(*Object);
+
+      // Could not find an Alloca.
+      if (!LifetimeObject)
+        continue;
+
+      // First check that the Alloca is static, otherwise it won't have a
+      // valid frame index.
+      auto SI = FuncInfo.StaticAllocaMap.find(LifetimeObject);
+      if (SI == FuncInfo.StaticAllocaMap.end())
+        return nullptr;
+
+      int FI = SI->second;
+
+      SDValue Ops[2];
+      Ops[0] = getRoot();
+      Ops[1] =
+          DAG.getFrameIndex(FI, TLI.getPointerTy(DAG.getDataLayout()), true);
+      unsigned Opcode = (IsStart ? ISD::LIFETIME_START : ISD::LIFETIME_END);
+
+      Res = DAG.getNode(Opcode, sdl, MVT::Other, Ops);
+      DAG.setRoot(Res);
+    }
+    return nullptr;
+  }
+  case Intrinsic::invariant_start:
+    // Discard region information.
+    setValue(&I, DAG.getUNDEF(TLI.getPointerTy(DAG.getDataLayout())));
+    return nullptr;
+  case Intrinsic::invariant_end:
+    // Discard region information.
+    return nullptr;
+  case Intrinsic::clear_cache:
+    return TLI.getClearCacheBuiltinName();
+  case Intrinsic::donothing:
+    // ignore
+    return nullptr;
+  case Intrinsic::experimental_stackmap: {
+    visitStackmap(I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_patchpoint_void:
+  case Intrinsic::experimental_patchpoint_i64: {
+    visitPatchpoint(&I);
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_statepoint: {
+    LowerStatepoint(ImmutableStatepoint(&I));
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_result: {
+    visitGCResult(cast<GCResultInst>(I));
+    return nullptr;
+  }
+  case Intrinsic::experimental_gc_relocate: {
+    visitGCRelocate(cast<GCRelocateInst>(I));
+    return nullptr;
+  }
+  case Intrinsic::instrprof_increment:
+    llvm_unreachable("instrprof failed to lower an increment");
+  case Intrinsic::instrprof_value_profile:
+    llvm_unreachable("instrprof failed to lower a value profiling call");
+  case Intrinsic::localescape: {
+    MachineFunction &MF = DAG.getMachineFunction();
+    const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
+
+    // Directly emit some LOCAL_ESCAPE machine instrs. Label assignment emission
+    // is the same on all targets.
+    for (unsigned Idx = 0, E = I.getNumArgOperands(); Idx < E; ++Idx) {
+      Value *Arg = I.getArgOperand(Idx)->stripPointerCasts();
+      if (isa<ConstantPointerNull>(Arg))
+        continue; // Skip null pointers. They represent a hole in index space.
+      AllocaInst *Slot = cast<AllocaInst>(Arg);
+      assert(FuncInfo.StaticAllocaMap.count(Slot) &&
+             "can only escape static allocas");
+      int FI = FuncInfo.StaticAllocaMap[Slot];
+      MCSymbol *FrameAllocSym =
+          MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
+              GlobalValue::getRealLinkageName(MF.getName()), Idx);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl,
+              TII->get(TargetOpcode::LOCAL_ESCAPE))
+          .addSym(FrameAllocSym)
+          .addFrameIndex(FI);
+    }
+
+    return nullptr;
+  }
+
+  case Intrinsic::localrecover: {
+    // i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
+    MachineFunction &MF = DAG.getMachineFunction();
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);
+
+    // Get the symbol that defines the frame offset.
+    auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
+    auto *Idx = cast<ConstantInt>(I.getArgOperand(2));
+    unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX));
+    MCSymbol *FrameAllocSym =
+        MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
+            GlobalValue::getRealLinkageName(Fn->getName()), IdxVal);
+
+    // Create a MCSymbol for the label to avoid any target lowering
+    // that would make this PC relative.
+    SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
+    SDValue OffsetVal =
+        DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
+
+    // Add the offset to the FP.
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    SDValue Add = DAG.getNode(ISD::ADD, sdl, PtrVT, FPVal, OffsetVal);
+    setValue(&I, Add);
+
+    return nullptr;
+  }
+
+  case Intrinsic::eh_exceptionpointer:
+  case Intrinsic::eh_exceptioncode: {
+    // Get the exception pointer vreg, copy from it, and resize it to fit.
+    const auto *CPI = cast<CatchPadInst>(I.getArgOperand(0));
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    const TargetRegisterClass *PtrRC = TLI.getRegClassFor(PtrVT);
+    unsigned VReg = FuncInfo.getCatchPadExceptionPointerVReg(CPI, PtrRC);
+    SDValue N =
+        DAG.getCopyFromReg(DAG.getEntryNode(), getCurSDLoc(), VReg, PtrVT);
+    if (Intrinsic == Intrinsic::eh_exceptioncode)
+      N = DAG.getZExtOrTrunc(N, getCurSDLoc(), MVT::i32);
+    setValue(&I, N);
+    return nullptr;
+  }
+
+  case Intrinsic::experimental_deoptimize:
+    LowerDeoptimizeCall(&I);
+    return nullptr;
+  }
+}
+
+std::pair<SDValue, SDValue>
+SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                                    const BasicBlock *EHPadBB) {
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  MCSymbol *BeginLabel = nullptr;
+
+  if (EHPadBB) {
+    // Insert a label before the invoke call to mark the try range.  This can be
+    // used to detect deletion of the invoke via the MachineModuleInfo.
+    BeginLabel = MMI.getContext().createTempSymbol();
+
+    // For SjLj, keep track of which landing pads go with which invokes
+    // so as to maintain the ordering of pads in the LSDA.
+    unsigned CallSiteIndex = MMI.getCurrentCallSite();
+    if (CallSiteIndex) {
+      MMI.setCallSiteBeginLabel(BeginLabel, CallSiteIndex);
+      LPadToCallSiteMap[FuncInfo.MBBMap[EHPadBB]].push_back(CallSiteIndex);
+
+      // Now that the call site is handled, stop tracking it.
+      MMI.setCurrentCallSite(0);
+    }
+
+    // Both PendingLoads and PendingExports must be flushed here;
+    // this call might not return.
+    (void)getRoot();
+    DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getControlRoot(), BeginLabel));
+
+    CLI.setChain(getRoot());
+  }
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  std::pair<SDValue, SDValue> Result = TLI.LowerCallTo(CLI);
+
+  assert((CLI.IsTailCall || Result.second.getNode()) &&
+         "Non-null chain expected with non-tail call!");
+  assert((Result.second.getNode() || !Result.first.getNode()) &&
+         "Null value expected with tail call!");
+
+  if (!Result.second.getNode()) {
+    // As a special case, a null chain means that a tail call has been emitted
+    // and the DAG root is already updated.
+    HasTailCall = true;
+
+    // Since there's no actual continuation from this block, nothing can be
+    // relying on us setting vregs for them.
+    PendingExports.clear();
+  } else {
+    DAG.setRoot(Result.second);
+  }
+
+  if (EHPadBB) {
+    // Insert a label at the end of the invoke call to mark the try range.  This
+    // can be used to detect deletion of the invoke via the MachineModuleInfo.
+    MCSymbol *EndLabel = MMI.getContext().createTempSymbol();
+    DAG.setRoot(DAG.getEHLabel(getCurSDLoc(), getRoot(), EndLabel));
+
+    // Inform MachineModuleInfo of range.
+    if (MMI.hasEHFunclets()) {
+      assert(CLI.CS);
+      WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
+      EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS->getInstruction()),
+                                BeginLabel, EndLabel);
+    } else {
+      MMI.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
+    }
+  }
+
+  return Result;
+}
+
+void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+                                      bool isTailCall,
+                                      const BasicBlock *EHPadBB) {
+  auto &DL = DAG.getDataLayout();
+  FunctionType *FTy = CS.getFunctionType();
+  Type *RetTy = CS.getType();
+
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Args.reserve(CS.arg_size());
+
+  const Value *SwiftErrorVal = nullptr;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+       i != e; ++i) {
+    const Value *V = *i;
+
+    // Skip empty types
+    if (V->getType()->isEmptyTy())
+      continue;
+
+    SDValue ArgNode = getValue(V);
+    Entry.Node = ArgNode; Entry.Ty = V->getType();
+
+    // Skip the first return-type Attribute to get to params.
+    Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
+
+    // Use swifterror virtual register as input to the call.
+    if (Entry.isSwiftError && TLI.supportSwiftError()) {
+      SwiftErrorVal = V;
+      // We find the virtual register for the actual swifterror argument.
+      // Instead of using the Value, we use the virtual register instead.
+      Entry.Node = DAG.getRegister(
+          FuncInfo.findSwiftErrorVReg(FuncInfo.MBB, V),
+          EVT(TLI.getPointerTy(DL)));
+    }
+
+    Args.push_back(Entry);
+
+    // If we have an explicit sret argument that is an Instruction, (i.e., it
+    // might point to function-local memory), we can't meaningfully tail-call.
+    if (Entry.isSRet && isa<Instruction>(V))
+      isTailCall = false;
+  }
+
+  // Check if target-independent constraints permit a tail call here.
+  // Target-dependent constraints are checked within TLI->LowerCallTo.
+  if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
+    isTailCall = false;
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
+      .setTailCall(isTailCall)
+      .setConvergent(CS.isConvergent());
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
+
+  if (Result.first.getNode()) {
+    const Instruction *Inst = CS.getInstruction();
+    Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
+    setValue(Inst, Result.first);
+  }
+
+  // The last element of CLI.InVals has the SDValue for swifterror return.
+  // Here we copy it to a virtual register and update SwiftErrorMap for
+  // book-keeping.
+  if (SwiftErrorVal && TLI.supportSwiftError()) {
+    // Get the last element of InVals.
+    SDValue Src = CLI.InVals.back();
+    const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
+    unsigned VReg = FuncInfo.MF->getRegInfo().createVirtualRegister(RC);
+    SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
+    // We update the virtual register for the actual swifterror argument.
+    FuncInfo.setSwiftErrorVReg(FuncInfo.MBB, SwiftErrorVal, VReg);
+    DAG.setRoot(CopyNode);
+  }
+}
+
+/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
+  for (const User *U : V->users()) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality())
+        if (const Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
+                             Type *LoadTy,
+                             SelectionDAGBuilder &Builder) {
+
+  // Check to see if this load can be trivially constant folded, e.g. if the
+  // input is from a string literal.
+  if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
+    // Cast pointer to the type we really want to load.
+    LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
+                                         PointerType::getUnqual(LoadTy));
+
+    if (const Constant *LoadCst = ConstantFoldLoadFromConstPtr(
+            const_cast<Constant *>(LoadInput), LoadTy, *Builder.DL))
+      return Builder.getValue(LoadCst);
+  }
+
+  // Otherwise, we have to emit the load.  If the pointer is to unfoldable but
+  // still constant memory, the input chain can be the entry node.
+  SDValue Root;
+  bool ConstantMemory = false;
+
+  // Do not serialize (non-volatile) loads of constant memory with anything.
+  if (Builder.AA->pointsToConstantMemory(PtrVal)) {
+    Root = Builder.DAG.getEntryNode();
+    ConstantMemory = true;
+  } else {
+    // Do not serialize non-volatile loads against each other.
+    Root = Builder.DAG.getRoot();
+  }
+
+  SDValue Ptr = Builder.getValue(PtrVal);
+  SDValue LoadVal = Builder.DAG.getLoad(LoadVT, Builder.getCurSDLoc(), Root,
+                                        Ptr, MachinePointerInfo(PtrVal),
+                                        /* Alignment = */ 1);
+
+  if (!ConstantMemory)
+    Builder.PendingLoads.push_back(LoadVal.getValue(1));
+  return LoadVal;
+}
+
+/// processIntegerCallValue - Record the value for an instruction that
+/// produces an integer result, converting the type where necessary.
+void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
+                                                  SDValue Value,
+                                                  bool IsSigned) {
+  EVT VT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                    I.getType(), true);
+  if (IsSigned)
+    Value = DAG.getSExtOrTrunc(Value, getCurSDLoc(), VT);
+  else
+    Value = DAG.getZExtOrTrunc(Value, getCurSDLoc(), VT);
+  setValue(&I, Value);
+}
+
+/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
+  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
+      !I.getArgOperand(2)->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const Value *Size = I.getArgOperand(2);
+  const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
+  if (CSize && CSize->getZExtValue() == 0) {
+    EVT CallVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
+                                                          I.getType(), true);
+    setValue(&I, DAG.getConstant(0, getCurSDLoc(), CallVT));
+    return true;
+  }
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(LHS), getValue(RHS), getValue(Size),
+                                MachinePointerInfo(LHS),
+                                MachinePointerInfo(RHS));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
+  // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
+  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
+    bool ActuallyDoIt = true;
+    MVT LoadVT;
+    Type *LoadTy;
+    switch (CSize->getZExtValue()) {
+    default:
+      LoadVT = MVT::Other;
+      LoadTy = nullptr;
+      ActuallyDoIt = false;
+      break;
+    case 2:
+      LoadVT = MVT::i16;
+      LoadTy = Type::getInt16Ty(CSize->getContext());
+      break;
+    case 4:
+      LoadVT = MVT::i32;
+      LoadTy = Type::getInt32Ty(CSize->getContext());
+      break;
+    case 8:
+      LoadVT = MVT::i64;
+      LoadTy = Type::getInt64Ty(CSize->getContext());
+      break;
+        /*
+    case 16:
+      LoadVT = MVT::v4i32;
+      LoadTy = Type::getInt32Ty(CSize->getContext());
+      LoadTy = VectorType::get(LoadTy, 4);
+      break;
+         */
+    }
+
+    // This turns into unaligned loads.  We only do this if the target natively
+    // supports the MVT we'll be loading or if it is small enough (<= 4) that
+    // we'll only produce a small number of byte loads.
+
+    // Require that we can find a legal MVT, and only do this if the target
+    // supports unaligned loads of that type.  Expanding into byte loads would
+    // bloat the code.
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      // TODO: Handle 5 byte compare as 4-byte + 1 byte.
+      // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
+      // TODO: Check alignment of src and dest ptrs.
+      if (!TLI.isTypeLegal(LoadVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
+        ActuallyDoIt = false;
+    }
+
+    if (ActuallyDoIt) {
+      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
+      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+
+      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
+                                 ISD::SETNE);
+      processIntegerCallValue(I, Res, false);
+      return true;
+    }
+  }
+
+
+  return false;
+}
+
+/// visitMemChrCall -- See if we can lower a memchr call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
+  if (I.getNumArgOperands() != 3)
+    return false;
+
+  const Value *Src = I.getArgOperand(0);
+  const Value *Char = I.getArgOperand(1);
+  const Value *Length = I.getArgOperand(2);
+  if (!Src->getType()->isPointerTy() ||
+      !Char->getType()->isIntegerTy() ||
+      !Length->getType()->isIntegerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForMemchr(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Src), getValue(Char), getValue(Length),
+                                MachinePointerInfo(Src));
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
+/// optimized form.  If so, return true and lower it, otherwise return false
+/// and it will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
+  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isPointerTy())
+    return false;
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcpy(DAG, getCurSDLoc(), getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1), isStpcpy);
+  if (Res.first.getNode()) {
+    setValue(&I, Res.first);
+    DAG.setRoot(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
+/// If so, return true and lower it, otherwise return false and it will be
+/// lowered like a normal call.
+bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  int strcmp(void*,void*)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isPointerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrcmp(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), getValue(Arg1),
+                                MachinePointerInfo(Arg0),
+                                MachinePointerInfo(Arg1));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, true);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrLenCall -- See if we can lower a strlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strlen(char *)
+  if (I.getNumArgOperands() != 1)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0);
+  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
+    return false;
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                getValue(Arg0), MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
+/// form.  If so, return true and lower it, otherwise return false and it
+/// will be lowered like a normal call.
+bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
+  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
+  if (I.getNumArgOperands() != 2)
+    return false;
+
+  const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
+  if (!Arg0->getType()->isPointerTy() ||
+      !Arg1->getType()->isIntegerTy() ||
+      !I.getType()->isIntegerTy())
+    return false;
+
+  const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
+  std::pair<SDValue, SDValue> Res =
+    TSI.EmitTargetCodeForStrnlen(DAG, getCurSDLoc(), DAG.getRoot(),
+                                 getValue(Arg0), getValue(Arg1),
+                                 MachinePointerInfo(Arg0));
+  if (Res.first.getNode()) {
+    processIntegerCallValue(I, Res.first, false);
+    PendingLoads.push_back(Res.second);
+    return true;
+  }
+
+  return false;
+}
+
+/// visitUnaryFloatCall - If a call instruction is a unary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
+                                              unsigned Opcode) {
+  // Sanity check that it really is a unary floating-point call.
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp = getValue(I.getArgOperand(0));
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), Tmp.getValueType(), Tmp));
+  return true;
+}
+
+/// visitBinaryFloatCall - If a call instruction is a binary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
+                                               unsigned Opcode) {
+  // Sanity check that it really is a binary floating-point call.
+  if (I.getNumArgOperands() != 2 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      I.getType() != I.getArgOperand(1)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp0 = getValue(I.getArgOperand(0));
+  SDValue Tmp1 = getValue(I.getArgOperand(1));
+  EVT VT = Tmp0.getValueType();
+  setValue(&I, DAG.getNode(Opcode, getCurSDLoc(), VT, Tmp0, Tmp1));
+  return true;
+}
+
+void SelectionDAGBuilder::visitCall(const CallInst &I) {
+  // Handle inline assembly differently.
+  if (isa<InlineAsm>(I.getCalledValue())) {
+    visitInlineAsm(&I);
+    return;
+  }
+
+  MachineModuleInfo &MMI = DAG.getMachineFunction().getMMI();
+  ComputeUsesVAFloatArgument(I, &MMI);
+
+  const char *RenameFn = nullptr;
+  if (Function *F = I.getCalledFunction()) {
+    if (F->isDeclaration()) {
+      if (const TargetIntrinsicInfo *II = TM.getIntrinsicInfo()) {
+        if (unsigned IID = II->getIntrinsicID(F)) {
+          RenameFn = visitIntrinsicCall(I, IID);
+          if (!RenameFn)
+            return;
+        }
+      }
+      if (Intrinsic::ID IID = F->getIntrinsicID()) {
+        RenameFn = visitIntrinsicCall(I, IID);
+        if (!RenameFn)
+          return;
+      }
+    }
+
+    // Check for well-known libc/libm calls.  If the function is internal, it
+    // can't be a library call.  Don't do the check if marked as nobuiltin for
+    // some reason.
+    LibFunc::Func Func;
+    if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func)) {
+      switch (Func) {
+      default: break;
+      case LibFunc::copysign:
+      case LibFunc::copysignf:
+      case LibFunc::copysignl:
+        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
+            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
+            I.getType() == I.getArgOperand(0)->getType() &&
+            I.getType() == I.getArgOperand(1)->getType() &&
+            I.onlyReadsMemory()) {
+          SDValue LHS = getValue(I.getArgOperand(0));
+          SDValue RHS = getValue(I.getArgOperand(1));
+          setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
+                                   LHS.getValueType(), LHS, RHS));
+          return;
+        }
+        break;
+      case LibFunc::fabs:
+      case LibFunc::fabsf:
+      case LibFunc::fabsl:
+        if (visitUnaryFloatCall(I, ISD::FABS))
+          return;
+        break;
+      case LibFunc::fmin:
+      case LibFunc::fminf:
+      case LibFunc::fminl:
+        if (visitBinaryFloatCall(I, ISD::FMINNUM))
+          return;
+        break;
+      case LibFunc::fmax:
+      case LibFunc::fmaxf:
+      case LibFunc::fmaxl:
+        if (visitBinaryFloatCall(I, ISD::FMAXNUM))
+          return;
+        break;
+      case LibFunc::sin:
+      case LibFunc::sinf:
+      case LibFunc::sinl:
+        if (visitUnaryFloatCall(I, ISD::FSIN))
+          return;
+        break;
+      case LibFunc::cos:
+      case LibFunc::cosf:
+      case LibFunc::cosl:
+        if (visitUnaryFloatCall(I, ISD::FCOS))
+          return;
+        break;
+      case LibFunc::sqrt:
+      case LibFunc::sqrtf:
+      case LibFunc::sqrtl:
+      case LibFunc::sqrt_finite:
+      case LibFunc::sqrtf_finite:
+      case LibFunc::sqrtl_finite:
+        if (visitUnaryFloatCall(I, ISD::FSQRT))
+          return;
+        break;
+      case LibFunc::floor:
+      case LibFunc::floorf:
+      case LibFunc::floorl:
+        if (visitUnaryFloatCall(I, ISD::FFLOOR))
+          return;
+        break;
+      case LibFunc::nearbyint:
+      case LibFunc::nearbyintf:
+      case LibFunc::nearbyintl:
+        if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
+          return;
+        break;
+      case LibFunc::ceil:
+      case LibFunc::ceilf:
+      case LibFunc::ceill:
+        if (visitUnaryFloatCall(I, ISD::FCEIL))
+          return;
+        break;
+      case LibFunc::rint:
+      case LibFunc::rintf:
+      case LibFunc::rintl:
+        if (visitUnaryFloatCall(I, ISD::FRINT))
+          return;
+        break;
+      case LibFunc::round:
+      case LibFunc::roundf:
+      case LibFunc::roundl:
+        if (visitUnaryFloatCall(I, ISD::FROUND))
+          return;
+        break;
+      case LibFunc::trunc:
+      case LibFunc::truncf:
+      case LibFunc::truncl:
+        if (visitUnaryFloatCall(I, ISD::FTRUNC))
+          return;
+        break;
+      case LibFunc::log2:
+      case LibFunc::log2f:
+      case LibFunc::log2l:
+        if (visitUnaryFloatCall(I, ISD::FLOG2))
+          return;
+        break;
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+      case LibFunc::exp2l:
+        if (visitUnaryFloatCall(I, ISD::FEXP2))
+          return;
+        break;
+      case LibFunc::memcmp:
+        if (visitMemCmpCall(I))
+          return;
+        break;
+      case LibFunc::memchr:
+        if (visitMemChrCall(I))
+          return;
+        break;
+      case LibFunc::strcpy:
+        if (visitStrCpyCall(I, false))
+          return;
+        break;
+      case LibFunc::stpcpy:
+        if (visitStrCpyCall(I, true))
+          return;
+        break;
+      case LibFunc::strcmp:
+        if (visitStrCmpCall(I))
+          return;
+        break;
+      case LibFunc::strlen:
+        if (visitStrLenCall(I))
+          return;
+        break;
+      case LibFunc::strnlen:
+        if (visitStrNLenCall(I))
+          return;
+        break;
+      }
+    }
+  }
+
+  SDValue Callee;
+  if (!RenameFn)
+    Callee = getValue(I.getCalledValue());
+  else
+    Callee = DAG.getExternalSymbol(
+        RenameFn,
+        DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
+
+  // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
+  // have to do anything here to lower funclet bundles.
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
+         "Cannot lower calls with arbitrary operand bundles!");
+
+  if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
+    LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
+  else
+    // Check if we can potentially perform a tail call. More detailed checking
+    // is be done within LowerCallTo, after more information about the call is
+    // known.
+    LowerCallTo(&I, Callee, I.isTailCall());
+}
+
+namespace {
+
+/// AsmOperandInfo - This contains information for each constraint that we are
+/// lowering.
+class SDISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
+public:
+  /// CallOperand - If this is the result output operand or a clobber
+  /// this is null, otherwise it is the incoming operand to the CallInst.
+  /// This gets modified as the asm is processed.
+  SDValue CallOperand;
+
+  /// AssignedRegs - If this is a register or register class operand, this
+  /// contains the set of register corresponding to the operand.
+  RegsForValue AssignedRegs;
+
+  explicit SDISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &info)
+    : TargetLowering::AsmOperandInfo(info), CallOperand(nullptr,0) {
+  }
+
+  /// getCallOperandValEVT - Return the EVT of the Value* that this operand
+  /// corresponds to.  If there is no Value* for this operand, it returns
+  /// MVT::Other.
+  EVT getCallOperandValEVT(LLVMContext &Context, const TargetLowering &TLI,
+                           const DataLayout &DL) const {
+    if (!CallOperandVal) return MVT::Other;
+
+    if (isa<BasicBlock>(CallOperandVal))
+      return TLI.getPointerTy(DL);
+
+    llvm::Type *OpTy = CallOperandVal->getType();
+
+    // FIXME: code duplicated from TargetLowering::ParseConstraints().
+    // If this is an indirect operand, the operand is a pointer to the
+    // accessed type.
+    if (isIndirect) {
+      llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+      if (!PtrTy)
+        report_fatal_error("Indirect operand for inline asm not a pointer!");
+      OpTy = PtrTy->getElementType();
+    }
+
+    // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+    if (StructType *STy = dyn_cast<StructType>(OpTy))
+      if (STy->getNumElements() == 1)
+        OpTy = STy->getElementType(0);
+
+    // If OpTy is not a single value, it may be a struct/union that we
+    // can tile with integers.
+    if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+      unsigned BitSize = DL.getTypeSizeInBits(OpTy);
+      switch (BitSize) {
+      default: break;
+      case 1:
+      case 8:
+      case 16:
+      case 32:
+      case 64:
+      case 128:
+        OpTy = IntegerType::get(Context, BitSize);
+        break;
+      }
+    }
+
+    return TLI.getValueType(DL, OpTy, true);
+  }
+};
+
+typedef SmallVector<SDISelAsmOperandInfo,16> SDISelAsmOperandInfoVector;
+
+} // end anonymous namespace
+
+/// GetRegistersForValue - Assign registers (virtual or physical) for the
+/// specified operand.  We prefer to assign virtual registers, to allow the
+/// register allocator to handle the assignment process.  However, if the asm
+/// uses features that we can't model on machineinstrs, we have SDISel do the
+/// allocation.  This produces generally horrible, but correct, code.
+///
+///   OpInfo describes the operand.
+///
+static void GetRegistersForValue(SelectionDAG &DAG, const TargetLowering &TLI,
+                                 const SDLoc &DL,
+                                 SDISelAsmOperandInfo &OpInfo) {
+  LLVMContext &Context = *DAG.getContext();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  SmallVector<unsigned, 4> Regs;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  std::pair<unsigned, const TargetRegisterClass *> PhysReg =
+      TLI.getRegForInlineAsmConstraint(MF.getSubtarget().getRegisterInfo(),
+                                       OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other) {
+    // If this is a FP input in an integer register (or visa versa) insert a bit
+    // cast of the input value.  More generally, handle any case where the input
+    // value disagrees with the register class we plan to stick this in.
+    if (OpInfo.Type == InlineAsm::isInput &&
+        PhysReg.second && !PhysReg.second->hasType(OpInfo.ConstraintVT)) {
+      // Try to convert to the first EVT that the reg class contains.  If the
+      // types are identical size, use a bitcast to convert (e.g. two differing
+      // vector types).
+      MVT RegVT = *PhysReg.second->vt_begin();
+      if (RegVT.getSizeInBits() == OpInfo.CallOperand.getValueSizeInBits()) {
+        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      } else if (RegVT.isInteger() && OpInfo.ConstraintVT.isFloatingPoint()) {
+        // If the input is a FP value and we want it in FP registers, do a
+        // bitcast to the corresponding integer type.  This turns an f64 value
+        // into i64, which can be passed with two i32 values on a 32-bit
+        // machine.
+        RegVT = MVT::getIntegerVT(OpInfo.ConstraintVT.getSizeInBits());
+        OpInfo.CallOperand = DAG.getNode(ISD::BITCAST, DL,
+                                         RegVT, OpInfo.CallOperand);
+        OpInfo.ConstraintVT = RegVT;
+      }
+    }
+
+    NumRegs = TLI.getNumRegisters(Context, OpInfo.ConstraintVT);
+  }
+
+  MVT RegVT;
+  EVT ValueVT = OpInfo.ConstraintVT;
+
+  // If this is a constraint for a specific physical register, like {r17},
+  // assign it now.
+  if (unsigned AssignedReg = PhysReg.first) {
+    const TargetRegisterClass *RC = PhysReg.second;
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = *RC->vt_begin();
+
+    // Get the actual register value type.  This is important, because the user
+    // may have asked for (e.g.) the AX register in i32 type.  We need to
+    // remember that AX is actually i16 to get the right extension.
+    RegVT = *RC->vt_begin();
+
+    // This is a explicit reference to a physical register.
+    Regs.push_back(AssignedReg);
+
+    // If this is an expanded reference, add the rest of the regs to Regs.
+    if (NumRegs != 1) {
+      TargetRegisterClass::iterator I = RC->begin();
+      for (; *I != AssignedReg; ++I)
+        assert(I != RC->end() && "Didn't find reg!");
+
+      // Already added the first reg.
+      --NumRegs; ++I;
+      for (; NumRegs; --NumRegs, ++I) {
+        assert(I != RC->end() && "Ran out of registers to allocate!");
+        Regs.push_back(*I);
+      }
+    }
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    return;
+  }
+
+  // Otherwise, if this was a reference to an LLVM register class, create vregs
+  // for this reference.
+  if (const TargetRegisterClass *RC = PhysReg.second) {
+    RegVT = *RC->vt_begin();
+    if (OpInfo.ConstraintVT == MVT::Other)
+      ValueVT = RegVT;
+
+    // Create the appropriate number of virtual registers.
+    MachineRegisterInfo &RegInfo = MF.getRegInfo();
+    for (; NumRegs; --NumRegs)
+      Regs.push_back(RegInfo.createVirtualRegister(RC));
+
+    OpInfo.AssignedRegs = RegsForValue(Regs, RegVT, ValueVT);
+    return;
+  }
+
+  // Otherwise, we couldn't allocate enough registers for this.
+}
+
+/// visitInlineAsm - Handle a call to an InlineAsm object.
+///
+void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
+  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  SDISelAsmOperandInfoVector ConstraintOperands;
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
+      DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);
+
+  bool hasMemory = false;
+
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0;   // ResNo - The result number of the next output.
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    ConstraintOperands.push_back(SDISelAsmOperandInfo(TargetConstraints[i]));
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    MVT OpVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+        OpVT = TLI.getSimpleValueType(DAG.getDataLayout(),
+                                      STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpVT = TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    // If this is an input or an indirect output, process the call argument.
+    // BasicBlocks are labels, currently appearing only in asm's.
+    if (OpInfo.CallOperandVal) {
+      if (const BasicBlock *BB = dyn_cast<BasicBlock>(OpInfo.CallOperandVal)) {
+        OpInfo.CallOperand = DAG.getBasicBlock(FuncInfo.MBBMap[BB]);
+      } else {
+        OpInfo.CallOperand = getValue(OpInfo.CallOperandVal);
+      }
+
+      OpVT = OpInfo.getCallOperandValEVT(*DAG.getContext(), TLI,
+                                         DAG.getDataLayout()).getSimpleVT();
+    }
+
+    OpInfo.ConstraintVT = OpVT;
+
+    // Indirect operand accesses access memory.
+    if (OpInfo.isIndirect)
+      hasMemory = true;
+    else {
+      for (unsigned j = 0, ee = OpInfo.Codes.size(); j != ee; ++j) {
+        TargetLowering::ConstraintType
+          CType = TLI.getConstraintType(OpInfo.Codes[j]);
+        if (CType == TargetLowering::C_Memory) {
+          hasMemory = true;
+          break;
+        }
+      }
+    }
+  }
+
+  SDValue Chain, Flag;
+
+  // We won't need to flush pending loads if this asm doesn't touch
+  // memory and is nonvolatile.
+  if (hasMemory || IA->hasSideEffects())
+    Chain = getRoot();
+  else
+    Chain = DAG.getRoot();
+
+  // Second pass over the constraints: compute which constraint option to use
+  // and assign registers to constraints that want a specific physreg.
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        const TargetRegisterInfo *TRI = DAG.getSubtarget().getRegisterInfo();
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            TLI.getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                             OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            TLI.getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                             Input.ConstraintVT);
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (MatchRC.second != InputRC.second)) {
+          report_fatal_error("Unsupported asm: input constraint"
+                             " with a matching output constraint of"
+                             " incompatible type!");
+        }
+        Input.ConstraintVT = OpInfo.ConstraintVT;
+      }
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, OpInfo.CallOperand, &DAG);
+
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        OpInfo.Type == InlineAsm::isClobber)
+      continue;
+
+    // If this is a memory input, and if the operand is not indirect, do what we
+    // need to to provide an address for the memory input.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory &&
+        !OpInfo.isIndirect) {
+      assert((OpInfo.isMultipleAlternative ||
+              (OpInfo.Type == InlineAsm::isInput)) &&
+             "Can only indirectify direct input operands!");
+
+      // Memory operands really want the address of the value.  If we don't have
+      // an indirect input, put it in the constpool if we can, otherwise spill
+      // it to a stack slot.
+      // TODO: This isn't quite right. We need to handle these according to
+      // the addressing mode that the constraint wants. Also, this may take
+      // an additional register for the computation and we don't want that
+      // either.
+
+      // If the operand is a float, integer, or vector constant, spill to a
+      // constant pool entry to get its address.
+      const Value *OpVal = OpInfo.CallOperandVal;
+      if (isa<ConstantFP>(OpVal) || isa<ConstantInt>(OpVal) ||
+          isa<ConstantVector>(OpVal) || isa<ConstantDataVector>(OpVal)) {
+        OpInfo.CallOperand = DAG.getConstantPool(
+            cast<Constant>(OpVal), TLI.getPointerTy(DAG.getDataLayout()));
+      } else {
+        // Otherwise, create a stack slot and emit a store to it before the
+        // asm.
+        Type *Ty = OpVal->getType();
+        auto &DL = DAG.getDataLayout();
+        uint64_t TySize = DL.getTypeAllocSize(Ty);
+        unsigned Align = DL.getPrefTypeAlignment(Ty);
+        MachineFunction &MF = DAG.getMachineFunction();
+        int SSFI = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+        SDValue StackSlot =
+            DAG.getFrameIndex(SSFI, TLI.getPointerTy(DAG.getDataLayout()));
+        Chain = DAG.getStore(
+            Chain, getCurSDLoc(), OpInfo.CallOperand, StackSlot,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+        OpInfo.CallOperand = StackSlot;
+      }
+
+      // There is no longer a Value* corresponding to this operand.
+      OpInfo.CallOperandVal = nullptr;
+
+      // It is now an indirect operand.
+      OpInfo.isIndirect = true;
+    }
+
+    // If this constraint is for a specific register, allocate it before
+    // anything else.
+    if (OpInfo.ConstraintType == TargetLowering::C_Register)
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
+  }
+
+  // Second pass - Loop over all of the operands, assigning virtual or physregs
+  // to register class operands.
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    // C_Register operands have already been allocated, Other/Memory don't need
+    // to be.
+    if (OpInfo.ConstraintType == TargetLowering::C_RegisterClass)
+      GetRegistersForValue(DAG, TLI, getCurSDLoc(), OpInfo);
+  }
+
+  // AsmNodeOperands - The operands for the ISD::INLINEASM node.
+  std::vector<SDValue> AsmNodeOperands;
+  AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
+  AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
+      IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));
+
+  // If we have a !srcloc metadata node associated with it, we want to attach
+  // this to the ultimately generated inline asm machineinstr.  To do this, we
+  // pass in the third operand as this (potentially null) inline asm MDNode.
+  const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
+  AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
+
+  // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
+  // bits as operand 3.
+  unsigned ExtraInfo = 0;
+  if (IA->hasSideEffects())
+    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+  if (IA->isAlignStack())
+    ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+  if (CS.isConvergent())
+    ExtraInfo |= InlineAsm::Extra_IsConvergent;
+  // Set the asm dialect.
+  ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+
+  // Determine if this InlineAsm MayLoad or MayStore based on the constraints.
+  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
+    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
+
+    // Compute the constraint code and ConstraintType to use.
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
+
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for other constriants as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        ExtraInfo |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        ExtraInfo |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        ExtraInfo |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
+    }
+  }
+
+  AsmNodeOperands.push_back(DAG.getTargetConstant(
+      ExtraInfo, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+
+  // Loop over all of the inputs, copying the operand values into the
+  // appropriate registers and processing the output regs.
+  RegsForValue RetValRegs;
+
+  // IndirectStoresToEmit - The set of stores to emit after the inline asm node.
+  std::vector<std::pair<RegsForValue, Value*> > IndirectStoresToEmit;
+
+  for (unsigned i = 0, e = ConstraintOperands.size(); i != e; ++i) {
+    SDISelAsmOperandInfo &OpInfo = ConstraintOperands[i];
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput: {
+      if (OpInfo.ConstraintType != TargetLowering::C_RegisterClass &&
+          OpInfo.ConstraintType != TargetLowering::C_Register) {
+        // Memory output, or 'other' output (e.g. 'X' constraint).
+        assert(OpInfo.isIndirect && "Memory output must be indirect operand");
+
+        unsigned ConstraintID =
+            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM node to know about this output.
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(OpFlags, getCurSDLoc(),
+                                                        MVT::i32));
+        AsmNodeOperands.push_back(OpInfo.CallOperand);
+        break;
+      }
+
+      // Otherwise, this is a register or register class output.
+
+      // Copy the output from the appropriate register.  Find a register that
+      // we can use.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        emitInlineAsmError(
+            CS, "couldn't allocate output register for constraint '" +
+                    Twine(OpInfo.ConstraintCode) + "'");
+        return;
+      }
+
+      // If this is an indirect operand, store through the pointer after the
+      // asm.
+      if (OpInfo.isIndirect) {
+        IndirectStoresToEmit.push_back(std::make_pair(OpInfo.AssignedRegs,
+                                                      OpInfo.CallOperandVal));
+      } else {
+        // This is the result value of the call.
+        assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+        // Concatenate this output onto the outputs list.
+        RetValRegs.append(OpInfo.AssignedRegs);
+      }
+
+      // Add information to the INLINEASM node to know that this register is
+      // set.
+      OpInfo.AssignedRegs
+          .AddInlineAsmOperands(OpInfo.isEarlyClobber
+                                    ? InlineAsm::Kind_RegDefEarlyClobber
+                                    : InlineAsm::Kind_RegDef,
+                                false, 0, getCurSDLoc(), DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isInput: {
+      SDValue InOperandVal = OpInfo.CallOperand;
+
+      if (OpInfo.isMatchingInputConstraint()) {   // Matching constraint?
+        // If this is required to match an output register we have already set,
+        // just use its register.
+        unsigned OperandNo = OpInfo.getMatchedOperand();
+
+        // Scan until we find the definition we already emitted of this operand.
+        // When we find it, create a RegsForValue operand.
+        unsigned CurOp = InlineAsm::Op_FirstOperand;
+        for (; OperandNo; --OperandNo) {
+          // Advance to the next operand.
+          unsigned OpFlag =
+            cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+          assert((InlineAsm::isRegDefKind(OpFlag) ||
+                  InlineAsm::isRegDefEarlyClobberKind(OpFlag) ||
+                  InlineAsm::isMemKind(OpFlag)) && "Skipped past definitions?");
+          CurOp += InlineAsm::getNumOperandRegisters(OpFlag)+1;
+        }
+
+        unsigned OpFlag =
+          cast<ConstantSDNode>(AsmNodeOperands[CurOp])->getZExtValue();
+        if (InlineAsm::isRegDefKind(OpFlag) ||
+            InlineAsm::isRegDefEarlyClobberKind(OpFlag)) {
+          // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
+          if (OpInfo.isIndirect) {
+            // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
+            emitInlineAsmError(CS, "inline asm not supported yet:"
+                                   " don't know how to handle tied "
+                                   "indirect register inputs");
+            return;
+          }
+
+          RegsForValue MatchedRegs;
+          MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
+          MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
+          MatchedRegs.RegVTs.push_back(RegVT);
+          MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
+          for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
+               i != e; ++i) {
+            if (const TargetRegisterClass *RC = TLI.getRegClassFor(RegVT))
+              MatchedRegs.Regs.push_back(RegInfo.createVirtualRegister(RC));
+            else {
+              emitInlineAsmError(
+                  CS, "inline asm error: This value"
+                      " type register class is not natively supported!");
+              return;
+            }
+          }
+          SDLoc dl = getCurSDLoc();
+          // Use the produced MatchedRegs object to
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl,
+                                    Chain, &Flag, CS.getInstruction());
+          MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
+                                           true, OpInfo.getMatchedOperand(), dl,
+                                           DAG, AsmNodeOperands);
+          break;
+        }
+
+        assert(InlineAsm::isMemKind(OpFlag) && "Unknown matching constraint!");
+        assert(InlineAsm::getNumOperandRegisters(OpFlag) == 1 &&
+               "Unexpected number of operands");
+        // Add information to the INLINEASM node to know about this input.
+        // See InlineAsm.h isUseOperandTiedToDef.
+        OpFlag = InlineAsm::convertMemFlagWordToMatchingFlagWord(OpFlag);
+        OpFlag = InlineAsm::getFlagWordForMatchingOp(OpFlag,
+                                                    OpInfo.getMatchedOperand());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            OpFlag, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+        AsmNodeOperands.push_back(AsmNodeOperands[CurOp+1]);
+        break;
+      }
+
+      // Treat indirect 'X' constraint as memory.
+      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          OpInfo.isIndirect)
+        OpInfo.ConstraintType = TargetLowering::C_Memory;
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Other) {
+        std::vector<SDValue> Ops;
+        TLI.LowerAsmOperandForConstraint(InOperandVal, OpInfo.ConstraintCode,
+                                          Ops, DAG);
+        if (Ops.empty()) {
+          emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
+                                     Twine(OpInfo.ConstraintCode) + "'");
+          return;
+        }
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType =
+          InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
+        AsmNodeOperands.push_back(DAG.getTargetConstant(
+            ResOpType, getCurSDLoc(), TLI.getPointerTy(DAG.getDataLayout())));
+        AsmNodeOperands.insert(AsmNodeOperands.end(), Ops.begin(), Ops.end());
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+        assert(InOperandVal.getValueType() ==
+                   TLI.getPointerTy(DAG.getDataLayout()) &&
+               "Memory operands expect pointer values");
+
+        unsigned ConstraintID =
+            TLI.getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned ResOpType = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        ResOpType = InlineAsm::getFlagWordForMem(ResOpType, ConstraintID);
+        AsmNodeOperands.push_back(DAG.getTargetConstant(ResOpType,
+                                                        getCurSDLoc(),
+                                                        MVT::i32));
+        AsmNodeOperands.push_back(InOperandVal);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+
+      // TODO: Support this.
+      if (OpInfo.isIndirect) {
+        emitInlineAsmError(
+            CS, "Don't know how to handle indirect register inputs yet "
+                "for constraint '" +
+                    Twine(OpInfo.ConstraintCode) + "'");
+        return;
+      }
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.AssignedRegs.Regs.empty()) {
+        emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
+                                   Twine(OpInfo.ConstraintCode) + "'");
+        return;
+      }
+
+      SDLoc dl = getCurSDLoc();
+
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
+                                        Chain, &Flag, CS.getInstruction());
+
+      OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
+                                               dl, DAG, AsmNodeOperands);
+      break;
+    }
+    case InlineAsm::isClobber: {
+      // Add the clobbered value to the operand list, so that the register
+      // allocator is aware that the physreg got clobbered.
+      if (!OpInfo.AssignedRegs.Regs.empty())
+        OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_Clobber,
+                                                 false, 0, getCurSDLoc(), DAG,
+                                                 AsmNodeOperands);
+      break;
+    }
+    }
+  }
+
+  // Finish up input operands.  Set the input chain and add the flag last.
+  AsmNodeOperands[InlineAsm::Op_InputChain] = Chain;
+  if (Flag.getNode()) AsmNodeOperands.push_back(Flag);
+
+  Chain = DAG.getNode(ISD::INLINEASM, getCurSDLoc(),
+                      DAG.getVTList(MVT::Other, MVT::Glue), AsmNodeOperands);
+  Flag = Chain.getValue(1);
+
+  // If this asm returns a register value, copy the result from that register
+  // and set it as the value of the call.
+  if (!RetValRegs.Regs.empty()) {
+    SDValue Val = RetValRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                             Chain, &Flag, CS.getInstruction());
+
+    // FIXME: Why don't we do this for inline asms with MRVs?
+    if (CS.getType()->isSingleValueType() && CS.getType()->isSized()) {
+      EVT ResultType = TLI.getValueType(DAG.getDataLayout(), CS.getType());
+
+      // If any of the results of the inline asm is a vector, it may have the
+      // wrong width/num elts.  This can happen for register classes that can
+      // contain multiple different value types.  The preg or vreg allocated may
+      // not have the same VT as was expected.  Convert it to the right type
+      // with bit_convert.
+      if (ResultType != Val.getValueType() && Val.getValueType().isVector()) {
+        Val = DAG.getNode(ISD::BITCAST, getCurSDLoc(),
+                          ResultType, Val);
+
+      } else if (ResultType != Val.getValueType() &&
+                 ResultType.isInteger() && Val.getValueType().isInteger()) {
+        // If a result value was tied to an input value, the computed result may
+        // have a wider width than the expected result.  Extract the relevant
+        // portion.
+        Val = DAG.getNode(ISD::TRUNCATE, getCurSDLoc(), ResultType, Val);
+      }
+
+      assert(ResultType == Val.getValueType() && "Asm result value mismatch!");
+    }
+
+    setValue(CS.getInstruction(), Val);
+    // Don't need to use this as a chain in this case.
+    if (!IA->hasSideEffects() && !hasMemory && IndirectStoresToEmit.empty())
+      return;
+  }
+
+  std::vector<std::pair<SDValue, const Value *> > StoresToEmit;
+
+  // Process indirect outputs, first output all of the flagged copies out of
+  // physregs.
+  for (unsigned i = 0, e = IndirectStoresToEmit.size(); i != e; ++i) {
+    RegsForValue &OutRegs = IndirectStoresToEmit[i].first;
+    const Value *Ptr = IndirectStoresToEmit[i].second;
+    SDValue OutVal = OutRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                             Chain, &Flag, IA);
+    StoresToEmit.push_back(std::make_pair(OutVal, Ptr));
+  }
+
+  // Emit the non-flagged stores from the physregs.
+  SmallVector<SDValue, 8> OutChains;
+  for (unsigned i = 0, e = StoresToEmit.size(); i != e; ++i) {
+    SDValue Val = DAG.getStore(Chain, getCurSDLoc(), StoresToEmit[i].first,
+                               getValue(StoresToEmit[i].second),
+                               MachinePointerInfo(StoresToEmit[i].second));
+    OutChains.push_back(Val);
+  }
+
+  if (!OutChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(), MVT::Other, OutChains);
+
+  DAG.setRoot(Chain);
+}
+
+void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
+                                             const Twine &Message) {
+  LLVMContext &Ctx = *DAG.getContext();
+  Ctx.emitError(CS.getInstruction(), Message);
+
+  // Make sure we leave the DAG in a valid state
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto VT = TLI.getValueType(DAG.getDataLayout(), CS.getType());
+  setValue(CS.getInstruction(), DAG.getUNDEF(VT));
+}
+
+void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VASTART, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
+  SDValue V = DAG.getVAArg(TLI.getValueType(DAG.getDataLayout(), I.getType()),
+                           getCurSDLoc(), getRoot(), getValue(I.getOperand(0)),
+                           DAG.getSrcValue(I.getOperand(0)),
+                           DL.getABITypeAlignment(I.getType()));
+  setValue(&I, V);
+  DAG.setRoot(V.getValue(1));
+}
+
+void SelectionDAGBuilder::visitVAEnd(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VAEND, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(0))));
+}
+
+void SelectionDAGBuilder::visitVACopy(const CallInst &I) {
+  DAG.setRoot(DAG.getNode(ISD::VACOPY, getCurSDLoc(),
+                          MVT::Other, getRoot(),
+                          getValue(I.getArgOperand(0)),
+                          getValue(I.getArgOperand(1)),
+                          DAG.getSrcValue(I.getArgOperand(0)),
+                          DAG.getSrcValue(I.getArgOperand(1))));
+}
+
+SDValue SelectionDAGBuilder::lowerRangeToAssertZExt(SelectionDAG &DAG,
+                                                    const Instruction &I,
+                                                    SDValue Op) {
+  const MDNode *Range = I.getMetadata(LLVMContext::MD_range);
+  if (!Range)
+    return Op;
+
+  Constant *Lo = cast<ConstantAsMetadata>(Range->getOperand(0))->getValue();
+  if (!Lo->isNullValue())
+    return Op;
+
+  Constant *Hi = cast<ConstantAsMetadata>(Range->getOperand(1))->getValue();
+  unsigned Bits = cast<ConstantInt>(Hi)->getValue().logBase2();
+
+  EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), Bits);
+
+  SDLoc SL = getCurSDLoc();
+
+  SDValue ZExt = DAG.getNode(ISD::AssertZext, SL, Op.getValueType(),
+                             Op, DAG.getValueType(SmallVT));
+  unsigned NumVals = Op.getNode()->getNumValues();
+  if (NumVals == 1)
+    return ZExt;
+
+  SmallVector<SDValue, 4> Ops;
+
+  Ops.push_back(ZExt);
+  for (unsigned I = 1; I != NumVals; ++I)
+    Ops.push_back(Op.getValue(I));
+
+  return DAG.getMergeValues(Ops, SL);
+}
+
+/// \brief Populate a CallLowerinInfo (into \p CLI) based on the properties of
+/// the call being lowered.
+///
+/// This is a helper for lowering intrinsics that follow a target calling
+/// convention or require stack pointer adjustment. Only a subset of the
+/// intrinsic's operands need to participate in the calling convention.
+void SelectionDAGBuilder::populateCallLoweringInfo(
+    TargetLowering::CallLoweringInfo &CLI, ImmutableCallSite CS,
+    unsigned ArgIdx, unsigned NumArgs, SDValue Callee, Type *ReturnTy,
+    bool IsPatchPoint) {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(NumArgs);
+
+  // Populate the argument list.
+  // Attributes for args start at offset 1, after the return attribute.
+  for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs, AttrI = ArgIdx + 1;
+       ArgI != ArgE; ++ArgI) {
+    const Value *V = CS->getOperand(ArgI);
+
+    assert(!V->getType()->isEmptyTy() && "Empty type passed to intrinsic.");
+
+    TargetLowering::ArgListEntry Entry;
+    Entry.Node = getValue(V);
+    Entry.Ty = V->getType();
+    Entry.setAttributes(&CS, AttrI);
+    Args.push_back(Entry);
+  }
+
+  CLI.setDebugLoc(getCurSDLoc())
+      .setChain(getRoot())
+      .setCallee(CS.getCallingConv(), ReturnTy, Callee, std::move(Args))
+      .setDiscardResult(CS->use_empty())
+      .setIsPatchPoint(IsPatchPoint);
+}
+
+/// \brief Add a stack map intrinsic call's live variable operands to a stackmap
+/// or patchpoint target node's operand list.
+///
+/// Constants are converted to TargetConstants purely as an optimization to
+/// avoid constant materialization and register allocation.
+///
+/// FrameIndex operands are converted to TargetFrameIndex so that ISEL does not
+/// generate addess computation nodes, and so ExpandISelPseudo can convert the
+/// TargetFrameIndex into a DirectMemRefOp StackMap location. This avoids
+/// address materialization and register allocation, but may also be required
+/// for correctness. If a StackMap (or PatchPoint) intrinsic directly uses an
+/// alloca in the entry block, then the runtime may assume that the alloca's
+/// StackMap location can be read immediately after compilation and that the
+/// location is valid at any point during execution (this is similar to the
+/// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
+/// only available in a register, then the runtime would need to trap when
+/// execution reaches the StackMap in order to read the alloca's location.
+static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
+                                const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
+                                SelectionDAGBuilder &Builder) {
+  for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
+    SDValue OpVal = Builder.getValue(CS.getArgument(i));
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
+      Ops.push_back(
+        Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
+      Ops.push_back(
+        Builder.DAG.getTargetConstant(C->getSExtValue(), DL, MVT::i64));
+    } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(OpVal)) {
+      const TargetLowering &TLI = Builder.DAG.getTargetLoweringInfo();
+      Ops.push_back(Builder.DAG.getTargetFrameIndex(
+          FI->getIndex(), TLI.getPointerTy(Builder.DAG.getDataLayout())));
+    } else
+      Ops.push_back(OpVal);
+  }
+}
+
+/// \brief Lower llvm.experimental.stackmap directly to its target opcode.
+void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
+  // void @llvm.experimental.stackmap(i32 <id>, i32 <numShadowBytes>,
+  //                                  [live variables...])
+
+  assert(CI.getType()->isVoidTy() && "Stackmap cannot return a value.");
+
+  SDValue Chain, InFlag, Callee, NullPtr;
+  SmallVector<SDValue, 32> Ops;
+
+  SDLoc DL = getCurSDLoc();
+  Callee = getValue(CI.getCalledValue());
+  NullPtr = DAG.getIntPtrConstant(0, DL, true);
+
+  // The stackmap intrinsic only records the live variables (the arguemnts
+  // passed to it) and emits NOPS (if requested). Unlike the patchpoint
+  // intrinsic, this won't be lowered to a function call. This means we don't
+  // have to worry about calling conventions and target specific lowering code.
+  // Instead we perform the call lowering right here.
+  //
+  // chain, flag = CALLSEQ_START(chain, 0)
+  // chain, flag = STACKMAP(id, nbytes, ..., chain, flag)
+  // chain, flag = CALLSEQ_END(chain, 0, 0, flag)
+  //
+  Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL);
+  InFlag = Chain.getValue(1);
+
+  // Add the <id> and <numBytes> constants.
+  SDValue IDVal = getValue(CI.getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(IDVal)->getZExtValue(), DL, MVT::i64));
+  SDValue NBytesVal = getValue(CI.getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(NBytesVal)->getZExtValue(), DL,
+                  MVT::i32));
+
+  // Push live variables for the stack map.
+  addStackMapLiveVars(&CI, 2, DL, Ops, *this);
+
+  // We are not pushing any register mask info here on the operands list,
+  // because the stackmap doesn't clobber anything.
+
+  // Push the chain and the glue flag.
+  Ops.push_back(Chain);
+  Ops.push_back(InFlag);
+
+  // Create the STACKMAP node.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  SDNode *SM = DAG.getMachineNode(TargetOpcode::STACKMAP, DL, NodeTys, Ops);
+  Chain = SDValue(SM, 0);
+  InFlag = Chain.getValue(1);
+
+  Chain = DAG.getCALLSEQ_END(Chain, NullPtr, NullPtr, InFlag, DL);
+
+  // Stackmaps don't generate values, so nothing goes into the NodeMap.
+
+  // Set the root to the target-lowered call chain.
+  DAG.setRoot(Chain);
+
+  // Inform the Frame Information that we have a stackmap in this function.
+  FuncInfo.MF->getFrameInfo()->setHasStackMap();
+}
+
+/// \brief Lower llvm.experimental.patchpoint directly to its target opcode.
+void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
+                                          const BasicBlock *EHPadBB) {
+  // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
+  //                                                 i32 <numBytes>,
+  //                                                 i8* <target>,
+  //                                                 i32 <numArgs>,
+  //                                                 [Args...],
+  //                                                 [live variables...])
+
+  CallingConv::ID CC = CS.getCallingConv();
+  bool IsAnyRegCC = CC == CallingConv::AnyReg;
+  bool HasDef = !CS->getType()->isVoidTy();
+  SDLoc dl = getCurSDLoc();
+  SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));
+
+  // Handle immediate and symbolic callees.
+  if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
+    Callee = DAG.getIntPtrConstant(ConstCallee->getZExtValue(), dl,
+                                   /*isTarget=*/true);
+  else if (auto* SymbolicCallee = dyn_cast<GlobalAddressSDNode>(Callee))
+    Callee =  DAG.getTargetGlobalAddress(SymbolicCallee->getGlobal(),
+                                         SDLoc(SymbolicCallee),
+                                         SymbolicCallee->getValueType(0));
+
+  // Get the real number of arguments participating in the call <numArgs>
+  SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
+  unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
+
+  // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
+  // Intrinsics include all meta-operands up to but not including CC.
+  unsigned NumMetaOpers = PatchPointOpers::CCPos;
+  assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
+         "Not enough arguments provided to the patchpoint intrinsic");
+
+  // For AnyRegCC the arguments are lowered later on manually.
+  unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
+  Type *ReturnTy =
+    IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  populateCallLoweringInfo(CLI, CS, NumMetaOpers, NumCallArgs, Callee, ReturnTy,
+                           true);
+  std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
+
+  SDNode *CallEnd = Result.second.getNode();
+  if (HasDef && (CallEnd->getOpcode() == ISD::CopyFromReg))
+    CallEnd = CallEnd->getOperand(0).getNode();
+
+  /// Get a call instruction from the call sequence chain.
+  /// Tail calls are not allowed.
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END &&
+         "Expected a callseq node.");
+  SDNode *Call = CallEnd->getOperand(0).getNode();
+  bool HasGlue = Call->getGluedNode();
+
+  // Replace the target specific call node with the patchable intrinsic.
+  SmallVector<SDValue, 8> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
+  SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
+  Ops.push_back(DAG.getTargetConstant(
+                  cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
+                  MVT::i32));
+
+  // Add the callee.
+  Ops.push_back(Callee);
+
+  // Adjust <numArgs> to account for any arguments that have been passed on the
+  // stack instead.
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  unsigned NumCallRegArgs = Call->getNumOperands() - (HasGlue ? 4 : 3);
+  NumCallRegArgs = IsAnyRegCC ? NumArgs : NumCallRegArgs;
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, dl, MVT::i32));
+
+  // Add the calling convention
+  Ops.push_back(DAG.getTargetConstant((unsigned)CC, dl, MVT::i32));
+
+  // Add the arguments we omitted previously. The register allocator should
+  // place these in any free register.
+  if (IsAnyRegCC)
+    for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
+      Ops.push_back(getValue(CS.getArgument(i)));
+
+  // Push the arguments from the call instruction up to the register mask.
+  SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
+  Ops.append(Call->op_begin() + 2, e);
+
+  // Push live variables for the stack map.
+  addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);
+
+  // Push the register mask info.
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end()-2));
+  else
+    Ops.push_back(*(Call->op_end()-1));
+
+  // Push the chain (this is originally the first operand of the call, but
+  // becomes now the last or second to last operand).
+  Ops.push_back(*(Call->op_begin()));
+
+  // Push the glue flag (last operand).
+  if (HasGlue)
+    Ops.push_back(*(Call->op_end()-1));
+
+  SDVTList NodeTys;
+  if (IsAnyRegCC && HasDef) {
+    // Create the return types based on the intrinsic definition
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    SmallVector<EVT, 3> ValueVTs;
+    ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
+    assert(ValueVTs.size() == 1 && "Expected only one return value type.");
+
+    // There is always a chain and a glue type at the end
+    ValueVTs.push_back(MVT::Other);
+    ValueVTs.push_back(MVT::Glue);
+    NodeTys = DAG.getVTList(ValueVTs);
+  } else
+    NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  // Replace the target specific call node with a PATCHPOINT node.
+  MachineSDNode *MN = DAG.getMachineNode(TargetOpcode::PATCHPOINT,
+                                         dl, NodeTys, Ops);
+
+  // Update the NodeMap.
+  if (HasDef) {
+    if (IsAnyRegCC)
+      setValue(CS.getInstruction(), SDValue(MN, 0));
+    else
+      setValue(CS.getInstruction(), Result.first);
+  }
+
+  // Fixup the consumers of the intrinsic. The chain and glue may be used in the
+  // call sequence. Furthermore the location of the chain and glue can change
+  // when the AnyReg calling convention is used and the intrinsic returns a
+  // value.
+  if (IsAnyRegCC && HasDef) {
+    SDValue From[] = {SDValue(Call, 0), SDValue(Call, 1)};
+    SDValue To[] = {SDValue(MN, 1), SDValue(MN, 2)};
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2);
+  } else
+    DAG.ReplaceAllUsesWith(Call, MN);
+  DAG.DeleteNode(Call);
+
+  // Inform the Frame Information that we have a patchpoint in this function.
+  FuncInfo.MF->getFrameInfo()->setHasPatchPoint();
+}
+
+/// Returns an AttributeSet representing the attributes applied to the return
+/// value of the given call.
+static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+  SmallVector<Attribute::AttrKind, 2> Attrs;
+  if (CLI.RetSExt)
+    Attrs.push_back(Attribute::SExt);
+  if (CLI.RetZExt)
+    Attrs.push_back(Attribute::ZExt);
+  if (CLI.IsInReg)
+    Attrs.push_back(Attribute::InReg);
+
+  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
+                           Attrs);
+}
+
+/// TargetLowering::LowerCallTo - This is the default LowerCallTo
+/// implementation, which just calls LowerCall.
+/// FIXME: When all targets are
+/// migrated to using LowerCall, this hook should be integrated into SDISel.
+std::pair<SDValue, SDValue>
+TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
+  // Handle the incoming return values from the call.
+  CLI.Ins.clear();
+  Type *OrigRetTy = CLI.RetTy;
+  SmallVector<EVT, 4> RetTys;
+  SmallVector<uint64_t, 4> Offsets;
+  auto &DL = CLI.DAG.getDataLayout();
+  ComputeValueVTs(*this, DL, CLI.RetTy, RetTys, &Offsets);
+
+  SmallVector<ISD::OutputArg, 4> Outs;
+  GetReturnInfo(CLI.RetTy, getReturnAttrs(CLI), Outs, *this, DL);
+
+  bool CanLowerReturn =
+      this->CanLowerReturn(CLI.CallConv, CLI.DAG.getMachineFunction(),
+                           CLI.IsVarArg, Outs, CLI.RetTy->getContext());
+
+  SDValue DemoteStackSlot;
+  int DemoteStackIdx = -100;
+  if (!CanLowerReturn) {
+    // FIXME: equivalent assert?
+    // assert(!CS.hasInAllocaArgument() &&
+    //        "sret demotion is incompatible with inalloca");
+    uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
+    unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    DemoteStackIdx = MF.getFrameInfo()->CreateStackObject(TySize, Align, false);
+    Type *StackSlotPtrType = PointerType::getUnqual(CLI.RetTy);
+
+    DemoteStackSlot = CLI.DAG.getFrameIndex(DemoteStackIdx, getPointerTy(DL));
+    ArgListEntry Entry;
+    Entry.Node = DemoteStackSlot;
+    Entry.Ty = StackSlotPtrType;
+    Entry.isSExt = false;
+    Entry.isZExt = false;
+    Entry.isInReg = false;
+    Entry.isSRet = true;
+    Entry.isNest = false;
+    Entry.isByVal = false;
+    Entry.isReturned = false;
+    Entry.isSwiftSelf = false;
+    Entry.isSwiftError = false;
+    Entry.Alignment = Align;
+    CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
+    CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
+
+    // sret demotion isn't compatible with tail-calls, since the sret argument
+    // points into the callers stack frame.
+    CLI.IsTailCall = false;
+  } else {
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = RegisterVT;
+        MyFlags.ArgVT = VT;
+        MyFlags.Used = CLI.IsReturnValueUsed;
+        if (CLI.RetSExt)
+          MyFlags.Flags.setSExt();
+        if (CLI.RetZExt)
+          MyFlags.Flags.setZExt();
+        if (CLI.IsInReg)
+          MyFlags.Flags.setInReg();
+        CLI.Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // We push in swifterror return as the last element of CLI.Ins.
+  ArgListTy &Args = CLI.getArgs();
+  if (supportSwiftError()) {
+    for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+      if (Args[i].isSwiftError) {
+        ISD::InputArg MyFlags;
+        MyFlags.VT = getPointerTy(DL);
+        MyFlags.ArgVT = EVT(getPointerTy(DL));
+        MyFlags.Flags.setSwiftError();
+        CLI.Ins.push_back(MyFlags);
+      }
+    }
+  }
+
+  // Handle all of the outgoing arguments.
+  CLI.Outs.clear();
+  CLI.OutVals.clear();
+  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
+    Type *FinalType = Args[i].Ty;
+    if (Args[i].isByVal)
+      FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
+    bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
+        FinalType, CLI.CallConv, CLI.IsVarArg);
+    for (unsigned Value = 0, NumValues = ValueVTs.size(); Value != NumValues;
+         ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(CLI.RetTy->getContext());
+      SDValue Op = SDValue(Args[i].Node.getNode(),
+                           Args[i].Node.getResNo() + Value);
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+
+      if (Args[i].isZExt)
+        Flags.setZExt();
+      if (Args[i].isSExt)
+        Flags.setSExt();
+      if (Args[i].isInReg)
+        Flags.setInReg();
+      if (Args[i].isSRet)
+        Flags.setSRet();
+      if (Args[i].isSwiftSelf)
+        Flags.setSwiftSelf();
+      if (Args[i].isSwiftError)
+        Flags.setSwiftError();
+      if (Args[i].isByVal)
+        Flags.setByVal();
+      if (Args[i].isInAlloca) {
+        Flags.setInAlloca();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // inalloca.  This way we can know how many bytes we should've allocated
+        // and how many bytes a callee cleanup function will pop.  If we port
+        // inalloca to more targets, we'll have to add custom inalloca handling
+        // in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+      if (Args[i].isByVal || Args[i].isInAlloca) {
+        PointerType *Ty = cast<PointerType>(Args[i].Ty);
+        Type *ElementTy = Ty->getElementType();
+        Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
+        // For ByVal, alignment should come from FE.  BE will guess if this
+        // info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
+        if (Args[i].Alignment)
+          FrameAlign = Args[i].Alignment;
+        else
+          FrameAlign = getByValTypeAlignment(ElementTy, DL);
+        Flags.setByValAlign(FrameAlign);
+      }
+      if (Args[i].isNest)
+        Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT PartVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumParts = getNumRegisters(CLI.RetTy->getContext(), VT);
+      SmallVector<SDValue, 4> Parts(NumParts);
+      ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+      if (Args[i].isSExt)
+        ExtendKind = ISD::SIGN_EXTEND;
+      else if (Args[i].isZExt)
+        ExtendKind = ISD::ZERO_EXTEND;
+
+      // Conservatively only handle 'returned' on non-vectors for now
+      if (Args[i].isReturned && !Op.getValueType().isVector()) {
+        assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
+               "unexpected use of 'returned'");
+        // Before passing 'returned' to the target lowering code, ensure that
+        // either the register MVT and the actual EVT are the same size or that
+        // the return value and argument are extended in the same way; in these
+        // cases it's safe to pass the argument register value unchanged as the
+        // return register value (although it's at the target's option whether
+        // to do so)
+        // TODO: allow code generation to take advantage of partially preserved
+        // registers rather than clobbering the entire register when the
+        // parameter extension method is not compatible with the return
+        // extension method
+        if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
+            (ExtendKind != ISD::ANY_EXTEND &&
+             CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt))
+        Flags.setReturned();
+      }
+
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
+                     CLI.CS ? CLI.CS->getInstruction() : nullptr, ExtendKind);
+
+      for (unsigned j = 0; j != NumParts; ++j) {
+        // if it isn't first piece, alignment must be 1
+        ISD::OutputArg MyFlags(Flags, Parts[j].getValueType(), VT,
+                               i < CLI.NumFixedArgs,
+                               i, j*Parts[j].getValueType().getStoreSize());
+        if (NumParts > 1 && j == 0)
+          MyFlags.Flags.setSplit();
+        else if (j != 0) {
+          MyFlags.Flags.setOrigAlign(1);
+          if (j == NumParts - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
+
+        CLI.Outs.push_back(MyFlags);
+        CLI.OutVals.push_back(Parts[j]);
+      }
+
+      if (NeedsRegBlock && Value == NumValues - 1)
+        CLI.Outs[CLI.Outs.size() - 1].Flags.setInConsecutiveRegsLast();
+    }
+  }
+
+  SmallVector<SDValue, 4> InVals;
+  CLI.Chain = LowerCall(CLI, InVals);
+
+  // Update CLI.InVals to use outside of this function.
+  CLI.InVals = InVals;
+
+  // Verify that the target's LowerCall behaved as expected.
+  assert(CLI.Chain.getNode() && CLI.Chain.getValueType() == MVT::Other &&
+         "LowerCall didn't return a valid chain!");
+  assert((!CLI.IsTailCall || InVals.empty()) &&
+         "LowerCall emitted a return value for a tail call!");
+  assert((CLI.IsTailCall || InVals.size() == CLI.Ins.size()) &&
+         "LowerCall didn't emit the correct number of values!");
+
+  // For a tail call, the return value is merely live-out and there aren't
+  // any nodes in the DAG representing it. Return a special value to
+  // indicate that a tail call has been emitted and no more Instructions
+  // should be processed in the current block.
+  if (CLI.IsTailCall) {
+    CLI.DAG.setRoot(CLI.Chain);
+    return std::make_pair(SDValue(), SDValue());
+  }
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = CLI.Ins.size(); i != e; ++i) {
+    assert(InVals[i].getNode() && "LowerCall emitted a null value!");
+    assert(EVT(CLI.Ins[i].VT) == InVals[i].getValueType() &&
+           "LowerCall emitted a value with the wrong type!");
+  }
+#endif
+
+  SmallVector<SDValue, 4> ReturnValues;
+  if (!CanLowerReturn) {
+    // The instruction result is the result of loading from the
+    // hidden sret parameter.
+    SmallVector<EVT, 1> PVTs;
+    Type *PtrRetTy = PointerType::getUnqual(OrigRetTy);
+
+    ComputeValueVTs(*this, DL, PtrRetTy, PVTs);
+    assert(PVTs.size() == 1 && "Pointers should fit in one register");
+    EVT PtrVT = PVTs[0];
+
+    unsigned NumValues = RetTys.size();
+    ReturnValues.resize(NumValues);
+    SmallVector<SDValue, 4> Chains(NumValues);
+
+    // An aggregate return value cannot wrap around the address space, so
+    // offsets to its parts don't wrap either.
+    SDNodeFlags Flags;
+    Flags.setNoUnsignedWrap(true);
+
+    for (unsigned i = 0; i < NumValues; ++i) {
+      SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
+                                    CLI.DAG.getConstant(Offsets[i], CLI.DL,
+                                                        PtrVT), &Flags);
+      SDValue L = CLI.DAG.getLoad(
+          RetTys[i], CLI.DL, CLI.Chain, Add,
+          MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
+                                            DemoteStackIdx, Offsets[i]),
+          /* Alignment = */ 1);
+      ReturnValues[i] = L;
+      Chains[i] = L.getValue(1);
+    }
+
+    CLI.Chain = CLI.DAG.getNode(ISD::TokenFactor, CLI.DL, MVT::Other, Chains);
+  } else {
+    // Collect the legal value parts into potentially illegal values
+    // that correspond to the original function's return values.
+    Optional<ISD::NodeType> AssertOp;
+    if (CLI.RetSExt)
+      AssertOp = ISD::AssertSext;
+    else if (CLI.RetZExt)
+      AssertOp = ISD::AssertZext;
+    unsigned CurReg = 0;
+    for (unsigned I = 0, E = RetTys.size(); I != E; ++I) {
+      EVT VT = RetTys[I];
+      MVT RegisterVT = getRegisterType(CLI.RetTy->getContext(), VT);
+      unsigned NumRegs = getNumRegisters(CLI.RetTy->getContext(), VT);
+
+      ReturnValues.push_back(getCopyFromParts(CLI.DAG, CLI.DL, &InVals[CurReg],
+                                              NumRegs, RegisterVT, VT, nullptr,
+                                              AssertOp));
+      CurReg += NumRegs;
+    }
+
+    // For a function returning void, there is no return value. We can't create
+    // such a node, so we just return a null return value in that case. In
+    // that case, nothing will actually look at the value.
+    if (ReturnValues.empty())
+      return std::make_pair(SDValue(), CLI.Chain);
+  }
+
+  SDValue Res = CLI.DAG.getNode(ISD::MERGE_VALUES, CLI.DL,
+                                CLI.DAG.getVTList(RetTys), ReturnValues);
+  return std::make_pair(Res, CLI.Chain);
+}
+
+void TargetLowering::LowerOperationWrapper(SDNode *N,
+                                           SmallVectorImpl<SDValue> &Results,
+                                           SelectionDAG &DAG) const {
+  if (SDValue Res = LowerOperation(SDValue(N, 0), DAG))
+    Results.push_back(Res);
+}
+
+SDValue TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  llvm_unreachable("LowerOperation not implemented for this target!");
+}
+
+void
+SelectionDAGBuilder::CopyValueToVirtualRegister(const Value *V, unsigned Reg) {
+  SDValue Op = getNonRegisterValue(V);
+  assert((Op.getOpcode() != ISD::CopyFromReg ||
+          cast<RegisterSDNode>(Op.getOperand(1))->getReg() != Reg) &&
+         "Copy from a reg to the same reg!");
+  assert(!TargetRegisterInfo::isPhysicalRegister(Reg) && "Is a physreg");
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  RegsForValue RFV(V->getContext(), TLI, DAG.getDataLayout(), Reg,
+                   V->getType());
+  SDValue Chain = DAG.getEntryNode();
+
+  ISD::NodeType ExtendType = (FuncInfo.PreferredExtendType.find(V) ==
+                              FuncInfo.PreferredExtendType.end())
+                                 ? ISD::ANY_EXTEND
+                                 : FuncInfo.PreferredExtendType[V];
+  RFV.getCopyToRegs(Op, DAG, getCurSDLoc(), Chain, nullptr, V, ExtendType);
+  PendingExports.push_back(Chain);
+}
+
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+/// isOnlyUsedInEntryBlock - If the specified argument is only used in the
+/// entry block, return true.  This includes arguments used by switches, since
+/// the switch may expand into multiple basic blocks.
+static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
+  // With FastISel active, we may be splitting blocks, so force creation
+  // of virtual registers for all non-dead arguments.
+  if (FastISel)
+    return A->use_empty();
+
+  const BasicBlock &Entry = A->getParent()->front();
+  for (const User *U : A->users())
+    if (cast<Instruction>(U)->getParent() != &Entry || isa<SwitchInst>(U))
+      return false;  // Use not in entry block.
+
+  return true;
+}
+
+void SelectionDAGISel::LowerArguments(const Function &F) {
+  SelectionDAG &DAG = SDB->DAG;
+  SDLoc dl = SDB->getCurSDLoc();
+  const DataLayout &DL = DAG.getDataLayout();
+  SmallVector<ISD::InputArg, 16> Ins;
+
+  if (!FuncInfo->CanLowerReturn) {
+    // Put in an sret pointer parameter before all the other parameters.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
+
+    // NOTE: Assuming that a pointer will never break down to more than one VT
+    // or one register.
+    ISD::ArgFlagsTy Flags;
+    Flags.setSRet();
+    MVT RegisterVT = TLI->getRegisterType(*DAG.getContext(), ValueVTs[0]);
+    ISD::InputArg RetArg(Flags, RegisterVT, ValueVTs[0], true,
+                         ISD::InputArg::NoArgIndex, 0);
+    Ins.push_back(RetArg);
+  }
+
+  // Set up the incoming argument description vector.
+  unsigned Idx = 1;
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
+       I != E; ++I, ++Idx) {
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
+    bool isArgValueUsed = !I->use_empty();
+    unsigned PartBase = 0;
+    Type *FinalType = I->getType();
+    if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
+      FinalType = cast<PointerType>(FinalType)->getElementType();
+    bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
+        FinalType, F.getCallingConv(), F.isVarArg());
+    for (unsigned Value = 0, NumValues = ValueVTs.size();
+         Value != NumValues; ++Value) {
+      EVT VT = ValueVTs[Value];
+      Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
+      ISD::ArgFlagsTy Flags;
+      unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
+
+      if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+        Flags.setZExt();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+        Flags.setSExt();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InReg))
+        Flags.setInReg();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::StructRet))
+        Flags.setSRet();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftSelf))
+        Flags.setSwiftSelf();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::SwiftError))
+        Flags.setSwiftError();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
+        Flags.setByVal();
+      if (F.getAttributes().hasAttribute(Idx, Attribute::InAlloca)) {
+        Flags.setInAlloca();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // inalloca.  This way we can know how many bytes we should've allocated
+        // and how many bytes a callee cleanup function will pop.  If we port
+        // inalloca to more targets, we'll have to add custom inalloca handling
+        // in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
+      if (F.getCallingConv() == CallingConv::X86_INTR) {
+        // IA Interrupt passes frame (1st parameter) by value in the stack.
+        if (Idx == 1)
+          Flags.setByVal();
+      }
+      if (Flags.isByVal() || Flags.isInAlloca()) {
+        PointerType *Ty = cast<PointerType>(I->getType());
+        Type *ElementTy = Ty->getElementType();
+        Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
+        // For ByVal, alignment should be passed from FE.  BE will guess if
+        // this info is not there but there are cases it cannot get right.
+        unsigned FrameAlign;
+        if (F.getParamAlignment(Idx))
+          FrameAlign = F.getParamAlignment(Idx);
+        else
+          FrameAlign = TLI->getByValTypeAlignment(ElementTy, DL);
+        Flags.setByValAlign(FrameAlign);
+      }
+      if (F.getAttributes().hasAttribute(Idx, Attribute::Nest))
+        Flags.setNest();
+      if (NeedsRegBlock)
+        Flags.setInConsecutiveRegs();
+      Flags.setOrigAlign(OriginalAlignment);
+
+      MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+      for (unsigned i = 0; i != NumRegs; ++i) {
+        ISD::InputArg MyFlags(Flags, RegisterVT, VT, isArgValueUsed,
+                              Idx-1, PartBase+i*RegisterVT.getStoreSize());
+        if (NumRegs > 1 && i == 0)
+          MyFlags.Flags.setSplit();
+        // if it isn't first piece, alignment must be 1
+        else if (i > 0) {
+          MyFlags.Flags.setOrigAlign(1);
+          if (i == NumRegs - 1)
+            MyFlags.Flags.setSplitEnd();
+        }
+        Ins.push_back(MyFlags);
+      }
+      if (NeedsRegBlock && Value == NumValues - 1)
+        Ins[Ins.size() - 1].Flags.setInConsecutiveRegsLast();
+      PartBase += VT.getStoreSize();
+    }
+  }
+
+  // Call the target to set up the argument values.
+  SmallVector<SDValue, 8> InVals;
+  SDValue NewRoot = TLI->LowerFormalArguments(
+      DAG.getRoot(), F.getCallingConv(), F.isVarArg(), Ins, dl, DAG, InVals);
+
+  // Verify that the target's LowerFormalArguments behaved as expected.
+  assert(NewRoot.getNode() && NewRoot.getValueType() == MVT::Other &&
+         "LowerFormalArguments didn't return a valid chain!");
+  assert(InVals.size() == Ins.size() &&
+         "LowerFormalArguments didn't emit the correct number of values!");
+  DEBUG({
+      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
+        assert(InVals[i].getNode() &&
+               "LowerFormalArguments emitted a null value!");
+        assert(EVT(Ins[i].VT) == InVals[i].getValueType() &&
+               "LowerFormalArguments emitted a value with the wrong type!");
+      }
+    });
+
+  // Update the DAG with the new chain value resulting from argument lowering.
+  DAG.setRoot(NewRoot);
+
+  // Set up the argument values.
+  unsigned i = 0;
+  Idx = 1;
+  if (!FuncInfo->CanLowerReturn) {
+    // Create a virtual register for the sret pointer, and put in a copy
+    // from the sret argument into it.
+    SmallVector<EVT, 1> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(),
+                    PointerType::getUnqual(F.getReturnType()), ValueVTs);
+    MVT VT = ValueVTs[0].getSimpleVT();
+    MVT RegVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+    Optional<ISD::NodeType> AssertOp = None;
+    SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
+                                        RegVT, VT, nullptr, AssertOp);
+
+    MachineFunction& MF = SDB->DAG.getMachineFunction();
+    MachineRegisterInfo& RegInfo = MF.getRegInfo();
+    unsigned SRetReg = RegInfo.createVirtualRegister(TLI->getRegClassFor(RegVT));
+    FuncInfo->DemoteRegister = SRetReg;
+    NewRoot =
+        SDB->DAG.getCopyToReg(NewRoot, SDB->getCurSDLoc(), SRetReg, ArgValue);
+    DAG.setRoot(NewRoot);
+
+    // i indexes lowered arguments.  Bump it past the hidden sret argument.
+    // Idx indexes LLVM arguments.  Don't touch it.
+    ++i;
+  }
+
+  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
+      ++I, ++Idx) {
+    SmallVector<SDValue, 4> ArgValues;
+    SmallVector<EVT, 4> ValueVTs;
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
+    unsigned NumValues = ValueVTs.size();
+
+    // If this argument is unused then remember its value. It is used to generate
+    // debugging information.
+    if (I->use_empty() && NumValues) {
+      SDB->setUnusedArgValue(&*I, InVals[i]);
+
+      // Also remember any frame index for use in FastISel.
+      if (FrameIndexSDNode *FI =
+          dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
+        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+    }
+
+    for (unsigned Val = 0; Val != NumValues; ++Val) {
+      EVT VT = ValueVTs[Val];
+      MVT PartVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
+      unsigned NumParts = TLI->getNumRegisters(*CurDAG->getContext(), VT);
+
+      if (!I->use_empty()) {
+        Optional<ISD::NodeType> AssertOp;
+        if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
+          AssertOp = ISD::AssertSext;
+        else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
+          AssertOp = ISD::AssertZext;
+
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
+                                             NumParts, PartVT, VT,
+                                             nullptr, AssertOp));
+      }
+
+      i += NumParts;
+    }
+
+    // We don't need to do anything else for unused arguments.
+    if (ArgValues.empty())
+      continue;
+
+    // Note down frame index.
+    if (FrameIndexSDNode *FI =
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+
+    SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
+                                     SDB->getCurSDLoc());
+
+    SDB->setValue(&*I, Res);
+    if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
+      if (LoadSDNode *LNode =
+          dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
+        if (FrameIndexSDNode *FI =
+            dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
+        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+    }
+
+    // Update SwiftErrorMap.
+    if (Res.getOpcode() == ISD::CopyFromReg && TLI->supportSwiftError() &&
+        F.getAttributes().hasAttribute(Idx, Attribute::SwiftError)) {
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        FuncInfo->SwiftErrorMap[FuncInfo->MBB][0] = Reg;
+    }
+
+    // If this argument is live outside of the entry block, insert a copy from
+    // wherever we got it to the vreg that other BB's will reference it as.
+    if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::CopyFromReg) {
+      // If we can, though, try to skip creating an unnecessary vreg.
+      // FIXME: This isn't very clean... it would be nice to make this more
+      // general.  It's also subtly incompatible with the hacks FastISel
+      // uses with vregs.
+      unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        FuncInfo->ValueMap[&*I] = Reg;
+        continue;
+      }
+    }
+    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&*I);
+      SDB->CopyToExportRegsIfNeeded(&*I);
+    }
+  }
+
+  assert(i == InVals.size() && "Argument register count mismatch!");
+
+  // Finally, if the target has anything special to do, allow it to do so.
+  EmitFunctionEntryCode();
+}
+
+/// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
+/// ensure constants are generated when needed.  Remember the virtual registers
+/// that need to be added to the Machine PHI nodes as input.  We cannot just
+/// directly add them, because expansion might result in multiple MBB's for one
+/// BB.  As such, the start of the BB might correspond to a different MBB than
+/// the end.
+///
+void
+SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
+  const TerminatorInst *TI = LLVMBB->getTerminator();
+
+  SmallPtrSet<MachineBasicBlock *, 4> SuccsHandled;
+
+  // Check PHI nodes in successors that expect a value to be available from this
+  // block.
+  for (unsigned succ = 0, e = TI->getNumSuccessors(); succ != e; ++succ) {
+    const BasicBlock *SuccBB = TI->getSuccessor(succ);
+    if (!isa<PHINode>(SuccBB->begin())) continue;
+    MachineBasicBlock *SuccMBB = FuncInfo.MBBMap[SuccBB];
+
+    // If this terminator has multiple identical successors (common for
+    // switches), only handle each succ once.
+    if (!SuccsHandled.insert(SuccMBB).second)
+      continue;
+
+    MachineBasicBlock::iterator MBBI = SuccMBB->begin();
+
+    // At this point we know that there is a 1-1 correspondence between LLVM PHI
+    // nodes and Machine PHI nodes, but the incoming operands have not been
+    // emitted yet.
+    for (BasicBlock::const_iterator I = SuccBB->begin();
+         const PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      // Ignore dead phi's.
+      if (PN->use_empty()) continue;
+
+      // Skip empty types
+      if (PN->getType()->isEmptyTy())
+        continue;
+
+      unsigned Reg;
+      const Value *PHIOp = PN->getIncomingValueForBlock(LLVMBB);
+
+      if (const Constant *C = dyn_cast<Constant>(PHIOp)) {
+        unsigned &RegOut = ConstantsOut[C];
+        if (RegOut == 0) {
+          RegOut = FuncInfo.CreateRegs(C->getType());
+          CopyValueToVirtualRegister(C, RegOut);
+        }
+        Reg = RegOut;
+      } else {
+        DenseMap<const Value *, unsigned>::iterator I =
+          FuncInfo.ValueMap.find(PHIOp);
+        if (I != FuncInfo.ValueMap.end())
+          Reg = I->second;
+        else {
+          assert(isa<AllocaInst>(PHIOp) &&
+                 FuncInfo.StaticAllocaMap.count(cast<AllocaInst>(PHIOp)) &&
+                 "Didn't codegen value into a register!??");
+          Reg = FuncInfo.CreateRegs(PHIOp->getType());
+          CopyValueToVirtualRegister(PHIOp, Reg);
+        }
+      }
+
+      // Remember that this register needs to added to the machine PHI node as
+      // the input for this MBB.
+      SmallVector<EVT, 4> ValueVTs;
+      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+      ComputeValueVTs(TLI, DAG.getDataLayout(), PN->getType(), ValueVTs);
+      for (unsigned vti = 0, vte = ValueVTs.size(); vti != vte; ++vti) {
+        EVT VT = ValueVTs[vti];
+        unsigned NumRegisters = TLI.getNumRegisters(*DAG.getContext(), VT);
+        for (unsigned i = 0, e = NumRegisters; i != e; ++i)
+          FuncInfo.PHINodesToUpdate.push_back(
+              std::make_pair(&*MBBI++, Reg + i));
+        Reg += NumRegisters;
+      }
+    }
+  }
+
+  ConstantsOut.clear();
+}
+
+/// Add a successor MBB to ParentMBB< creating a new MachineBB for BB if SuccMBB
+/// is 0.
+MachineBasicBlock *
+SelectionDAGBuilder::StackProtectorDescriptor::
+AddSuccessorMBB(const BasicBlock *BB,
+                MachineBasicBlock *ParentMBB,
+                bool IsLikely,
+                MachineBasicBlock *SuccMBB) {
+  // If SuccBB has not been created yet, create it.
+  if (!SuccMBB) {
+    MachineFunction *MF = ParentMBB->getParent();
+    MachineFunction::iterator BBI(ParentMBB);
+    SuccMBB = MF->CreateMachineBasicBlock(BB);
+    MF->insert(++BBI, SuccMBB);
+  }
+  // Add it as a successor of ParentMBB.
+  ParentMBB->addSuccessor(
+      SuccMBB, BranchProbabilityInfo::getBranchProbStackProtector(IsLikely));
+  return SuccMBB;
+}
+
+MachineBasicBlock *SelectionDAGBuilder::NextBlock(MachineBasicBlock *MBB) {
+  MachineFunction::iterator I(MBB);
+  if (++I == FuncInfo.MF->end())
+    return nullptr;
+  return &*I;
+}
+
+/// During lowering new call nodes can be created (such as memset, etc.).
+/// Those will become new roots of the current DAG, but complications arise
+/// when they are tail calls. In such cases, the call lowering will update
+/// the root, but the builder still needs to know that a tail call has been
+/// lowered in order to avoid generating an additional return.
+void SelectionDAGBuilder::updateDAGForMaybeTailCall(SDValue MaybeTC) {
+  // If the node is null, we do have a tail call.
+  if (MaybeTC.getNode() != nullptr)
+    DAG.setRoot(MaybeTC);
+  else
+    HasTailCall = true;
+}
+
+bool SelectionDAGBuilder::isDense(const CaseClusterVector &Clusters,
+                                  unsigned *TotalCases, unsigned First,
+                                  unsigned Last,
+                                  unsigned Density) {
+  assert(Last >= First);
+  assert(TotalCases[Last] >= TotalCases[First]);
+
+  APInt LowCase = Clusters[First].Low->getValue();
+  APInt HighCase = Clusters[Last].High->getValue();
+  assert(LowCase.getBitWidth() == HighCase.getBitWidth());
+
+  // FIXME: A range of consecutive cases has 100% density, but only requires one
+  // comparison to lower. We should discriminate against such consecutive ranges
+  // in jump tables.
+
+  uint64_t Diff = (HighCase - LowCase).getLimitedValue((UINT64_MAX - 1) / 100);
+  uint64_t Range = Diff + 1;
+
+  uint64_t NumCases =
+      TotalCases[Last] - (First == 0 ? 0 : TotalCases[First - 1]);
+
+  assert(NumCases < UINT64_MAX / 100);
+  assert(Range >= NumCases);
+
+  return NumCases * 100 >= Range * Density;
+}
+
+static inline bool areJTsAllowed(const TargetLowering &TLI,
+                                 const SwitchInst *SI) {
+  const Function *Fn = SI->getParent()->getParent();
+  if (Fn->getFnAttribute("no-jump-tables").getValueAsString() == "true")
+    return false;
+
+  return TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+         TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other);
+}
+
+bool SelectionDAGBuilder::buildJumpTable(CaseClusterVector &Clusters,
+                                         unsigned First, unsigned Last,
+                                         const SwitchInst *SI,
+                                         MachineBasicBlock *DefaultMBB,
+                                         CaseCluster &JTCluster) {
+  assert(First <= Last);
+
+  auto Prob = BranchProbability::getZero();
+  unsigned NumCmps = 0;
+  std::vector<MachineBasicBlock*> Table;
+  DenseMap<MachineBasicBlock*, BranchProbability> JTProbs;
+
+  // Initialize probabilities in JTProbs.
+  for (unsigned I = First; I <= Last; ++I)
+    JTProbs[Clusters[I].MBB] = BranchProbability::getZero();
+
+  for (unsigned I = First; I <= Last; ++I) {
+    assert(Clusters[I].Kind == CC_Range);
+    Prob += Clusters[I].Prob;
+    APInt Low = Clusters[I].Low->getValue();
+    APInt High = Clusters[I].High->getValue();
+    NumCmps += (Low == High) ? 1 : 2;
+    if (I != First) {
+      // Fill the gap between this and the previous cluster.
+      APInt PreviousHigh = Clusters[I - 1].High->getValue();
+      assert(PreviousHigh.slt(Low));
+      uint64_t Gap = (Low - PreviousHigh).getLimitedValue() - 1;
+      for (uint64_t J = 0; J < Gap; J++)
+        Table.push_back(DefaultMBB);
+    }
+    uint64_t ClusterSize = (High - Low).getLimitedValue() + 1;
+    for (uint64_t J = 0; J < ClusterSize; ++J)
+      Table.push_back(Clusters[I].MBB);
+    JTProbs[Clusters[I].MBB] += Clusters[I].Prob;
+  }
+
+  unsigned NumDests = JTProbs.size();
+  if (isSuitableForBitTests(NumDests, NumCmps,
+                            Clusters[First].Low->getValue(),
+                            Clusters[Last].High->getValue())) {
+    // Clusters[First..Last] should be lowered as bit tests instead.
+    return false;
+  }
+
+  // Create the MBB that will load from and jump through the table.
+  // Note: We create it here, but it's not inserted into the function yet.
+  MachineFunction *CurMF = FuncInfo.MF;
+  MachineBasicBlock *JumpTableMBB =
+      CurMF->CreateMachineBasicBlock(SI->getParent());
+
+  // Add successors. Note: use table order for determinism.
+  SmallPtrSet<MachineBasicBlock *, 8> Done;
+  for (MachineBasicBlock *Succ : Table) {
+    if (Done.count(Succ))
+      continue;
+    addSuccessorWithProb(JumpTableMBB, Succ, JTProbs[Succ]);
+    Done.insert(Succ);
+  }
+  JumpTableMBB->normalizeSuccProbs();
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned JTI = CurMF->getOrCreateJumpTableInfo(TLI.getJumpTableEncoding())
+                     ->createJumpTableIndex(Table);
+
+  // Set up the jump table info.
+  JumpTable JT(-1U, JTI, JumpTableMBB, nullptr);
+  JumpTableHeader JTH(Clusters[First].Low->getValue(),
+                      Clusters[Last].High->getValue(), SI->getCondition(),
+                      nullptr, false);
+  JTCases.emplace_back(std::move(JTH), std::move(JT));
+
+  JTCluster = CaseCluster::jumpTable(Clusters[First].Low, Clusters[Last].High,
+                                     JTCases.size() - 1, Prob);
+  return true;
+}
+
+void SelectionDAGBuilder::findJumpTables(CaseClusterVector &Clusters,
+                                         const SwitchInst *SI,
+                                         MachineBasicBlock *DefaultMBB) {
+#ifndef NDEBUG
+  // Clusters must be non-empty, sorted, and only contain Range clusters.
+  assert(!Clusters.empty());
+  for (CaseCluster &C : Clusters)
+    assert(C.Kind == CC_Range);
+  for (unsigned i = 1, e = Clusters.size(); i < e; ++i)
+    assert(Clusters[i - 1].High->getValue().slt(Clusters[i].Low->getValue()));
+#endif
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!areJTsAllowed(TLI, SI))
+    return;
+
+  const int64_t N = Clusters.size();
+  const unsigned MinJumpTableSize = TLI.getMinimumJumpTableEntries();
+
+  // TotalCases[i]: Total nbr of cases in Clusters[0..i].
+  SmallVector<unsigned, 8> TotalCases(N);
+
+  for (unsigned i = 0; i < N; ++i) {
+    APInt Hi = Clusters[i].High->getValue();
+    APInt Lo = Clusters[i].Low->getValue();
+    TotalCases[i] = (Hi - Lo).getLimitedValue() + 1;
+    if (i != 0)
+      TotalCases[i] += TotalCases[i - 1];
+  }
+
+  unsigned MinDensity = JumpTableDensity;
+  if (DefaultMBB->getParent()->getFunction()->optForSize())
+    MinDensity = OptsizeJumpTableDensity;
+  if (N >= MinJumpTableSize
+      && isDense(Clusters, &TotalCases[0], 0, N - 1, MinDensity)) {
+    // Cheap case: the whole range might be suitable for jump table.
+    CaseCluster JTCluster;
+    if (buildJumpTable(Clusters, 0, N - 1, SI, DefaultMBB, JTCluster)) {
+      Clusters[0] = JTCluster;
+      Clusters.resize(1);
+      return;
+    }
+  }
+
+  // The algorithm below is not suitable for -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // Split Clusters into minimum number of dense partitions. The algorithm uses
+  // the same idea as Kannan & Proebsting "Correction to 'Producing Good Code
+  // for the Case Statement'" (1994), but builds the MinPartitions array in
+  // reverse order to make it easier to reconstruct the partitions in ascending
+  // order. In the choice between two optimal partitionings, it picks the one
+  // which yields more jump tables.
+
+  // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
+  SmallVector<unsigned, 8> MinPartitions(N);
+  // LastElement[i] is the last element of the partition starting at i.
+  SmallVector<unsigned, 8> LastElement(N);
+  // NumTables[i]: nbr of >= MinJumpTableSize partitions from Clusters[i..N-1].
+  SmallVector<unsigned, 8> NumTables(N);
+
+  // Base case: There is only one way to partition Clusters[N-1].
+  MinPartitions[N - 1] = 1;
+  LastElement[N - 1] = N - 1;
+  assert(MinJumpTableSize > 1);
+  NumTables[N - 1] = 0;
+
+  // Note: loop indexes are signed to avoid underflow.
+  for (int64_t i = N - 2; i >= 0; i--) {
+    // Find optimal partitioning of Clusters[i..N-1].
+    // Baseline: Put Clusters[i] into a partition on its own.
+    MinPartitions[i] = MinPartitions[i + 1] + 1;
+    LastElement[i] = i;
+    NumTables[i] = NumTables[i + 1];
+
+    // Search for a solution that results in fewer partitions.
+    for (int64_t j = N - 1; j > i; j--) {
+      // Try building a partition from Clusters[i..j].
+      if (isDense(Clusters, &TotalCases[0], i, j, MinDensity)) {
+        unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
+        bool IsTable = j - i + 1 >= MinJumpTableSize;
+        unsigned Tables = IsTable + (j == N - 1 ? 0 : NumTables[j + 1]);
+
+        // If this j leads to fewer partitions, or same number of partitions
+        // with more lookup tables, it is a better partitioning.
+        if (NumPartitions < MinPartitions[i] ||
+            (NumPartitions == MinPartitions[i] && Tables > NumTables[i])) {
+          MinPartitions[i] = NumPartitions;
+          LastElement[i] = j;
+          NumTables[i] = Tables;
+        }
+      }
+    }
+  }
+
+  // Iterate over the partitions, replacing some with jump tables in-place.
+  unsigned DstIndex = 0;
+  for (unsigned First = 0, Last; First < N; First = Last + 1) {
+    Last = LastElement[First];
+    assert(Last >= First);
+    assert(DstIndex <= First);
+    unsigned NumClusters = Last - First + 1;
+
+    CaseCluster JTCluster;
+    if (NumClusters >= MinJumpTableSize &&
+        buildJumpTable(Clusters, First, Last, SI, DefaultMBB, JTCluster)) {
+      Clusters[DstIndex++] = JTCluster;
+    } else {
+      for (unsigned I = First; I <= Last; ++I)
+        std::memmove(&Clusters[DstIndex++], &Clusters[I], sizeof(Clusters[I]));
+    }
+  }
+  Clusters.resize(DstIndex);
+}
+
+bool SelectionDAGBuilder::rangeFitsInWord(const APInt &Low, const APInt &High) {
+  // FIXME: Using the pointer type doesn't seem ideal.
+  uint64_t BW = DAG.getDataLayout().getPointerSizeInBits();
+  uint64_t Range = (High - Low).getLimitedValue(UINT64_MAX - 1) + 1;
+  return Range <= BW;
+}
+
+bool SelectionDAGBuilder::isSuitableForBitTests(unsigned NumDests,
+                                                unsigned NumCmps,
+                                                const APInt &Low,
+                                                const APInt &High) {
+  // FIXME: I don't think NumCmps is the correct metric: a single case and a
+  // range of cases both require only one branch to lower. Just looking at the
+  // number of clusters and destinations should be enough to decide whether to
+  // build bit tests.
+
+  // To lower a range with bit tests, the range must fit the bitwidth of a
+  // machine word.
+  if (!rangeFitsInWord(Low, High))
+    return false;
+
+  // Decide whether it's profitable to lower this range with bit tests. Each
+  // destination requires a bit test and branch, and there is an overall range
+  // check branch. For a small number of clusters, separate comparisons might be
+  // cheaper, and for many destinations, splitting the range might be better.
+  return (NumDests == 1 && NumCmps >= 3) ||
+         (NumDests == 2 && NumCmps >= 5) ||
+         (NumDests == 3 && NumCmps >= 6);
+}
+
+bool SelectionDAGBuilder::buildBitTests(CaseClusterVector &Clusters,
+                                        unsigned First, unsigned Last,
+                                        const SwitchInst *SI,
+                                        CaseCluster &BTCluster) {
+  assert(First <= Last);
+  if (First == Last)
+    return false;
+
+  BitVector Dests(FuncInfo.MF->getNumBlockIDs());
+  unsigned NumCmps = 0;
+  for (int64_t I = First; I <= Last; ++I) {
+    assert(Clusters[I].Kind == CC_Range);
+    Dests.set(Clusters[I].MBB->getNumber());
+    NumCmps += (Clusters[I].Low == Clusters[I].High) ? 1 : 2;
+  }
+  unsigned NumDests = Dests.count();
+
+  APInt Low = Clusters[First].Low->getValue();
+  APInt High = Clusters[Last].High->getValue();
+  assert(Low.slt(High));
+
+  if (!isSuitableForBitTests(NumDests, NumCmps, Low, High))
+    return false;
+
+  APInt LowBound;
+  APInt CmpRange;
+
+  const int BitWidth = DAG.getTargetLoweringInfo()
+                           .getPointerTy(DAG.getDataLayout())
+                           .getSizeInBits();
+  assert(rangeFitsInWord(Low, High) && "Case range must fit in bit mask!");
+
+  // Check if the clusters cover a contiguous range such that no value in the
+  // range will jump to the default statement.
+  bool ContiguousRange = true;
+  for (int64_t I = First + 1; I <= Last; ++I) {
+    if (Clusters[I].Low->getValue() != Clusters[I - 1].High->getValue() + 1) {
+      ContiguousRange = false;
+      break;
+    }
+  }
+
+  if (Low.isStrictlyPositive() && High.slt(BitWidth)) {
+    // Optimize the case where all the case values fit in a word without having
+    // to subtract minValue. In this case, we can optimize away the subtraction.
+    LowBound = APInt::getNullValue(Low.getBitWidth());
+    CmpRange = High;
+    ContiguousRange = false;
+  } else {
+    LowBound = Low;
+    CmpRange = High - Low;
+  }
+
+  CaseBitsVector CBV;
+  auto TotalProb = BranchProbability::getZero();
+  for (unsigned i = First; i <= Last; ++i) {
+    // Find the CaseBits for this destination.
+    unsigned j;
+    for (j = 0; j < CBV.size(); ++j)
+      if (CBV[j].BB == Clusters[i].MBB)
+        break;
+    if (j == CBV.size())
+      CBV.push_back(
+          CaseBits(0, Clusters[i].MBB, 0, BranchProbability::getZero()));
+    CaseBits *CB = &CBV[j];
+
+    // Update Mask, Bits and ExtraProb.
+    uint64_t Lo = (Clusters[i].Low->getValue() - LowBound).getZExtValue();
+    uint64_t Hi = (Clusters[i].High->getValue() - LowBound).getZExtValue();
+    assert(Hi >= Lo && Hi < 64 && "Invalid bit case!");
+    CB->Mask |= (-1ULL >> (63 - (Hi - Lo))) << Lo;
+    CB->Bits += Hi - Lo + 1;
+    CB->ExtraProb += Clusters[i].Prob;
+    TotalProb += Clusters[i].Prob;
+  }
+
+  BitTestInfo BTI;
+  std::sort(CBV.begin(), CBV.end(), [](const CaseBits &a, const CaseBits &b) {
+    // Sort by probability first, number of bits second.
+    if (a.ExtraProb != b.ExtraProb)
+      return a.ExtraProb > b.ExtraProb;
+    return a.Bits > b.Bits;
+  });
+
+  for (auto &CB : CBV) {
+    MachineBasicBlock *BitTestBB =
+        FuncInfo.MF->CreateMachineBasicBlock(SI->getParent());
+    BTI.push_back(BitTestCase(CB.Mask, BitTestBB, CB.BB, CB.ExtraProb));
+  }
+  BitTestCases.emplace_back(std::move(LowBound), std::move(CmpRange),
+                            SI->getCondition(), -1U, MVT::Other, false,
+                            ContiguousRange, nullptr, nullptr, std::move(BTI),
+                            TotalProb);
+
+  BTCluster = CaseCluster::bitTests(Clusters[First].Low, Clusters[Last].High,
+                                    BitTestCases.size() - 1, TotalProb);
+  return true;
+}
+
+void SelectionDAGBuilder::findBitTestClusters(CaseClusterVector &Clusters,
+                                              const SwitchInst *SI) {
+// Partition Clusters into as few subsets as possible, where each subset has a
+// range that fits in a machine word and has <= 3 unique destinations.
+
+#ifndef NDEBUG
+  // Clusters must be sorted and contain Range or JumpTable clusters.
+  assert(!Clusters.empty());
+  assert(Clusters[0].Kind == CC_Range || Clusters[0].Kind == CC_JumpTable);
+  for (const CaseCluster &C : Clusters)
+    assert(C.Kind == CC_Range || C.Kind == CC_JumpTable);
+  for (unsigned i = 1; i < Clusters.size(); ++i)
+    assert(Clusters[i-1].High->getValue().slt(Clusters[i].Low->getValue()));
+#endif
+
+  // The algorithm below is not suitable for -O0.
+  if (TM.getOptLevel() == CodeGenOpt::None)
+    return;
+
+  // If target does not have legal shift left, do not emit bit tests at all.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  EVT PTy = TLI.getPointerTy(DAG.getDataLayout());
+  if (!TLI.isOperationLegal(ISD::SHL, PTy))
+    return;
+
+  int BitWidth = PTy.getSizeInBits();
+  const int64_t N = Clusters.size();
+
+  // MinPartitions[i] is the minimum nbr of partitions of Clusters[i..N-1].
+  SmallVector<unsigned, 8> MinPartitions(N);
+  // LastElement[i] is the last element of the partition starting at i.
+  SmallVector<unsigned, 8> LastElement(N);
+
+  // FIXME: This might not be the best algorithm for finding bit test clusters.
+
+  // Base case: There is only one way to partition Clusters[N-1].
+  MinPartitions[N - 1] = 1;
+  LastElement[N - 1] = N - 1;
+
+  // Note: loop indexes are signed to avoid underflow.
+  for (int64_t i = N - 2; i >= 0; --i) {
+    // Find optimal partitioning of Clusters[i..N-1].
+    // Baseline: Put Clusters[i] into a partition on its own.
+    MinPartitions[i] = MinPartitions[i + 1] + 1;
+    LastElement[i] = i;
+
+    // Search for a solution that results in fewer partitions.
+    // Note: the search is limited by BitWidth, reducing time complexity.
+    for (int64_t j = std::min(N - 1, i + BitWidth - 1); j > i; --j) {
+      // Try building a partition from Clusters[i..j].
+
+      // Check the range.
+      if (!rangeFitsInWord(Clusters[i].Low->getValue(),
+                           Clusters[j].High->getValue()))
+        continue;
+
+      // Check nbr of destinations and cluster types.
+      // FIXME: This works, but doesn't seem very efficient.
+      bool RangesOnly = true;
+      BitVector Dests(FuncInfo.MF->getNumBlockIDs());
+      for (int64_t k = i; k <= j; k++) {
+        if (Clusters[k].Kind != CC_Range) {
+          RangesOnly = false;
+          break;
+        }
+        Dests.set(Clusters[k].MBB->getNumber());
+      }
+      if (!RangesOnly || Dests.count() > 3)
+        break;
+
+      // Check if it's a better partition.
+      unsigned NumPartitions = 1 + (j == N - 1 ? 0 : MinPartitions[j + 1]);
+      if (NumPartitions < MinPartitions[i]) {
+        // Found a better partition.
+        MinPartitions[i] = NumPartitions;
+        LastElement[i] = j;
+      }
+    }
+  }
+
+  // Iterate over the partitions, replacing with bit-test clusters in-place.
+  unsigned DstIndex = 0;
+  for (unsigned First = 0, Last; First < N; First = Last + 1) {
+    Last = LastElement[First];
+    assert(First <= Last);
+    assert(DstIndex <= First);
+
+    CaseCluster BitTestCluster;
+    if (buildBitTests(Clusters, First, Last, SI, BitTestCluster)) {
+      Clusters[DstIndex++] = BitTestCluster;
+    } else {
+      size_t NumClusters = Last - First + 1;
+      std::memmove(&Clusters[DstIndex], &Clusters[First],
+                   sizeof(Clusters[0]) * NumClusters);
+      DstIndex += NumClusters;
+    }
+  }
+  Clusters.resize(DstIndex);
+}
+
+void SelectionDAGBuilder::lowerWorkItem(SwitchWorkListItem W, Value *Cond,
+                                        MachineBasicBlock *SwitchMBB,
+                                        MachineBasicBlock *DefaultMBB) {
+  MachineFunction *CurMF = FuncInfo.MF;
+  MachineBasicBlock *NextMBB = nullptr;
+  MachineFunction::iterator BBI(W.MBB);
+  if (++BBI != FuncInfo.MF->end())
+    NextMBB = &*BBI;
+
+  unsigned Size = W.LastCluster - W.FirstCluster + 1;
+
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+
+  if (Size == 2 && W.MBB == SwitchMBB) {
+    // If any two of the cases has the same destination, and if one value
+    // is the same as the other, but has one bit unset that the other has set,
+    // use bit manipulation to do two compares at once.  For example:
+    // "if (X == 6 || X == 4)" -> "if ((X|2) == 6)"
+    // TODO: This could be extended to merge any 2 cases in switches with 3
+    // cases.
+    // TODO: Handle cases where W.CaseBB != SwitchBB.
+    CaseCluster &Small = *W.FirstCluster;
+    CaseCluster &Big = *W.LastCluster;
+
+    if (Small.Low == Small.High && Big.Low == Big.High &&
+        Small.MBB == Big.MBB) {
+      const APInt &SmallValue = Small.Low->getValue();
+      const APInt &BigValue = Big.Low->getValue();
+
+      // Check that there is only one bit different.
+      APInt CommonBit = BigValue ^ SmallValue;
+      if (CommonBit.isPowerOf2()) {
+        SDValue CondLHS = getValue(Cond);
+        EVT VT = CondLHS.getValueType();
+        SDLoc DL = getCurSDLoc();
+
+        SDValue Or = DAG.getNode(ISD::OR, DL, VT, CondLHS,
+                                 DAG.getConstant(CommonBit, DL, VT));
+        SDValue Cond = DAG.getSetCC(
+            DL, MVT::i1, Or, DAG.getConstant(BigValue | SmallValue, DL, VT),
+            ISD::SETEQ);
+
+        // Update successor info.
+        // Both Small and Big will jump to Small.BB, so we sum up the
+        // probabilities.
+        addSuccessorWithProb(SwitchMBB, Small.MBB, Small.Prob + Big.Prob);
+        if (BPI)
+          addSuccessorWithProb(
+              SwitchMBB, DefaultMBB,
+              // The default destination is the first successor in IR.
+              BPI->getEdgeProbability(SwitchMBB->getBasicBlock(), (unsigned)0));
+        else
+          addSuccessorWithProb(SwitchMBB, DefaultMBB);
+
+        // Insert the true branch.
+        SDValue BrCond =
+            DAG.getNode(ISD::BRCOND, DL, MVT::Other, getControlRoot(), Cond,
+                        DAG.getBasicBlock(Small.MBB));
+        // Insert the false branch.
+        BrCond = DAG.getNode(ISD::BR, DL, MVT::Other, BrCond,
+                             DAG.getBasicBlock(DefaultMBB));
+
+        DAG.setRoot(BrCond);
+        return;
+      }
+    }
+  }
+
+  if (TM.getOptLevel() != CodeGenOpt::None) {
+    // Order cases by probability so the most likely case will be checked first.
+    std::sort(W.FirstCluster, W.LastCluster + 1,
+              [](const CaseCluster &a, const CaseCluster &b) {
+      return a.Prob > b.Prob;
+    });
+
+    // Rearrange the case blocks so that the last one falls through if possible
+    // without without changing the order of probabilities.
+    for (CaseClusterIt I = W.LastCluster; I > W.FirstCluster; ) {
+      --I;
+      if (I->Prob > W.LastCluster->Prob)
+        break;
+      if (I->Kind == CC_Range && I->MBB == NextMBB) {
+        std::swap(*I, *W.LastCluster);
+        break;
+      }
+    }
+  }
+
+  // Compute total probability.
+  BranchProbability DefaultProb = W.DefaultProb;
+  BranchProbability UnhandledProbs = DefaultProb;
+  for (CaseClusterIt I = W.FirstCluster; I <= W.LastCluster; ++I)
+    UnhandledProbs += I->Prob;
+
+  MachineBasicBlock *CurMBB = W.MBB;
+  for (CaseClusterIt I = W.FirstCluster, E = W.LastCluster; I <= E; ++I) {
+    MachineBasicBlock *Fallthrough;
+    if (I == W.LastCluster) {
+      // For the last cluster, fall through to the default destination.
+      Fallthrough = DefaultMBB;
+    } else {
+      Fallthrough = CurMF->CreateMachineBasicBlock(CurMBB->getBasicBlock());
+      CurMF->insert(BBI, Fallthrough);
+      // Put Cond in a virtual register to make it available from the new blocks.
+      ExportFromCurrentBlock(Cond);
+    }
+    UnhandledProbs -= I->Prob;
+
+    switch (I->Kind) {
+      case CC_JumpTable: {
+        // FIXME: Optimize away range check based on pivot comparisons.
+        JumpTableHeader *JTH = &JTCases[I->JTCasesIndex].first;
+        JumpTable *JT = &JTCases[I->JTCasesIndex].second;
+
+        // The jump block hasn't been inserted yet; insert it here.
+        MachineBasicBlock *JumpMBB = JT->MBB;
+        CurMF->insert(BBI, JumpMBB);
+
+        auto JumpProb = I->Prob;
+        auto FallthroughProb = UnhandledProbs;
+
+        // If the default statement is a target of the jump table, we evenly
+        // distribute the default probability to successors of CurMBB. Also
+        // update the probability on the edge from JumpMBB to Fallthrough.
+        for (MachineBasicBlock::succ_iterator SI = JumpMBB->succ_begin(),
+                                              SE = JumpMBB->succ_end();
+             SI != SE; ++SI) {
+          if (*SI == DefaultMBB) {
+            JumpProb += DefaultProb / 2;
+            FallthroughProb -= DefaultProb / 2;
+            JumpMBB->setSuccProbability(SI, DefaultProb / 2);
+            JumpMBB->normalizeSuccProbs();
+            break;
+          }
+        }
+
+        addSuccessorWithProb(CurMBB, Fallthrough, FallthroughProb);
+        addSuccessorWithProb(CurMBB, JumpMBB, JumpProb);
+        CurMBB->normalizeSuccProbs();
+
+        // The jump table header will be inserted in our current block, do the
+        // range check, and fall through to our fallthrough block.
+        JTH->HeaderBB = CurMBB;
+        JT->Default = Fallthrough; // FIXME: Move Default to JumpTableHeader.
+
+        // If we're in the right place, emit the jump table header right now.
+        if (CurMBB == SwitchMBB) {
+          visitJumpTableHeader(*JT, *JTH, SwitchMBB);
+          JTH->Emitted = true;
+        }
+        break;
+      }
+      case CC_BitTests: {
+        // FIXME: Optimize away range check based on pivot comparisons.
+        BitTestBlock *BTB = &BitTestCases[I->BTCasesIndex];
+
+        // The bit test blocks haven't been inserted yet; insert them here.
+        for (BitTestCase &BTC : BTB->Cases)
+          CurMF->insert(BBI, BTC.ThisBB);
+
+        // Fill in fields of the BitTestBlock.
+        BTB->Parent = CurMBB;
+        BTB->Default = Fallthrough;
+
+        BTB->DefaultProb = UnhandledProbs;
+        // If the cases in bit test don't form a contiguous range, we evenly
+        // distribute the probability on the edge to Fallthrough to two
+        // successors of CurMBB.
+        if (!BTB->ContiguousRange) {
+          BTB->Prob += DefaultProb / 2;
+          BTB->DefaultProb -= DefaultProb / 2;
+        }
+
+        // If we're in the right place, emit the bit test header right now.
+        if (CurMBB == SwitchMBB) {
+          visitBitTestHeader(*BTB, SwitchMBB);
+          BTB->Emitted = true;
+        }
+        break;
+      }
+      case CC_Range: {
+        const Value *RHS, *LHS, *MHS;
+        ISD::CondCode CC;
+        if (I->Low == I->High) {
+          // Check Cond == I->Low.
+          CC = ISD::SETEQ;
+          LHS = Cond;
+          RHS=I->Low;
+          MHS = nullptr;
+        } else {
+          // Check I->Low <= Cond <= I->High.
+          CC = ISD::SETLE;
+          LHS = I->Low;
+          MHS = Cond;
+          RHS = I->High;
+        }
+
+        // The false probability is the sum of all unhandled cases.
+        CaseBlock CB(CC, LHS, RHS, MHS, I->MBB, Fallthrough, CurMBB, I->Prob,
+                     UnhandledProbs);
+
+        if (CurMBB == SwitchMBB)
+          visitSwitchCase(CB, SwitchMBB);
+        else
+          SwitchCases.push_back(CB);
+
+        break;
+      }
+    }
+    CurMBB = Fallthrough;
+  }
+}
+
+unsigned SelectionDAGBuilder::caseClusterRank(const CaseCluster &CC,
+                                              CaseClusterIt First,
+                                              CaseClusterIt Last) {
+  return std::count_if(First, Last + 1, [&](const CaseCluster &X) {
+    if (X.Prob != CC.Prob)
+      return X.Prob > CC.Prob;
+
+    // Ties are broken by comparing the case value.
+    return X.Low->getValue().slt(CC.Low->getValue());
+  });
+}
+
+void SelectionDAGBuilder::splitWorkItem(SwitchWorkList &WorkList,
+                                        const SwitchWorkListItem &W,
+                                        Value *Cond,
+                                        MachineBasicBlock *SwitchMBB) {
+  assert(W.FirstCluster->Low->getValue().slt(W.LastCluster->Low->getValue()) &&
+         "Clusters not sorted?");
+
+  assert(W.LastCluster - W.FirstCluster + 1 >= 2 && "Too small to split!");
+
+  // Balance the tree based on branch probabilities to create a near-optimal (in
+  // terms of search time given key frequency) binary search tree. See e.g. Kurt
+  // Mehlhorn "Nearly Optimal Binary Search Trees" (1975).
+  CaseClusterIt LastLeft = W.FirstCluster;
+  CaseClusterIt FirstRight = W.LastCluster;
+  auto LeftProb = LastLeft->Prob + W.DefaultProb / 2;
+  auto RightProb = FirstRight->Prob + W.DefaultProb / 2;
+
+  // Move LastLeft and FirstRight towards each other from opposite directions to
+  // find a partitioning of the clusters which balances the probability on both
+  // sides. If LeftProb and RightProb are equal, alternate which side is
+  // taken to ensure 0-probability nodes are distributed evenly.
+  unsigned I = 0;
+  while (LastLeft + 1 < FirstRight) {
+    if (LeftProb < RightProb || (LeftProb == RightProb && (I & 1)))
+      LeftProb += (++LastLeft)->Prob;
+    else
+      RightProb += (--FirstRight)->Prob;
+    I++;
+  }
+
+  for (;;) {
+    // Our binary search tree differs from a typical BST in that ours can have up
+    // to three values in each leaf. The pivot selection above doesn't take that
+    // into account, which means the tree might require more nodes and be less
+    // efficient. We compensate for this here.
+
+    unsigned NumLeft = LastLeft - W.FirstCluster + 1;
+    unsigned NumRight = W.LastCluster - FirstRight + 1;
+
+    if (std::min(NumLeft, NumRight) < 3 && std::max(NumLeft, NumRight) > 3) {
+      // If one side has less than 3 clusters, and the other has more than 3,
+      // consider taking a cluster from the other side.
+
+      if (NumLeft < NumRight) {
+        // Consider moving the first cluster on the right to the left side.
+        CaseCluster &CC = *FirstRight;
+        unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
+        unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
+        if (LeftSideRank <= RightSideRank) {
+          // Moving the cluster to the left does not demote it.
+          ++LastLeft;
+          ++FirstRight;
+          continue;
+        }
+      } else {
+        assert(NumRight < NumLeft);
+        // Consider moving the last element on the left to the right side.
+        CaseCluster &CC = *LastLeft;
+        unsigned LeftSideRank = caseClusterRank(CC, W.FirstCluster, LastLeft);
+        unsigned RightSideRank = caseClusterRank(CC, FirstRight, W.LastCluster);
+        if (RightSideRank <= LeftSideRank) {
+          // Moving the cluster to the right does not demot it.
+          --LastLeft;
+          --FirstRight;
+          continue;
+        }
+      }
+    }
+    break;
+  }
+
+  assert(LastLeft + 1 == FirstRight);
+  assert(LastLeft >= W.FirstCluster);
+  assert(FirstRight <= W.LastCluster);
+
+  // Use the first element on the right as pivot since we will make less-than
+  // comparisons against it.
+  CaseClusterIt PivotCluster = FirstRight;
+  assert(PivotCluster > W.FirstCluster);
+  assert(PivotCluster <= W.LastCluster);
+
+  CaseClusterIt FirstLeft = W.FirstCluster;
+  CaseClusterIt LastRight = W.LastCluster;
+
+  const ConstantInt *Pivot = PivotCluster->Low;
+
+  // New blocks will be inserted immediately after the current one.
+  MachineFunction::iterator BBI(W.MBB);
+  ++BBI;
+
+  // We will branch to the LHS if Value < Pivot. If LHS is a single cluster,
+  // we can branch to its destination directly if it's squeezed exactly in
+  // between the known lower bound and Pivot - 1.
+  MachineBasicBlock *LeftMBB;
+  if (FirstLeft == LastLeft && FirstLeft->Kind == CC_Range &&
+      FirstLeft->Low == W.GE &&
+      (FirstLeft->High->getValue() + 1LL) == Pivot->getValue()) {
+    LeftMBB = FirstLeft->MBB;
+  } else {
+    LeftMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
+    FuncInfo.MF->insert(BBI, LeftMBB);
+    WorkList.push_back(
+        {LeftMBB, FirstLeft, LastLeft, W.GE, Pivot, W.DefaultProb / 2});
+    // Put Cond in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(Cond);
+  }
+
+  // Similarly, we will branch to the RHS if Value >= Pivot. If RHS is a
+  // single cluster, RHS.Low == Pivot, and we can branch to its destination
+  // directly if RHS.High equals the current upper bound.
+  MachineBasicBlock *RightMBB;
+  if (FirstRight == LastRight && FirstRight->Kind == CC_Range &&
+      W.LT && (FirstRight->High->getValue() + 1ULL) == W.LT->getValue()) {
+    RightMBB = FirstRight->MBB;
+  } else {
+    RightMBB = FuncInfo.MF->CreateMachineBasicBlock(W.MBB->getBasicBlock());
+    FuncInfo.MF->insert(BBI, RightMBB);
+    WorkList.push_back(
+        {RightMBB, FirstRight, LastRight, Pivot, W.LT, W.DefaultProb / 2});
+    // Put Cond in a virtual register to make it available from the new blocks.
+    ExportFromCurrentBlock(Cond);
+  }
+
+  // Create the CaseBlock record that will be used to lower the branch.
+  CaseBlock CB(ISD::SETLT, Cond, Pivot, nullptr, LeftMBB, RightMBB, W.MBB,
+               LeftProb, RightProb);
+
+  if (W.MBB == SwitchMBB)
+    visitSwitchCase(CB, SwitchMBB);
+  else
+    SwitchCases.push_back(CB);
+}
+
+void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
+  // Extract cases from the switch.
+  BranchProbabilityInfo *BPI = FuncInfo.BPI;
+  CaseClusterVector Clusters;
+  Clusters.reserve(SI.getNumCases());
+  for (auto I : SI.cases()) {
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[I.getCaseSuccessor()];
+    const ConstantInt *CaseVal = I.getCaseValue();
+    BranchProbability Prob =
+        BPI ? BPI->getEdgeProbability(SI.getParent(), I.getSuccessorIndex())
+            : BranchProbability(1, SI.getNumCases() + 1);
+    Clusters.push_back(CaseCluster::range(CaseVal, CaseVal, Succ, Prob));
+  }
+
+  MachineBasicBlock *DefaultMBB = FuncInfo.MBBMap[SI.getDefaultDest()];
+
+  // Cluster adjacent cases with the same destination. We do this at all
+  // optimization levels because it's cheap to do and will make codegen faster
+  // if there are many clusters.
+  sortAndRangeify(Clusters);
+
+  if (TM.getOptLevel() != CodeGenOpt::None) {
+    // Replace an unreachable default with the most popular destination.
+    // FIXME: Exploit unreachable default more aggressively.
+    bool UnreachableDefault =
+        isa<UnreachableInst>(SI.getDefaultDest()->getFirstNonPHIOrDbg());
+    if (UnreachableDefault && !Clusters.empty()) {
+      DenseMap<const BasicBlock *, unsigned> Popularity;
+      unsigned MaxPop = 0;
+      const BasicBlock *MaxBB = nullptr;
+      for (auto I : SI.cases()) {
+        const BasicBlock *BB = I.getCaseSuccessor();
+        if (++Popularity[BB] > MaxPop) {
+          MaxPop = Popularity[BB];
+          MaxBB = BB;
+        }
+      }
+      // Set new default.
+      assert(MaxPop > 0 && MaxBB);
+      DefaultMBB = FuncInfo.MBBMap[MaxBB];
+
+      // Remove cases that were pointing to the destination that is now the
+      // default.
+      CaseClusterVector New;
+      New.reserve(Clusters.size());
+      for (CaseCluster &CC : Clusters) {
+        if (CC.MBB != DefaultMBB)
+          New.push_back(CC);
+      }
+      Clusters = std::move(New);
+    }
+  }
+
+  // If there is only the default destination, jump there directly.
+  MachineBasicBlock *SwitchMBB = FuncInfo.MBB;
+  if (Clusters.empty()) {
+    SwitchMBB->addSuccessor(DefaultMBB);
+    if (DefaultMBB != NextBlock(SwitchMBB)) {
+      DAG.setRoot(DAG.getNode(ISD::BR, getCurSDLoc(), MVT::Other,
+                              getControlRoot(), DAG.getBasicBlock(DefaultMBB)));
+    }
+    return;
+  }
+
+  findJumpTables(Clusters, &SI, DefaultMBB);
+  findBitTestClusters(Clusters, &SI);
+
+  DEBUG({
+    dbgs() << "Case clusters: ";
+    for (const CaseCluster &C : Clusters) {
+      if (C.Kind == CC_JumpTable) dbgs() << "JT:";
+      if (C.Kind == CC_BitTests) dbgs() << "BT:";
+
+      C.Low->getValue().print(dbgs(), true);
+      if (C.Low != C.High) {
+        dbgs() << '-';
+        C.High->getValue().print(dbgs(), true);
+      }
+      dbgs() << ' ';
+    }
+    dbgs() << '\n';
+  });
+
+  assert(!Clusters.empty());
+  SwitchWorkList WorkList;
+  CaseClusterIt First = Clusters.begin();
+  CaseClusterIt Last = Clusters.end() - 1;
+  auto DefaultProb = getEdgeProbability(SwitchMBB, DefaultMBB);
+  WorkList.push_back({SwitchMBB, First, Last, nullptr, nullptr, DefaultProb});
+
+  while (!WorkList.empty()) {
+    SwitchWorkListItem W = WorkList.back();
+    WorkList.pop_back();
+    unsigned NumClusters = W.LastCluster - W.FirstCluster + 1;
+
+    if (NumClusters > 3 && TM.getOptLevel() != CodeGenOpt::None) {
+      // For optimized builds, lower large range as a balanced binary tree.
+      splitWorkItem(WorkList, W, SI.getCondition(), SwitchMBB);
+      continue;
+    }
+
+    lowerWorkItem(W, SI.getCondition(), SwitchMBB, DefaultMBB);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
new file mode 100644
index 000000000000..b9888ae87639
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -0,0 +1,1019 @@
+//===-- SelectionDAGBuilder.h - Selection-DAG building --------*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements routines for translating from LLVM IR into SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
+
+#include "StatepointLowering.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetLowering.h"
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+class AddrSpaceCastInst;
+class AllocaInst;
+class BasicBlock;
+class BitCastInst;
+class BranchInst;
+class CallInst;
+class DbgValueInst;
+class ExtractElementInst;
+class ExtractValueInst;
+class FCmpInst;
+class FPExtInst;
+class FPToSIInst;
+class FPToUIInst;
+class FPTruncInst;
+class Function;
+class FunctionLoweringInfo;
+class GetElementPtrInst;
+class GCFunctionInfo;
+class ICmpInst;
+class IntToPtrInst;
+class IndirectBrInst;
+class InvokeInst;
+class InsertElementInst;
+class InsertValueInst;
+class Instruction;
+class LoadInst;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineRegisterInfo;
+class MDNode;
+class MVT;
+class PHINode;
+class PtrToIntInst;
+class ReturnInst;
+class SDDbgValue;
+class SExtInst;
+class SelectInst;
+class ShuffleVectorInst;
+class SIToFPInst;
+class StoreInst;
+class SwitchInst;
+class DataLayout;
+class TargetLibraryInfo;
+class TargetLowering;
+class TruncInst;
+class UIToFPInst;
+class UnreachableInst;
+class VAArgInst;
+class ZExtInst;
+
+//===----------------------------------------------------------------------===//
+/// SelectionDAGBuilder - This is the common target-independent lowering
+/// implementation that is parameterized by a TargetLowering object.
+///
+class SelectionDAGBuilder {
+  /// CurInst - The current instruction being visited
+  const Instruction *CurInst;
+
+  DenseMap<const Value*, SDValue> NodeMap;
+
+  /// UnusedArgNodeMap - Maps argument value for unused arguments. This is used
+  /// to preserve debug information for incoming arguments.
+  DenseMap<const Value*, SDValue> UnusedArgNodeMap;
+
+  /// DanglingDebugInfo - Helper type for DanglingDebugInfoMap.
+  class DanglingDebugInfo {
+    const DbgValueInst* DI;
+    DebugLoc dl;
+    unsigned SDNodeOrder;
+  public:
+    DanglingDebugInfo() : DI(nullptr), dl(DebugLoc()), SDNodeOrder(0) { }
+    DanglingDebugInfo(const DbgValueInst *di, DebugLoc DL, unsigned SDNO)
+        : DI(di), dl(std::move(DL)), SDNodeOrder(SDNO) {}
+    const DbgValueInst* getDI() { return DI; }
+    DebugLoc getdl() { return dl; }
+    unsigned getSDNodeOrder() { return SDNodeOrder; }
+  };
+
+  /// DanglingDebugInfoMap - Keeps track of dbg_values for which we have not
+  /// yet seen the referent.  We defer handling these until we do see it.
+  DenseMap<const Value*, DanglingDebugInfo> DanglingDebugInfoMap;
+
+public:
+  /// PendingLoads - Loads are not emitted to the program immediately.  We bunch
+  /// them up and then emit token factor nodes when possible.  This allows us to
+  /// get simple disambiguation between loads without worrying about alias
+  /// analysis.
+  SmallVector<SDValue, 8> PendingLoads;
+
+  /// State used while lowering a statepoint sequence (gc_statepoint,
+  /// gc_relocate, and gc_result).  See StatepointLowering.hpp/cpp for details.
+  StatepointLoweringState StatepointLowering;
+private:
+
+  /// PendingExports - CopyToReg nodes that copy values to virtual registers
+  /// for export to other blocks need to be emitted before any terminator
+  /// instruction, but they have no other ordering requirements. We bunch them
+  /// up and the emit a single tokenfactor for them just before terminator
+  /// instructions.
+  SmallVector<SDValue, 8> PendingExports;
+
+  /// SDNodeOrder - A unique monotonically increasing number used to order the
+  /// SDNodes we create.
+  unsigned SDNodeOrder;
+
+  enum CaseClusterKind {
+    /// A cluster of adjacent case labels with the same destination, or just one
+    /// case.
+    CC_Range,
+    /// A cluster of cases suitable for jump table lowering.
+    CC_JumpTable,
+    /// A cluster of cases suitable for bit test lowering.
+    CC_BitTests
+  };
+
+  /// A cluster of case labels.
+  struct CaseCluster {
+    CaseClusterKind Kind;
+    const ConstantInt *Low, *High;
+    union {
+      MachineBasicBlock *MBB;
+      unsigned JTCasesIndex;
+      unsigned BTCasesIndex;
+    };
+    BranchProbability Prob;
+
+    static CaseCluster range(const ConstantInt *Low, const ConstantInt *High,
+                             MachineBasicBlock *MBB, BranchProbability Prob) {
+      CaseCluster C;
+      C.Kind = CC_Range;
+      C.Low = Low;
+      C.High = High;
+      C.MBB = MBB;
+      C.Prob = Prob;
+      return C;
+    }
+
+    static CaseCluster jumpTable(const ConstantInt *Low,
+                                 const ConstantInt *High, unsigned JTCasesIndex,
+                                 BranchProbability Prob) {
+      CaseCluster C;
+      C.Kind = CC_JumpTable;
+      C.Low = Low;
+      C.High = High;
+      C.JTCasesIndex = JTCasesIndex;
+      C.Prob = Prob;
+      return C;
+    }
+
+    static CaseCluster bitTests(const ConstantInt *Low, const ConstantInt *High,
+                                unsigned BTCasesIndex, BranchProbability Prob) {
+      CaseCluster C;
+      C.Kind = CC_BitTests;
+      C.Low = Low;
+      C.High = High;
+      C.BTCasesIndex = BTCasesIndex;
+      C.Prob = Prob;
+      return C;
+    }
+  };
+
+  typedef std::vector<CaseCluster> CaseClusterVector;
+  typedef CaseClusterVector::iterator CaseClusterIt;
+
+  struct CaseBits {
+    uint64_t Mask;
+    MachineBasicBlock* BB;
+    unsigned Bits;
+    BranchProbability ExtraProb;
+
+    CaseBits(uint64_t mask, MachineBasicBlock* bb, unsigned bits,
+             BranchProbability Prob):
+      Mask(mask), BB(bb), Bits(bits), ExtraProb(Prob) { }
+
+    CaseBits() : Mask(0), BB(nullptr), Bits(0) {}
+  };
+
+  typedef std::vector<CaseBits> CaseBitsVector;
+
+  /// Sort Clusters and merge adjacent cases.
+  void sortAndRangeify(CaseClusterVector &Clusters);
+
+  /// CaseBlock - This structure is used to communicate between
+  /// SelectionDAGBuilder and SDISel for the code generation of additional basic
+  /// blocks needed by multi-case switch statements.
+  struct CaseBlock {
+    CaseBlock(ISD::CondCode cc, const Value *cmplhs, const Value *cmprhs,
+              const Value *cmpmiddle, MachineBasicBlock *truebb,
+              MachineBasicBlock *falsebb, MachineBasicBlock *me,
+              BranchProbability trueprob = BranchProbability::getUnknown(),
+              BranchProbability falseprob = BranchProbability::getUnknown())
+        : CC(cc), CmpLHS(cmplhs), CmpMHS(cmpmiddle), CmpRHS(cmprhs),
+          TrueBB(truebb), FalseBB(falsebb), ThisBB(me), TrueProb(trueprob),
+          FalseProb(falseprob) {}
+
+    // CC - the condition code to use for the case block's setcc node
+    ISD::CondCode CC;
+
+    // CmpLHS/CmpRHS/CmpMHS - The LHS/MHS/RHS of the comparison to emit.
+    // Emit by default LHS op RHS. MHS is used for range comparisons:
+    // If MHS is not null: (LHS <= MHS) and (MHS <= RHS).
+    const Value *CmpLHS, *CmpMHS, *CmpRHS;
+
+    // TrueBB/FalseBB - the block to branch to if the setcc is true/false.
+    MachineBasicBlock *TrueBB, *FalseBB;
+
+    // ThisBB - the block into which to emit the code for the setcc and branches
+    MachineBasicBlock *ThisBB;
+
+    // TrueProb/FalseProb - branch weights.
+    BranchProbability TrueProb, FalseProb;
+  };
+
+  struct JumpTable {
+    JumpTable(unsigned R, unsigned J, MachineBasicBlock *M,
+              MachineBasicBlock *D): Reg(R), JTI(J), MBB(M), Default(D) {}
+
+    /// Reg - the virtual register containing the index of the jump table entry
+    //. to jump to.
+    unsigned Reg;
+    /// JTI - the JumpTableIndex for this jump table in the function.
+    unsigned JTI;
+    /// MBB - the MBB into which to emit the code for the indirect jump.
+    MachineBasicBlock *MBB;
+    /// Default - the MBB of the default bb, which is a successor of the range
+    /// check MBB.  This is when updating PHI nodes in successors.
+    MachineBasicBlock *Default;
+  };
+  struct JumpTableHeader {
+    JumpTableHeader(APInt F, APInt L, const Value *SV, MachineBasicBlock *H,
+                    bool E = false)
+        : First(std::move(F)), Last(std::move(L)), SValue(SV), HeaderBB(H),
+          Emitted(E) {}
+    APInt First;
+    APInt Last;
+    const Value *SValue;
+    MachineBasicBlock *HeaderBB;
+    bool Emitted;
+  };
+  typedef std::pair<JumpTableHeader, JumpTable> JumpTableBlock;
+
+  struct BitTestCase {
+    BitTestCase(uint64_t M, MachineBasicBlock* T, MachineBasicBlock* Tr,
+                BranchProbability Prob):
+      Mask(M), ThisBB(T), TargetBB(Tr), ExtraProb(Prob) { }
+    uint64_t Mask;
+    MachineBasicBlock *ThisBB;
+    MachineBasicBlock *TargetBB;
+    BranchProbability ExtraProb;
+  };
+
+  typedef SmallVector<BitTestCase, 3> BitTestInfo;
+
+  struct BitTestBlock {
+    BitTestBlock(APInt F, APInt R, const Value *SV, unsigned Rg, MVT RgVT,
+                 bool E, bool CR, MachineBasicBlock *P, MachineBasicBlock *D,
+                 BitTestInfo C, BranchProbability Pr)
+        : First(std::move(F)), Range(std::move(R)), SValue(SV), Reg(Rg),
+          RegVT(RgVT), Emitted(E), ContiguousRange(CR), Parent(P), Default(D),
+          Cases(std::move(C)), Prob(Pr) {}
+    APInt First;
+    APInt Range;
+    const Value *SValue;
+    unsigned Reg;
+    MVT RegVT;
+    bool Emitted;
+    bool ContiguousRange;
+    MachineBasicBlock *Parent;
+    MachineBasicBlock *Default;
+    BitTestInfo Cases;
+    BranchProbability Prob;
+    BranchProbability DefaultProb;
+  };
+
+  /// Check whether a range of clusters is dense enough for a jump table.
+  bool isDense(const CaseClusterVector &Clusters, unsigned *TotalCases,
+               unsigned First, unsigned Last, unsigned MinDensity);
+
+  /// Build a jump table cluster from Clusters[First..Last]. Returns false if it
+  /// decides it's not a good idea.
+  bool buildJumpTable(CaseClusterVector &Clusters, unsigned First,
+                      unsigned Last, const SwitchInst *SI,
+                      MachineBasicBlock *DefaultMBB, CaseCluster &JTCluster);
+
+  /// Find clusters of cases suitable for jump table lowering.
+  void findJumpTables(CaseClusterVector &Clusters, const SwitchInst *SI,
+                      MachineBasicBlock *DefaultMBB);
+
+  /// Check whether the range [Low,High] fits in a machine word.
+  bool rangeFitsInWord(const APInt &Low, const APInt &High);
+
+  /// Check whether these clusters are suitable for lowering with bit tests based
+  /// on the number of destinations, comparison metric, and range.
+  bool isSuitableForBitTests(unsigned NumDests, unsigned NumCmps,
+                             const APInt &Low, const APInt &High);
+
+  /// Build a bit test cluster from Clusters[First..Last]. Returns false if it
+  /// decides it's not a good idea.
+  bool buildBitTests(CaseClusterVector &Clusters, unsigned First, unsigned Last,
+                     const SwitchInst *SI, CaseCluster &BTCluster);
+
+  /// Find clusters of cases suitable for bit test lowering.
+  void findBitTestClusters(CaseClusterVector &Clusters, const SwitchInst *SI);
+
+  struct SwitchWorkListItem {
+    MachineBasicBlock *MBB;
+    CaseClusterIt FirstCluster;
+    CaseClusterIt LastCluster;
+    const ConstantInt *GE;
+    const ConstantInt *LT;
+    BranchProbability DefaultProb;
+  };
+  typedef SmallVector<SwitchWorkListItem, 4> SwitchWorkList;
+
+  /// Determine the rank by weight of CC in [First,Last]. If CC has more weight
+  /// than each cluster in the range, its rank is 0.
+  static unsigned caseClusterRank(const CaseCluster &CC, CaseClusterIt First,
+                                  CaseClusterIt Last);
+
+  /// Emit comparison and split W into two subtrees.
+  void splitWorkItem(SwitchWorkList &WorkList, const SwitchWorkListItem &W,
+                     Value *Cond, MachineBasicBlock *SwitchMBB);
+
+  /// Lower W.
+  void lowerWorkItem(SwitchWorkListItem W, Value *Cond,
+                     MachineBasicBlock *SwitchMBB,
+                     MachineBasicBlock *DefaultMBB);
+
+
+  /// A class which encapsulates all of the information needed to generate a
+  /// stack protector check and signals to isel via its state being initialized
+  /// that a stack protector needs to be generated.
+  ///
+  /// *NOTE* The following is a high level documentation of SelectionDAG Stack
+  /// Protector Generation. The reason that it is placed here is for a lack of
+  /// other good places to stick it.
+  ///
+  /// High Level Overview of SelectionDAG Stack Protector Generation:
+  ///
+  /// Previously, generation of stack protectors was done exclusively in the
+  /// pre-SelectionDAG Codegen LLVM IR Pass "Stack Protector". This necessitated
+  /// splitting basic blocks at the IR level to create the success/failure basic
+  /// blocks in the tail of the basic block in question. As a result of this,
+  /// calls that would have qualified for the sibling call optimization were no
+  /// longer eligible for optimization since said calls were no longer right in
+  /// the "tail position" (i.e. the immediate predecessor of a ReturnInst
+  /// instruction).
+  ///
+  /// Then it was noticed that since the sibling call optimization causes the
+  /// callee to reuse the caller's stack, if we could delay the generation of
+  /// the stack protector check until later in CodeGen after the sibling call
+  /// decision was made, we get both the tail call optimization and the stack
+  /// protector check!
+  ///
+  /// A few goals in solving this problem were:
+  ///
+  ///   1. Preserve the architecture independence of stack protector generation.
+  ///
+  ///   2. Preserve the normal IR level stack protector check for platforms like
+  ///      OpenBSD for which we support platform-specific stack protector
+  ///      generation.
+  ///
+  /// The main problem that guided the present solution is that one can not
+  /// solve this problem in an architecture independent manner at the IR level
+  /// only. This is because:
+  ///
+  ///   1. The decision on whether or not to perform a sibling call on certain
+  ///      platforms (for instance i386) requires lower level information
+  ///      related to available registers that can not be known at the IR level.
+  ///
+  ///   2. Even if the previous point were not true, the decision on whether to
+  ///      perform a tail call is done in LowerCallTo in SelectionDAG which
+  ///      occurs after the Stack Protector Pass. As a result, one would need to
+  ///      put the relevant callinst into the stack protector check success
+  ///      basic block (where the return inst is placed) and then move it back
+  ///      later at SelectionDAG/MI time before the stack protector check if the
+  ///      tail call optimization failed. The MI level option was nixed
+  ///      immediately since it would require platform-specific pattern
+  ///      matching. The SelectionDAG level option was nixed because
+  ///      SelectionDAG only processes one IR level basic block at a time
+  ///      implying one could not create a DAG Combine to move the callinst.
+  ///
+  /// To get around this problem a few things were realized:
+  ///
+  ///   1. While one can not handle multiple IR level basic blocks at the
+  ///      SelectionDAG Level, one can generate multiple machine basic blocks
+  ///      for one IR level basic block. This is how we handle bit tests and
+  ///      switches.
+  ///
+  ///   2. At the MI level, tail calls are represented via a special return
+  ///      MIInst called "tcreturn". Thus if we know the basic block in which we
+  ///      wish to insert the stack protector check, we get the correct behavior
+  ///      by always inserting the stack protector check right before the return
+  ///      statement. This is a "magical transformation" since no matter where
+  ///      the stack protector check intrinsic is, we always insert the stack
+  ///      protector check code at the end of the BB.
+  ///
+  /// Given the aforementioned constraints, the following solution was devised:
+  ///
+  ///   1. On platforms that do not support SelectionDAG stack protector check
+  ///      generation, allow for the normal IR level stack protector check
+  ///      generation to continue.
+  ///
+  ///   2. On platforms that do support SelectionDAG stack protector check
+  ///      generation:
+  ///
+  ///     a. Use the IR level stack protector pass to decide if a stack
+  ///        protector is required/which BB we insert the stack protector check
+  ///        in by reusing the logic already therein. If we wish to generate a
+  ///        stack protector check in a basic block, we place a special IR
+  ///        intrinsic called llvm.stackprotectorcheck right before the BB's
+  ///        returninst or if there is a callinst that could potentially be
+  ///        sibling call optimized, before the call inst.
+  ///
+  ///     b. Then when a BB with said intrinsic is processed, we codegen the BB
+  ///        normally via SelectBasicBlock. In said process, when we visit the
+  ///        stack protector check, we do not actually emit anything into the
+  ///        BB. Instead, we just initialize the stack protector descriptor
+  ///        class (which involves stashing information/creating the success
+  ///        mbbb and the failure mbb if we have not created one for this
+  ///        function yet) and export the guard variable that we are going to
+  ///        compare.
+  ///
+  ///     c. After we finish selecting the basic block, in FinishBasicBlock if
+  ///        the StackProtectorDescriptor attached to the SelectionDAGBuilder is
+  ///        initialized, we produce the validation code with one of these
+  ///        techniques:
+  ///          1) with a call to a guard check function
+  ///          2) with inlined instrumentation
+  ///
+  ///        1) We insert a call to the check function before the terminator.
+  ///
+  ///        2) We first find a splice point in the parent basic block
+  ///        before the terminator and then splice the terminator of said basic
+  ///        block into the success basic block. Then we code-gen a new tail for
+  ///        the parent basic block consisting of the two loads, the comparison,
+  ///        and finally two branches to the success/failure basic blocks. We
+  ///        conclude by code-gening the failure basic block if we have not
+  ///        code-gened it already (all stack protector checks we generate in
+  ///        the same function, use the same failure basic block).
+  class StackProtectorDescriptor {
+  public:
+    StackProtectorDescriptor()
+        : ParentMBB(nullptr), SuccessMBB(nullptr), FailureMBB(nullptr) {}
+
+    /// Returns true if all fields of the stack protector descriptor are
+    /// initialized implying that we should/are ready to emit a stack protector.
+    bool shouldEmitStackProtector() const {
+      return ParentMBB && SuccessMBB && FailureMBB;
+    }
+
+    bool shouldEmitFunctionBasedCheckStackProtector() const {
+      return ParentMBB && !SuccessMBB && !FailureMBB;
+    }
+
+    /// Initialize the stack protector descriptor structure for a new basic
+    /// block.
+    void initialize(const BasicBlock *BB, MachineBasicBlock *MBB,
+                    bool FunctionBasedInstrumentation) {
+      // Make sure we are not initialized yet.
+      assert(!shouldEmitStackProtector() && "Stack Protector Descriptor is "
+             "already initialized!");
+      ParentMBB = MBB;
+      if (!FunctionBasedInstrumentation) {
+        SuccessMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ true);
+        FailureMBB = AddSuccessorMBB(BB, MBB, /* IsLikely */ false, FailureMBB);
+      }
+    }
+
+    /// Reset state that changes when we handle different basic blocks.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. The specific basic block we are generating a
+    /// stack protector for (ParentMBB).
+    ///
+    /// 2. The successor machine basic block that will contain the tail of
+    /// parent mbb after we create the stack protector check (SuccessMBB). This
+    /// BB is visited only on stack protector check success.
+    void resetPerBBState() {
+      ParentMBB = nullptr;
+      SuccessMBB = nullptr;
+    }
+
+    /// Reset state that only changes when we switch functions.
+    ///
+    /// This currently includes:
+    ///
+    /// 1. FailureMBB since we reuse the failure code path for all stack
+    /// protector checks created in an individual function.
+    ///
+    /// 2.The guard variable since the guard variable we are checking against is
+    /// always the same.
+    void resetPerFunctionState() {
+      FailureMBB = nullptr;
+    }
+
+    MachineBasicBlock *getParentMBB() { return ParentMBB; }
+    MachineBasicBlock *getSuccessMBB() { return SuccessMBB; }
+    MachineBasicBlock *getFailureMBB() { return FailureMBB; }
+
+  private:
+    /// The basic block for which we are generating the stack protector.
+    ///
+    /// As a result of stack protector generation, we will splice the
+    /// terminators of this basic block into the successor mbb SuccessMBB and
+    /// replace it with a compare/branch to the successor mbbs
+    /// SuccessMBB/FailureMBB depending on whether or not the stack protector
+    /// was violated.
+    MachineBasicBlock *ParentMBB;
+
+    /// A basic block visited on stack protector check success that contains the
+    /// terminators of ParentMBB.
+    MachineBasicBlock *SuccessMBB;
+
+    /// This basic block visited on stack protector check failure that will
+    /// contain a call to __stack_chk_fail().
+    MachineBasicBlock *FailureMBB;
+
+    /// Add a successor machine basic block to ParentMBB. If the successor mbb
+    /// has not been created yet (i.e. if SuccMBB = 0), then the machine basic
+    /// block will be created. Assign a large weight if IsLikely is true.
+    MachineBasicBlock *AddSuccessorMBB(const BasicBlock *BB,
+                                       MachineBasicBlock *ParentMBB,
+                                       bool IsLikely,
+                                       MachineBasicBlock *SuccMBB = nullptr);
+  };
+
+private:
+  const TargetMachine &TM;
+public:
+  /// Lowest valid SDNodeOrder. The special case 0 is reserved for scheduling
+  /// nodes without a corresponding SDNode.
+  static const unsigned LowestSDNodeOrder = 1;
+
+  SelectionDAG &DAG;
+  const DataLayout *DL;
+  AliasAnalysis *AA;
+  const TargetLibraryInfo *LibInfo;
+
+  /// SwitchCases - Vector of CaseBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<CaseBlock> SwitchCases;
+  /// JTCases - Vector of JumpTable structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<JumpTableBlock> JTCases;
+  /// BitTestCases - Vector of BitTestBlock structures used to communicate
+  /// SwitchInst code generation information.
+  std::vector<BitTestBlock> BitTestCases;
+  /// A StackProtectorDescriptor structure used to communicate stack protector
+  /// information in between SelectBasicBlock and FinishBasicBlock.
+  StackProtectorDescriptor SPDescriptor;
+
+  // Emit PHI-node-operand constants only once even if used by multiple
+  // PHI nodes.
+  DenseMap<const Constant *, unsigned> ConstantsOut;
+
+  /// FuncInfo - Information about the function as a whole.
+  ///
+  FunctionLoweringInfo &FuncInfo;
+
+  /// GFI - Garbage collection metadata for the function.
+  GCFunctionInfo *GFI;
+
+  /// LPadToCallSiteMap - Map a landing pad to the call site indexes.
+  DenseMap<MachineBasicBlock*, SmallVector<unsigned, 4> > LPadToCallSiteMap;
+
+  /// HasTailCall - This is set to true if a call in the current
+  /// block has been translated as a tail call. In this case,
+  /// no subsequent DAG nodes should be created.
+  ///
+  bool HasTailCall;
+
+  LLVMContext *Context;
+
+  SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo,
+                      CodeGenOpt::Level ol)
+    : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()),
+      DAG(dag), FuncInfo(funcinfo),
+      HasTailCall(false) {
+  }
+
+  void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
+            const TargetLibraryInfo *li);
+
+  /// clear - Clear out the current SelectionDAG and the associated
+  /// state and prepare this SelectionDAGBuilder object to be used
+  /// for a new block. This doesn't clear out information about
+  /// additional blocks that are needed to complete switch lowering
+  /// or PHI node updating; that information is cleared out as it is
+  /// consumed.
+  void clear();
+
+  /// clearDanglingDebugInfo - Clear the dangling debug information
+  /// map. This function is separated from the clear so that debug
+  /// information that is dangling in a basic block can be properly
+  /// resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached
+  /// to PHI nodes.
+  void clearDanglingDebugInfo();
+
+  /// getRoot - Return the current virtual root of the Selection DAG,
+  /// flushing any PendingLoad items. This must be done before emitting
+  /// a store or any other node that may need to be ordered after any
+  /// prior load instructions.
+  ///
+  SDValue getRoot();
+
+  /// getControlRoot - Similar to getRoot, but instead of flushing all the
+  /// PendingLoad items, flush all the PendingExports items. It is necessary
+  /// to do this before emitting a terminator instruction.
+  ///
+  SDValue getControlRoot();
+
+  SDLoc getCurSDLoc() const {
+    return SDLoc(CurInst, SDNodeOrder);
+  }
+
+  DebugLoc getCurDebugLoc() const {
+    return CurInst ? CurInst->getDebugLoc() : DebugLoc();
+  }
+
+  unsigned getSDNodeOrder() const { return SDNodeOrder; }
+
+  void CopyValueToVirtualRegister(const Value *V, unsigned Reg);
+
+  void visit(const Instruction &I);
+
+  void visit(unsigned Opcode, const User &I);
+
+  /// getCopyFromRegs - If there was virtual register allocated for the value V
+  /// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
+  SDValue getCopyFromRegs(const Value *V, Type *Ty);
+
+  // resolveDanglingDebugInfo - if we saw an earlier dbg_value referring to V,
+  // generate the debug data structures now that we've seen its definition.
+  void resolveDanglingDebugInfo(const Value *V, SDValue Val);
+  SDValue getValue(const Value *V);
+  bool findValue(const Value *V) const;
+
+  SDValue getNonRegisterValue(const Value *V);
+  SDValue getValueImpl(const Value *V);
+
+  void setValue(const Value *V, SDValue NewN) {
+    SDValue &N = NodeMap[V];
+    assert(!N.getNode() && "Already set a value for this node!");
+    N = NewN;
+  }
+
+  void setUnusedArgValue(const Value *V, SDValue NewN) {
+    SDValue &N = UnusedArgNodeMap[V];
+    assert(!N.getNode() && "Already set a value for this node!");
+    N = NewN;
+  }
+
+  void FindMergedConditions(const Value *Cond, MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB, MachineBasicBlock *CurBB,
+                            MachineBasicBlock *SwitchBB,
+                            Instruction::BinaryOps Opc, BranchProbability TW,
+                            BranchProbability FW);
+  void EmitBranchForMergedCondition(const Value *Cond, MachineBasicBlock *TBB,
+                                    MachineBasicBlock *FBB,
+                                    MachineBasicBlock *CurBB,
+                                    MachineBasicBlock *SwitchBB,
+                                    BranchProbability TW, BranchProbability FW);
+  bool ShouldEmitAsBranches(const std::vector<CaseBlock> &Cases);
+  bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
+  void CopyToExportRegsIfNeeded(const Value *V);
+  void ExportFromCurrentBlock(const Value *V);
+  void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
+                   const BasicBlock *EHPadBB = nullptr);
+
+  // Lower range metadata from 0 to N to assert zext to an integer of nearest
+  // floor power of two.
+  SDValue lowerRangeToAssertZExt(SelectionDAG &DAG, const Instruction &I,
+                                 SDValue Op);
+
+  void populateCallLoweringInfo(TargetLowering::CallLoweringInfo &CLI,
+                                ImmutableCallSite CS, unsigned ArgIdx,
+                                unsigned NumArgs, SDValue Callee,
+                                Type *ReturnTy, bool IsPatchPoint);
+
+  std::pair<SDValue, SDValue>
+  lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
+                 const BasicBlock *EHPadBB = nullptr);
+
+  /// UpdateSplitBlock - When an MBB was split during scheduling, update the
+  /// references that need to refer to the last resulting block.
+  void UpdateSplitBlock(MachineBasicBlock *First, MachineBasicBlock *Last);
+
+  /// Describes a gc.statepoint or a gc.statepoint like thing for the purposes
+  /// of lowering into a STATEPOINT node.
+  struct StatepointLoweringInfo {
+    /// Bases[i] is the base pointer for Ptrs[i].  Together they denote the set
+    /// of gc pointers this STATEPOINT has to relocate.
+    SmallVector<const Value *, 16> Bases;
+    SmallVector<const Value *, 16> Ptrs;
+
+    /// The set of gc.relocate calls associated with this gc.statepoint.
+    SmallVector<const GCRelocateInst *, 16> GCRelocates;
+
+    /// The full list of gc arguments to the gc.statepoint being lowered.
+    ArrayRef<const Use> GCArgs;
+
+    /// The gc.statepoint instruction.
+    const Instruction *StatepointInstr = nullptr;
+
+    /// The list of gc transition arguments present in the gc.statepoint being
+    /// lowered.
+    ArrayRef<const Use> GCTransitionArgs;
+
+    /// The ID that the resulting STATEPOINT instruction has to report.
+    unsigned ID = -1;
+
+    /// Information regarding the underlying call instruction.
+    TargetLowering::CallLoweringInfo CLI;
+
+    /// The deoptimization state associated with this gc.statepoint call, if
+    /// any.
+    ArrayRef<const Use> DeoptState;
+
+    /// Flags associated with the meta arguments being lowered.
+    uint64_t StatepointFlags = -1;
+
+    /// The number of patchable bytes the call needs to get lowered into.
+    unsigned NumPatchBytes = -1;
+
+    /// The exception handling unwind destination, in case this represents an
+    /// invoke of gc.statepoint.
+    const BasicBlock *EHPadBB = nullptr;
+
+    explicit StatepointLoweringInfo(SelectionDAG &DAG) : CLI(DAG) {}
+  };
+
+  /// Lower \p SLI into a STATEPOINT instruction.
+  SDValue LowerAsSTATEPOINT(StatepointLoweringInfo &SLI);
+
+  // This function is responsible for the whole statepoint lowering process.
+  // It uniformly handles invoke and call statepoints.
+  void LowerStatepoint(ImmutableStatepoint Statepoint,
+                       const BasicBlock *EHPadBB = nullptr);
+
+  void LowerCallSiteWithDeoptBundle(ImmutableCallSite CS, SDValue Callee,
+                                    const BasicBlock *EHPadBB);
+
+  void LowerDeoptimizeCall(const CallInst *CI);
+  void LowerDeoptimizingReturn();
+
+  void LowerCallSiteWithDeoptBundleImpl(ImmutableCallSite CS, SDValue Callee,
+                                        const BasicBlock *EHPadBB,
+                                        bool VarArgDisallowed,
+                                        bool ForceVoidReturnTy);
+
+private:
+  // Terminator instructions.
+  void visitRet(const ReturnInst &I);
+  void visitBr(const BranchInst &I);
+  void visitSwitch(const SwitchInst &I);
+  void visitIndirectBr(const IndirectBrInst &I);
+  void visitUnreachable(const UnreachableInst &I);
+  void visitCleanupRet(const CleanupReturnInst &I);
+  void visitCatchSwitch(const CatchSwitchInst &I);
+  void visitCatchRet(const CatchReturnInst &I);
+  void visitCatchPad(const CatchPadInst &I);
+  void visitCleanupPad(const CleanupPadInst &CPI);
+
+  BranchProbability getEdgeProbability(const MachineBasicBlock *Src,
+                                       const MachineBasicBlock *Dst) const;
+  void addSuccessorWithProb(
+      MachineBasicBlock *Src, MachineBasicBlock *Dst,
+      BranchProbability Prob = BranchProbability::getUnknown());
+
+public:
+  void visitSwitchCase(CaseBlock &CB,
+                       MachineBasicBlock *SwitchBB);
+  void visitSPDescriptorParent(StackProtectorDescriptor &SPD,
+                               MachineBasicBlock *ParentBB);
+  void visitSPDescriptorFailure(StackProtectorDescriptor &SPD);
+  void visitBitTestHeader(BitTestBlock &B, MachineBasicBlock *SwitchBB);
+  void visitBitTestCase(BitTestBlock &BB,
+                        MachineBasicBlock* NextMBB,
+                        BranchProbability BranchProbToNext,
+                        unsigned Reg,
+                        BitTestCase &B,
+                        MachineBasicBlock *SwitchBB);
+  void visitJumpTable(JumpTable &JT);
+  void visitJumpTableHeader(JumpTable &JT, JumpTableHeader &JTH,
+                            MachineBasicBlock *SwitchBB);
+
+private:
+  // These all get lowered before this pass.
+  void visitInvoke(const InvokeInst &I);
+  void visitResume(const ResumeInst &I);
+
+  void visitBinary(const User &I, unsigned OpCode);
+  void visitShift(const User &I, unsigned Opcode);
+  void visitAdd(const User &I)  { visitBinary(I, ISD::ADD); }
+  void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); }
+  void visitSub(const User &I)  { visitBinary(I, ISD::SUB); }
+  void visitFSub(const User &I);
+  void visitMul(const User &I)  { visitBinary(I, ISD::MUL); }
+  void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); }
+  void visitURem(const User &I) { visitBinary(I, ISD::UREM); }
+  void visitSRem(const User &I) { visitBinary(I, ISD::SREM); }
+  void visitFRem(const User &I) { visitBinary(I, ISD::FREM); }
+  void visitUDiv(const User &I) { visitBinary(I, ISD::UDIV); }
+  void visitSDiv(const User &I);
+  void visitFDiv(const User &I) { visitBinary(I, ISD::FDIV); }
+  void visitAnd (const User &I) { visitBinary(I, ISD::AND); }
+  void visitOr  (const User &I) { visitBinary(I, ISD::OR); }
+  void visitXor (const User &I) { visitBinary(I, ISD::XOR); }
+  void visitShl (const User &I) { visitShift(I, ISD::SHL); }
+  void visitLShr(const User &I) { visitShift(I, ISD::SRL); }
+  void visitAShr(const User &I) { visitShift(I, ISD::SRA); }
+  void visitICmp(const User &I);
+  void visitFCmp(const User &I);
+  // Visit the conversion instructions
+  void visitTrunc(const User &I);
+  void visitZExt(const User &I);
+  void visitSExt(const User &I);
+  void visitFPTrunc(const User &I);
+  void visitFPExt(const User &I);
+  void visitFPToUI(const User &I);
+  void visitFPToSI(const User &I);
+  void visitUIToFP(const User &I);
+  void visitSIToFP(const User &I);
+  void visitPtrToInt(const User &I);
+  void visitIntToPtr(const User &I);
+  void visitBitCast(const User &I);
+  void visitAddrSpaceCast(const User &I);
+
+  void visitExtractElement(const User &I);
+  void visitInsertElement(const User &I);
+  void visitShuffleVector(const User &I);
+
+  void visitExtractValue(const ExtractValueInst &I);
+  void visitInsertValue(const InsertValueInst &I);
+  void visitLandingPad(const LandingPadInst &I);
+
+  void visitGetElementPtr(const User &I);
+  void visitSelect(const User &I);
+
+  void visitAlloca(const AllocaInst &I);
+  void visitLoad(const LoadInst &I);
+  void visitStore(const StoreInst &I);
+  void visitMaskedLoad(const CallInst &I);
+  void visitMaskedStore(const CallInst &I);
+  void visitMaskedGather(const CallInst &I);
+  void visitMaskedScatter(const CallInst &I);
+  void visitAtomicCmpXchg(const AtomicCmpXchgInst &I);
+  void visitAtomicRMW(const AtomicRMWInst &I);
+  void visitFence(const FenceInst &I);
+  void visitPHI(const PHINode &I);
+  void visitCall(const CallInst &I);
+  bool visitMemCmpCall(const CallInst &I);
+  bool visitMemChrCall(const CallInst &I);
+  bool visitStrCpyCall(const CallInst &I, bool isStpcpy);
+  bool visitStrCmpCall(const CallInst &I);
+  bool visitStrLenCall(const CallInst &I);
+  bool visitStrNLenCall(const CallInst &I);
+  bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
+  bool visitBinaryFloatCall(const CallInst &I, unsigned Opcode);
+  void visitAtomicLoad(const LoadInst &I);
+  void visitAtomicStore(const StoreInst &I);
+  void visitLoadFromSwiftError(const LoadInst &I);
+  void visitStoreToSwiftError(const StoreInst &I);
+
+  void visitInlineAsm(ImmutableCallSite CS);
+  const char *visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
+  void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
+
+  void visitVAStart(const CallInst &I);
+  void visitVAArg(const VAArgInst &I);
+  void visitVAEnd(const CallInst &I);
+  void visitVACopy(const CallInst &I);
+  void visitStackmap(const CallInst &I);
+  void visitPatchpoint(ImmutableCallSite CS,
+                       const BasicBlock *EHPadBB = nullptr);
+
+  // These two are implemented in StatepointLowering.cpp
+  void visitGCRelocate(const GCRelocateInst &I);
+  void visitGCResult(const GCResultInst &I);
+
+  void visitUserOp1(const Instruction &I) {
+    llvm_unreachable("UserOp1 should not exist at instruction selection time!");
+  }
+  void visitUserOp2(const Instruction &I) {
+    llvm_unreachable("UserOp2 should not exist at instruction selection time!");
+  }
+
+  void processIntegerCallValue(const Instruction &I,
+                               SDValue Value, bool IsSigned);
+
+  void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
+
+  void emitInlineAsmError(ImmutableCallSite CS, const Twine &Message);
+
+  /// EmitFuncArgumentDbgValue - If V is an function argument then create
+  /// corresponding DBG_VALUE machine instruction for it now. At the end of
+  /// instruction selection, they will be inserted to the entry BB.
+  bool EmitFuncArgumentDbgValue(const Value *V, DILocalVariable *Variable,
+                                DIExpression *Expr, DILocation *DL,
+                                int64_t Offset, bool IsIndirect,
+                                const SDValue &N);
+
+  /// Return the next block after MBB, or nullptr if there is none.
+  MachineBasicBlock *NextBlock(MachineBasicBlock *MBB);
+
+  /// Update the DAG and DAG builder with the relevant information after
+  /// a new root node has been created which could be a tail call.
+  void updateDAGForMaybeTailCall(SDValue MaybeTC);
+};
+
+/// RegsForValue - This struct represents the registers (physical or virtual)
+/// that a particular set of values is assigned, and the type information about
+/// the value. The most common situation is to represent one value at a time,
+/// but struct or array values are handled element-wise as multiple values.  The
+/// splitting of aggregates is performed recursively, so that we never have
+/// aggregate-typed registers. The values at this point do not necessarily have
+/// legal types, so each value may require one or more registers of some legal
+/// type.
+///
+struct RegsForValue {
+  /// ValueVTs - The value types of the values, which may not be legal, and
+  /// may need be promoted or synthesized from one or more registers.
+  ///
+  SmallVector<EVT, 4> ValueVTs;
+
+  /// RegVTs - The value types of the registers. This is the same size as
+  /// ValueVTs and it records, for each value, what the type of the assigned
+  /// register or registers are. (Individual values are never synthesized
+  /// from more than one type of register.)
+  ///
+  /// With virtual registers, the contents of RegVTs is redundant with TLI's
+  /// getRegisterType member function, however when with physical registers
+  /// it is necessary to have a separate record of the types.
+  ///
+  SmallVector<MVT, 4> RegVTs;
+
+  /// Regs - This list holds the registers assigned to the values.
+  /// Each legal or promoted value requires one register, and each
+  /// expanded value requires multiple registers.
+  ///
+  SmallVector<unsigned, 4> Regs;
+
+  RegsForValue();
+
+  RegsForValue(const SmallVector<unsigned, 4> &regs, MVT regvt, EVT valuevt);
+
+  RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
+               const DataLayout &DL, unsigned Reg, Type *Ty);
+
+  /// append - Add the specified values to this one.
+  void append(const RegsForValue &RHS) {
+    ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
+    RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
+    Regs.append(RHS.Regs.begin(), RHS.Regs.end());
+  }
+
+  /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
+  /// this value and returns the result as a ValueVTs value.  This uses
+  /// Chain/Flag as the input and updates them for the output Chain/Flag.
+  /// If the Flag pointer is NULL, no flag is used.
+  SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
+                          const SDLoc &dl, SDValue &Chain, SDValue *Flag,
+                          const Value *V = nullptr) const;
+
+  /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified
+  /// value into the registers specified by this object.  This uses Chain/Flag
+  /// as the input and updates them for the output Chain/Flag.  If the Flag
+  /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used
+  /// in printing better diagnostic messages on error.
+  void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
+                     SDValue &Chain, SDValue *Flag, const Value *V = nullptr,
+                     ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
+
+  /// AddInlineAsmOperands - Add this value to the specified inlineasm node
+  /// operand list.  This adds the code marker, matching input operand index
+  /// (if applicable), and includes the number of values added into it.
+  void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
+                            unsigned MatchingIdx, const SDLoc &dl,
+                            SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
new file mode 100644
index 000000000000..401da059dedc
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -0,0 +1,719 @@
+//===-- SelectionDAGDumper.cpp - Implement SelectionDAG::dump() -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::dump method and friends.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/Printable.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+static cl::opt<bool>
+VerboseDAGDumping("dag-dump-verbose", cl::Hidden,
+                  cl::desc("Display more information when dumping selection "
+                           "DAG nodes."));
+
+std::string SDNode::getOperationName(const SelectionDAG *G) const {
+  switch (getOpcode()) {
+  default:
+    if (getOpcode() < ISD::BUILTIN_OP_END)
+      return "<<Unknown DAG Node>>";
+    if (isMachineOpcode()) {
+      if (G)
+        if (const TargetInstrInfo *TII = G->getSubtarget().getInstrInfo())
+          if (getMachineOpcode() < TII->getNumOpcodes())
+            return TII->getName(getMachineOpcode());
+      return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
+    }
+    if (G) {
+      const TargetLowering &TLI = G->getTargetLoweringInfo();
+      const char *Name = TLI.getTargetNodeName(getOpcode());
+      if (Name) return Name;
+      return "<<Unknown Target Node #" + utostr(getOpcode()) + ">>";
+    }
+    return "<<Unknown Node #" + utostr(getOpcode()) + ">>";
+
+#ifndef NDEBUG
+  case ISD::DELETED_NODE:               return "<<Deleted Node!>>";
+#endif
+  case ISD::PREFETCH:                   return "Prefetch";
+  case ISD::ATOMIC_FENCE:               return "AtomicFence";
+  case ISD::ATOMIC_CMP_SWAP:            return "AtomicCmpSwap";
+  case ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS: return "AtomicCmpSwapWithSuccess";
+  case ISD::ATOMIC_SWAP:                return "AtomicSwap";
+  case ISD::ATOMIC_LOAD_ADD:            return "AtomicLoadAdd";
+  case ISD::ATOMIC_LOAD_SUB:            return "AtomicLoadSub";
+  case ISD::ATOMIC_LOAD_AND:            return "AtomicLoadAnd";
+  case ISD::ATOMIC_LOAD_OR:             return "AtomicLoadOr";
+  case ISD::ATOMIC_LOAD_XOR:            return "AtomicLoadXor";
+  case ISD::ATOMIC_LOAD_NAND:           return "AtomicLoadNand";
+  case ISD::ATOMIC_LOAD_MIN:            return "AtomicLoadMin";
+  case ISD::ATOMIC_LOAD_MAX:            return "AtomicLoadMax";
+  case ISD::ATOMIC_LOAD_UMIN:           return "AtomicLoadUMin";
+  case ISD::ATOMIC_LOAD_UMAX:           return "AtomicLoadUMax";
+  case ISD::ATOMIC_LOAD:                return "AtomicLoad";
+  case ISD::ATOMIC_STORE:               return "AtomicStore";
+  case ISD::PCMARKER:                   return "PCMarker";
+  case ISD::READCYCLECOUNTER:           return "ReadCycleCounter";
+  case ISD::SRCVALUE:                   return "SrcValue";
+  case ISD::MDNODE_SDNODE:              return "MDNode";
+  case ISD::EntryToken:                 return "EntryToken";
+  case ISD::TokenFactor:                return "TokenFactor";
+  case ISD::AssertSext:                 return "AssertSext";
+  case ISD::AssertZext:                 return "AssertZext";
+
+  case ISD::BasicBlock:                 return "BasicBlock";
+  case ISD::VALUETYPE:                  return "ValueType";
+  case ISD::Register:                   return "Register";
+  case ISD::RegisterMask:               return "RegisterMask";
+  case ISD::Constant:
+    if (cast<ConstantSDNode>(this)->isOpaque())
+      return "OpaqueConstant";
+    return "Constant";
+  case ISD::ConstantFP:                 return "ConstantFP";
+  case ISD::GlobalAddress:              return "GlobalAddress";
+  case ISD::GlobalTLSAddress:           return "GlobalTLSAddress";
+  case ISD::FrameIndex:                 return "FrameIndex";
+  case ISD::JumpTable:                  return "JumpTable";
+  case ISD::GLOBAL_OFFSET_TABLE:        return "GLOBAL_OFFSET_TABLE";
+  case ISD::RETURNADDR:                 return "RETURNADDR";
+  case ISD::FRAMEADDR:                  return "FRAMEADDR";
+  case ISD::LOCAL_RECOVER:              return "LOCAL_RECOVER";
+  case ISD::READ_REGISTER:              return "READ_REGISTER";
+  case ISD::WRITE_REGISTER:             return "WRITE_REGISTER";
+  case ISD::FRAME_TO_ARGS_OFFSET:       return "FRAME_TO_ARGS_OFFSET";
+  case ISD::EH_RETURN:                  return "EH_RETURN";
+  case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
+  case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
+  case ISD::EH_SJLJ_SETUP_DISPATCH:     return "EH_SJLJ_SETUP_DISPATCH";
+  case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::TargetIndex:                return "TargetIndex";
+  case ISD::ExternalSymbol:             return "ExternalSymbol";
+  case ISD::BlockAddress:               return "BlockAddress";
+  case ISD::INTRINSIC_WO_CHAIN:
+  case ISD::INTRINSIC_VOID:
+  case ISD::INTRINSIC_W_CHAIN: {
+    unsigned OpNo = getOpcode() == ISD::INTRINSIC_WO_CHAIN ? 0 : 1;
+    unsigned IID = cast<ConstantSDNode>(getOperand(OpNo))->getZExtValue();
+    if (IID < Intrinsic::num_intrinsics)
+      return Intrinsic::getName((Intrinsic::ID)IID);
+    else if (const TargetIntrinsicInfo *TII = G->getTarget().getIntrinsicInfo())
+      return TII->getName(IID);
+    llvm_unreachable("Invalid intrinsic ID");
+  }
+
+  case ISD::BUILD_VECTOR:               return "BUILD_VECTOR";
+  case ISD::TargetConstant:
+    if (cast<ConstantSDNode>(this)->isOpaque())
+      return "OpaqueTargetConstant";
+    return "TargetConstant";
+  case ISD::TargetConstantFP:           return "TargetConstantFP";
+  case ISD::TargetGlobalAddress:        return "TargetGlobalAddress";
+  case ISD::TargetGlobalTLSAddress:     return "TargetGlobalTLSAddress";
+  case ISD::TargetFrameIndex:           return "TargetFrameIndex";
+  case ISD::TargetJumpTable:            return "TargetJumpTable";
+  case ISD::TargetConstantPool:         return "TargetConstantPool";
+  case ISD::TargetExternalSymbol:       return "TargetExternalSymbol";
+  case ISD::MCSymbol:                   return "MCSymbol";
+  case ISD::TargetBlockAddress:         return "TargetBlockAddress";
+
+  case ISD::CopyToReg:                  return "CopyToReg";
+  case ISD::CopyFromReg:                return "CopyFromReg";
+  case ISD::UNDEF:                      return "undef";
+  case ISD::MERGE_VALUES:               return "merge_values";
+  case ISD::INLINEASM:                  return "inlineasm";
+  case ISD::EH_LABEL:                   return "eh_label";
+  case ISD::HANDLENODE:                 return "handlenode";
+
+  // Unary operators
+  case ISD::FABS:                       return "fabs";
+  case ISD::FMINNUM:                    return "fminnum";
+  case ISD::FMAXNUM:                    return "fmaxnum";
+  case ISD::FMINNAN:                    return "fminnan";
+  case ISD::FMAXNAN:                    return "fmaxnan";
+  case ISD::FNEG:                       return "fneg";
+  case ISD::FSQRT:                      return "fsqrt";
+  case ISD::FSIN:                       return "fsin";
+  case ISD::FCOS:                       return "fcos";
+  case ISD::FSINCOS:                    return "fsincos";
+  case ISD::FTRUNC:                     return "ftrunc";
+  case ISD::FFLOOR:                     return "ffloor";
+  case ISD::FCEIL:                      return "fceil";
+  case ISD::FRINT:                      return "frint";
+  case ISD::FNEARBYINT:                 return "fnearbyint";
+  case ISD::FROUND:                     return "fround";
+  case ISD::FEXP:                       return "fexp";
+  case ISD::FEXP2:                      return "fexp2";
+  case ISD::FLOG:                       return "flog";
+  case ISD::FLOG2:                      return "flog2";
+  case ISD::FLOG10:                     return "flog10";
+
+  // Binary operators
+  case ISD::ADD:                        return "add";
+  case ISD::SUB:                        return "sub";
+  case ISD::MUL:                        return "mul";
+  case ISD::MULHU:                      return "mulhu";
+  case ISD::MULHS:                      return "mulhs";
+  case ISD::SDIV:                       return "sdiv";
+  case ISD::UDIV:                       return "udiv";
+  case ISD::SREM:                       return "srem";
+  case ISD::UREM:                       return "urem";
+  case ISD::SMUL_LOHI:                  return "smul_lohi";
+  case ISD::UMUL_LOHI:                  return "umul_lohi";
+  case ISD::SDIVREM:                    return "sdivrem";
+  case ISD::UDIVREM:                    return "udivrem";
+  case ISD::AND:                        return "and";
+  case ISD::OR:                         return "or";
+  case ISD::XOR:                        return "xor";
+  case ISD::SHL:                        return "shl";
+  case ISD::SRA:                        return "sra";
+  case ISD::SRL:                        return "srl";
+  case ISD::ROTL:                       return "rotl";
+  case ISD::ROTR:                       return "rotr";
+  case ISD::FADD:                       return "fadd";
+  case ISD::FSUB:                       return "fsub";
+  case ISD::FMUL:                       return "fmul";
+  case ISD::FDIV:                       return "fdiv";
+  case ISD::FMA:                        return "fma";
+  case ISD::FMAD:                       return "fmad";
+  case ISD::FREM:                       return "frem";
+  case ISD::FCOPYSIGN:                  return "fcopysign";
+  case ISD::FGETSIGN:                   return "fgetsign";
+  case ISD::FCANONICALIZE:              return "fcanonicalize";
+  case ISD::FPOW:                       return "fpow";
+  case ISD::SMIN:                       return "smin";
+  case ISD::SMAX:                       return "smax";
+  case ISD::UMIN:                       return "umin";
+  case ISD::UMAX:                       return "umax";
+
+  case ISD::FPOWI:                      return "fpowi";
+  case ISD::SETCC:                      return "setcc";
+  case ISD::SETCCE:                     return "setcce";
+  case ISD::SELECT:                     return "select";
+  case ISD::VSELECT:                    return "vselect";
+  case ISD::SELECT_CC:                  return "select_cc";
+  case ISD::INSERT_VECTOR_ELT:          return "insert_vector_elt";
+  case ISD::EXTRACT_VECTOR_ELT:         return "extract_vector_elt";
+  case ISD::CONCAT_VECTORS:             return "concat_vectors";
+  case ISD::INSERT_SUBVECTOR:           return "insert_subvector";
+  case ISD::EXTRACT_SUBVECTOR:          return "extract_subvector";
+  case ISD::SCALAR_TO_VECTOR:           return "scalar_to_vector";
+  case ISD::VECTOR_SHUFFLE:             return "vector_shuffle";
+  case ISD::CARRY_FALSE:                return "carry_false";
+  case ISD::ADDC:                       return "addc";
+  case ISD::ADDE:                       return "adde";
+  case ISD::SADDO:                      return "saddo";
+  case ISD::UADDO:                      return "uaddo";
+  case ISD::SSUBO:                      return "ssubo";
+  case ISD::USUBO:                      return "usubo";
+  case ISD::SMULO:                      return "smulo";
+  case ISD::UMULO:                      return "umulo";
+  case ISD::SUBC:                       return "subc";
+  case ISD::SUBE:                       return "sube";
+  case ISD::SHL_PARTS:                  return "shl_parts";
+  case ISD::SRA_PARTS:                  return "sra_parts";
+  case ISD::SRL_PARTS:                  return "srl_parts";
+
+  // Conversion operators.
+  case ISD::SIGN_EXTEND:                return "sign_extend";
+  case ISD::ZERO_EXTEND:                return "zero_extend";
+  case ISD::ANY_EXTEND:                 return "any_extend";
+  case ISD::SIGN_EXTEND_INREG:          return "sign_extend_inreg";
+  case ISD::ANY_EXTEND_VECTOR_INREG:    return "any_extend_vector_inreg";
+  case ISD::SIGN_EXTEND_VECTOR_INREG:   return "sign_extend_vector_inreg";
+  case ISD::ZERO_EXTEND_VECTOR_INREG:   return "zero_extend_vector_inreg";
+  case ISD::TRUNCATE:                   return "truncate";
+  case ISD::FP_ROUND:                   return "fp_round";
+  case ISD::FLT_ROUNDS_:                return "flt_rounds";
+  case ISD::FP_ROUND_INREG:             return "fp_round_inreg";
+  case ISD::FP_EXTEND:                  return "fp_extend";
+
+  case ISD::SINT_TO_FP:                 return "sint_to_fp";
+  case ISD::UINT_TO_FP:                 return "uint_to_fp";
+  case ISD::FP_TO_SINT:                 return "fp_to_sint";
+  case ISD::FP_TO_UINT:                 return "fp_to_uint";
+  case ISD::BITCAST:                    return "bitcast";
+  case ISD::ADDRSPACECAST:              return "addrspacecast";
+  case ISD::FP16_TO_FP:                 return "fp16_to_fp";
+  case ISD::FP_TO_FP16:                 return "fp_to_fp16";
+
+  case ISD::CONVERT_RNDSAT: {
+    switch (cast<CvtRndSatSDNode>(this)->getCvtCode()) {
+    default: llvm_unreachable("Unknown cvt code!");
+    case ISD::CVT_FF:                   return "cvt_ff";
+    case ISD::CVT_FS:                   return "cvt_fs";
+    case ISD::CVT_FU:                   return "cvt_fu";
+    case ISD::CVT_SF:                   return "cvt_sf";
+    case ISD::CVT_UF:                   return "cvt_uf";
+    case ISD::CVT_SS:                   return "cvt_ss";
+    case ISD::CVT_SU:                   return "cvt_su";
+    case ISD::CVT_US:                   return "cvt_us";
+    case ISD::CVT_UU:                   return "cvt_uu";
+    }
+  }
+
+    // Control flow instructions
+  case ISD::BR:                         return "br";
+  case ISD::BRIND:                      return "brind";
+  case ISD::BR_JT:                      return "br_jt";
+  case ISD::BRCOND:                     return "brcond";
+  case ISD::BR_CC:                      return "br_cc";
+  case ISD::CALLSEQ_START:              return "callseq_start";
+  case ISD::CALLSEQ_END:                return "callseq_end";
+
+    // EH instructions
+  case ISD::CATCHRET:                   return "catchret";
+  case ISD::CLEANUPRET:                 return "cleanupret";
+
+    // Other operators
+  case ISD::LOAD:                       return "load";
+  case ISD::STORE:                      return "store";
+  case ISD::MLOAD:                      return "masked_load";
+  case ISD::MSTORE:                     return "masked_store";
+  case ISD::MGATHER:                    return "masked_gather";
+  case ISD::MSCATTER:                   return "masked_scatter";
+  case ISD::VAARG:                      return "vaarg";
+  case ISD::VACOPY:                     return "vacopy";
+  case ISD::VAEND:                      return "vaend";
+  case ISD::VASTART:                    return "vastart";
+  case ISD::DYNAMIC_STACKALLOC:         return "dynamic_stackalloc";
+  case ISD::EXTRACT_ELEMENT:            return "extract_element";
+  case ISD::BUILD_PAIR:                 return "build_pair";
+  case ISD::STACKSAVE:                  return "stacksave";
+  case ISD::STACKRESTORE:               return "stackrestore";
+  case ISD::TRAP:                       return "trap";
+  case ISD::DEBUGTRAP:                  return "debugtrap";
+  case ISD::LIFETIME_START:             return "lifetime.start";
+  case ISD::LIFETIME_END:               return "lifetime.end";
+  case ISD::GC_TRANSITION_START:        return "gc_transition.start";
+  case ISD::GC_TRANSITION_END:          return "gc_transition.end";
+  case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
+
+  // Bit manipulation
+  case ISD::BITREVERSE:                 return "bitreverse";
+  case ISD::BSWAP:                      return "bswap";
+  case ISD::CTPOP:                      return "ctpop";
+  case ISD::CTTZ:                       return "cttz";
+  case ISD::CTTZ_ZERO_UNDEF:            return "cttz_zero_undef";
+  case ISD::CTLZ:                       return "ctlz";
+  case ISD::CTLZ_ZERO_UNDEF:            return "ctlz_zero_undef";
+    
+  // Trampolines
+  case ISD::INIT_TRAMPOLINE:            return "init_trampoline";
+  case ISD::ADJUST_TRAMPOLINE:          return "adjust_trampoline";
+
+  case ISD::CONDCODE:
+    switch (cast<CondCodeSDNode>(this)->get()) {
+    default: llvm_unreachable("Unknown setcc condition!");
+    case ISD::SETOEQ:                   return "setoeq";
+    case ISD::SETOGT:                   return "setogt";
+    case ISD::SETOGE:                   return "setoge";
+    case ISD::SETOLT:                   return "setolt";
+    case ISD::SETOLE:                   return "setole";
+    case ISD::SETONE:                   return "setone";
+
+    case ISD::SETO:                     return "seto";
+    case ISD::SETUO:                    return "setuo";
+    case ISD::SETUEQ:                   return "setueq";
+    case ISD::SETUGT:                   return "setugt";
+    case ISD::SETUGE:                   return "setuge";
+    case ISD::SETULT:                   return "setult";
+    case ISD::SETULE:                   return "setule";
+    case ISD::SETUNE:                   return "setune";
+
+    case ISD::SETEQ:                    return "seteq";
+    case ISD::SETGT:                    return "setgt";
+    case ISD::SETGE:                    return "setge";
+    case ISD::SETLT:                    return "setlt";
+    case ISD::SETLE:                    return "setle";
+    case ISD::SETNE:                    return "setne";
+
+    case ISD::SETTRUE:                  return "settrue";
+    case ISD::SETTRUE2:                 return "settrue2";
+    case ISD::SETFALSE:                 return "setfalse";
+    case ISD::SETFALSE2:                return "setfalse2";
+    }
+  }
+}
+
+const char *SDNode::getIndexedModeName(ISD::MemIndexedMode AM) {
+  switch (AM) {
+  default:              return "";
+  case ISD::PRE_INC:    return "<pre-inc>";
+  case ISD::PRE_DEC:    return "<pre-dec>";
+  case ISD::POST_INC:   return "<post-inc>";
+  case ISD::POST_DEC:   return "<post-dec>";
+  }
+}
+
+static Printable PrintNodeId(const SDNode &Node) {
+  return Printable([&Node](raw_ostream &OS) {
+#ifndef NDEBUG
+    OS << 't' << Node.PersistentId;
+#else
+    OS << (const void*)&Node;
+#endif
+  });
+}
+
+LLVM_DUMP_METHOD void SDNode::dump() const { dump(nullptr); }
+void SDNode::dump(const SelectionDAG *G) const {
+  print(dbgs(), G);
+  dbgs() << '\n';
+}
+
+void SDNode::print_types(raw_ostream &OS, const SelectionDAG *G) const {
+  for (unsigned i = 0, e = getNumValues(); i != e; ++i) {
+    if (i) OS << ",";
+    if (getValueType(i) == MVT::Other)
+      OS << "ch";
+    else
+      OS << getValueType(i).getEVTString();
+  }
+}
+
+void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
+  if (const MachineSDNode *MN = dyn_cast<MachineSDNode>(this)) {
+    if (!MN->memoperands_empty()) {
+      OS << "<";
+      OS << "Mem:";
+      for (MachineSDNode::mmo_iterator i = MN->memoperands_begin(),
+           e = MN->memoperands_end(); i != e; ++i) {
+        OS << **i;
+        if (std::next(i) != e)
+          OS << " ";
+      }
+      OS << ">";
+    }
+  } else if (const ShuffleVectorSDNode *SVN =
+               dyn_cast<ShuffleVectorSDNode>(this)) {
+    OS << "<";
+    for (unsigned i = 0, e = ValueList[0].getVectorNumElements(); i != e; ++i) {
+      int Idx = SVN->getMaskElt(i);
+      if (i) OS << ",";
+      if (Idx < 0)
+        OS << "u";
+      else
+        OS << Idx;
+    }
+    OS << ">";
+  } else if (const ConstantSDNode *CSDN = dyn_cast<ConstantSDNode>(this)) {
+    OS << '<' << CSDN->getAPIntValue() << '>';
+  } else if (const ConstantFPSDNode *CSDN = dyn_cast<ConstantFPSDNode>(this)) {
+    if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEsingle)
+      OS << '<' << CSDN->getValueAPF().convertToFloat() << '>';
+    else if (&CSDN->getValueAPF().getSemantics()==&APFloat::IEEEdouble)
+      OS << '<' << CSDN->getValueAPF().convertToDouble() << '>';
+    else {
+      OS << "<APFloat(";
+      CSDN->getValueAPF().bitcastToAPInt().dump();
+      OS << ")>";
+    }
+  } else if (const GlobalAddressSDNode *GADN =
+             dyn_cast<GlobalAddressSDNode>(this)) {
+    int64_t offset = GADN->getOffset();
+    OS << '<';
+    GADN->getGlobal()->printAsOperand(OS);
+    OS << '>';
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = GADN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const FrameIndexSDNode *FIDN = dyn_cast<FrameIndexSDNode>(this)) {
+    OS << "<" << FIDN->getIndex() << ">";
+  } else if (const JumpTableSDNode *JTDN = dyn_cast<JumpTableSDNode>(this)) {
+    OS << "<" << JTDN->getIndex() << ">";
+    if (unsigned int TF = JTDN->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(this)){
+    int offset = CP->getOffset();
+    if (CP->isMachineConstantPoolEntry())
+      OS << "<" << *CP->getMachineCPVal() << ">";
+    else
+      OS << "<" << *CP->getConstVal() << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = CP->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(this)) {
+    OS << "<" << TI->getIndex() << '+' << TI->getOffset() << ">";
+    if (unsigned TF = TI->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
+    OS << "<";
+    const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
+    if (LBB)
+      OS << LBB->getName() << " ";
+    OS << (const void*)BBDN->getBasicBlock() << ">";
+  } else if (const RegisterSDNode *R = dyn_cast<RegisterSDNode>(this)) {
+    OS << ' ' << PrintReg(R->getReg(),
+                          G ? G->getSubtarget().getRegisterInfo() : nullptr);
+  } else if (const ExternalSymbolSDNode *ES =
+             dyn_cast<ExternalSymbolSDNode>(this)) {
+    OS << "'" << ES->getSymbol() << "'";
+    if (unsigned int TF = ES->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const SrcValueSDNode *M = dyn_cast<SrcValueSDNode>(this)) {
+    if (M->getValue())
+      OS << "<" << M->getValue() << ">";
+    else
+      OS << "<null>";
+  } else if (const MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(this)) {
+    if (MD->getMD())
+      OS << "<" << MD->getMD() << ">";
+    else
+      OS << "<null>";
+  } else if (const VTSDNode *N = dyn_cast<VTSDNode>(this)) {
+    OS << ":" << N->getVT().getEVTString();
+  }
+  else if (const LoadSDNode *LD = dyn_cast<LoadSDNode>(this)) {
+    OS << "<" << *LD->getMemOperand();
+
+    bool doExt = true;
+    switch (LD->getExtensionType()) {
+    default: doExt = false; break;
+    case ISD::EXTLOAD:  OS << ", anyext"; break;
+    case ISD::SEXTLOAD: OS << ", sext"; break;
+    case ISD::ZEXTLOAD: OS << ", zext"; break;
+    }
+    if (doExt)
+      OS << " from " << LD->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(LD->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const StoreSDNode *ST = dyn_cast<StoreSDNode>(this)) {
+    OS << "<" << *ST->getMemOperand();
+
+    if (ST->isTruncatingStore())
+      OS << ", trunc to " << ST->getMemoryVT().getEVTString();
+
+    const char *AM = getIndexedModeName(ST->getAddressingMode());
+    if (*AM)
+      OS << ", " << AM;
+
+    OS << ">";
+  } else if (const MemSDNode* M = dyn_cast<MemSDNode>(this)) {
+    OS << "<" << *M->getMemOperand() << ">";
+  } else if (const BlockAddressSDNode *BA =
+               dyn_cast<BlockAddressSDNode>(this)) {
+    int64_t offset = BA->getOffset();
+    OS << "<";
+    BA->getBlockAddress()->getFunction()->printAsOperand(OS, false);
+    OS << ", ";
+    BA->getBlockAddress()->getBasicBlock()->printAsOperand(OS, false);
+    OS << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
+    if (unsigned int TF = BA->getTargetFlags())
+      OS << " [TF=" << TF << ']';
+  } else if (const AddrSpaceCastSDNode *ASC =
+               dyn_cast<AddrSpaceCastSDNode>(this)) {
+    OS << '['
+       << ASC->getSrcAddressSpace()
+       << " -> "
+       << ASC->getDestAddressSpace()
+       << ']';
+  }
+
+  if (VerboseDAGDumping) {
+    if (unsigned Order = getIROrder())
+        OS << " [ORD=" << Order << ']';
+
+    if (getNodeId() != -1)
+      OS << " [ID=" << getNodeId() << ']';
+
+    if (!G)
+      return;
+
+    DILocation *L = getDebugLoc();
+    if (!L)
+      return;
+
+    if (auto *Scope = L->getScope())
+      OS << Scope->getFilename();
+    else
+      OS << "<unknown>";
+    OS << ':' << L->getLine();
+    if (unsigned C = L->getColumn())
+      OS << ':' << C;
+  }
+}
+
+/// Return true if this node is so simple that we should just print it inline
+/// if it appears as an operand.
+static bool shouldPrintInline(const SDNode &Node) {
+  if (Node.getOpcode() == ISD::EntryToken)
+    return false;
+  return Node.getNumOperands() == 0;
+}
+
+static void DumpNodes(const SDNode *N, unsigned indent, const SelectionDAG *G) {
+  for (const SDValue &Op : N->op_values()) {
+    if (shouldPrintInline(*Op.getNode()))
+      continue;
+    if (Op.getNode()->hasOneUse())
+      DumpNodes(Op.getNode(), indent+2, G);
+  }
+
+  dbgs().indent(indent);
+  N->dump(G);
+}
+
+LLVM_DUMP_METHOD void SelectionDAG::dump() const {
+  dbgs() << "SelectionDAG has " << AllNodes.size() << " nodes:\n";
+
+  for (allnodes_const_iterator I = allnodes_begin(), E = allnodes_end();
+       I != E; ++I) {
+    const SDNode *N = &*I;
+    if (!N->hasOneUse() && N != getRoot().getNode() &&
+        (!shouldPrintInline(*N) || N->use_empty()))
+      DumpNodes(N, 2, this);
+  }
+
+  if (getRoot().getNode()) DumpNodes(getRoot().getNode(), 2, this);
+  dbgs() << "\n\n";
+}
+
+void SDNode::printr(raw_ostream &OS, const SelectionDAG *G) const {
+  OS << PrintNodeId(*this) << ": ";
+  print_types(OS, G);
+  OS << " = " << getOperationName(G);
+  print_details(OS, G);
+}
+
+static bool printOperand(raw_ostream &OS, const SelectionDAG *G,
+                         const SDValue Value) {
+  if (!Value.getNode()) {
+    OS << "<null>";
+    return false;
+  } else if (shouldPrintInline(*Value.getNode())) {
+    OS << Value->getOperationName(G) << ':';
+    Value->print_types(OS, G);
+    Value->print_details(OS, G);
+    return true;
+  } else {
+    OS << PrintNodeId(*Value.getNode());
+    if (unsigned RN = Value.getResNo())
+      OS << ':' << RN;
+    return false;
+  }
+}
+
+typedef SmallPtrSet<const SDNode *, 32> VisitedSDNodeSet;
+static void DumpNodesr(raw_ostream &OS, const SDNode *N, unsigned indent,
+                       const SelectionDAG *G, VisitedSDNodeSet &once) {
+  if (!once.insert(N).second) // If we've been here before, return now.
+    return;
+
+  // Dump the current SDNode, but don't end the line yet.
+  OS.indent(indent);
+  N->printr(OS, G);
+
+  // Having printed this SDNode, walk the children:
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (i) OS << ",";
+    OS << " ";
+
+    const SDValue Op = N->getOperand(i);
+    bool printedInline = printOperand(OS, G, Op);
+    if (printedInline)
+      once.insert(Op.getNode());
+  }
+
+  OS << "\n";
+
+  // Dump children that have grandchildren on their own line(s).
+  for (const SDValue &Op : N->op_values())
+    DumpNodesr(OS, Op.getNode(), indent+2, G, once);
+}
+
+void SDNode::dumpr() const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, nullptr, once);
+}
+
+void SDNode::dumpr(const SelectionDAG *G) const {
+  VisitedSDNodeSet once;
+  DumpNodesr(dbgs(), this, 0, G, once);
+}
+
+static void printrWithDepthHelper(raw_ostream &OS, const SDNode *N,
+                                  const SelectionDAG *G, unsigned depth,
+                                  unsigned indent) {
+  if (depth == 0)
+    return;
+
+  OS.indent(indent);
+
+  N->print(OS, G);
+
+  if (depth < 1)
+    return;
+
+  for (const SDValue &Op : N->op_values()) {
+    // Don't follow chain operands.
+    if (Op.getValueType() == MVT::Other)
+      continue;
+    OS << '\n';
+    printrWithDepthHelper(OS, Op.getNode(), G, depth-1, indent+2);
+  }
+}
+
+void SDNode::printrWithDepth(raw_ostream &OS, const SelectionDAG *G,
+                            unsigned depth) const {
+  printrWithDepthHelper(OS, this, G, depth, 0);
+}
+
+void SDNode::printrFull(raw_ostream &OS, const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  printrWithDepth(OS, G, 10);
+}
+
+void SDNode::dumprWithDepth(const SelectionDAG *G, unsigned depth) const {
+  printrWithDepth(dbgs(), G, depth);
+}
+
+void SDNode::dumprFull(const SelectionDAG *G) const {
+  // Don't print impossibly deep things.
+  dumprWithDepth(G, 10);
+}
+
+void SDNode::print(raw_ostream &OS, const SelectionDAG *G) const {
+  printr(OS, G);
+  for (unsigned i = 0, e = getNumOperands(); i != e; ++i) {
+    if (i) OS << ", "; else OS << " ";
+    printOperand(OS, G, getOperand(i));
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
new file mode 100644
index 000000000000..1d61657194c5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -0,0 +1,3591 @@
+//===-- SelectionDAGISel.cpp - Implement the SelectionDAGISel class -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGISel class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "ScheduleDAGSDNodes.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/FastISel.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Timer.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include <algorithm>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "isel"
+
+STATISTIC(NumFastIselFailures, "Number of instructions fast isel failed on");
+STATISTIC(NumFastIselSuccess, "Number of instructions fast isel selected");
+STATISTIC(NumFastIselBlocks, "Number of blocks selected entirely by fast isel");
+STATISTIC(NumDAGBlocks, "Number of blocks selected using DAG");
+STATISTIC(NumDAGIselRetries,"Number of times dag isel has to try another path");
+STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
+STATISTIC(NumFastIselFailLowerArguments,
+          "Number of entry blocks where fast isel failed to lower arguments");
+
+#ifndef NDEBUG
+static cl::opt<bool>
+EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden,
+          cl::desc("Enable extra verbose messages in the \"fast\" "
+                   "instruction selector"));
+
+  // Terminators
+STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret");
+STATISTIC(NumFastIselFailBr,"Fast isel fails on Br");
+STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch");
+STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr");
+STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke");
+STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume");
+STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable");
+
+  // Standard binary operators...
+STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add");
+STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd");
+STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub");
+STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub");
+STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul");
+STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul");
+STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv");
+STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv");
+STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv");
+STATISTIC(NumFastIselFailURem,"Fast isel fails on URem");
+STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem");
+STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem");
+
+  // Logical operators...
+STATISTIC(NumFastIselFailAnd,"Fast isel fails on And");
+STATISTIC(NumFastIselFailOr,"Fast isel fails on Or");
+STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor");
+
+  // Memory instructions...
+STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca");
+STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load");
+STATISTIC(NumFastIselFailStore,"Fast isel fails on Store");
+STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg");
+STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM");
+STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence");
+STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr");
+
+  // Convert instructions...
+STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc");
+STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt");
+STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt");
+STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc");
+STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt");
+STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI");
+STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI");
+STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP");
+STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP");
+STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr");
+STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt");
+STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast");
+
+  // Other instructions...
+STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp");
+STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp");
+STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI");
+STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select");
+STATISTIC(NumFastIselFailCall,"Fast isel fails on Call");
+STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl");
+STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr");
+STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr");
+STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg");
+STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement");
+STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement");
+STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
+STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
+STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
+STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
+
+// Intrinsic instructions...
+STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
+STATISTIC(NumFastIselFailSAddWithOverflow,
+          "Fast isel fails on sadd.with.overflow");
+STATISTIC(NumFastIselFailUAddWithOverflow,
+          "Fast isel fails on uadd.with.overflow");
+STATISTIC(NumFastIselFailSSubWithOverflow,
+          "Fast isel fails on ssub.with.overflow");
+STATISTIC(NumFastIselFailUSubWithOverflow,
+          "Fast isel fails on usub.with.overflow");
+STATISTIC(NumFastIselFailSMulWithOverflow,
+          "Fast isel fails on smul.with.overflow");
+STATISTIC(NumFastIselFailUMulWithOverflow,
+          "Fast isel fails on umul.with.overflow");
+STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
+STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
+STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
+STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
+#endif
+
+static cl::opt<bool>
+EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
+          cl::desc("Enable verbose messages in the \"fast\" "
+                   "instruction selector"));
+static cl::opt<int> EnableFastISelAbort(
+    "fast-isel-abort", cl::Hidden,
+    cl::desc("Enable abort calls when \"fast\" instruction selection "
+             "fails to lower an instruction: 0 disable the abort, 1 will "
+             "abort but for args, calls and terminators, 2 will also "
+             "abort for argument lowering, and 3 will never fallback "
+             "to SelectionDAG."));
+
+static cl::opt<bool>
+UseMBPI("use-mbpi",
+        cl::desc("use Machine Branch Probability Info"),
+        cl::init(true), cl::Hidden);
+
+#ifndef NDEBUG
+static cl::opt<std::string>
+FilterDAGBasicBlockName("filter-view-dags", cl::Hidden,
+                        cl::desc("Only display the basic block whose name "
+                                 "matches this for all view-*-dags options"));
+static cl::opt<bool>
+ViewDAGCombine1("view-dag-combine1-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the first "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewLegalizeTypesDAGs("view-legalize-types-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize types"));
+static cl::opt<bool>
+ViewLegalizeDAGs("view-legalize-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before legalize"));
+static cl::opt<bool>
+ViewDAGCombine2("view-dag-combine2-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the second "
+                   "dag combine pass"));
+static cl::opt<bool>
+ViewDAGCombineLT("view-dag-combine-lt-dags", cl::Hidden,
+          cl::desc("Pop up a window to show dags before the post legalize types"
+                   " dag combine pass"));
+static cl::opt<bool>
+ViewISelDAGs("view-isel-dags", cl::Hidden,
+          cl::desc("Pop up a window to show isel dags as they are selected"));
+static cl::opt<bool>
+ViewSchedDAGs("view-sched-dags", cl::Hidden,
+          cl::desc("Pop up a window to show sched dags as they are processed"));
+static cl::opt<bool>
+ViewSUnitDAGs("view-sunit-dags", cl::Hidden,
+      cl::desc("Pop up a window to show SUnit dags after they are processed"));
+#else
+static const bool ViewDAGCombine1 = false,
+                  ViewLegalizeTypesDAGs = false, ViewLegalizeDAGs = false,
+                  ViewDAGCombine2 = false,
+                  ViewDAGCombineLT = false,
+                  ViewISelDAGs = false, ViewSchedDAGs = false,
+                  ViewSUnitDAGs = false;
+#endif
+
+//===---------------------------------------------------------------------===//
+///
+/// RegisterScheduler class - Track the registration of instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+MachinePassRegistry RegisterScheduler::Registry;
+
+//===---------------------------------------------------------------------===//
+///
+/// ISHeuristic command line option for instruction schedulers.
+///
+//===---------------------------------------------------------------------===//
+static cl::opt<RegisterScheduler::FunctionPassCtor, false,
+               RegisterPassParser<RegisterScheduler> >
+ISHeuristic("pre-RA-sched",
+            cl::init(&createDefaultScheduler), cl::Hidden,
+            cl::desc("Instruction schedulers available (before register"
+                     " allocation):"));
+
+static RegisterScheduler
+defaultListDAGScheduler("default", "Best scheduler for the target",
+                        createDefaultScheduler);
+
+namespace llvm {
+  //===--------------------------------------------------------------------===//
+  /// \brief This class is used by SelectionDAGISel to temporarily override
+  /// the optimization level on a per-function basis.
+  class OptLevelChanger {
+    SelectionDAGISel &IS;
+    CodeGenOpt::Level SavedOptLevel;
+    bool SavedFastISel;
+
+  public:
+    OptLevelChanger(SelectionDAGISel &ISel,
+                    CodeGenOpt::Level NewOptLevel) : IS(ISel) {
+      SavedOptLevel = IS.OptLevel;
+      if (NewOptLevel == SavedOptLevel)
+        return;
+      IS.OptLevel = NewOptLevel;
+      IS.TM.setOptLevel(NewOptLevel);
+      DEBUG(dbgs() << "\nChanging optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel
+            << " ; After: -O" << NewOptLevel << "\n");
+      SavedFastISel = IS.TM.Options.EnableFastISel;
+      if (NewOptLevel == CodeGenOpt::None) {
+        IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
+        DEBUG(dbgs() << "\tFastISel is "
+              << (IS.TM.Options.EnableFastISel ? "enabled" : "disabled")
+              << "\n");
+      }
+    }
+
+    ~OptLevelChanger() {
+      if (IS.OptLevel == SavedOptLevel)
+        return;
+      DEBUG(dbgs() << "\nRestoring optimization level for Function "
+            << IS.MF->getFunction()->getName() << "\n");
+      DEBUG(dbgs() << "\tBefore: -O" << IS.OptLevel
+            << " ; After: -O" << SavedOptLevel << "\n");
+      IS.OptLevel = SavedOptLevel;
+      IS.TM.setOptLevel(SavedOptLevel);
+      IS.TM.setFastISel(SavedFastISel);
+    }
+  };
+
+  //===--------------------------------------------------------------------===//
+  /// createDefaultScheduler - This creates an instruction scheduler appropriate
+  /// for the target.
+  ScheduleDAGSDNodes* createDefaultScheduler(SelectionDAGISel *IS,
+                                             CodeGenOpt::Level OptLevel) {
+    const TargetLowering *TLI = IS->TLI;
+    const TargetSubtargetInfo &ST = IS->MF->getSubtarget();
+
+    // Try first to see if the Target has its own way of selecting a scheduler
+    if (auto *SchedulerCtor = ST.getDAGScheduler(OptLevel)) {
+      return SchedulerCtor(IS, OptLevel);
+    }
+
+    if (OptLevel == CodeGenOpt::None ||
+        (ST.enableMachineScheduler() && ST.enableMachineSchedDefaultSched()) ||
+        TLI->getSchedulingPreference() == Sched::Source)
+      return createSourceListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::RegPressure)
+      return createBURRListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::Hybrid)
+      return createHybridListDAGScheduler(IS, OptLevel);
+    if (TLI->getSchedulingPreference() == Sched::VLIW)
+      return createVLIWDAGScheduler(IS, OptLevel);
+    assert(TLI->getSchedulingPreference() == Sched::ILP &&
+           "Unknown sched type!");
+    return createILPListDAGScheduler(IS, OptLevel);
+  }
+} // end namespace llvm
+
+// EmitInstrWithCustomInserter - This method should be implemented by targets
+// that mark instructions with the 'usesCustomInserter' flag.  These
+// instructions are special in various ways, which require special support to
+// insert.  The specified MachineInstr is created but not inserted into any
+// basic blocks, and this method is called to expand it into a sequence of
+// instructions, potentially also creating new basic blocks and control flow.
+// When new basic blocks are inserted and the edges from MBB to its successors
+// are modified, the method should insert pairs of <OldSucc, NewSucc> into the
+// DenseMap.
+MachineBasicBlock *
+TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
+                                            MachineBasicBlock *MBB) const {
+#ifndef NDEBUG
+  dbgs() << "If a target marks an instruction with "
+          "'usesCustomInserter', it must implement "
+          "TargetLowering::EmitInstrWithCustomInserter!";
+#endif
+  llvm_unreachable(nullptr);
+}
+
+void TargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
+                                                   SDNode *Node) const {
+  assert(!MI.hasPostISelHook() &&
+         "If a target marks an instruction with 'hasPostISelHook', "
+         "it must implement TargetLowering::AdjustInstrPostInstrSelection!");
+}
+
+//===----------------------------------------------------------------------===//
+// SelectionDAGISel code
+//===----------------------------------------------------------------------===//
+
+SelectionDAGISel::SelectionDAGISel(TargetMachine &tm,
+                                   CodeGenOpt::Level OL) :
+  MachineFunctionPass(ID), TM(tm),
+  FuncInfo(new FunctionLoweringInfo()),
+  CurDAG(new SelectionDAG(tm, OL)),
+  SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)),
+  GFI(),
+  OptLevel(OL),
+  DAGSize(0) {
+    initializeGCModuleInfoPass(*PassRegistry::getPassRegistry());
+    initializeBranchProbabilityInfoWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+    initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+    initializeTargetLibraryInfoWrapperPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+SelectionDAGISel::~SelectionDAGISel() {
+  delete SDB;
+  delete CurDAG;
+  delete FuncInfo;
+}
+
+void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<AAResultsWrapperPass>();
+  AU.addRequired<GCModuleInfo>();
+  AU.addRequired<StackProtector>();
+  AU.addPreserved<StackProtector>();
+  AU.addPreserved<GCModuleInfo>();
+  AU.addRequired<TargetLibraryInfoWrapperPass>();
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// SplitCriticalSideEffectEdges - Look for critical edges with a PHI value that
+/// may trap on it.  In this case we have to split the edge so that the path
+/// through the predecessor block that doesn't go to the phi block doesn't
+/// execute the possibly trapping instruction.
+///
+/// This is required for correctness, so it must be done at -O0.
+///
+static void SplitCriticalSideEffectEdges(Function &Fn) {
+  // Loop for blocks with phi nodes.
+  for (BasicBlock &BB : Fn) {
+    PHINode *PN = dyn_cast<PHINode>(BB.begin());
+    if (!PN) continue;
+
+  ReprocessBlock:
+    // For each block with a PHI node, check to see if any of the input values
+    // are potentially trapping constant expressions.  Constant expressions are
+    // the only potentially trapping value that can occur as the argument to a
+    // PHI.
+    for (BasicBlock::iterator I = BB.begin(); (PN = dyn_cast<PHINode>(I)); ++I)
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        ConstantExpr *CE = dyn_cast<ConstantExpr>(PN->getIncomingValue(i));
+        if (!CE || !CE->canTrap()) continue;
+
+        // The only case we have to worry about is when the edge is critical.
+        // Since this block has a PHI Node, we assume it has multiple input
+        // edges: check to see if the pred has multiple successors.
+        BasicBlock *Pred = PN->getIncomingBlock(i);
+        if (Pred->getTerminator()->getNumSuccessors() == 1)
+          continue;
+
+        // Okay, we have to split this edge.
+        SplitCriticalEdge(
+            Pred->getTerminator(), GetSuccessorNumber(Pred, &BB),
+            CriticalEdgeSplittingOptions().setMergeIdenticalEdges());
+        goto ReprocessBlock;
+      }
+  }
+}
+
+bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
+  // Do some sanity-checking on the command-line options.
+  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) &&
+         "-fast-isel-verbose requires -fast-isel");
+  assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
+         "-fast-isel-abort > 0 requires -fast-isel");
+
+  const Function &Fn = *mf.getFunction();
+  MF = &mf;
+
+  // Reset the target options before resetting the optimization
+  // level below.
+  // FIXME: This is a horrible hack and should be processed via
+  // codegen looking at the optimization level explicitly when
+  // it wants to look at it.
+  TM.resetTargetOptions(Fn);
+  // Reset OptLevel to None for optnone functions.
+  CodeGenOpt::Level NewOptLevel = OptLevel;
+  if (OptLevel != CodeGenOpt::None && skipFunction(Fn))
+    NewOptLevel = CodeGenOpt::None;
+  OptLevelChanger OLC(*this, NewOptLevel);
+
+  TII = MF->getSubtarget().getInstrInfo();
+  TLI = MF->getSubtarget().getTargetLowering();
+  RegInfo = &MF->getRegInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+  GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+
+  DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
+
+  SplitCriticalSideEffectEdges(const_cast<Function &>(Fn));
+
+  CurDAG->init(*MF);
+  FuncInfo->set(Fn, *MF, CurDAG);
+
+  if (UseMBPI && OptLevel != CodeGenOpt::None)
+    FuncInfo->BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+  else
+    FuncInfo->BPI = nullptr;
+
+  SDB->init(GFI, *AA, LibInfo);
+
+  MF->setHasInlineAsm(false);
+
+  FuncInfo->SplitCSR = false;
+
+  // We split CSR if the target supports it for the given function
+  // and the function has only return exits.
+  if (OptLevel != CodeGenOpt::None && TLI->supportSplitCSR(MF)) {
+    FuncInfo->SplitCSR = true;
+
+    // Collect all the return blocks.
+    for (const BasicBlock &BB : Fn) {
+      if (!succ_empty(&BB))
+        continue;
+
+      const TerminatorInst *Term = BB.getTerminator();
+      if (isa<UnreachableInst>(Term) || isa<ReturnInst>(Term))
+        continue;
+
+      // Bail out if the exit block is not Return nor Unreachable.
+      FuncInfo->SplitCSR = false;
+      break;
+    }
+  }
+
+  MachineBasicBlock *EntryMBB = &MF->front();
+  if (FuncInfo->SplitCSR)
+    // This performs initialization so lowering for SplitCSR will be correct.
+    TLI->initializeSplitCSR(EntryMBB);
+
+  SelectAllBasicBlocks(Fn);
+
+  // If the first basic block in the function has live ins that need to be
+  // copied into vregs, emit the copies into the top of the block before
+  // emitting the code for the block.
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  RegInfo->EmitLiveInCopies(EntryMBB, TRI, *TII);
+
+  // Insert copies in the entry block and the return blocks.
+  if (FuncInfo->SplitCSR) {
+    SmallVector<MachineBasicBlock*, 4> Returns;
+    // Collect all the return blocks.
+    for (MachineBasicBlock &MBB : mf) {
+      if (!MBB.succ_empty())
+        continue;
+
+      MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+      if (Term != MBB.end() && Term->isReturn()) {
+        Returns.push_back(&MBB);
+        continue;
+      }
+    }
+    TLI->insertCopiesSplitCSR(EntryMBB, Returns);
+  }
+
+  DenseMap<unsigned, unsigned> LiveInMap;
+  if (!FuncInfo->ArgDbgValues.empty())
+    for (MachineRegisterInfo::livein_iterator LI = RegInfo->livein_begin(),
+           E = RegInfo->livein_end(); LI != E; ++LI)
+      if (LI->second)
+        LiveInMap.insert(std::make_pair(LI->first, LI->second));
+
+  // Insert DBG_VALUE instructions for function arguments to the entry block.
+  for (unsigned i = 0, e = FuncInfo->ArgDbgValues.size(); i != e; ++i) {
+    MachineInstr *MI = FuncInfo->ArgDbgValues[e-i-1];
+    bool hasFI = MI->getOperand(0).isFI();
+    unsigned Reg =
+        hasFI ? TRI.getFrameRegister(*MF) : MI->getOperand(0).getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      EntryMBB->insert(EntryMBB->begin(), MI);
+    else {
+      MachineInstr *Def = RegInfo->getVRegDef(Reg);
+      if (Def) {
+        MachineBasicBlock::iterator InsertPos = Def;
+        // FIXME: VR def may not be in entry block.
+        Def->getParent()->insert(std::next(InsertPos), MI);
+      } else
+        DEBUG(dbgs() << "Dropping debug info for dead vreg"
+              << TargetRegisterInfo::virtReg2Index(Reg) << "\n");
+    }
+
+    // If Reg is live-in then update debug info to track its copy in a vreg.
+    DenseMap<unsigned, unsigned>::iterator LDI = LiveInMap.find(Reg);
+    if (LDI != LiveInMap.end()) {
+      assert(!hasFI && "There's no handling of frame pointer updating here yet "
+                       "- add if needed");
+      MachineInstr *Def = RegInfo->getVRegDef(LDI->second);
+      MachineBasicBlock::iterator InsertPos = Def;
+      const MDNode *Variable = MI->getDebugVariable();
+      const MDNode *Expr = MI->getDebugExpression();
+      DebugLoc DL = MI->getDebugLoc();
+      bool IsIndirect = MI->isIndirectDebugValue();
+      unsigned Offset = IsIndirect ? MI->getOperand(1).getImm() : 0;
+      assert(cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(DL) &&
+             "Expected inlined-at fields to agree");
+      // Def is never a terminator here, so it is ok to increment InsertPos.
+      BuildMI(*EntryMBB, ++InsertPos, DL, TII->get(TargetOpcode::DBG_VALUE),
+              IsIndirect, LDI->second, Offset, Variable, Expr);
+
+      // If this vreg is directly copied into an exported register then
+      // that COPY instructions also need DBG_VALUE, if it is the only
+      // user of LDI->second.
+      MachineInstr *CopyUseMI = nullptr;
+      for (MachineRegisterInfo::use_instr_iterator
+           UI = RegInfo->use_instr_begin(LDI->second),
+           E = RegInfo->use_instr_end(); UI != E; ) {
+        MachineInstr *UseMI = &*(UI++);
+        if (UseMI->isDebugValue()) continue;
+        if (UseMI->isCopy() && !CopyUseMI && UseMI->getParent() == EntryMBB) {
+          CopyUseMI = UseMI; continue;
+        }
+        // Otherwise this is another use or second copy use.
+        CopyUseMI = nullptr; break;
+      }
+      if (CopyUseMI) {
+        // Use MI's debug location, which describes where Variable was
+        // declared, rather than whatever is attached to CopyUseMI.
+        MachineInstr *NewMI =
+            BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
+                    CopyUseMI->getOperand(0).getReg(), Offset, Variable, Expr);
+        MachineBasicBlock::iterator Pos = CopyUseMI;
+        EntryMBB->insertAfter(Pos, NewMI);
+      }
+    }
+  }
+
+  // Determine if there are any calls in this machine function.
+  MachineFrameInfo *MFI = MF->getFrameInfo();
+  for (const auto &MBB : *MF) {
+    if (MFI->hasCalls() && MF->hasInlineAsm())
+      break;
+
+    for (const auto &MI : MBB) {
+      const MCInstrDesc &MCID = TII->get(MI.getOpcode());
+      if ((MCID.isCall() && !MCID.isReturn()) ||
+          MI.isStackAligningInlineAsm()) {
+        MFI->setHasCalls(true);
+      }
+      if (MI.isInlineAsm()) {
+        MF->setHasInlineAsm(true);
+      }
+    }
+  }
+
+  // Determine if there is a call to setjmp in the machine function.
+  MF->setExposesReturnsTwice(Fn.callsFunctionThatReturnsTwice());
+
+  // Replace forward-declared registers with the registers containing
+  // the desired value.
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  for (DenseMap<unsigned, unsigned>::iterator
+       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
+       I != E; ++I) {
+    unsigned From = I->first;
+    unsigned To = I->second;
+    // If To is also scheduled to be replaced, find what its ultimate
+    // replacement is.
+    for (;;) {
+      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
+      if (J == E) break;
+      To = J->second;
+    }
+    // Make sure the new register has a sufficiently constrained register class.
+    if (TargetRegisterInfo::isVirtualRegister(From) &&
+        TargetRegisterInfo::isVirtualRegister(To))
+      MRI.constrainRegClass(To, MRI.getRegClass(From));
+    // Replace it.
+
+
+    // Replacing one register with another won't touch the kill flags.
+    // We need to conservatively clear the kill flags as a kill on the old
+    // register might dominate existing uses of the new register.
+    if (!MRI.use_empty(To))
+      MRI.clearKillFlags(From);
+    MRI.replaceRegWith(From, To);
+  }
+
+  if (TLI->hasCopyImplyingStackAdjustment(MF))
+    MFI->setHasCopyImplyingStackAdjustment(true);
+
+  // Freeze the set of reserved registers now that MachineFrameInfo has been
+  // set up. All the information required by getReservedRegs() should be
+  // available now.
+  MRI.freezeReservedRegs(*MF);
+
+  // Release function-specific state. SDB and CurDAG are already cleared
+  // at this point.
+  FuncInfo->clear();
+
+  DEBUG(dbgs() << "*** MachineFunction at end of ISel ***\n");
+  DEBUG(MF->print(dbgs()));
+
+  return true;
+}
+
+void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
+                                        BasicBlock::const_iterator End,
+                                        bool &HadTailCall) {
+  // Lower the instructions. If a call is emitted as a tail call, cease emitting
+  // nodes for this block.
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
+    SDB->visit(*I);
+
+  // Make sure the root of the DAG is up-to-date.
+  CurDAG->setRoot(SDB->getControlRoot());
+  HadTailCall = SDB->HasTailCall;
+  SDB->clear();
+
+  // Final step, emit the lowered DAG as machine code.
+  CodeGenAndEmitDAG();
+}
+
+void SelectionDAGISel::ComputeLiveOutVRegInfo() {
+  SmallPtrSet<SDNode*, 16> VisitedNodes;
+  SmallVector<SDNode*, 128> Worklist;
+
+  Worklist.push_back(CurDAG->getRoot().getNode());
+
+  APInt KnownZero;
+  APInt KnownOne;
+
+  do {
+    SDNode *N = Worklist.pop_back_val();
+
+    // If we've already seen this node, ignore it.
+    if (!VisitedNodes.insert(N).second)
+      continue;
+
+    // Otherwise, add all chain operands to the worklist.
+    for (const SDValue &Op : N->op_values())
+      if (Op.getValueType() == MVT::Other)
+        Worklist.push_back(Op.getNode());
+
+    // If this is a CopyToReg with a vreg dest, process it.
+    if (N->getOpcode() != ISD::CopyToReg)
+      continue;
+
+    unsigned DestReg = cast<RegisterSDNode>(N->getOperand(1))->getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(DestReg))
+      continue;
+
+    // Ignore non-scalar or non-integer values.
+    SDValue Src = N->getOperand(2);
+    EVT SrcVT = Src.getValueType();
+    if (!SrcVT.isInteger() || SrcVT.isVector())
+      continue;
+
+    unsigned NumSignBits = CurDAG->ComputeNumSignBits(Src);
+    CurDAG->computeKnownBits(Src, KnownZero, KnownOne);
+    FuncInfo->AddLiveOutRegInfo(DestReg, NumSignBits, KnownZero, KnownOne);
+  } while (!Worklist.empty());
+}
+
+void SelectionDAGISel::CodeGenAndEmitDAG() {
+  std::string GroupName;
+  if (TimePassesIsEnabled)
+    GroupName = "Instruction Selection and Scheduling";
+  std::string BlockName;
+  int BlockNumber = -1;
+  (void)BlockNumber;
+  bool MatchFilterBB = false; (void)MatchFilterBB;
+#ifndef NDEBUG
+  MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
+                   FilterDAGBasicBlockName ==
+                       FuncInfo->MBB->getBasicBlock()->getName().str());
+#endif
+#ifdef NDEBUG
+  if (ViewDAGCombine1 || ViewLegalizeTypesDAGs || ViewLegalizeDAGs ||
+      ViewDAGCombine2 || ViewDAGCombineLT || ViewISelDAGs || ViewSchedDAGs ||
+      ViewSUnitDAGs)
+#endif
+  {
+    BlockNumber = FuncInfo->MBB->getNumber();
+    BlockName =
+        (MF->getName() + ":" + FuncInfo->MBB->getBasicBlock()->getName()).str();
+  }
+  DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewDAGCombine1 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine1 input for " + BlockName);
+
+  // Run the DAG combiner in pre-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 1", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  // Second step, hack on the DAG until it only uses operations and types that
+  // the target supports.
+  if (ViewLegalizeTypesDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize-types input for " + BlockName);
+
+  bool Changed;
+  {
+    NamedRegionTimer T("Type Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeTypes();
+  }
+
+  DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  CurDAG->NewNodesMustHaveLegalTypes = true;
+
+  if (Changed) {
+    if (ViewDAGCombineLT && MatchFilterBB)
+      CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize types", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  }
+
+  {
+    NamedRegionTimer T("Vector Legalization", GroupName, TimePassesIsEnabled);
+    Changed = CurDAG->LegalizeVectors();
+  }
+
+  if (Changed) {
+    {
+      NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
+      CurDAG->LegalizeTypes();
+    }
+
+    if (ViewDAGCombineLT && MatchFilterBB)
+      CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
+
+    // Run the DAG combiner in post-type-legalize mode.
+    {
+      NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
+                         TimePassesIsEnabled);
+      CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel);
+    }
+
+    DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
+          << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
+  }
+
+  if (ViewLegalizeDAGs && MatchFilterBB)
+    CurDAG->viewGraph("legalize input for " + BlockName);
+
+  {
+    NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
+    CurDAG->Legalize();
+  }
+
+  DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewDAGCombine2 && MatchFilterBB)
+    CurDAG->viewGraph("dag-combine2 input for " + BlockName);
+
+  // Run the DAG combiner in post-legalize mode.
+  {
+    NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
+    CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel);
+  }
+
+  DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (OptLevel != CodeGenOpt::None)
+    ComputeLiveOutVRegInfo();
+
+  if (ViewISelDAGs && MatchFilterBB)
+    CurDAG->viewGraph("isel input for " + BlockName);
+
+  // Third, instruction select all of the operations to machine code, adding the
+  // code to the MachineBasicBlock.
+  {
+    NamedRegionTimer T("Instruction Selection", GroupName, TimePassesIsEnabled);
+    DoInstructionSelection();
+  }
+
+  DEBUG(dbgs() << "Selected selection DAG: BB#" << BlockNumber
+        << " '" << BlockName << "'\n"; CurDAG->dump());
+
+  if (ViewSchedDAGs && MatchFilterBB)
+    CurDAG->viewGraph("scheduler input for " + BlockName);
+
+  // Schedule machine code.
+  ScheduleDAGSDNodes *Scheduler = CreateScheduler();
+  {
+    NamedRegionTimer T("Instruction Scheduling", GroupName,
+                       TimePassesIsEnabled);
+    Scheduler->Run(CurDAG, FuncInfo->MBB);
+  }
+
+  if (ViewSUnitDAGs && MatchFilterBB)
+    Scheduler->viewGraph();
+
+  // Emit machine code to BB.  This can change 'BB' to the last block being
+  // inserted into.
+  MachineBasicBlock *FirstMBB = FuncInfo->MBB, *LastMBB;
+  {
+    NamedRegionTimer T("Instruction Creation", GroupName, TimePassesIsEnabled);
+
+    // FuncInfo->InsertPt is passed by reference and set to the end of the
+    // scheduled instructions.
+    LastMBB = FuncInfo->MBB = Scheduler->EmitSchedule(FuncInfo->InsertPt);
+  }
+
+  // If the block was split, make sure we update any references that are used to
+  // update PHI nodes later on.
+  if (FirstMBB != LastMBB)
+    SDB->UpdateSplitBlock(FirstMBB, LastMBB);
+
+  // Free the scheduler state.
+  {
+    NamedRegionTimer T("Instruction Scheduling Cleanup", GroupName,
+                       TimePassesIsEnabled);
+    delete Scheduler;
+  }
+
+  // Free the SelectionDAG state, now that we're finished with it.
+  CurDAG->clear();
+}
+
+namespace {
+/// ISelUpdater - helper class to handle updates of the instruction selection
+/// graph.
+class ISelUpdater : public SelectionDAG::DAGUpdateListener {
+  SelectionDAG::allnodes_iterator &ISelPosition;
+public:
+  ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
+    : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
+
+  /// NodeDeleted - Handle nodes deleted from the graph. If the node being
+  /// deleted is the current ISelPosition node, update ISelPosition.
+  ///
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    if (ISelPosition == SelectionDAG::allnodes_iterator(N))
+      ++ISelPosition;
+  }
+};
+} // end anonymous namespace
+
+void SelectionDAGISel::DoInstructionSelection() {
+  DEBUG(dbgs() << "===== Instruction selection begins: BB#"
+        << FuncInfo->MBB->getNumber()
+        << " '" << FuncInfo->MBB->getName() << "'\n");
+
+  PreprocessISelDAG();
+
+  // Select target instructions for the DAG.
+  {
+    // Number all nodes with a topological order and set DAGSize.
+    DAGSize = CurDAG->AssignTopologicalOrder();
+
+    // Create a dummy node (which is not added to allnodes), that adds
+    // a reference to the root node, preventing it from being deleted,
+    // and tracking any changes of the root.
+    HandleSDNode Dummy(CurDAG->getRoot());
+    SelectionDAG::allnodes_iterator ISelPosition (CurDAG->getRoot().getNode());
+    ++ISelPosition;
+
+    // Make sure that ISelPosition gets properly updated when nodes are deleted
+    // in calls made from this function.
+    ISelUpdater ISU(*CurDAG, ISelPosition);
+
+    // The AllNodes list is now topological-sorted. Visit the
+    // nodes by starting at the end of the list (the root of the
+    // graph) and preceding back toward the beginning (the entry
+    // node).
+    while (ISelPosition != CurDAG->allnodes_begin()) {
+      SDNode *Node = &*--ISelPosition;
+      // Skip dead nodes. DAGCombiner is expected to eliminate all dead nodes,
+      // but there are currently some corner cases that it misses. Also, this
+      // makes it theoretically possible to disable the DAGCombiner.
+      if (Node->use_empty())
+        continue;
+
+      Select(Node);
+    }
+
+    CurDAG->setRoot(Dummy.getValue());
+  }
+
+  DEBUG(dbgs() << "===== Instruction selection ends:\n");
+
+  PostprocessISelDAG();
+}
+
+static bool hasExceptionPointerOrCodeUser(const CatchPadInst *CPI) {
+  for (const User *U : CPI->users()) {
+    if (const IntrinsicInst *EHPtrCall = dyn_cast<IntrinsicInst>(U)) {
+      Intrinsic::ID IID = EHPtrCall->getIntrinsicID();
+      if (IID == Intrinsic::eh_exceptionpointer ||
+          IID == Intrinsic::eh_exceptioncode)
+        return true;
+    }
+  }
+  return false;
+}
+
+/// PrepareEHLandingPad - Emit an EH_LABEL, set up live-in registers, and
+/// do other setup for EH landing-pad blocks.
+bool SelectionDAGISel::PrepareEHLandingPad() {
+  MachineBasicBlock *MBB = FuncInfo->MBB;
+  const Constant *PersonalityFn = FuncInfo->Fn->getPersonalityFn();
+  const BasicBlock *LLVMBB = MBB->getBasicBlock();
+  const TargetRegisterClass *PtrRC =
+      TLI->getRegClassFor(TLI->getPointerTy(CurDAG->getDataLayout()));
+
+  // Catchpads have one live-in register, which typically holds the exception
+  // pointer or code.
+  if (const auto *CPI = dyn_cast<CatchPadInst>(LLVMBB->getFirstNonPHI())) {
+    if (hasExceptionPointerOrCodeUser(CPI)) {
+      // Get or create the virtual register to hold the pointer or code.  Mark
+      // the live in physreg and copy into the vreg.
+      MCPhysReg EHPhysReg = TLI->getExceptionPointerRegister(PersonalityFn);
+      assert(EHPhysReg && "target lacks exception pointer register");
+      MBB->addLiveIn(EHPhysReg);
+      unsigned VReg = FuncInfo->getCatchPadExceptionPointerVReg(CPI, PtrRC);
+      BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+              TII->get(TargetOpcode::COPY), VReg)
+          .addReg(EHPhysReg, RegState::Kill);
+    }
+    return true;
+  }
+
+  if (!LLVMBB->isLandingPad())
+    return true;
+
+  // Add a label to mark the beginning of the landing pad.  Deletion of the
+  // landing pad can thus be detected via the MachineModuleInfo.
+  MCSymbol *Label = MF->getMMI().addLandingPad(MBB);
+
+  // Assign the call site to the landing pad's begin label.
+  MF->getMMI().setCallSiteLandingPad(Label, SDB->LPadToCallSiteMap[MBB]);
+
+  const MCInstrDesc &II = TII->get(TargetOpcode::EH_LABEL);
+  BuildMI(*MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(), II)
+    .addSym(Label);
+
+  // Mark exception register as live in.
+  if (unsigned Reg = TLI->getExceptionPointerRegister(PersonalityFn))
+    FuncInfo->ExceptionPointerVirtReg = MBB->addLiveIn(Reg, PtrRC);
+
+  // Mark exception selector register as live in.
+  if (unsigned Reg = TLI->getExceptionSelectorRegister(PersonalityFn))
+    FuncInfo->ExceptionSelectorVirtReg = MBB->addLiveIn(Reg, PtrRC);
+
+  return true;
+}
+
+/// isFoldedOrDeadInstruction - Return true if the specified instruction is
+/// side-effect free and is either dead or folded into a generated instruction.
+/// Return false if it needs to be emitted.
+static bool isFoldedOrDeadInstruction(const Instruction *I,
+                                      FunctionLoweringInfo *FuncInfo) {
+  return !I->mayWriteToMemory() && // Side-effecting instructions aren't folded.
+         !isa<TerminatorInst>(I) &&    // Terminators aren't folded.
+         !isa<DbgInfoIntrinsic>(I) &&  // Debug instructions aren't folded.
+         !I->isEHPad() &&              // EH pad instructions aren't folded.
+         !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
+}
+
+#ifndef NDEBUG
+// Collect per Instruction statistics for fast-isel misses.  Only those
+// instructions that cause the bail are accounted for.  It does not account for
+// instructions higher in the block.  Thus, summing the per instructions stats
+// will not add up to what is reported by NumFastIselFailures.
+static void collectFailStats(const Instruction *I) {
+  switch (I->getOpcode()) {
+  default: assert (0 && "<Invalid operator> ");
+
+  // Terminators
+  case Instruction::Ret:         NumFastIselFailRet++; return;
+  case Instruction::Br:          NumFastIselFailBr++; return;
+  case Instruction::Switch:      NumFastIselFailSwitch++; return;
+  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return;
+  case Instruction::Invoke:      NumFastIselFailInvoke++; return;
+  case Instruction::Resume:      NumFastIselFailResume++; return;
+  case Instruction::Unreachable: NumFastIselFailUnreachable++; return;
+
+  // Standard binary operators...
+  case Instruction::Add:  NumFastIselFailAdd++; return;
+  case Instruction::FAdd: NumFastIselFailFAdd++; return;
+  case Instruction::Sub:  NumFastIselFailSub++; return;
+  case Instruction::FSub: NumFastIselFailFSub++; return;
+  case Instruction::Mul:  NumFastIselFailMul++; return;
+  case Instruction::FMul: NumFastIselFailFMul++; return;
+  case Instruction::UDiv: NumFastIselFailUDiv++; return;
+  case Instruction::SDiv: NumFastIselFailSDiv++; return;
+  case Instruction::FDiv: NumFastIselFailFDiv++; return;
+  case Instruction::URem: NumFastIselFailURem++; return;
+  case Instruction::SRem: NumFastIselFailSRem++; return;
+  case Instruction::FRem: NumFastIselFailFRem++; return;
+
+  // Logical operators...
+  case Instruction::And: NumFastIselFailAnd++; return;
+  case Instruction::Or:  NumFastIselFailOr++; return;
+  case Instruction::Xor: NumFastIselFailXor++; return;
+
+  // Memory instructions...
+  case Instruction::Alloca:        NumFastIselFailAlloca++; return;
+  case Instruction::Load:          NumFastIselFailLoad++; return;
+  case Instruction::Store:         NumFastIselFailStore++; return;
+  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return;
+  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return;
+  case Instruction::Fence:         NumFastIselFailFence++; return;
+  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return;
+
+  // Convert instructions...
+  case Instruction::Trunc:    NumFastIselFailTrunc++; return;
+  case Instruction::ZExt:     NumFastIselFailZExt++; return;
+  case Instruction::SExt:     NumFastIselFailSExt++; return;
+  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return;
+  case Instruction::FPExt:    NumFastIselFailFPExt++; return;
+  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return;
+  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
+  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
+  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
+  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
+  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
+  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
+
+  // Other instructions...
+  case Instruction::ICmp:           NumFastIselFailICmp++; return;
+  case Instruction::FCmp:           NumFastIselFailFCmp++; return;
+  case Instruction::PHI:            NumFastIselFailPHI++; return;
+  case Instruction::Select:         NumFastIselFailSelect++; return;
+  case Instruction::Call: {
+    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
+      switch (Intrinsic->getIntrinsicID()) {
+      default:
+        NumFastIselFailIntrinsicCall++; return;
+      case Intrinsic::sadd_with_overflow:
+        NumFastIselFailSAddWithOverflow++; return;
+      case Intrinsic::uadd_with_overflow:
+        NumFastIselFailUAddWithOverflow++; return;
+      case Intrinsic::ssub_with_overflow:
+        NumFastIselFailSSubWithOverflow++; return;
+      case Intrinsic::usub_with_overflow:
+        NumFastIselFailUSubWithOverflow++; return;
+      case Intrinsic::smul_with_overflow:
+        NumFastIselFailSMulWithOverflow++; return;
+      case Intrinsic::umul_with_overflow:
+        NumFastIselFailUMulWithOverflow++; return;
+      case Intrinsic::frameaddress:
+        NumFastIselFailFrameaddress++; return;
+      case Intrinsic::sqrt:
+          NumFastIselFailSqrt++; return;
+      case Intrinsic::experimental_stackmap:
+        NumFastIselFailStackMap++; return;
+      case Intrinsic::experimental_patchpoint_void: // fall-through
+      case Intrinsic::experimental_patchpoint_i64:
+        NumFastIselFailPatchPoint++; return;
+      }
+    }
+    NumFastIselFailCall++;
+    return;
+  }
+  case Instruction::Shl:            NumFastIselFailShl++; return;
+  case Instruction::LShr:           NumFastIselFailLShr++; return;
+  case Instruction::AShr:           NumFastIselFailAShr++; return;
+  case Instruction::VAArg:          NumFastIselFailVAArg++; return;
+  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return;
+  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return;
+  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return;
+  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return;
+  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return;
+  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
+  }
+}
+#endif // NDEBUG
+
+/// Set up SwiftErrorVals by going through the function. If the function has
+/// swifterror argument, it will be the first entry.
+static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
+                                FunctionLoweringInfo *FuncInfo) {
+  if (!TLI->supportSwiftError())
+    return;
+
+  FuncInfo->SwiftErrorVals.clear();
+  FuncInfo->SwiftErrorMap.clear();
+  FuncInfo->SwiftErrorWorklist.clear();
+
+  // Check if function has a swifterror argument.
+  for (Function::const_arg_iterator AI = Fn.arg_begin(), AE = Fn.arg_end();
+       AI != AE; ++AI)
+    if (AI->hasSwiftErrorAttr())
+      FuncInfo->SwiftErrorVals.push_back(&*AI);
+
+  for (const auto &LLVMBB : Fn)
+    for (const auto &Inst : LLVMBB) {
+      if (const AllocaInst *Alloca = dyn_cast<AllocaInst>(&Inst))
+        if (Alloca->isSwiftError())
+          FuncInfo->SwiftErrorVals.push_back(Alloca);
+    }
+}
+
+/// For each basic block, merge incoming swifterror values or simply propagate
+/// them. The merged results will be saved in SwiftErrorMap. For predecessors
+/// that are not yet visited, we create virtual registers to hold the swifterror
+/// values and save them in SwiftErrorWorklist.
+static void mergeIncomingSwiftErrors(FunctionLoweringInfo *FuncInfo,
+                            const TargetLowering *TLI,
+                            const TargetInstrInfo *TII,
+                            const BasicBlock *LLVMBB,
+                            SelectionDAGBuilder *SDB) {
+  if (!TLI->supportSwiftError())
+    return;
+
+  // We should only do this when we have swifterror parameter or swifterror
+  // alloc.
+  if (FuncInfo->SwiftErrorVals.empty())
+    return;
+
+  // At beginning of a basic block, insert PHI nodes or get the virtual
+  // register from the only predecessor, and update SwiftErrorMap; if one
+  // of the predecessors is not visited, update SwiftErrorWorklist.
+  // At end of a basic block, if a block is in SwiftErrorWorklist, insert copy
+  // to sync up the virtual register assignment.
+
+  // Always create a virtual register for each swifterror value in entry block.
+  auto &DL = SDB->DAG.getDataLayout();
+  const TargetRegisterClass *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) {
+    for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+      // Assign Undef to Vreg. We construct MI directly to make sure it works
+      // with FastISel.
+      BuildMI(*FuncInfo->MBB, FuncInfo->InsertPt, SDB->getCurDebugLoc(),
+          TII->get(TargetOpcode::IMPLICIT_DEF), VReg);
+      FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+    }
+    return;
+  }
+
+  if (auto *UniquePred = LLVMBB->getUniquePredecessor()) {
+    auto *UniquePredMBB = FuncInfo->MBBMap[UniquePred];
+    if (!FuncInfo->SwiftErrorMap.count(UniquePredMBB)) {
+      // Update SwiftErrorWorklist with a new virtual register.
+      for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+        unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+        FuncInfo->SwiftErrorWorklist[UniquePredMBB].push_back(VReg);
+        // Propagate the information from the single predecessor.
+        FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+      }
+      return;
+    }
+    // Propagate the information from the single predecessor.
+    FuncInfo->SwiftErrorMap[FuncInfo->MBB] =
+      FuncInfo->SwiftErrorMap[UniquePredMBB];
+    return;
+  }
+
+  // For the case of multiple predecessors, update SwiftErrorWorklist.
+  // Handle the case where we have two or more predecessors being the same.
+  for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+       PI != PE; ++PI) {
+    auto *PredMBB = FuncInfo->MBBMap[*PI];
+    if (!FuncInfo->SwiftErrorMap.count(PredMBB) &&
+        !FuncInfo->SwiftErrorWorklist.count(PredMBB)) {
+      for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+        unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+        // When we actually visit the basic block PredMBB, we will materialize
+        // the virtual register assignment in copySwiftErrorsToFinalVRegs.
+        FuncInfo->SwiftErrorWorklist[PredMBB].push_back(VReg);
+      }
+    }
+  }
+
+  // For the case of multiple predecessors, create a virtual register for
+  // each swifterror value and generate Phi node.
+  for (unsigned I = 0, E = FuncInfo->SwiftErrorVals.size(); I < E; I++) {
+    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+    FuncInfo->SwiftErrorMap[FuncInfo->MBB].push_back(VReg);
+
+    MachineInstrBuilder SwiftErrorPHI = BuildMI(*FuncInfo->MBB,
+        FuncInfo->MBB->begin(), SDB->getCurDebugLoc(),
+        TII->get(TargetOpcode::PHI), VReg);
+    for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+         PI != PE; ++PI) {
+      auto *PredMBB = FuncInfo->MBBMap[*PI];
+      unsigned SwiftErrorReg = FuncInfo->SwiftErrorMap.count(PredMBB) ?
+        FuncInfo->SwiftErrorMap[PredMBB][I] :
+        FuncInfo->SwiftErrorWorklist[PredMBB][I];
+      SwiftErrorPHI.addReg(SwiftErrorReg)
+                   .addMBB(PredMBB);
+    }
+  }
+}
+
+void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  // Initialize the Fast-ISel state, if needed.
+  FastISel *FastIS = nullptr;
+  if (TM.Options.EnableFastISel)
+    FastIS = TLI->createFastISel(*FuncInfo, LibInfo);
+
+  setupSwiftErrorVals(Fn, TLI, FuncInfo);
+
+  // Iterate over all basic blocks in the function.
+  ReversePostOrderTraversal<const Function*> RPOT(&Fn);
+  for (ReversePostOrderTraversal<const Function*>::rpo_iterator
+       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
+    const BasicBlock *LLVMBB = *I;
+
+    if (OptLevel != CodeGenOpt::None) {
+      bool AllPredsVisited = true;
+      for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
+           PI != PE; ++PI) {
+        if (!FuncInfo->VisitedBBs.count(*PI)) {
+          AllPredsVisited = false;
+          break;
+        }
+      }
+
+      if (AllPredsVisited) {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             const PHINode *PN = dyn_cast<PHINode>(I); ++I)
+          FuncInfo->ComputePHILiveOutRegInfo(PN);
+      } else {
+        for (BasicBlock::const_iterator I = LLVMBB->begin();
+             const PHINode *PN = dyn_cast<PHINode>(I); ++I)
+          FuncInfo->InvalidatePHILiveOutRegInfo(PN);
+      }
+
+      FuncInfo->VisitedBBs.insert(LLVMBB);
+    }
+
+    BasicBlock::const_iterator const Begin =
+        LLVMBB->getFirstNonPHI()->getIterator();
+    BasicBlock::const_iterator const End = LLVMBB->end();
+    BasicBlock::const_iterator BI = End;
+
+    FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
+    if (!FuncInfo->MBB)
+      continue; // Some blocks like catchpads have no code or MBB.
+    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
+    mergeIncomingSwiftErrors(FuncInfo, TLI, TII, LLVMBB, SDB);
+
+    // Setup an EH landing-pad block.
+    FuncInfo->ExceptionPointerVirtReg = 0;
+    FuncInfo->ExceptionSelectorVirtReg = 0;
+    if (LLVMBB->isEHPad())
+      if (!PrepareEHLandingPad())
+        continue;
+
+    // Before doing SelectionDAG ISel, see if FastISel has been requested.
+    if (FastIS) {
+      FastIS->startNewBlock();
+
+      // Emit code for any incoming arguments. This must happen before
+      // beginning FastISel on the entry block.
+      if (LLVMBB == &Fn.getEntryBlock()) {
+        ++NumEntryBlocks;
+
+        // Lower any arguments needed in this block if this is the entry block.
+        if (!FastIS->lowerArguments()) {
+          // Fast isel failed to lower these arguments
+          ++NumFastIselFailLowerArguments;
+          if (EnableFastISelAbort > 1)
+            report_fatal_error("FastISel didn't lower all arguments");
+
+          // Use SelectionDAG argument lowering
+          LowerArguments(Fn);
+          CurDAG->setRoot(SDB->getControlRoot());
+          SDB->clear();
+          CodeGenAndEmitDAG();
+        }
+
+        // If we inserted any instructions at the beginning, make a note of
+        // where they are, so we can be sure to emit subsequent instructions
+        // after them.
+        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+        else
+          FastIS->setLastLocalValue(nullptr);
+      }
+
+      unsigned NumFastIselRemaining = std::distance(Begin, End);
+      // Do FastISel on as many instructions as possible.
+      for (; BI != Begin; --BI) {
+        const Instruction *Inst = &*std::prev(BI);
+
+        // If we no longer require this instruction, skip it.
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+          --NumFastIselRemaining;
+          continue;
+        }
+
+        // Bottom-up: reset the insert pos at the top, after any local-value
+        // instructions.
+        FastIS->recomputeInsertPt();
+
+        // Try to select the instruction with FastISel.
+        if (FastIS->selectInstruction(Inst)) {
+          --NumFastIselRemaining;
+          ++NumFastIselSuccess;
+          // If fast isel succeeded, skip over all the folded instructions, and
+          // then see if there is a load right before the selected instructions.
+          // Try to fold the load if so.
+          const Instruction *BeforeInst = Inst;
+          while (BeforeInst != &*Begin) {
+            BeforeInst = &*std::prev(BasicBlock::const_iterator(BeforeInst));
+            if (!isFoldedOrDeadInstruction(BeforeInst, FuncInfo))
+              break;
+          }
+          if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
+              BeforeInst->hasOneUse() &&
+              FastIS->tryToFoldLoad(cast<LoadInst>(BeforeInst), Inst)) {
+            // If we succeeded, don't re-select the load.
+            BI = std::next(BasicBlock::const_iterator(BeforeInst));
+            --NumFastIselRemaining;
+            ++NumFastIselSuccess;
+          }
+          continue;
+        }
+
+#ifndef NDEBUG
+        if (EnableFastISelVerbose2)
+          collectFailStats(Inst);
+#endif
+
+        // Then handle certain instructions as single-LLVM-Instruction blocks.
+        if (isa<CallInst>(Inst)) {
+
+          if (EnableFastISelVerbose || EnableFastISelAbort) {
+            dbgs() << "FastISel missed call: ";
+            Inst->dump();
+          }
+          if (EnableFastISelAbort > 2)
+            // FastISel selector couldn't handle something and bailed.
+            // For the purpose of debugging, just abort.
+            report_fatal_error("FastISel didn't select the entire block");
+
+          if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
+              !Inst->use_empty()) {
+            unsigned &R = FuncInfo->ValueMap[Inst];
+            if (!R)
+              R = FuncInfo->CreateRegs(Inst->getType());
+          }
+
+          bool HadTailCall = false;
+          MachineBasicBlock::iterator SavedInsertPt = FuncInfo->InsertPt;
+          SelectBasicBlock(Inst->getIterator(), BI, HadTailCall);
+
+          // If the call was emitted as a tail call, we're done with the block.
+          // We also need to delete any previously emitted instructions.
+          if (HadTailCall) {
+            FastIS->removeDeadCode(SavedInsertPt, FuncInfo->MBB->end());
+            --BI;
+            break;
+          }
+
+          // Recompute NumFastIselRemaining as Selection DAG instruction
+          // selection may have handled the call, input args, etc.
+          unsigned RemainingNow = std::distance(Begin, BI);
+          NumFastIselFailures += NumFastIselRemaining - RemainingNow;
+          NumFastIselRemaining = RemainingNow;
+          continue;
+        }
+
+        bool ShouldAbort = EnableFastISelAbort;
+        if (EnableFastISelVerbose || EnableFastISelAbort) {
+          if (isa<TerminatorInst>(Inst)) {
+            // Use a different message for terminator misses.
+            dbgs() << "FastISel missed terminator: ";
+            // Don't abort unless for terminator unless the level is really high
+            ShouldAbort = (EnableFastISelAbort > 2);
+          } else {
+            dbgs() << "FastISel miss: ";
+          }
+          Inst->dump();
+        }
+        if (ShouldAbort)
+          // FastISel selector couldn't handle something and bailed.
+          // For the purpose of debugging, just abort.
+          report_fatal_error("FastISel didn't select the entire block");
+
+        NumFastIselFailures += NumFastIselRemaining;
+        break;
+      }
+
+      FastIS->recomputeInsertPt();
+    } else {
+      // Lower any arguments needed in this block if this is the entry block.
+      if (LLVMBB == &Fn.getEntryBlock()) {
+        ++NumEntryBlocks;
+        LowerArguments(Fn);
+      }
+    }
+    if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
+      bool FunctionBasedInstrumentation =
+          TLI->getSSPStackGuardCheck(*Fn.getParent());
+      SDB->SPDescriptor.initialize(LLVMBB, FuncInfo->MBBMap[LLVMBB],
+                                   FunctionBasedInstrumentation);
+    }
+
+    if (Begin != BI)
+      ++NumDAGBlocks;
+    else
+      ++NumFastIselBlocks;
+
+    if (Begin != BI) {
+      // Run SelectionDAG instruction selection on the remainder of the block
+      // not handled by FastISel. If FastISel is not run, this is the entire
+      // block.
+      bool HadTailCall;
+      SelectBasicBlock(Begin, BI, HadTailCall);
+    }
+
+    FinishBasicBlock();
+    FuncInfo->PHINodesToUpdate.clear();
+  }
+
+  delete FastIS;
+  SDB->clearDanglingDebugInfo();
+  SDB->SPDescriptor.resetPerFunctionState();
+}
+
+/// Given that the input MI is before a partial terminator sequence TSeq, return
+/// true if M + TSeq also a partial terminator sequence.
+///
+/// A Terminator sequence is a sequence of MachineInstrs which at this point in
+/// lowering copy vregs into physical registers, which are then passed into
+/// terminator instructors so we can satisfy ABI constraints. A partial
+/// terminator sequence is an improper subset of a terminator sequence (i.e. it
+/// may be the whole terminator sequence).
+static bool MIIsInTerminatorSequence(const MachineInstr &MI) {
+  // If we do not have a copy or an implicit def, we return true if and only if
+  // MI is a debug value.
+  if (!MI.isCopy() && !MI.isImplicitDef())
+    // Sometimes DBG_VALUE MI sneak in between the copies from the vregs to the
+    // physical registers if there is debug info associated with the terminator
+    // of our mbb. We want to include said debug info in our terminator
+    // sequence, so we return true in that case.
+    return MI.isDebugValue();
+
+  // We have left the terminator sequence if we are not doing one of the
+  // following:
+  //
+  // 1. Copying a vreg into a physical register.
+  // 2. Copying a vreg into a vreg.
+  // 3. Defining a register via an implicit def.
+
+  // OPI should always be a register definition...
+  MachineInstr::const_mop_iterator OPI = MI.operands_begin();
+  if (!OPI->isReg() || !OPI->isDef())
+    return false;
+
+  // Defining any register via an implicit def is always ok.
+  if (MI.isImplicitDef())
+    return true;
+
+  // Grab the copy source...
+  MachineInstr::const_mop_iterator OPI2 = OPI;
+  ++OPI2;
+  assert(OPI2 != MI.operands_end()
+         && "Should have a copy implying we should have 2 arguments.");
+
+  // Make sure that the copy dest is not a vreg when the copy source is a
+  // physical register.
+  if (!OPI2->isReg() ||
+      (!TargetRegisterInfo::isPhysicalRegister(OPI->getReg()) &&
+       TargetRegisterInfo::isPhysicalRegister(OPI2->getReg())))
+    return false;
+
+  return true;
+}
+
+/// Find the split point at which to splice the end of BB into its success stack
+/// protector check machine basic block.
+///
+/// On many platforms, due to ABI constraints, terminators, even before register
+/// allocation, use physical registers. This creates an issue for us since
+/// physical registers at this point can not travel across basic
+/// blocks. Luckily, selectiondag always moves physical registers into vregs
+/// when they enter functions and moves them through a sequence of copies back
+/// into the physical registers right before the terminator creating a
+/// ``Terminator Sequence''. This function is searching for the beginning of the
+/// terminator sequence so that we can ensure that we splice off not just the
+/// terminator, but additionally the copies that move the vregs into the
+/// physical registers.
+static MachineBasicBlock::iterator
+FindSplitPointForStackProtector(MachineBasicBlock *BB) {
+  MachineBasicBlock::iterator SplitPoint = BB->getFirstTerminator();
+  //
+  if (SplitPoint == BB->begin())
+    return SplitPoint;
+
+  MachineBasicBlock::iterator Start = BB->begin();
+  MachineBasicBlock::iterator Previous = SplitPoint;
+  --Previous;
+
+  while (MIIsInTerminatorSequence(*Previous)) {
+    SplitPoint = Previous;
+    if (Previous == Start)
+      break;
+    --Previous;
+  }
+
+  return SplitPoint;
+}
+
+void
+SelectionDAGISel::FinishBasicBlock() {
+  DEBUG(dbgs() << "Total amount of phi nodes to update: "
+               << FuncInfo->PHINodesToUpdate.size() << "\n";
+        for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i)
+          dbgs() << "Node " << i << " : ("
+                 << FuncInfo->PHINodesToUpdate[i].first
+                 << ", " << FuncInfo->PHINodesToUpdate[i].second << ")\n");
+
+  // Next, now that we know what the last MBB the LLVM BB expanded is, update
+  // PHI nodes in successors.
+  for (unsigned i = 0, e = FuncInfo->PHINodesToUpdate.size(); i != e; ++i) {
+    MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[i].first);
+    assert(PHI->isPHI() &&
+           "This is not a machine PHI node that we are updating!");
+    if (!FuncInfo->MBB->isSuccessor(PHI->getParent()))
+      continue;
+    PHI.addReg(FuncInfo->PHINodesToUpdate[i].second).addMBB(FuncInfo->MBB);
+  }
+
+  // Handle stack protector.
+  if (SDB->SPDescriptor.shouldEmitFunctionBasedCheckStackProtector()) {
+    // The target provides a guard check function. There is no need to
+    // generate error handling code or to split current basic block.
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+
+    // Add load and check to the basicblock.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt =
+        FindSplitPointForStackProtector(ParentMBB);
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  } else if (SDB->SPDescriptor.shouldEmitStackProtector()) {
+    MachineBasicBlock *ParentMBB = SDB->SPDescriptor.getParentMBB();
+    MachineBasicBlock *SuccessMBB = SDB->SPDescriptor.getSuccessMBB();
+
+    // Find the split point to split the parent mbb. At the same time copy all
+    // physical registers used in the tail of parent mbb into virtual registers
+    // before the split point and back into physical registers after the split
+    // point. This prevents us needing to deal with Live-ins and many other
+    // register allocation issues caused by us splitting the parent mbb. The
+    // register allocator will clean up said virtual copies later on.
+    MachineBasicBlock::iterator SplitPoint =
+        FindSplitPointForStackProtector(ParentMBB);
+
+    // Splice the terminator of ParentMBB into SuccessMBB.
+    SuccessMBB->splice(SuccessMBB->end(), ParentMBB,
+                       SplitPoint,
+                       ParentMBB->end());
+
+    // Add compare/jump on neq/jump to the parent BB.
+    FuncInfo->MBB = ParentMBB;
+    FuncInfo->InsertPt = ParentMBB->end();
+    SDB->visitSPDescriptorParent(SDB->SPDescriptor, ParentMBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // CodeGen Failure MBB if we have not codegened it yet.
+    MachineBasicBlock *FailureMBB = SDB->SPDescriptor.getFailureMBB();
+    if (FailureMBB->empty()) {
+      FuncInfo->MBB = FailureMBB;
+      FuncInfo->InsertPt = FailureMBB->end();
+      SDB->visitSPDescriptorFailure(SDB->SPDescriptor);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Clear the Per-BB State.
+    SDB->SPDescriptor.resetPerBBState();
+  }
+
+  // Lower each BitTestBlock.
+  for (auto &BTB : SDB->BitTestCases) {
+    // Lower header first, if it wasn't already lowered
+    if (!BTB.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = BTB.Parent;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitBitTestHeader(BTB, FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    BranchProbability UnhandledProb = BTB.Prob;
+    for (unsigned j = 0, ej = BTB.Cases.size(); j != ej; ++j) {
+      UnhandledProb -= BTB.Cases[j].ExtraProb;
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = BTB.Cases[j].ThisBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+
+      // If all cases cover a contiguous range, it is not necessary to jump to
+      // the default block after the last bit test fails. This is because the
+      // range check during bit test header creation has guaranteed that every
+      // case here doesn't go outside the range. In this case, there is no need
+      // to perform the last bit test, as it will always be true. Instead, make
+      // the second-to-last bit-test fall through to the target of the last bit
+      // test, and delete the last bit test.
+
+      MachineBasicBlock *NextMBB;
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Second-to-last bit-test with contiguous range: fall through to the
+        // target of the final bit test.
+        NextMBB = BTB.Cases[j + 1].TargetBB;
+      } else if (j + 1 == ej) {
+        // For the last bit test, fall through to Default.
+        NextMBB = BTB.Default;
+      } else {
+        // Otherwise, fall through to the next bit test.
+        NextMBB = BTB.Cases[j + 1].ThisBB;
+      }
+
+      SDB->visitBitTestCase(BTB, NextMBB, UnhandledProb, BTB.Reg, BTB.Cases[j],
+                            FuncInfo->MBB);
+
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+
+      if (BTB.ContiguousRange && j + 2 == ej) {
+        // Since we're not going to use the final bit test, remove it.
+        BTB.Cases.pop_back();
+        break;
+      }
+    }
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // This is "default" BB. We have two jumps to it. From "header" BB and
+      // from last "case" BB, unless the latter was skipped.
+      if (PHIBB == BTB.Default) {
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(BTB.Parent);
+        if (!BTB.ContiguousRange) {
+          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+              .addMBB(BTB.Cases.back().ThisBB);
+         }
+      }
+      // One of "cases" BB.
+      for (unsigned j = 0, ej = BTB.Cases.size();
+           j != ej; ++j) {
+        MachineBasicBlock* cBB = BTB.Cases[j].ThisBB;
+        if (cBB->isSuccessor(PHIBB))
+          PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(cBB);
+      }
+    }
+  }
+  SDB->BitTestCases.clear();
+
+  // If the JumpTable record is filled in, then we need to emit a jump table.
+  // Updating the PHI nodes is tricky in this case, since we need to determine
+  // whether the PHI is a successor of the range check MBB or the jump table MBB
+  for (unsigned i = 0, e = SDB->JTCases.size(); i != e; ++i) {
+    // Lower header first, if it wasn't already lowered
+    if (!SDB->JTCases[i].first.Emitted) {
+      // Set the current basic block to the mbb we wish to insert the code into
+      FuncInfo->MBB = SDB->JTCases[i].first.HeaderBB;
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // Emit the code
+      SDB->visitJumpTableHeader(SDB->JTCases[i].second, SDB->JTCases[i].first,
+                                FuncInfo->MBB);
+      CurDAG->setRoot(SDB->getRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->JTCases[i].second.MBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+    // Emit the code
+    SDB->visitJumpTable(SDB->JTCases[i].second);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Update PHI Nodes
+    for (unsigned pi = 0, pe = FuncInfo->PHINodesToUpdate.size();
+         pi != pe; ++pi) {
+      MachineInstrBuilder PHI(*MF, FuncInfo->PHINodesToUpdate[pi].first);
+      MachineBasicBlock *PHIBB = PHI->getParent();
+      assert(PHI->isPHI() &&
+             "This is not a machine PHI node that we are updating!");
+      // "default" BB. We can go there only from header BB.
+      if (PHIBB == SDB->JTCases[i].second.Default)
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second)
+           .addMBB(SDB->JTCases[i].first.HeaderBB);
+      // JT BB. Just iterate over successors here
+      if (FuncInfo->MBB->isSuccessor(PHIBB))
+        PHI.addReg(FuncInfo->PHINodesToUpdate[pi].second).addMBB(FuncInfo->MBB);
+    }
+  }
+  SDB->JTCases.clear();
+
+  // If we generated any switch lowering information, build and codegen any
+  // additional DAGs necessary.
+  for (unsigned i = 0, e = SDB->SwitchCases.size(); i != e; ++i) {
+    // Set the current basic block to the mbb we wish to insert the code into
+    FuncInfo->MBB = SDB->SwitchCases[i].ThisBB;
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
+
+    // Determine the unique successors.
+    SmallVector<MachineBasicBlock *, 2> Succs;
+    Succs.push_back(SDB->SwitchCases[i].TrueBB);
+    if (SDB->SwitchCases[i].TrueBB != SDB->SwitchCases[i].FalseBB)
+      Succs.push_back(SDB->SwitchCases[i].FalseBB);
+
+    // Emit the code. Note that this could result in FuncInfo->MBB being split.
+    SDB->visitSwitchCase(SDB->SwitchCases[i], FuncInfo->MBB);
+    CurDAG->setRoot(SDB->getRoot());
+    SDB->clear();
+    CodeGenAndEmitDAG();
+
+    // Remember the last block, now that any splitting is done, for use in
+    // populating PHI nodes in successors.
+    MachineBasicBlock *ThisBB = FuncInfo->MBB;
+
+    // Handle any PHI nodes in successors of this chunk, as if we were coming
+    // from the original BB before switch expansion.  Note that PHI nodes can
+    // occur multiple times in PHINodesToUpdate.  We have to be very careful to
+    // handle them the right number of times.
+    for (unsigned i = 0, e = Succs.size(); i != e; ++i) {
+      FuncInfo->MBB = Succs[i];
+      FuncInfo->InsertPt = FuncInfo->MBB->end();
+      // FuncInfo->MBB may have been removed from the CFG if a branch was
+      // constant folded.
+      if (ThisBB->isSuccessor(FuncInfo->MBB)) {
+        for (MachineBasicBlock::iterator
+             MBBI = FuncInfo->MBB->begin(), MBBE = FuncInfo->MBB->end();
+             MBBI != MBBE && MBBI->isPHI(); ++MBBI) {
+          MachineInstrBuilder PHI(*MF, MBBI);
+          // This value for this PHI node is recorded in PHINodesToUpdate.
+          for (unsigned pn = 0; ; ++pn) {
+            assert(pn != FuncInfo->PHINodesToUpdate.size() &&
+                   "Didn't find PHI entry!");
+            if (FuncInfo->PHINodesToUpdate[pn].first == PHI) {
+              PHI.addReg(FuncInfo->PHINodesToUpdate[pn].second).addMBB(ThisBB);
+              break;
+            }
+          }
+        }
+      }
+    }
+  }
+  SDB->SwitchCases.clear();
+}
+
+/// Create the scheduler. If a specific scheduler was specified
+/// via the SchedulerRegistry, use it, otherwise select the
+/// one preferred by the target.
+///
+ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
+  return ISHeuristic(this, OptLevel);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions used by the generated instruction selector.
+//===----------------------------------------------------------------------===//
+// Calls to these methods are generated by tblgen.
+
+/// CheckAndMask - The isel is trying to match something like (and X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an AND, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
+                                    int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+  if (CurDAG->MaskedValueIsZero(LHS, NeededMask))
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// CheckOrMask - The isel is trying to match something like (or X, 255).  If
+/// the dag combiner simplified the 255, we still want to match.  RHS is the
+/// actual value in the DAG on the RHS of an OR, and DesiredMaskS is the value
+/// specified in the .td file (e.g. 255).
+bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
+                                   int64_t DesiredMaskS) const {
+  const APInt &ActualMask = RHS->getAPIntValue();
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+
+  // If the actual mask exactly matches, success!
+  if (ActualMask == DesiredMask)
+    return true;
+
+  // If the actual AND mask is allowing unallowed bits, this doesn't match.
+  if (ActualMask.intersects(~DesiredMask))
+    return false;
+
+  // Otherwise, the DAG Combiner may have proven that the value coming in is
+  // either already zero or is not demanded.  Check for known zero input bits.
+  APInt NeededMask = DesiredMask & ~ActualMask;
+
+  APInt KnownZero, KnownOne;
+  CurDAG->computeKnownBits(LHS, KnownZero, KnownOne);
+
+  // If all the missing bits in the or are already known to be set, match!
+  if ((NeededMask & KnownOne) == NeededMask)
+    return true;
+
+  // TODO: check to see if missing bits are just not demanded.
+
+  // Otherwise, this pattern doesn't match.
+  return false;
+}
+
+/// SelectInlineAsmMemoryOperands - Calls to this are automatically generated
+/// by tblgen.  Others should not call it.
+void SelectionDAGISel::SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
+                                                     const SDLoc &DL) {
+  std::vector<SDValue> InOps;
+  std::swap(InOps, Ops);
+
+  Ops.push_back(InOps[InlineAsm::Op_InputChain]); // 0
+  Ops.push_back(InOps[InlineAsm::Op_AsmString]);  // 1
+  Ops.push_back(InOps[InlineAsm::Op_MDNode]);     // 2, !srcloc
+  Ops.push_back(InOps[InlineAsm::Op_ExtraInfo]);  // 3 (SideEffect, AlignStack)
+
+  unsigned i = InlineAsm::Op_FirstOperand, e = InOps.size();
+  if (InOps[e-1].getValueType() == MVT::Glue)
+    --e;  // Don't process a glue operand if it is here.
+
+  while (i != e) {
+    unsigned Flags = cast<ConstantSDNode>(InOps[i])->getZExtValue();
+    if (!InlineAsm::isMemKind(Flags)) {
+      // Just skip over this operand, copying the operands verbatim.
+      Ops.insert(Ops.end(), InOps.begin()+i,
+                 InOps.begin()+i+InlineAsm::getNumOperandRegisters(Flags) + 1);
+      i += InlineAsm::getNumOperandRegisters(Flags) + 1;
+    } else {
+      assert(InlineAsm::getNumOperandRegisters(Flags) == 1 &&
+             "Memory operand with multiple values?");
+
+      unsigned TiedToOperand;
+      if (InlineAsm::isUseOperandTiedToDef(Flags, TiedToOperand)) {
+        // We need the constraint ID from the operand this is tied to.
+        unsigned CurOp = InlineAsm::Op_FirstOperand;
+        Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
+        for (; TiedToOperand; --TiedToOperand) {
+          CurOp += InlineAsm::getNumOperandRegisters(Flags)+1;
+          Flags = cast<ConstantSDNode>(InOps[CurOp])->getZExtValue();
+        }
+      }
+
+      // Otherwise, this is a memory operand.  Ask the target to select it.
+      std::vector<SDValue> SelOps;
+      unsigned ConstraintID = InlineAsm::getMemoryConstraintID(Flags);
+      if (SelectInlineAsmMemoryOperand(InOps[i+1], ConstraintID, SelOps))
+        report_fatal_error("Could not match memory address.  Inline asm"
+                           " failure!");
+
+      // Add this to the output node.
+      unsigned NewFlags =
+        InlineAsm::getFlagWord(InlineAsm::Kind_Mem, SelOps.size());
+      NewFlags = InlineAsm::getFlagWordForMem(NewFlags, ConstraintID);
+      Ops.push_back(CurDAG->getTargetConstant(NewFlags, DL, MVT::i32));
+      Ops.insert(Ops.end(), SelOps.begin(), SelOps.end());
+      i += 2;
+    }
+  }
+
+  // Add the glue input back if present.
+  if (e != InOps.size())
+    Ops.push_back(InOps.back());
+}
+
+/// findGlueUse - Return use of MVT::Glue value produced by the specified
+/// SDNode.
+///
+static SDNode *findGlueUse(SDNode *N) {
+  unsigned FlagResNo = N->getNumValues()-1;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDUse &Use = I.getUse();
+    if (Use.getResNo() == FlagResNo)
+      return Use.getUser();
+  }
+  return nullptr;
+}
+
+/// findNonImmUse - Return true if "Use" is a non-immediate use of "Def".
+/// This function recursively traverses up the operand chain, ignoring
+/// certain nodes.
+static bool findNonImmUse(SDNode *Use, SDNode* Def, SDNode *ImmedUse,
+                          SDNode *Root, SmallPtrSetImpl<SDNode*> &Visited,
+                          bool IgnoreChains) {
+  // The NodeID's are given uniques ID's where a node ID is guaranteed to be
+  // greater than all of its (recursive) operands.  If we scan to a point where
+  // 'use' is smaller than the node we're scanning for, then we know we will
+  // never find it.
+  //
+  // The Use may be -1 (unassigned) if it is a newly allocated node.  This can
+  // happen because we scan down to newly selected nodes in the case of glue
+  // uses.
+  if ((Use->getNodeId() < Def->getNodeId() && Use->getNodeId() != -1))
+    return false;
+
+  // Don't revisit nodes if we already scanned it and didn't fail, we know we
+  // won't fail if we scan it again.
+  if (!Visited.insert(Use).second)
+    return false;
+
+  for (const SDValue &Op : Use->op_values()) {
+    // Ignore chain uses, they are validated by HandleMergeInputChains.
+    if (Op.getValueType() == MVT::Other && IgnoreChains)
+      continue;
+
+    SDNode *N = Op.getNode();
+    if (N == Def) {
+      if (Use == ImmedUse || Use == Root)
+        continue;  // We are not looking for immediate use.
+      assert(N != Root);
+      return true;
+    }
+
+    // Traverse up the operand chain.
+    if (findNonImmUse(N, Def, ImmedUse, Root, Visited, IgnoreChains))
+      return true;
+  }
+  return false;
+}
+
+/// IsProfitableToFold - Returns true if it's profitable to fold the specific
+/// operand node N of U during instruction selection that starts at Root.
+bool SelectionDAGISel::IsProfitableToFold(SDValue N, SDNode *U,
+                                          SDNode *Root) const {
+  if (OptLevel == CodeGenOpt::None) return false;
+  return N.hasOneUse();
+}
+
+/// IsLegalToFold - Returns true if the specific operand node N of
+/// U can be folded during instruction selection that starts at Root.
+bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
+                                     CodeGenOpt::Level OptLevel,
+                                     bool IgnoreChains) {
+  if (OptLevel == CodeGenOpt::None) return false;
+
+  // If Root use can somehow reach N through a path that that doesn't contain
+  // U then folding N would create a cycle. e.g. In the following
+  // diagram, Root can reach N through X. If N is folded into into Root, then
+  // X is both a predecessor and a successor of U.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^     ^          //
+  //         \   /           //
+  //          \ /            //
+  //         [Root*]         //
+  //
+  // * indicates nodes to be folded together.
+  //
+  // If Root produces glue, then it gets (even more) interesting. Since it
+  // will be "glued" together with its glue use in the scheduler, we need to
+  // check if it might reach N.
+  //
+  //          [N*]           //
+  //         ^   ^           //
+  //        /     \          //
+  //      [U*]    [X]?       //
+  //        ^       ^        //
+  //         \       \       //
+  //          \      |       //
+  //         [Root*] |       //
+  //          ^      |       //
+  //          f      |       //
+  //          |      /       //
+  //         [Y]    /        //
+  //           ^   /         //
+  //           f  /          //
+  //           | /           //
+  //          [GU]           //
+  //
+  // If GU (glue use) indirectly reaches N (the load), and Root folds N
+  // (call it Fold), then X is a predecessor of GU and a successor of
+  // Fold. But since Fold and GU are glued together, this will create
+  // a cycle in the scheduling graph.
+
+  // If the node has glue, walk down the graph to the "lowest" node in the
+  // glueged set.
+  EVT VT = Root->getValueType(Root->getNumValues()-1);
+  while (VT == MVT::Glue) {
+    SDNode *GU = findGlueUse(Root);
+    if (!GU)
+      break;
+    Root = GU;
+    VT = Root->getValueType(Root->getNumValues()-1);
+
+    // If our query node has a glue result with a use, we've walked up it.  If
+    // the user (which has already been selected) has a chain or indirectly uses
+    // the chain, our WalkChainUsers predicate will not consider it.  Because of
+    // this, we cannot ignore chains in this predicate.
+    IgnoreChains = false;
+  }
+
+
+  SmallPtrSet<SDNode*, 16> Visited;
+  return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
+}
+
+void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
+  SDLoc DL(N);
+
+  std::vector<SDValue> Ops(N->op_begin(), N->op_end());
+  SelectInlineAsmMemoryOperands(Ops, DL);
+
+  const EVT VTs[] = {MVT::Other, MVT::Glue};
+  SDValue New = CurDAG->getNode(ISD::INLINEASM, DL, VTs, Ops);
+  New->setNodeId(-1);
+  ReplaceUses(N, New.getNode());
+  CurDAG->RemoveDeadNode(N);
+}
+
+void SelectionDAGISel::Select_READ_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
+  const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  unsigned Reg =
+      TLI->getRegisterByName(RegStr->getString().data(), Op->getValueType(0),
+                             *CurDAG);
+  SDValue New = CurDAG->getCopyFromReg(
+                        Op->getOperand(0), dl, Reg, Op->getValueType(0));
+  New->setNodeId(-1);
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
+}
+
+void SelectionDAGISel::Select_WRITE_REGISTER(SDNode *Op) {
+  SDLoc dl(Op);
+  MDNodeSDNode *MD = dyn_cast<MDNodeSDNode>(Op->getOperand(1));
+  const MDString *RegStr = dyn_cast<MDString>(MD->getMD()->getOperand(0));
+  unsigned Reg = TLI->getRegisterByName(RegStr->getString().data(),
+                                        Op->getOperand(2).getValueType(),
+                                        *CurDAG);
+  SDValue New = CurDAG->getCopyToReg(
+                        Op->getOperand(0), dl, Reg, Op->getOperand(2));
+  New->setNodeId(-1);
+  ReplaceUses(Op, New.getNode());
+  CurDAG->RemoveDeadNode(Op);
+}
+
+void SelectionDAGISel::Select_UNDEF(SDNode *N) {
+  CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
+}
+
+/// GetVBR - decode a vbr encoding whose top bit is set.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t
+GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
+  assert(Val >= 128 && "Not a VBR");
+  Val &= 127;  // Remove first vbr bit.
+
+  unsigned Shift = 7;
+  uint64_t NextBits;
+  do {
+    NextBits = MatcherTable[Idx++];
+    Val |= (NextBits&127) << Shift;
+    Shift += 7;
+  } while (NextBits & 128);
+
+  return Val;
+}
+
+/// When a match is complete, this method updates uses of interior chain results
+/// to use the new results.
+void SelectionDAGISel::UpdateChains(
+    SDNode *NodeToMatch, SDValue InputChain,
+    const SmallVectorImpl<SDNode *> &ChainNodesMatched, bool isMorphNodeTo) {
+  SmallVector<SDNode*, 4> NowDeadNodes;
+
+  // Now that all the normal results are replaced, we replace the chain and
+  // glue results if present.
+  if (!ChainNodesMatched.empty()) {
+    assert(InputChain.getNode() &&
+           "Matched input chains but didn't produce a chain");
+    // Loop over all of the nodes we matched that produced a chain result.
+    // Replace all the chain results with the final chain we ended up with.
+    for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+      SDNode *ChainNode = ChainNodesMatched[i];
+      assert(ChainNode->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted node left in chain");
+
+      // Don't replace the results of the root node if we're doing a
+      // MorphNodeTo.
+      if (ChainNode == NodeToMatch && isMorphNodeTo)
+        continue;
+
+      SDValue ChainVal = SDValue(ChainNode, ChainNode->getNumValues()-1);
+      if (ChainVal.getValueType() == MVT::Glue)
+        ChainVal = ChainVal.getValue(ChainVal->getNumValues()-2);
+      assert(ChainVal.getValueType() == MVT::Other && "Not a chain?");
+      CurDAG->ReplaceAllUsesOfValueWith(ChainVal, InputChain);
+
+      // If the node became dead and we haven't already seen it, delete it.
+      if (ChainNode != NodeToMatch && ChainNode->use_empty() &&
+          !std::count(NowDeadNodes.begin(), NowDeadNodes.end(), ChainNode))
+        NowDeadNodes.push_back(ChainNode);
+    }
+  }
+
+  if (!NowDeadNodes.empty())
+    CurDAG->RemoveDeadNodes(NowDeadNodes);
+
+  DEBUG(dbgs() << "ISEL: Match complete!\n");
+}
+
+enum ChainResult {
+  CR_Simple,
+  CR_InducesCycle,
+  CR_LeadsToInteriorNode
+};
+
+/// WalkChainUsers - Walk down the users of the specified chained node that is
+/// part of the pattern we're matching, looking at all of the users we find.
+/// This determines whether something is an interior node, whether we have a
+/// non-pattern node in between two pattern nodes (which prevent folding because
+/// it would induce a cycle) and whether we have a TokenFactor node sandwiched
+/// between pattern nodes (in which case the TF becomes part of the pattern).
+///
+/// The walk we do here is guaranteed to be small because we quickly get down to
+/// already selected nodes "below" us.
+static ChainResult
+WalkChainUsers(const SDNode *ChainedNode,
+               SmallVectorImpl<SDNode *> &ChainedNodesInPattern,
+               DenseMap<const SDNode *, ChainResult> &TokenFactorResult,
+               SmallVectorImpl<SDNode *> &InteriorChainedNodes) {
+  ChainResult Result = CR_Simple;
+
+  for (SDNode::use_iterator UI = ChainedNode->use_begin(),
+         E = ChainedNode->use_end(); UI != E; ++UI) {
+    // Make sure the use is of the chain, not some other value we produce.
+    if (UI.getUse().getValueType() != MVT::Other) continue;
+
+    SDNode *User = *UI;
+
+    if (User->getOpcode() == ISD::HANDLENODE)  // Root of the graph.
+      continue;
+
+    // If we see an already-selected machine node, then we've gone beyond the
+    // pattern that we're selecting down into the already selected chunk of the
+    // DAG.
+    unsigned UserOpcode = User->getOpcode();
+    if (User->isMachineOpcode() ||
+        UserOpcode == ISD::CopyToReg ||
+        UserOpcode == ISD::CopyFromReg ||
+        UserOpcode == ISD::INLINEASM ||
+        UserOpcode == ISD::EH_LABEL ||
+        UserOpcode == ISD::LIFETIME_START ||
+        UserOpcode == ISD::LIFETIME_END) {
+      // If their node ID got reset to -1 then they've already been selected.
+      // Treat them like a MachineOpcode.
+      if (User->getNodeId() == -1)
+        continue;
+    }
+
+    // If we have a TokenFactor, we handle it specially.
+    if (User->getOpcode() != ISD::TokenFactor) {
+      // If the node isn't a token factor and isn't part of our pattern, then it
+      // must be a random chained node in between two nodes we're selecting.
+      // This happens when we have something like:
+      //   x = load ptr
+      //   call
+      //   y = x+4
+      //   store y -> ptr
+      // Because we structurally match the load/store as a read/modify/write,
+      // but the call is chained between them.  We cannot fold in this case
+      // because it would induce a cycle in the graph.
+      if (!std::count(ChainedNodesInPattern.begin(),
+                      ChainedNodesInPattern.end(), User))
+        return CR_InducesCycle;
+
+      // Otherwise we found a node that is part of our pattern.  For example in:
+      //   x = load ptr
+      //   y = x+4
+      //   store y -> ptr
+      // This would happen when we're scanning down from the load and see the
+      // store as a user.  Record that there is a use of ChainedNode that is
+      // part of the pattern and keep scanning uses.
+      Result = CR_LeadsToInteriorNode;
+      InteriorChainedNodes.push_back(User);
+      continue;
+    }
+
+    // If we found a TokenFactor, there are two cases to consider: first if the
+    // TokenFactor is just hanging "below" the pattern we're matching (i.e. no
+    // uses of the TF are in our pattern) we just want to ignore it.  Second,
+    // the TokenFactor can be sandwiched in between two chained nodes, like so:
+    //     [Load chain]
+    //         ^
+    //         |
+    //       [Load]
+    //       ^    ^
+    //       |    \                    DAG's like cheese
+    //      /       \                       do you?
+    //     /         |
+    // [TokenFactor] [Op]
+    //     ^          ^
+    //     |          |
+    //      \        /
+    //       \      /
+    //       [Store]
+    //
+    // In this case, the TokenFactor becomes part of our match and we rewrite it
+    // as a new TokenFactor.
+    //
+    // To distinguish these two cases, do a recursive walk down the uses.
+    auto MemoizeResult = TokenFactorResult.find(User);
+    bool Visited = MemoizeResult != TokenFactorResult.end();
+    // Recursively walk chain users only if the result is not memoized.
+    if (!Visited) {
+      auto Res = WalkChainUsers(User, ChainedNodesInPattern, TokenFactorResult,
+                                InteriorChainedNodes);
+      MemoizeResult = TokenFactorResult.insert(std::make_pair(User, Res)).first;
+    }
+    switch (MemoizeResult->second) {
+    case CR_Simple:
+      // If the uses of the TokenFactor are just already-selected nodes, ignore
+      // it, it is "below" our pattern.
+      continue;
+    case CR_InducesCycle:
+      // If the uses of the TokenFactor lead to nodes that are not part of our
+      // pattern that are not selected, folding would turn this into a cycle,
+      // bail out now.
+      return CR_InducesCycle;
+    case CR_LeadsToInteriorNode:
+      break;  // Otherwise, keep processing.
+    }
+
+    // Okay, we know we're in the interesting interior case.  The TokenFactor
+    // is now going to be considered part of the pattern so that we rewrite its
+    // uses (it may have uses that are not part of the pattern) with the
+    // ultimate chain result of the generated code.  We will also add its chain
+    // inputs as inputs to the ultimate TokenFactor we create.
+    Result = CR_LeadsToInteriorNode;
+    if (!Visited) {
+      ChainedNodesInPattern.push_back(User);
+      InteriorChainedNodes.push_back(User);
+    }
+  }
+
+  return Result;
+}
+
+/// HandleMergeInputChains - This implements the OPC_EmitMergeInputChains
+/// operation for when the pattern matched at least one node with a chains.  The
+/// input vector contains a list of all of the chained nodes that we match.  We
+/// must determine if this is a valid thing to cover (i.e. matching it won't
+/// induce cycles in the DAG) and if so, creating a TokenFactor node. that will
+/// be used as the input node chain for the generated nodes.
+static SDValue
+HandleMergeInputChains(SmallVectorImpl<SDNode*> &ChainNodesMatched,
+                       SelectionDAG *CurDAG) {
+  // Used for memoization. Without it WalkChainUsers could take exponential
+  // time to run.
+  DenseMap<const SDNode *, ChainResult> TokenFactorResult;
+  // Walk all of the chained nodes we've matched, recursively scanning down the
+  // users of the chain result. This adds any TokenFactor nodes that are caught
+  // in between chained nodes to the chained and interior nodes list.
+  SmallVector<SDNode*, 3> InteriorChainedNodes;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    if (WalkChainUsers(ChainNodesMatched[i], ChainNodesMatched,
+                       TokenFactorResult,
+                       InteriorChainedNodes) == CR_InducesCycle)
+      return SDValue(); // Would induce a cycle.
+  }
+
+  // Okay, we have walked all the matched nodes and collected TokenFactor nodes
+  // that we are interested in.  Form our input TokenFactor node.
+  SmallVector<SDValue, 3> InputChains;
+  for (unsigned i = 0, e = ChainNodesMatched.size(); i != e; ++i) {
+    // Add the input chain of this node to the InputChains list (which will be
+    // the operands of the generated TokenFactor) if it's not an interior node.
+    SDNode *N = ChainNodesMatched[i];
+    if (N->getOpcode() != ISD::TokenFactor) {
+      if (std::count(InteriorChainedNodes.begin(),InteriorChainedNodes.end(),N))
+        continue;
+
+      // Otherwise, add the input chain.
+      SDValue InChain = ChainNodesMatched[i]->getOperand(0);
+      assert(InChain.getValueType() == MVT::Other && "Not a chain");
+      InputChains.push_back(InChain);
+      continue;
+    }
+
+    // If we have a token factor, we want to add all inputs of the token factor
+    // that are not part of the pattern we're matching.
+    for (const SDValue &Op : N->op_values()) {
+      if (!std::count(ChainNodesMatched.begin(), ChainNodesMatched.end(),
+                      Op.getNode()))
+        InputChains.push_back(Op);
+    }
+  }
+
+  if (InputChains.size() == 1)
+    return InputChains[0];
+  return CurDAG->getNode(ISD::TokenFactor, SDLoc(ChainNodesMatched[0]),
+                         MVT::Other, InputChains);
+}
+
+/// MorphNode - Handle morphing a node in place for the selector.
+SDNode *SelectionDAGISel::
+MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
+          ArrayRef<SDValue> Ops, unsigned EmitNodeInfo) {
+  // It is possible we're using MorphNodeTo to replace a node with no
+  // normal results with one that has a normal result (or we could be
+  // adding a chain) and the input could have glue and chains as well.
+  // In this case we need to shift the operands down.
+  // FIXME: This is a horrible hack and broken in obscure cases, no worse
+  // than the old isel though.
+  int OldGlueResultNo = -1, OldChainResultNo = -1;
+
+  unsigned NTMNumResults = Node->getNumValues();
+  if (Node->getValueType(NTMNumResults-1) == MVT::Glue) {
+    OldGlueResultNo = NTMNumResults-1;
+    if (NTMNumResults != 1 &&
+        Node->getValueType(NTMNumResults-2) == MVT::Other)
+      OldChainResultNo = NTMNumResults-2;
+  } else if (Node->getValueType(NTMNumResults-1) == MVT::Other)
+    OldChainResultNo = NTMNumResults-1;
+
+  // Call the underlying SelectionDAG routine to do the transmogrification. Note
+  // that this deletes operands of the old node that become dead.
+  SDNode *Res = CurDAG->MorphNodeTo(Node, ~TargetOpc, VTList, Ops);
+
+  // MorphNodeTo can operate in two ways: if an existing node with the
+  // specified operands exists, it can just return it.  Otherwise, it
+  // updates the node in place to have the requested operands.
+  if (Res == Node) {
+    // If we updated the node in place, reset the node ID.  To the isel,
+    // this should be just like a newly allocated machine node.
+    Res->setNodeId(-1);
+  }
+
+  unsigned ResNumResults = Res->getNumValues();
+  // Move the glue if needed.
+  if ((EmitNodeInfo & OPFL_GlueOutput) && OldGlueResultNo != -1 &&
+      (unsigned)OldGlueResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldGlueResultNo),
+                                      SDValue(Res, ResNumResults-1));
+
+  if ((EmitNodeInfo & OPFL_GlueOutput) != 0)
+    --ResNumResults;
+
+  // Move the chain reference if needed.
+  if ((EmitNodeInfo & OPFL_Chain) && OldChainResultNo != -1 &&
+      (unsigned)OldChainResultNo != ResNumResults-1)
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(Node, OldChainResultNo),
+                                      SDValue(Res, ResNumResults-1));
+
+  // Otherwise, no replacement happened because the node already exists. Replace
+  // Uses of the old node with the new one.
+  if (Res != Node) {
+    CurDAG->ReplaceAllUsesWith(Node, Res);
+    CurDAG->RemoveDeadNode(Node);
+  }
+
+  return Res;
+}
+
+/// CheckSame - Implements OP_CheckSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+          SDValue N,
+          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+  // Accept if it is exactly the same as a previously recorded node.
+  unsigned RecNo = MatcherTable[MatcherIndex++];
+  assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
+  return N == RecordedNodes[RecNo].first;
+}
+
+/// CheckChildSame - Implements OP_CheckChildXSame.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N,
+             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
+             unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
+                     RecordedNodes);
+}
+
+/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckPatternPredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                      const SelectionDAGISel &SDISel) {
+  return SDISel.CheckPatternPredicate(MatcherTable[MatcherIndex++]);
+}
+
+/// CheckNodePredicate - Implements OP_CheckNodePredicate.
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckNodePredicate(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                   const SelectionDAGISel &SDISel, SDNode *N) {
+  return SDISel.CheckNodePredicate(N, MatcherTable[MatcherIndex++]);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckOpcode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDNode *N) {
+  uint16_t Opc = MatcherTable[MatcherIndex++];
+  Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+  return N->getOpcode() == Opc;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckType(const unsigned char *MatcherTable, unsigned &MatcherIndex, SDValue N,
+          const TargetLowering *TLI, const DataLayout &DL) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (N.getValueType() == VT) return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && N.getValueType() == TLI->getPointerTy(DL);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckChildType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL,
+               unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckType(MatcherTable, MatcherIndex, N.getOperand(ChildNo), TLI,
+                     DL);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckCondCode(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+              SDValue N) {
+  return cast<CondCodeSDNode>(N)->get() ==
+      (ISD::CondCode)MatcherTable[MatcherIndex++];
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckValueType(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+               SDValue N, const TargetLowering *TLI, const DataLayout &DL) {
+  MVT::SimpleValueType VT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+  if (cast<VTSDNode>(N)->getVT() == VT)
+    return true;
+
+  // Handle the case when VT is iPTR.
+  return VT == MVT::iPTR && cast<VTSDNode>(N)->getVT() == TLI->getPointerTy(DL);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+             SDValue N) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+  return C && C->getSExtValue() == Val;
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckChildInteger(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+                  SDValue N, unsigned ChildNo) {
+  if (ChildNo >= N.getNumOperands())
+    return false;  // Match fails if out of range child #.
+  return ::CheckInteger(MatcherTable, MatcherIndex, N.getOperand(ChildNo));
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckAndImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+            SDValue N, const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::AND) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C && SDISel.CheckAndMask(N.getOperand(0), C, Val);
+}
+
+LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
+CheckOrImm(const unsigned char *MatcherTable, unsigned &MatcherIndex,
+           SDValue N, const SelectionDAGISel &SDISel) {
+  int64_t Val = MatcherTable[MatcherIndex++];
+  if (Val & 128)
+    Val = GetVBR(Val, MatcherTable, MatcherIndex);
+
+  if (N->getOpcode() != ISD::OR) return false;
+
+  ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  return C && SDISel.CheckOrMask(N.getOperand(0), C, Val);
+}
+
+/// IsPredicateKnownToFail - If we know how and can do so without pushing a
+/// scope, evaluate the current node.  If the current predicate is known to
+/// fail, set Result=true and return anything.  If the current predicate is
+/// known to pass, set Result=false and return the MatcherIndex to continue
+/// with.  If the current predicate is unknown, set Result=false and return the
+/// MatcherIndex to continue with.
+static unsigned IsPredicateKnownToFail(const unsigned char *Table,
+                                       unsigned Index, SDValue N,
+                                       bool &Result,
+                                       const SelectionDAGISel &SDISel,
+                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+  switch (Table[Index++]) {
+  default:
+    Result = false;
+    return Index-1;  // Could not evaluate this predicate.
+  case SelectionDAGISel::OPC_CheckSame:
+    Result = !::CheckSame(Table, Index, N, RecordedNodes);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Same:
+  case SelectionDAGISel::OPC_CheckChild1Same:
+  case SelectionDAGISel::OPC_CheckChild2Same:
+  case SelectionDAGISel::OPC_CheckChild3Same:
+    Result = !::CheckChildSame(Table, Index, N, RecordedNodes,
+                        Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Same);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPatternPredicate:
+    Result = !::CheckPatternPredicate(Table, Index, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckPredicate:
+    Result = !::CheckNodePredicate(Table, Index, SDISel, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckOpcode:
+    Result = !::CheckOpcode(Table, Index, N.getNode());
+    return Index;
+  case SelectionDAGISel::OPC_CheckType:
+    Result = !::CheckType(Table, Index, N, SDISel.TLI,
+                          SDISel.CurDAG->getDataLayout());
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Type:
+  case SelectionDAGISel::OPC_CheckChild1Type:
+  case SelectionDAGISel::OPC_CheckChild2Type:
+  case SelectionDAGISel::OPC_CheckChild3Type:
+  case SelectionDAGISel::OPC_CheckChild4Type:
+  case SelectionDAGISel::OPC_CheckChild5Type:
+  case SelectionDAGISel::OPC_CheckChild6Type:
+  case SelectionDAGISel::OPC_CheckChild7Type:
+    Result = !::CheckChildType(
+                 Table, Index, N, SDISel.TLI, SDISel.CurDAG->getDataLayout(),
+                 Table[Index - 1] - SelectionDAGISel::OPC_CheckChild0Type);
+    return Index;
+  case SelectionDAGISel::OPC_CheckCondCode:
+    Result = !::CheckCondCode(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckValueType:
+    Result = !::CheckValueType(Table, Index, N, SDISel.TLI,
+                               SDISel.CurDAG->getDataLayout());
+    return Index;
+  case SelectionDAGISel::OPC_CheckInteger:
+    Result = !::CheckInteger(Table, Index, N);
+    return Index;
+  case SelectionDAGISel::OPC_CheckChild0Integer:
+  case SelectionDAGISel::OPC_CheckChild1Integer:
+  case SelectionDAGISel::OPC_CheckChild2Integer:
+  case SelectionDAGISel::OPC_CheckChild3Integer:
+  case SelectionDAGISel::OPC_CheckChild4Integer:
+    Result = !::CheckChildInteger(Table, Index, N,
+                     Table[Index-1] - SelectionDAGISel::OPC_CheckChild0Integer);
+    return Index;
+  case SelectionDAGISel::OPC_CheckAndImm:
+    Result = !::CheckAndImm(Table, Index, N, SDISel);
+    return Index;
+  case SelectionDAGISel::OPC_CheckOrImm:
+    Result = !::CheckOrImm(Table, Index, N, SDISel);
+    return Index;
+  }
+}
+
+namespace {
+struct MatchScope {
+  /// FailIndex - If this match fails, this is the index to continue with.
+  unsigned FailIndex;
+
+  /// NodeStack - The node stack when the scope was formed.
+  SmallVector<SDValue, 4> NodeStack;
+
+  /// NumRecordedNodes - The number of recorded nodes when the scope was formed.
+  unsigned NumRecordedNodes;
+
+  /// NumMatchedMemRefs - The number of matched memref entries.
+  unsigned NumMatchedMemRefs;
+
+  /// InputChain/InputGlue - The current chain/glue
+  SDValue InputChain, InputGlue;
+
+  /// HasChainNodesMatched - True if the ChainNodesMatched list is non-empty.
+  bool HasChainNodesMatched;
+};
+
+/// \\brief A DAG update listener to keep the matching state
+/// (i.e. RecordedNodes and MatchScope) uptodate if the target is allowed to
+/// change the DAG while matching.  X86 addressing mode matcher is an example
+/// for this.
+class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
+{
+      SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes;
+      SmallVectorImpl<MatchScope> &MatchScopes;
+public:
+  MatchStateUpdater(SelectionDAG &DAG,
+                    SmallVectorImpl<std::pair<SDValue, SDNode*> > &RN,
+                    SmallVectorImpl<MatchScope> &MS) :
+    SelectionDAG::DAGUpdateListener(DAG),
+    RecordedNodes(RN), MatchScopes(MS) { }
+
+  void NodeDeleted(SDNode *N, SDNode *E) override {
+    // Some early-returns here to avoid the search if we deleted the node or
+    // if the update comes from MorphNodeTo (MorphNodeTo is the last thing we
+    // do, so it's unnecessary to update matching state at that point).
+    // Neither of these can occur currently because we only install this
+    // update listener during matching a complex patterns.
+    if (!E || E->isMachineOpcode())
+      return;
+    // Performing linear search here does not matter because we almost never
+    // run this code.  You'd have to have a CSE during complex pattern
+    // matching.
+    for (auto &I : RecordedNodes)
+      if (I.first.getNode() == N)
+        I.first.setNode(E);
+
+    for (auto &I : MatchScopes)
+      for (auto &J : I.NodeStack)
+        if (J.getNode() == N)
+          J.setNode(E);
+  }
+};
+} // end anonymous namespace
+
+void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
+                                        const unsigned char *MatcherTable,
+                                        unsigned TableSize) {
+  // FIXME: Should these even be selected?  Handle these cases in the caller?
+  switch (NodeToMatch->getOpcode()) {
+  default:
+    break;
+  case ISD::EntryToken:       // These nodes remain the same.
+  case ISD::BasicBlock:
+  case ISD::Register:
+  case ISD::RegisterMask:
+  case ISD::HANDLENODE:
+  case ISD::MDNODE_SDNODE:
+  case ISD::TargetConstant:
+  case ISD::TargetConstantFP:
+  case ISD::TargetConstantPool:
+  case ISD::TargetFrameIndex:
+  case ISD::TargetExternalSymbol:
+  case ISD::MCSymbol:
+  case ISD::TargetBlockAddress:
+  case ISD::TargetJumpTable:
+  case ISD::TargetGlobalTLSAddress:
+  case ISD::TargetGlobalAddress:
+  case ISD::TokenFactor:
+  case ISD::CopyFromReg:
+  case ISD::CopyToReg:
+  case ISD::EH_LABEL:
+  case ISD::LIFETIME_START:
+  case ISD::LIFETIME_END:
+    NodeToMatch->setNodeId(-1); // Mark selected.
+    return;
+  case ISD::AssertSext:
+  case ISD::AssertZext:
+    CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, 0),
+                                      NodeToMatch->getOperand(0));
+    CurDAG->RemoveDeadNode(NodeToMatch);
+    return;
+  case ISD::INLINEASM:
+    Select_INLINEASM(NodeToMatch);
+    return;
+  case ISD::READ_REGISTER:
+    Select_READ_REGISTER(NodeToMatch);
+    return;
+  case ISD::WRITE_REGISTER:
+    Select_WRITE_REGISTER(NodeToMatch);
+    return;
+  case ISD::UNDEF:
+    Select_UNDEF(NodeToMatch);
+    return;
+  }
+
+  assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
+
+  // Set up the node stack with NodeToMatch as the only node on the stack.
+  SmallVector<SDValue, 8> NodeStack;
+  SDValue N = SDValue(NodeToMatch, 0);
+  NodeStack.push_back(N);
+
+  // MatchScopes - Scopes used when matching, if a match failure happens, this
+  // indicates where to continue checking.
+  SmallVector<MatchScope, 8> MatchScopes;
+
+  // RecordedNodes - This is the set of nodes that have been recorded by the
+  // state machine.  The second value is the parent of the node, or null if the
+  // root is recorded.
+  SmallVector<std::pair<SDValue, SDNode*>, 8> RecordedNodes;
+
+  // MatchedMemRefs - This is the set of MemRef's we've seen in the input
+  // pattern.
+  SmallVector<MachineMemOperand*, 2> MatchedMemRefs;
+
+  // These are the current input chain and glue for use when generating nodes.
+  // Various Emit operations change these.  For example, emitting a copytoreg
+  // uses and updates these.
+  SDValue InputChain, InputGlue;
+
+  // ChainNodesMatched - If a pattern matches nodes that have input/output
+  // chains, the OPC_EmitMergeInputChains operation is emitted which indicates
+  // which ones they are.  The result is captured into this list so that we can
+  // update the chain results when the pattern is complete.
+  SmallVector<SDNode*, 3> ChainNodesMatched;
+
+  DEBUG(dbgs() << "ISEL: Starting pattern match on root node: ";
+        NodeToMatch->dump(CurDAG);
+        dbgs() << '\n');
+
+  // Determine where to start the interpreter.  Normally we start at opcode #0,
+  // but if the state machine starts with an OPC_SwitchOpcode, then we
+  // accelerate the first lookup (which is guaranteed to be hot) with the
+  // OpcodeOffset table.
+  unsigned MatcherIndex = 0;
+
+  if (!OpcodeOffset.empty()) {
+    // Already computed the OpcodeOffset table, just index into it.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+    DEBUG(dbgs() << "  Initial Opcode index to " << MatcherIndex << "\n");
+
+  } else if (MatcherTable[0] == OPC_SwitchOpcode) {
+    // Otherwise, the table isn't computed, but the state machine does start
+    // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
+    // is the first time we're selecting an instruction.
+    unsigned Idx = 1;
+    while (1) {
+      // Get the size of this case.
+      unsigned CaseSize = MatcherTable[Idx++];
+      if (CaseSize & 128)
+        CaseSize = GetVBR(CaseSize, MatcherTable, Idx);
+      if (CaseSize == 0) break;
+
+      // Get the opcode, add the index to the table.
+      uint16_t Opc = MatcherTable[Idx++];
+      Opc |= (unsigned short)MatcherTable[Idx++] << 8;
+      if (Opc >= OpcodeOffset.size())
+        OpcodeOffset.resize((Opc+1)*2);
+      OpcodeOffset[Opc] = Idx;
+      Idx += CaseSize;
+    }
+
+    // Okay, do the lookup for the first opcode.
+    if (N.getOpcode() < OpcodeOffset.size())
+      MatcherIndex = OpcodeOffset[N.getOpcode()];
+  }
+
+  while (1) {
+    assert(MatcherIndex < TableSize && "Invalid index");
+#ifndef NDEBUG
+    unsigned CurrentOpcodeIndex = MatcherIndex;
+#endif
+    BuiltinOpcodes Opcode = (BuiltinOpcodes)MatcherTable[MatcherIndex++];
+    switch (Opcode) {
+    case OPC_Scope: {
+      // Okay, the semantics of this operation are that we should push a scope
+      // then evaluate the first child.  However, pushing a scope only to have
+      // the first check fail (which then pops it) is inefficient.  If we can
+      // determine immediately that the first check (or first several) will
+      // immediately fail, don't even bother pushing a scope for them.
+      unsigned FailIndex;
+
+      while (1) {
+        unsigned NumToSkip = MatcherTable[MatcherIndex++];
+        if (NumToSkip & 128)
+          NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+        // Found the end of the scope with no match.
+        if (NumToSkip == 0) {
+          FailIndex = 0;
+          break;
+        }
+
+        FailIndex = MatcherIndex+NumToSkip;
+
+        unsigned MatcherIndexOfPredicate = MatcherIndex;
+        (void)MatcherIndexOfPredicate; // silence warning.
+
+        // If we can't evaluate this predicate without pushing a scope (e.g. if
+        // it is a 'MoveParent') or if the predicate succeeds on this node, we
+        // push the scope and evaluate the full predicate chain.
+        bool Result;
+        MatcherIndex = IsPredicateKnownToFail(MatcherTable, MatcherIndex, N,
+                                              Result, *this, RecordedNodes);
+        if (!Result)
+          break;
+
+        DEBUG(dbgs() << "  Skipped scope entry (due to false predicate) at "
+                     << "index " << MatcherIndexOfPredicate
+                     << ", continuing at " << FailIndex << "\n");
+        ++NumDAGIselRetries;
+
+        // Otherwise, we know that this case of the Scope is guaranteed to fail,
+        // move to the next case.
+        MatcherIndex = FailIndex;
+      }
+
+      // If the whole scope failed to match, bail.
+      if (FailIndex == 0) break;
+
+      // Push a MatchScope which indicates where to go if the first child fails
+      // to match.
+      MatchScope NewEntry;
+      NewEntry.FailIndex = FailIndex;
+      NewEntry.NodeStack.append(NodeStack.begin(), NodeStack.end());
+      NewEntry.NumRecordedNodes = RecordedNodes.size();
+      NewEntry.NumMatchedMemRefs = MatchedMemRefs.size();
+      NewEntry.InputChain = InputChain;
+      NewEntry.InputGlue = InputGlue;
+      NewEntry.HasChainNodesMatched = !ChainNodesMatched.empty();
+      MatchScopes.push_back(NewEntry);
+      continue;
+    }
+    case OPC_RecordNode: {
+      // Remember this node, it may end up being an operand in the pattern.
+      SDNode *Parent = nullptr;
+      if (NodeStack.size() > 1)
+        Parent = NodeStack[NodeStack.size()-2].getNode();
+      RecordedNodes.push_back(std::make_pair(N, Parent));
+      continue;
+    }
+
+    case OPC_RecordChild0: case OPC_RecordChild1:
+    case OPC_RecordChild2: case OPC_RecordChild3:
+    case OPC_RecordChild4: case OPC_RecordChild5:
+    case OPC_RecordChild6: case OPC_RecordChild7: {
+      unsigned ChildNo = Opcode-OPC_RecordChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+
+      RecordedNodes.push_back(std::make_pair(N->getOperand(ChildNo),
+                                             N.getNode()));
+      continue;
+    }
+    case OPC_RecordMemRef:
+      MatchedMemRefs.push_back(cast<MemSDNode>(N)->getMemOperand());
+      continue;
+
+    case OPC_CaptureGlueInput:
+      // If the current node has an input glue, capture it in InputGlue.
+      if (N->getNumOperands() != 0 &&
+          N->getOperand(N->getNumOperands()-1).getValueType() == MVT::Glue)
+        InputGlue = N->getOperand(N->getNumOperands()-1);
+      continue;
+
+    case OPC_MoveChild: {
+      unsigned ChildNo = MatcherTable[MatcherIndex++];
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
+    case OPC_MoveChild0: case OPC_MoveChild1:
+    case OPC_MoveChild2: case OPC_MoveChild3:
+    case OPC_MoveChild4: case OPC_MoveChild5:
+    case OPC_MoveChild6: case OPC_MoveChild7: {
+      unsigned ChildNo = Opcode-OPC_MoveChild0;
+      if (ChildNo >= N.getNumOperands())
+        break;  // Match fails if out of range child #.
+      N = N.getOperand(ChildNo);
+      NodeStack.push_back(N);
+      continue;
+    }
+
+    case OPC_MoveParent:
+      // Pop the current node off the NodeStack.
+      NodeStack.pop_back();
+      assert(!NodeStack.empty() && "Node stack imbalance!");
+      N = NodeStack.back();
+      continue;
+
+    case OPC_CheckSame:
+      if (!::CheckSame(MatcherTable, MatcherIndex, N, RecordedNodes)) break;
+      continue;
+
+    case OPC_CheckChild0Same: case OPC_CheckChild1Same:
+    case OPC_CheckChild2Same: case OPC_CheckChild3Same:
+      if (!::CheckChildSame(MatcherTable, MatcherIndex, N, RecordedNodes,
+                            Opcode-OPC_CheckChild0Same))
+        break;
+      continue;
+
+    case OPC_CheckPatternPredicate:
+      if (!::CheckPatternPredicate(MatcherTable, MatcherIndex, *this)) break;
+      continue;
+    case OPC_CheckPredicate:
+      if (!::CheckNodePredicate(MatcherTable, MatcherIndex, *this,
+                                N.getNode()))
+        break;
+      continue;
+    case OPC_CheckComplexPat: {
+      unsigned CPNum = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid CheckComplexPat");
+
+      // If target can modify DAG during matching, keep the matching state
+      // consistent.
+      std::unique_ptr<MatchStateUpdater> MSU;
+      if (ComplexPatternFuncMutatesDAG())
+        MSU.reset(new MatchStateUpdater(*CurDAG, RecordedNodes,
+                                        MatchScopes));
+
+      if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
+                               RecordedNodes[RecNo].first, CPNum,
+                               RecordedNodes))
+        break;
+      continue;
+    }
+    case OPC_CheckOpcode:
+      if (!::CheckOpcode(MatcherTable, MatcherIndex, N.getNode())) break;
+      continue;
+
+    case OPC_CheckType:
+      if (!::CheckType(MatcherTable, MatcherIndex, N, TLI,
+                       CurDAG->getDataLayout()))
+        break;
+      continue;
+
+    case OPC_SwitchOpcode: {
+      unsigned CurNodeOpcode = N.getOpcode();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        uint16_t Opc = MatcherTable[MatcherIndex++];
+        Opc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+
+        // If the opcode matches, then we will execute this case.
+        if (CurNodeOpcode == Opc)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      DEBUG(dbgs() << "  OpcodeSwitch from " << SwitchStart
+                   << " to " << MatcherIndex << "\n");
+      continue;
+    }
+
+    case OPC_SwitchType: {
+      MVT CurNodeVT = N.getSimpleValueType();
+      unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
+      unsigned CaseSize;
+      while (1) {
+        // Get the size of this case.
+        CaseSize = MatcherTable[MatcherIndex++];
+        if (CaseSize & 128)
+          CaseSize = GetVBR(CaseSize, MatcherTable, MatcherIndex);
+        if (CaseSize == 0) break;
+
+        MVT CaseVT = (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (CaseVT == MVT::iPTR)
+          CaseVT = TLI->getPointerTy(CurDAG->getDataLayout());
+
+        // If the VT matches, then we will execute this case.
+        if (CurNodeVT == CaseVT)
+          break;
+
+        // Otherwise, skip over this case.
+        MatcherIndex += CaseSize;
+      }
+
+      // If no cases matched, bail out.
+      if (CaseSize == 0) break;
+
+      // Otherwise, execute the case we found.
+      DEBUG(dbgs() << "  TypeSwitch[" << EVT(CurNodeVT).getEVTString()
+                   << "] from " << SwitchStart << " to " << MatcherIndex<<'\n');
+      continue;
+    }
+    case OPC_CheckChild0Type: case OPC_CheckChild1Type:
+    case OPC_CheckChild2Type: case OPC_CheckChild3Type:
+    case OPC_CheckChild4Type: case OPC_CheckChild5Type:
+    case OPC_CheckChild6Type: case OPC_CheckChild7Type:
+      if (!::CheckChildType(MatcherTable, MatcherIndex, N, TLI,
+                            CurDAG->getDataLayout(),
+                            Opcode - OPC_CheckChild0Type))
+        break;
+      continue;
+    case OPC_CheckCondCode:
+      if (!::CheckCondCode(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckValueType:
+      if (!::CheckValueType(MatcherTable, MatcherIndex, N, TLI,
+                            CurDAG->getDataLayout()))
+        break;
+      continue;
+    case OPC_CheckInteger:
+      if (!::CheckInteger(MatcherTable, MatcherIndex, N)) break;
+      continue;
+    case OPC_CheckChild0Integer: case OPC_CheckChild1Integer:
+    case OPC_CheckChild2Integer: case OPC_CheckChild3Integer:
+    case OPC_CheckChild4Integer:
+      if (!::CheckChildInteger(MatcherTable, MatcherIndex, N,
+                               Opcode-OPC_CheckChild0Integer)) break;
+      continue;
+    case OPC_CheckAndImm:
+      if (!::CheckAndImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+    case OPC_CheckOrImm:
+      if (!::CheckOrImm(MatcherTable, MatcherIndex, N, *this)) break;
+      continue;
+
+    case OPC_CheckFoldableChainNode: {
+      assert(NodeStack.size() != 1 && "No parent node");
+      // Verify that all intermediate nodes between the root and this one have
+      // a single use.
+      bool HasMultipleUses = false;
+      for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
+        if (!NodeStack[i].hasOneUse()) {
+          HasMultipleUses = true;
+          break;
+        }
+      if (HasMultipleUses) break;
+
+      // Check to see that the target thinks this is profitable to fold and that
+      // we can fold it without inducing cycles in the graph.
+      if (!IsProfitableToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                              NodeToMatch) ||
+          !IsLegalToFold(N, NodeStack[NodeStack.size()-2].getNode(),
+                         NodeToMatch, OptLevel,
+                         true/*We validate our own chains*/))
+        break;
+
+      continue;
+    }
+    case OPC_EmitInteger: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      int64_t Val = MatcherTable[MatcherIndex++];
+      if (Val & 128)
+        Val = GetVBR(Val, MatcherTable, MatcherIndex);
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getTargetConstant(Val, SDLoc(NodeToMatch),
+                                                        VT), nullptr));
+      continue;
+    }
+    case OPC_EmitRegister: {
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), nullptr));
+      continue;
+    }
+    case OPC_EmitRegister2: {
+      // For targets w/ more than 256 register names, the register enum
+      // values are stored in two bytes in the matcher table (just like
+      // opcodes).
+      MVT::SimpleValueType VT =
+        (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+      unsigned RegNo = MatcherTable[MatcherIndex++];
+      RegNo |= MatcherTable[MatcherIndex++] << 8;
+      RecordedNodes.push_back(std::pair<SDValue, SDNode*>(
+                              CurDAG->getRegister(RegNo, VT), nullptr));
+      continue;
+    }
+
+    case OPC_EmitConvertToTarget:  {
+      // Convert from IMM/FPIMM to target version.
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitConvertToTarget");
+      SDValue Imm = RecordedNodes[RecNo].first;
+
+      if (Imm->getOpcode() == ISD::Constant) {
+        const ConstantInt *Val=cast<ConstantSDNode>(Imm)->getConstantIntValue();
+        Imm = CurDAG->getTargetConstant(*Val, SDLoc(NodeToMatch),
+                                        Imm.getValueType());
+      } else if (Imm->getOpcode() == ISD::ConstantFP) {
+        const ConstantFP *Val=cast<ConstantFPSDNode>(Imm)->getConstantFPValue();
+        Imm = CurDAG->getTargetConstantFP(*Val, SDLoc(NodeToMatch),
+                                          Imm.getValueType());
+      }
+
+      RecordedNodes.push_back(std::make_pair(Imm, RecordedNodes[RecNo].second));
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains1_0:    // OPC_EmitMergeInputChains, 1, 0
+    case OPC_EmitMergeInputChains1_1:    // OPC_EmitMergeInputChains, 1, 1
+    case OPC_EmitMergeInputChains1_2: {  // OPC_EmitMergeInputChains, 1, 2
+      // These are space-optimized forms of OPC_EmitMergeInputChains.
+      assert(!InputChain.getNode() &&
+             "EmitMergeInputChains should be the first chain producing node");
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      unsigned RecNo = Opcode - OPC_EmitMergeInputChains1_0;
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
+      ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+      // FIXME: What if other value results of the node have uses not matched
+      // by this pattern?
+      if (ChainNodesMatched.back() != NodeToMatch &&
+          !RecordedNodes[RecNo].first.hasOneUse()) {
+        ChainNodesMatched.clear();
+        break;
+      }
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (!InputChain.getNode())
+        break;  // Failed to merge.
+      continue;
+    }
+
+    case OPC_EmitMergeInputChains: {
+      assert(!InputChain.getNode() &&
+             "EmitMergeInputChains should be the first chain producing node");
+      // This node gets a list of nodes we matched in the input that have
+      // chains.  We want to token factor all of the input chains to these nodes
+      // together.  However, if any of the input chains is actually one of the
+      // nodes matched in this pattern, then we have an intra-match reference.
+      // Ignore these because the newly token factored chain should not refer to
+      // the old nodes.
+      unsigned NumChains = MatcherTable[MatcherIndex++];
+      assert(NumChains != 0 && "Can't TF zero chains");
+
+      assert(ChainNodesMatched.empty() &&
+             "Should only have one EmitMergeInputChains per match");
+
+      // Read all of the chained nodes.
+      for (unsigned i = 0; i != NumChains; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitMergeInputChains");
+        ChainNodesMatched.push_back(RecordedNodes[RecNo].first.getNode());
+
+        // FIXME: What if other value results of the node have uses not matched
+        // by this pattern?
+        if (ChainNodesMatched.back() != NodeToMatch &&
+            !RecordedNodes[RecNo].first.hasOneUse()) {
+          ChainNodesMatched.clear();
+          break;
+        }
+      }
+
+      // If the inner loop broke out, the match fails.
+      if (ChainNodesMatched.empty())
+        break;
+
+      // Merge the input chains if they are not intra-pattern references.
+      InputChain = HandleMergeInputChains(ChainNodesMatched, CurDAG);
+
+      if (!InputChain.getNode())
+        break;  // Failed to merge.
+
+      continue;
+    }
+
+    case OPC_EmitCopyToReg: {
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitCopyToReg");
+      unsigned DestPhysReg = MatcherTable[MatcherIndex++];
+
+      if (!InputChain.getNode())
+        InputChain = CurDAG->getEntryNode();
+
+      InputChain = CurDAG->getCopyToReg(InputChain, SDLoc(NodeToMatch),
+                                        DestPhysReg, RecordedNodes[RecNo].first,
+                                        InputGlue);
+
+      InputGlue = InputChain.getValue(1);
+      continue;
+    }
+
+    case OPC_EmitNodeXForm: {
+      unsigned XFormNo = MatcherTable[MatcherIndex++];
+      unsigned RecNo = MatcherTable[MatcherIndex++];
+      assert(RecNo < RecordedNodes.size() && "Invalid EmitNodeXForm");
+      SDValue Res = RunSDNodeXForm(RecordedNodes[RecNo].first, XFormNo);
+      RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
+      continue;
+    }
+
+    case OPC_EmitNode:     case OPC_MorphNodeTo:
+    case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
+    case OPC_MorphNodeTo0: case OPC_MorphNodeTo1: case OPC_MorphNodeTo2: {
+      uint16_t TargetOpc = MatcherTable[MatcherIndex++];
+      TargetOpc |= (unsigned short)MatcherTable[MatcherIndex++] << 8;
+      unsigned EmitNodeInfo = MatcherTable[MatcherIndex++];
+      // Get the result VT list.
+      unsigned NumVTs;
+      // If this is one of the compressed forms, get the number of VTs based
+      // on the Opcode. Otherwise read the next byte from the table.
+      if (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2)
+        NumVTs = Opcode - OPC_MorphNodeTo0;
+      else if (Opcode >= OPC_EmitNode0 && Opcode <= OPC_EmitNode2)
+        NumVTs = Opcode - OPC_EmitNode0;
+      else
+        NumVTs = MatcherTable[MatcherIndex++];
+      SmallVector<EVT, 4> VTs;
+      for (unsigned i = 0; i != NumVTs; ++i) {
+        MVT::SimpleValueType VT =
+          (MVT::SimpleValueType)MatcherTable[MatcherIndex++];
+        if (VT == MVT::iPTR)
+          VT = TLI->getPointerTy(CurDAG->getDataLayout()).SimpleTy;
+        VTs.push_back(VT);
+      }
+
+      if (EmitNodeInfo & OPFL_Chain)
+        VTs.push_back(MVT::Other);
+      if (EmitNodeInfo & OPFL_GlueOutput)
+        VTs.push_back(MVT::Glue);
+
+      // This is hot code, so optimize the two most common cases of 1 and 2
+      // results.
+      SDVTList VTList;
+      if (VTs.size() == 1)
+        VTList = CurDAG->getVTList(VTs[0]);
+      else if (VTs.size() == 2)
+        VTList = CurDAG->getVTList(VTs[0], VTs[1]);
+      else
+        VTList = CurDAG->getVTList(VTs);
+
+      // Get the operand list.
+      unsigned NumOps = MatcherTable[MatcherIndex++];
+      SmallVector<SDValue, 8> Ops;
+      for (unsigned i = 0; i != NumOps; ++i) {
+        unsigned RecNo = MatcherTable[MatcherIndex++];
+        if (RecNo & 128)
+          RecNo = GetVBR(RecNo, MatcherTable, MatcherIndex);
+
+        assert(RecNo < RecordedNodes.size() && "Invalid EmitNode");
+        Ops.push_back(RecordedNodes[RecNo].first);
+      }
+
+      // If there are variadic operands to add, handle them now.
+      if (EmitNodeInfo & OPFL_VariadicInfo) {
+        // Determine the start index to copy from.
+        unsigned FirstOpToCopy = getNumFixedFromVariadicInfo(EmitNodeInfo);
+        FirstOpToCopy += (EmitNodeInfo & OPFL_Chain) ? 1 : 0;
+        assert(NodeToMatch->getNumOperands() >= FirstOpToCopy &&
+               "Invalid variadic node");
+        // Copy all of the variadic operands, not including a potential glue
+        // input.
+        for (unsigned i = FirstOpToCopy, e = NodeToMatch->getNumOperands();
+             i != e; ++i) {
+          SDValue V = NodeToMatch->getOperand(i);
+          if (V.getValueType() == MVT::Glue) break;
+          Ops.push_back(V);
+        }
+      }
+
+      // If this has chain/glue inputs, add them.
+      if (EmitNodeInfo & OPFL_Chain)
+        Ops.push_back(InputChain);
+      if ((EmitNodeInfo & OPFL_GlueInput) && InputGlue.getNode() != nullptr)
+        Ops.push_back(InputGlue);
+
+      // Create the node.
+      SDNode *Res = nullptr;
+      bool IsMorphNodeTo = Opcode == OPC_MorphNodeTo ||
+                     (Opcode >= OPC_MorphNodeTo0 && Opcode <= OPC_MorphNodeTo2);
+      if (!IsMorphNodeTo) {
+        // If this is a normal EmitNode command, just create the new node and
+        // add the results to the RecordedNodes list.
+        Res = CurDAG->getMachineNode(TargetOpc, SDLoc(NodeToMatch),
+                                     VTList, Ops);
+
+        // Add all the non-glue/non-chain results to the RecordedNodes list.
+        for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+          if (VTs[i] == MVT::Other || VTs[i] == MVT::Glue) break;
+          RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
+                                                             nullptr));
+        }
+
+      } else {
+        assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
+               "NodeToMatch was removed partway through selection");
+        SelectionDAG::DAGNodeDeletedListener NDL(*CurDAG, [&](SDNode *N,
+                                                              SDNode *E) {
+          auto &Chain = ChainNodesMatched;
+          assert((!E || llvm::find(Chain, N) == Chain.end()) &&
+                 "Chain node replaced during MorphNode");
+          Chain.erase(std::remove(Chain.begin(), Chain.end(), N), Chain.end());
+        });
+        Res = MorphNode(NodeToMatch, TargetOpc, VTList, Ops, EmitNodeInfo);
+      }
+
+      // If the node had chain/glue results, update our notion of the current
+      // chain and glue.
+      if (EmitNodeInfo & OPFL_GlueOutput) {
+        InputGlue = SDValue(Res, VTs.size()-1);
+        if (EmitNodeInfo & OPFL_Chain)
+          InputChain = SDValue(Res, VTs.size()-2);
+      } else if (EmitNodeInfo & OPFL_Chain)
+        InputChain = SDValue(Res, VTs.size()-1);
+
+      // If the OPFL_MemRefs glue is set on this node, slap all of the
+      // accumulated memrefs onto it.
+      //
+      // FIXME: This is vastly incorrect for patterns with multiple outputs
+      // instructions that access memory and for ComplexPatterns that match
+      // loads.
+      if (EmitNodeInfo & OPFL_MemRefs) {
+        // Only attach load or store memory operands if the generated
+        // instruction may load or store.
+        const MCInstrDesc &MCID = TII->get(TargetOpc);
+        bool mayLoad = MCID.mayLoad();
+        bool mayStore = MCID.mayStore();
+
+        unsigned NumMemRefs = 0;
+        for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
+               MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              ++NumMemRefs;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              ++NumMemRefs;
+          } else {
+            ++NumMemRefs;
+          }
+        }
+
+        MachineSDNode::mmo_iterator MemRefs =
+          MF->allocateMemRefsArray(NumMemRefs);
+
+        MachineSDNode::mmo_iterator MemRefsPos = MemRefs;
+        for (SmallVectorImpl<MachineMemOperand *>::const_iterator I =
+               MatchedMemRefs.begin(), E = MatchedMemRefs.end(); I != E; ++I) {
+          if ((*I)->isLoad()) {
+            if (mayLoad)
+              *MemRefsPos++ = *I;
+          } else if ((*I)->isStore()) {
+            if (mayStore)
+              *MemRefsPos++ = *I;
+          } else {
+            *MemRefsPos++ = *I;
+          }
+        }
+
+        cast<MachineSDNode>(Res)
+          ->setMemRefs(MemRefs, MemRefs + NumMemRefs);
+      }
+
+      DEBUG(dbgs() << "  "
+                   << (IsMorphNodeTo ? "Morphed" : "Created")
+                   << " node: "; Res->dump(CurDAG); dbgs() << "\n");
+
+      // If this was a MorphNodeTo then we're completely done!
+      if (IsMorphNodeTo) {
+        // Update chain uses.
+        UpdateChains(Res, InputChain, ChainNodesMatched, true);
+        return;
+      }
+      continue;
+    }
+
+    case OPC_CompleteMatch: {
+      // The match has been completed, and any new nodes (if any) have been
+      // created.  Patch up references to the matched dag to use the newly
+      // created nodes.
+      unsigned NumResults = MatcherTable[MatcherIndex++];
+
+      for (unsigned i = 0; i != NumResults; ++i) {
+        unsigned ResSlot = MatcherTable[MatcherIndex++];
+        if (ResSlot & 128)
+          ResSlot = GetVBR(ResSlot, MatcherTable, MatcherIndex);
+
+        assert(ResSlot < RecordedNodes.size() && "Invalid CompleteMatch");
+        SDValue Res = RecordedNodes[ResSlot].first;
+
+        assert(i < NodeToMatch->getNumValues() &&
+               NodeToMatch->getValueType(i) != MVT::Other &&
+               NodeToMatch->getValueType(i) != MVT::Glue &&
+               "Invalid number of results to complete!");
+        assert((NodeToMatch->getValueType(i) == Res.getValueType() ||
+                NodeToMatch->getValueType(i) == MVT::iPTR ||
+                Res.getValueType() == MVT::iPTR ||
+                NodeToMatch->getValueType(i).getSizeInBits() ==
+                    Res.getValueType().getSizeInBits()) &&
+               "invalid replacement");
+        CurDAG->ReplaceAllUsesOfValueWith(SDValue(NodeToMatch, i), Res);
+      }
+
+      // Update chain uses.
+      UpdateChains(NodeToMatch, InputChain, ChainNodesMatched, false);
+
+      // If the root node defines glue, we need to update it to the glue result.
+      // TODO: This never happens in our tests and I think it can be removed /
+      // replaced with an assert, but if we do it this the way the change is
+      // NFC.
+      if (NodeToMatch->getValueType(NodeToMatch->getNumValues() - 1) ==
+              MVT::Glue &&
+          InputGlue.getNode())
+        CurDAG->ReplaceAllUsesOfValueWith(
+            SDValue(NodeToMatch, NodeToMatch->getNumValues() - 1), InputGlue);
+
+      assert(NodeToMatch->use_empty() &&
+             "Didn't replace all uses of the node?");
+      CurDAG->RemoveDeadNode(NodeToMatch);
+
+      return;
+    }
+    }
+
+    // If the code reached this point, then the match failed.  See if there is
+    // another child to try in the current 'Scope', otherwise pop it until we
+    // find a case to check.
+    DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
+    ++NumDAGIselRetries;
+    while (1) {
+      if (MatchScopes.empty()) {
+        CannotYetSelect(NodeToMatch);
+        return;
+      }
+
+      // Restore the interpreter state back to the point where the scope was
+      // formed.
+      MatchScope &LastScope = MatchScopes.back();
+      RecordedNodes.resize(LastScope.NumRecordedNodes);
+      NodeStack.clear();
+      NodeStack.append(LastScope.NodeStack.begin(), LastScope.NodeStack.end());
+      N = NodeStack.back();
+
+      if (LastScope.NumMatchedMemRefs != MatchedMemRefs.size())
+        MatchedMemRefs.resize(LastScope.NumMatchedMemRefs);
+      MatcherIndex = LastScope.FailIndex;
+
+      DEBUG(dbgs() << "  Continuing at " << MatcherIndex << "\n");
+
+      InputChain = LastScope.InputChain;
+      InputGlue = LastScope.InputGlue;
+      if (!LastScope.HasChainNodesMatched)
+        ChainNodesMatched.clear();
+
+      // Check to see what the offset is at the new MatcherIndex.  If it is zero
+      // we have reached the end of this scope, otherwise we have another child
+      // in the current scope to try.
+      unsigned NumToSkip = MatcherTable[MatcherIndex++];
+      if (NumToSkip & 128)
+        NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
+
+      // If we have another child in this scope to match, update FailIndex and
+      // try it.
+      if (NumToSkip != 0) {
+        LastScope.FailIndex = MatcherIndex+NumToSkip;
+        break;
+      }
+
+      // End of this scope, pop it and try the next child in the containing
+      // scope.
+      MatchScopes.pop_back();
+    }
+  }
+}
+
+void SelectionDAGISel::CannotYetSelect(SDNode *N) {
+  std::string msg;
+  raw_string_ostream Msg(msg);
+  Msg << "Cannot select: ";
+
+  if (N->getOpcode() != ISD::INTRINSIC_W_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
+      N->getOpcode() != ISD::INTRINSIC_VOID) {
+    N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getName();
+  } else {
+    bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
+    unsigned iid =
+      cast<ConstantSDNode>(N->getOperand(HasInputChain))->getZExtValue();
+    if (iid < Intrinsic::num_intrinsics)
+      Msg << "intrinsic %" << Intrinsic::getName((Intrinsic::ID)iid);
+    else if (const TargetIntrinsicInfo *TII = TM.getIntrinsicInfo())
+      Msg << "target intrinsic %" << TII->getName(iid);
+    else
+      Msg << "unknown intrinsic #" << iid;
+  }
+  report_fatal_error(Msg.str());
+}
+
+char SelectionDAGISel::ID = 0;
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
new file mode 100644
index 000000000000..2764688518c2
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -0,0 +1,307 @@
+//===-- SelectionDAGPrinter.cpp - Implement SelectionDAG::viewGraph() -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAG::viewGraph method.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "ScheduleDAGSDNodes.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "dag-printer"
+
+namespace llvm {
+  template<>
+  struct DOTGraphTraits<SelectionDAG*> : public DefaultDOTGraphTraits {
+
+    explicit DOTGraphTraits(bool isSimple=false) :
+      DefaultDOTGraphTraits(isSimple) {}
+
+    static bool hasEdgeDestLabels() {
+      return true;
+    }
+
+    static unsigned numEdgeDestLabels(const void *Node) {
+      return ((const SDNode *) Node)->getNumValues();
+    }
+
+    static std::string getEdgeDestLabel(const void *Node, unsigned i) {
+      return ((const SDNode *) Node)->getValueType(i).getEVTString();
+    }
+
+    template<typename EdgeIter>
+    static std::string getEdgeSourceLabel(const void *Node, EdgeIter I) {
+      return itostr(I - SDNodeIterator::begin((const SDNode *) Node));
+    }
+
+    /// edgeTargetsEdgeSource - This method returns true if this outgoing edge
+    /// should actually target another edge source, not a node.  If this method
+    /// is implemented, getEdgeTarget should be implemented.
+    template<typename EdgeIter>
+    static bool edgeTargetsEdgeSource(const void *Node, EdgeIter I) {
+      return true;
+    }
+
+    /// getEdgeTarget - If edgeTargetsEdgeSource returns true, this method is
+    /// called to determine which outgoing edge of Node is the target of this
+    /// edge.
+    template<typename EdgeIter>
+    static EdgeIter getEdgeTarget(const void *Node, EdgeIter I) {
+      SDNode *TargetNode = *I;
+      SDNodeIterator NI = SDNodeIterator::begin(TargetNode);
+      std::advance(NI, I.getNode()->getOperand(I.getOperand()).getResNo());
+      return NI;
+    }
+
+    static std::string getGraphName(const SelectionDAG *G) {
+      return G->getMachineFunction().getName();
+    }
+
+    static bool renderGraphFromBottomUp() {
+      return true;
+    }
+
+    static std::string getNodeIdentifierLabel(const SDNode *Node,
+                                              const SelectionDAG *Graph) {
+      std::string R;
+      raw_string_ostream OS(R);
+#ifndef NDEBUG
+      OS << 't' << Node->PersistentId;
+#else
+      OS << static_cast<const void *>(Node);
+#endif
+      return R;
+    }
+
+    /// If you want to override the dot attributes printed for a particular
+    /// edge, override this method.
+    template<typename EdgeIter>
+    static std::string getEdgeAttributes(const void *Node, EdgeIter EI,
+                                         const SelectionDAG *Graph) {
+      SDValue Op = EI.getNode()->getOperand(EI.getOperand());
+      EVT VT = Op.getValueType();
+      if (VT == MVT::Glue)
+        return "color=red,style=bold";
+      else if (VT == MVT::Other)
+        return "color=blue,style=dashed";
+      return "";
+    }
+
+
+    static std::string getSimpleNodeLabel(const SDNode *Node,
+                                          const SelectionDAG *G) {
+      std::string Result = Node->getOperationName(G);
+      {
+        raw_string_ostream OS(Result);
+        Node->print_details(OS, G);
+      }
+      return Result;
+    }
+    std::string getNodeLabel(const SDNode *Node, const SelectionDAG *Graph);
+    static std::string getNodeAttributes(const SDNode *N,
+                                         const SelectionDAG *Graph) {
+#ifndef NDEBUG
+      const std::string &Attrs = Graph->getGraphAttrs(N);
+      if (!Attrs.empty()) {
+        if (Attrs.find("shape=") == std::string::npos)
+          return std::string("shape=Mrecord,") + Attrs;
+        else
+          return Attrs;
+      }
+#endif
+      return "shape=Mrecord";
+    }
+
+    static void addCustomGraphFeatures(SelectionDAG *G,
+                                       GraphWriter<SelectionDAG*> &GW) {
+      GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
+      if (G->getRoot().getNode())
+        GW.emitEdge(nullptr, -1, G->getRoot().getNode(), G->getRoot().getResNo(),
+                    "color=blue,style=dashed");
+    }
+  };
+}
+
+std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
+                                                        const SelectionDAG *G) {
+  return DOTGraphTraits<SelectionDAG*>::getSimpleNodeLabel(Node, G);
+}
+
+
+/// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
+/// rendered using 'dot'.
+///
+void SelectionDAG::viewGraph(const std::string &Title) {
+// This code is only for debugging!
+#ifndef NDEBUG
+  ViewGraph(this, "dag." + getMachineFunction().getName(),
+            false, Title);
+#else
+  errs() << "SelectionDAG::viewGraph is only available in debug builds on "
+         << "systems with Graphviz or gv!\n";
+#endif  // NDEBUG
+}
+
+// This overload is defined out-of-line here instead of just using a
+// default parameter because this is easiest for gdb to call.
+void SelectionDAG::viewGraph() {
+  viewGraph("");
+}
+
+/// clearGraphAttrs - Clear all previously defined node graph attributes.
+/// Intended to be used from a debugging tool (eg. gdb).
+void SelectionDAG::clearGraphAttrs() {
+#ifndef NDEBUG
+  NodeGraphAttrs.clear();
+#else
+  errs() << "SelectionDAG::clearGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// setGraphAttrs - Set graph attributes for a node. (eg. "color=red".)
+///
+void SelectionDAG::setGraphAttrs(const SDNode *N, const char *Attrs) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = Attrs;
+#else
+  errs() << "SelectionDAG::setGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+
+/// getGraphAttrs - Get graph attributes for a node. (eg. "color=red".)
+/// Used from getNodeAttributes.
+const std::string SelectionDAG::getGraphAttrs(const SDNode *N) const {
+#ifndef NDEBUG
+  std::map<const SDNode *, std::string>::const_iterator I =
+    NodeGraphAttrs.find(N);
+
+  if (I != NodeGraphAttrs.end())
+    return I->second;
+  else
+    return "";
+#else
+  errs() << "SelectionDAG::getGraphAttrs is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+  return std::string();
+#endif
+}
+
+/// setGraphColor - Convenience for setting node color attribute.
+///
+void SelectionDAG::setGraphColor(const SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  NodeGraphAttrs[N] = std::string("color=") + Color;
+#else
+  errs() << "SelectionDAG::setGraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+/// setSubgraphColorHelper - Implement setSubgraphColor.  Return
+/// whether we truncated the search.
+///
+bool SelectionDAG::setSubgraphColorHelper(SDNode *N, const char *Color, DenseSet<SDNode *> &visited,
+                                          int level, bool &printed) {
+  bool hit_limit = false;
+
+#ifndef NDEBUG
+  if (level >= 20) {
+    if (!printed) {
+      printed = true;
+      DEBUG(dbgs() << "setSubgraphColor hit max level\n");
+    }
+    return true;
+  }
+
+  unsigned oldSize = visited.size();
+  visited.insert(N);
+  if (visited.size() != oldSize) {
+    setGraphColor(N, Color);
+    for(SDNodeIterator i = SDNodeIterator::begin(N), iend = SDNodeIterator::end(N);
+        i != iend;
+        ++i) {
+      hit_limit = setSubgraphColorHelper(*i, Color, visited, level+1, printed) || hit_limit;
+    }
+  }
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+  return hit_limit;
+}
+
+/// setSubgraphColor - Convenience for setting subgraph color attribute.
+///
+void SelectionDAG::setSubgraphColor(SDNode *N, const char *Color) {
+#ifndef NDEBUG
+  DenseSet<SDNode *> visited;
+  bool printed = false;
+  if (setSubgraphColorHelper(N, Color, visited, 0, printed)) {
+    // Visually mark that we hit the limit
+    if (strcmp(Color, "red") == 0) {
+      setSubgraphColorHelper(N, "blue", visited, 0, printed);
+    } else if (strcmp(Color, "yellow") == 0) {
+      setSubgraphColorHelper(N, "green", visited, 0, printed);
+    }
+  }
+
+#else
+  errs() << "SelectionDAG::setSubgraphColor is only available in debug builds"
+         << " on systems with Graphviz or gv!\n";
+#endif
+}
+
+std::string ScheduleDAGSDNodes::getGraphNodeLabel(const SUnit *SU) const {
+  std::string s;
+  raw_string_ostream O(s);
+  O << "SU(" << SU->NodeNum << "): ";
+  if (SU->getNode()) {
+    SmallVector<SDNode *, 4> GluedNodes;
+    for (SDNode *N = SU->getNode(); N; N = N->getGluedNode())
+      GluedNodes.push_back(N);
+    while (!GluedNodes.empty()) {
+      O << DOTGraphTraits<SelectionDAG*>
+        ::getSimpleNodeLabel(GluedNodes.back(), DAG);
+      GluedNodes.pop_back();
+      if (!GluedNodes.empty())
+        O << "\n    ";
+    }
+  } else {
+    O << "CROSS RC COPY";
+  }
+  return O.str();
+}
+
+void ScheduleDAGSDNodes::getCustomGraphFeatures(GraphWriter<ScheduleDAG*> &GW) const {
+  if (DAG) {
+    // Draw a special "GraphRoot" node to indicate the root of the graph.
+    GW.emitSimpleNode(nullptr, "plaintext=circle", "GraphRoot");
+    const SDNode *N = DAG->getRoot().getNode();
+    if (N && N->getNodeId() != -1)
+      GW.emitEdge(nullptr, -1, &SUnits[N->getNodeId()], -1,
+                  "color=blue,style=dashed");
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
new file mode 100644
index 000000000000..55f70f7d9fd3
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/SelectionDAGTargetInfo.cpp
@@ -0,0 +1,17 @@
+//===-- SelectionDAGTargetInfo.cpp - SelectionDAG Info --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the SelectionDAGTargetInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+using namespace llvm;
+
+SelectionDAGTargetInfo::~SelectionDAGTargetInfo() {}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
new file mode 100644
index 000000000000..90aaba247265
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -0,0 +1,958 @@
+//===-- StatepointLowering.cpp - SDAGBuilder's statepoint code -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StatepointLowering.h"
+#include "SelectionDAGBuilder.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/GCMetadata.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "statepoint-lowering"
+
+STATISTIC(NumSlotsAllocatedForStatepoints,
+          "Number of stack slots allocated for statepoints");
+STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
+STATISTIC(StatepointMaxSlotsRequired,
+          "Maximum number of stack slots required for a singe statepoint");
+
+static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
+                                 SelectionDAGBuilder &Builder, uint64_t Value) {
+  SDLoc L = Builder.getCurSDLoc();
+  Ops.push_back(Builder.DAG.getTargetConstant(StackMaps::ConstantOp, L,
+                                              MVT::i64));
+  Ops.push_back(Builder.DAG.getTargetConstant(Value, L, MVT::i64));
+}
+
+void StatepointLoweringState::startNewStatepoint(SelectionDAGBuilder &Builder) {
+  // Consistency check
+  assert(PendingGCRelocateCalls.empty() &&
+         "Trying to visit statepoint before finished processing previous one");
+  Locations.clear();
+  NextSlotToAllocate = 0;
+  // Need to resize this on each safepoint - we need the two to stay in sync and
+  // the clear patterns of a SelectionDAGBuilder have no relation to
+  // FunctionLoweringInfo.  SmallBitVector::reset initializes all bits to false.
+  AllocatedStackSlots.resize(Builder.FuncInfo.StatepointStackSlots.size());
+}
+
+void StatepointLoweringState::clear() {
+  Locations.clear();
+  AllocatedStackSlots.clear();
+  assert(PendingGCRelocateCalls.empty() &&
+         "cleared before statepoint sequence completed");
+}
+
+SDValue
+StatepointLoweringState::allocateStackSlot(EVT ValueType,
+                                           SelectionDAGBuilder &Builder) {
+  NumSlotsAllocatedForStatepoints++;
+  auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+
+  unsigned SpillSize = ValueType.getSizeInBits() / 8;
+  assert((SpillSize * 8) == ValueType.getSizeInBits() && "Size not in bytes?");
+
+  // First look for a previously created stack slot which is not in
+  // use (accounting for the fact arbitrary slots may already be
+  // reserved), or to create a new stack slot and use it.
+
+  const size_t NumSlots = AllocatedStackSlots.size();
+  assert(NextSlotToAllocate <= NumSlots && "Broken invariant");
+
+  // The stack slots in StatepointStackSlots beyond the first NumSlots were
+  // added in this instance of StatepointLoweringState, and cannot be re-used.
+  assert(NumSlots <= Builder.FuncInfo.StatepointStackSlots.size() &&
+         "Broken invariant");
+
+  for (; NextSlotToAllocate < NumSlots; NextSlotToAllocate++) {
+    if (!AllocatedStackSlots.test(NextSlotToAllocate)) {
+      const int FI = Builder.FuncInfo.StatepointStackSlots[NextSlotToAllocate];
+      if (MFI->getObjectSize(FI) == SpillSize) {
+        AllocatedStackSlots.set(NextSlotToAllocate);
+        return Builder.DAG.getFrameIndex(FI, ValueType);
+      }
+    }
+  }
+
+  // Couldn't find a free slot, so create a new one:
+
+  SDValue SpillSlot = Builder.DAG.CreateStackTemporary(ValueType);
+  const unsigned FI = cast<FrameIndexSDNode>(SpillSlot)->getIndex();
+  MFI->markAsStatepointSpillSlotObjectIndex(FI);
+
+  Builder.FuncInfo.StatepointStackSlots.push_back(FI);
+
+  StatepointMaxSlotsRequired = std::max<unsigned long>(
+      StatepointMaxSlotsRequired, Builder.FuncInfo.StatepointStackSlots.size());
+
+  return SpillSlot;
+}
+
+/// Utility function for reservePreviousStackSlotForValue. Tries to find
+/// stack slot index to which we have spilled value for previous statepoints.
+/// LookUpDepth specifies maximum DFS depth this function is allowed to look.
+static Optional<int> findPreviousSpillSlot(const Value *Val,
+                                           SelectionDAGBuilder &Builder,
+                                           int LookUpDepth) {
+  // Can not look any further - give up now
+  if (LookUpDepth <= 0)
+    return None;
+
+  // Spill location is known for gc relocates
+  if (const auto *Relocate = dyn_cast<GCRelocateInst>(Val)) {
+    const auto &SpillMap =
+        Builder.FuncInfo.StatepointSpillMaps[Relocate->getStatepoint()];
+
+    auto It = SpillMap.find(Relocate->getDerivedPtr());
+    if (It == SpillMap.end())
+      return None;
+
+    return It->second;
+  }
+
+  // Look through bitcast instructions.
+  if (const BitCastInst *Cast = dyn_cast<BitCastInst>(Val))
+    return findPreviousSpillSlot(Cast->getOperand(0), Builder, LookUpDepth - 1);
+
+  // Look through phi nodes
+  // All incoming values should have same known stack slot, otherwise result
+  // is unknown.
+  if (const PHINode *Phi = dyn_cast<PHINode>(Val)) {
+    Optional<int> MergedResult = None;
+
+    for (auto &IncomingValue : Phi->incoming_values()) {
+      Optional<int> SpillSlot =
+          findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth - 1);
+      if (!SpillSlot.hasValue())
+        return None;
+
+      if (MergedResult.hasValue() && *MergedResult != *SpillSlot)
+        return None;
+
+      MergedResult = SpillSlot;
+    }
+    return MergedResult;
+  }
+
+  // TODO: We can do better for PHI nodes. In cases like this:
+  //   ptr = phi(relocated_pointer, not_relocated_pointer)
+  //   statepoint(ptr)
+  // We will return that stack slot for ptr is unknown. And later we might
+  // assign different stack slots for ptr and relocated_pointer. This limits
+  // llvm's ability to remove redundant stores.
+  // Unfortunately it's hard to accomplish in current infrastructure.
+  // We use this function to eliminate spill store completely, while
+  // in example we still need to emit store, but instead of any location
+  // we need to use special "preferred" location.
+
+  // TODO: handle simple updates.  If a value is modified and the original
+  // value is no longer live, it would be nice to put the modified value in the
+  // same slot.  This allows folding of the memory accesses for some
+  // instructions types (like an increment).
+  //   statepoint (i)
+  //   i1 = i+1
+  //   statepoint (i1)
+  // However we need to be careful for cases like this:
+  //   statepoint(i)
+  //   i1 = i+1
+  //   statepoint(i, i1)
+  // Here we want to reserve spill slot for 'i', but not for 'i+1'. If we just
+  // put handling of simple modifications in this function like it's done
+  // for bitcasts we might end up reserving i's slot for 'i+1' because order in
+  // which we visit values is unspecified.
+
+  // Don't know any information about this instruction
+  return None;
+}
+
+/// Try to find existing copies of the incoming values in stack slots used for
+/// statepoint spilling.  If we can find a spill slot for the incoming value,
+/// mark that slot as allocated, and reuse the same slot for this safepoint.
+/// This helps to avoid series of loads and stores that only serve to reshuffle
+/// values on the stack between calls.
+static void reservePreviousStackSlotForValue(const Value *IncomingValue,
+                                             SelectionDAGBuilder &Builder) {
+
+  SDValue Incoming = Builder.getValue(IncomingValue);
+
+  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
+    // We won't need to spill this, so no need to check for previously
+    // allocated stack slots
+    return;
+  }
+
+  SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
+  if (OldLocation.getNode())
+    // Duplicates in input
+    return;
+
+  const int LookUpDepth = 6;
+  Optional<int> Index =
+      findPreviousSpillSlot(IncomingValue, Builder, LookUpDepth);
+  if (!Index.hasValue())
+    return;
+
+  const auto &StatepointSlots = Builder.FuncInfo.StatepointStackSlots;
+
+  auto SlotIt = find(StatepointSlots, *Index);
+  assert(SlotIt != StatepointSlots.end() &&
+         "Value spilled to the unknown stack slot");
+
+  // This is one of our dedicated lowering slots
+  const int Offset = std::distance(StatepointSlots.begin(), SlotIt);
+  if (Builder.StatepointLowering.isStackSlotAllocated(Offset)) {
+    // stack slot already assigned to someone else, can't use it!
+    // TODO: currently we reserve space for gc arguments after doing
+    // normal allocation for deopt arguments.  We should reserve for
+    // _all_ deopt and gc arguments, then start allocating.  This
+    // will prevent some moves being inserted when vm state changes,
+    // but gc state doesn't between two calls.
+    return;
+  }
+  // Reserve this stack slot
+  Builder.StatepointLowering.reserveStackSlot(Offset);
+
+  // Cache this slot so we find it when going through the normal
+  // assignment loop.
+  SDValue Loc = Builder.DAG.getTargetFrameIndex(*Index, Incoming.getValueType());
+  Builder.StatepointLowering.setLocation(Incoming, Loc);
+}
+
+/// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
+/// is not required for correctness.  It's purpose is to reduce the size of
+/// StackMap section.  It has no effect on the number of spill slots required
+/// or the actual lowering.
+static void
+removeDuplicateGCPtrs(SmallVectorImpl<const Value *> &Bases,
+                      SmallVectorImpl<const Value *> &Ptrs,
+                      SmallVectorImpl<const GCRelocateInst *> &Relocs,
+                      SelectionDAGBuilder &Builder,
+                      FunctionLoweringInfo::StatepointSpillMap &SSM) {
+  DenseMap<SDValue, const Value *> Seen;
+
+  SmallVector<const Value *, 64> NewBases, NewPtrs;
+  SmallVector<const GCRelocateInst *, 64> NewRelocs;
+  for (size_t i = 0, e = Ptrs.size(); i < e; i++) {
+    SDValue SD = Builder.getValue(Ptrs[i]);
+    auto SeenIt = Seen.find(SD);
+
+    if (SeenIt == Seen.end()) {
+      // Only add non-duplicates
+      NewBases.push_back(Bases[i]);
+      NewPtrs.push_back(Ptrs[i]);
+      NewRelocs.push_back(Relocs[i]);
+      Seen[SD] = Ptrs[i];
+    } else {
+      // Duplicate pointer found, note in SSM and move on:
+      SSM.DuplicateMap[Ptrs[i]] = SeenIt->second;
+    }
+  }
+  assert(Bases.size() >= NewBases.size());
+  assert(Ptrs.size() >= NewPtrs.size());
+  assert(Relocs.size() >= NewRelocs.size());
+  Bases = NewBases;
+  Ptrs = NewPtrs;
+  Relocs = NewRelocs;
+  assert(Ptrs.size() == Bases.size());
+  assert(Ptrs.size() == Relocs.size());
+}
+
+/// Extract call from statepoint, lower it and return pointer to the
+/// call node. Also update NodeMap so that getValue(statepoint) will
+/// reference lowered call result
+static std::pair<SDValue, SDNode *> lowerCallFromStatepointLoweringInfo(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI,
+    SelectionDAGBuilder &Builder, SmallVectorImpl<SDValue> &PendingExports) {
+
+  SDValue ReturnValue, CallEndVal;
+  std::tie(ReturnValue, CallEndVal) =
+      Builder.lowerInvokable(SI.CLI, SI.EHPadBB);
+  SDNode *CallEnd = CallEndVal.getNode();
+
+  // Get a call instruction from the call sequence chain.  Tail calls are not
+  // allowed.  The following code is essentially reverse engineering X86's
+  // LowerCallTo.
+  //
+  // We are expecting DAG to have the following form:
+  //
+  // ch = eh_label (only in case of invoke statepoint)
+  //   ch, glue = callseq_start ch
+  //   ch, glue = X86::Call ch, glue
+  //   ch, glue = callseq_end ch, glue
+  //   get_return_value ch, glue
+  //
+  // get_return_value can either be a sequence of CopyFromReg instructions
+  // to grab the return value from the return register(s), or it can be a LOAD
+  // to load a value returned by reference via a stack slot.
+
+  bool HasDef = !SI.CLI.RetTy->isVoidTy();
+  if (HasDef) {
+    if (CallEnd->getOpcode() == ISD::LOAD)
+      CallEnd = CallEnd->getOperand(0).getNode();
+    else
+      while (CallEnd->getOpcode() == ISD::CopyFromReg)
+        CallEnd = CallEnd->getOperand(0).getNode();
+  }
+
+  assert(CallEnd->getOpcode() == ISD::CALLSEQ_END && "expected!");
+  return std::make_pair(ReturnValue, CallEnd->getOperand(0).getNode());
+}
+
+/// Spill a value incoming to the statepoint. It might be either part of
+/// vmstate
+/// or gcstate. In both cases unconditionally spill it on the stack unless it
+/// is a null constant. Return pair with first element being frame index
+/// containing saved value and second element with outgoing chain from the
+/// emitted store
+static std::pair<SDValue, SDValue>
+spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
+                             SelectionDAGBuilder &Builder) {
+  SDValue Loc = Builder.StatepointLowering.getLocation(Incoming);
+
+  // Emit new store if we didn't do it for this ptr before
+  if (!Loc.getNode()) {
+    Loc = Builder.StatepointLowering.allocateStackSlot(Incoming.getValueType(),
+                                                       Builder);
+    int Index = cast<FrameIndexSDNode>(Loc)->getIndex();
+    // We use TargetFrameIndex so that isel will not select it into LEA
+    Loc = Builder.DAG.getTargetFrameIndex(Index, Incoming.getValueType());
+
+    // TODO: We can create TokenFactor node instead of
+    //       chaining stores one after another, this may allow
+    //       a bit more optimal scheduling for them
+
+#ifndef NDEBUG
+    // Right now we always allocate spill slots that are of the same
+    // size as the value we're about to spill (the size of spillee can
+    // vary since we spill vectors of pointers too).  At some point we
+    // can consider allowing spills of smaller values to larger slots
+    // (i.e. change the '==' in the assert below to a '>=').
+    auto *MFI = Builder.DAG.getMachineFunction().getFrameInfo();
+    assert((MFI->getObjectSize(Index) * 8) ==
+               Incoming.getValueType().getSizeInBits() &&
+           "Bad spill:  stack slot does not match!");
+#endif
+
+    Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
+                                 MachinePointerInfo::getFixedStack(
+                                     Builder.DAG.getMachineFunction(), Index));
+
+    Builder.StatepointLowering.setLocation(Incoming, Loc);
+  }
+
+  assert(Loc.getNode());
+  return std::make_pair(Loc, Chain);
+}
+
+/// Lower a single value incoming to a statepoint node.  This value can be
+/// either a deopt value or a gc value, the handling is the same.  We special
+/// case constants and allocas, then fall back to spilling if required.
+static void lowerIncomingStatepointValue(SDValue Incoming,
+                                         SmallVectorImpl<SDValue> &Ops,
+                                         SelectionDAGBuilder &Builder) {
+  SDValue Chain = Builder.getRoot();
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+    // If the original value was a constant, make sure it gets recorded as
+    // such in the stackmap.  This is required so that the consumer can
+    // parse any internal format to the deopt state.  It also handles null
+    // pointers and other constant pointers in GC states.  Note the constant
+    // vectors do not appear to actually hit this path and that anything larger
+    // than an i64 value (not type!) will fail asserts here.
+    pushStackMapConstant(Ops, Builder, C->getSExtValue());
+  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+    // This handles allocas as arguments to the statepoint (this is only
+    // really meaningful for a deopt value.  For GC, we'd be trying to
+    // relocate the address of the alloca itself?)
+    Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
+                                                  Incoming.getValueType()));
+  } else {
+    // Otherwise, locate a spill slot and explicitly spill it so it
+    // can be found by the runtime later.  We currently do not support
+    // tracking values through callee saved registers to their eventual
+    // spill location.  This would be a useful optimization, but would
+    // need to be optional since it requires a lot of complexity on the
+    // runtime side which not all would support.
+    auto Res = spillIncomingStatepointValue(Incoming, Chain, Builder);
+    Ops.push_back(Res.first);
+    Chain = Res.second;
+  }
+
+  Builder.DAG.setRoot(Chain);
+}
+
+/// Lower deopt state and gc pointer arguments of the statepoint.  The actual
+/// lowering is described in lowerIncomingStatepointValue.  This function is
+/// responsible for lowering everything in the right position and playing some
+/// tricks to avoid redundant stack manipulation where possible.  On
+/// completion, 'Ops' will contain ready to use operands for machine code
+/// statepoint. The chain nodes will have already been created and the DAG root
+/// will be set to the last value spilled (if any were).
+static void
+lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
+                        SelectionDAGBuilder::StatepointLoweringInfo &SI,
+                        SelectionDAGBuilder &Builder) {
+  // Lower the deopt and gc arguments for this statepoint.  Layout will be:
+  // deopt argument length, deopt arguments.., gc arguments...
+#ifndef NDEBUG
+  if (auto *GFI = Builder.GFI) {
+    // Check that each of the gc pointer and bases we've gotten out of the
+    // safepoint is something the strategy thinks might be a pointer (or vector
+    // of pointers) into the GC heap.  This is basically just here to help catch
+    // errors during statepoint insertion. TODO: This should actually be in the
+    // Verifier, but we can't get to the GCStrategy from there (yet).
+    GCStrategy &S = GFI->getStrategy();
+    for (const Value *V : SI.Bases) {
+      auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed base pointer found in statepoint");
+      }
+    }
+    for (const Value *V : SI.Ptrs) {
+      auto Opt = S.isGCManagedPointer(V->getType()->getScalarType());
+      if (Opt.hasValue()) {
+        assert(Opt.getValue() &&
+               "non gc managed derived pointer found in statepoint");
+      }
+    }
+  } else {
+    assert(SI.Bases.empty() && "No gc specified, so cannot relocate pointers!");
+    assert(SI.Ptrs.empty() && "No gc specified, so cannot relocate pointers!");
+  }
+#endif
+
+  // Before we actually start lowering (and allocating spill slots for values),
+  // reserve any stack slots which we judge to be profitable to reuse for a
+  // particular value.  This is purely an optimization over the code below and
+  // doesn't change semantics at all.  It is important for performance that we
+  // reserve slots for both deopt and gc values before lowering either.
+  for (const Value *V : SI.DeoptState) {
+    reservePreviousStackSlotForValue(V, Builder);
+  }
+  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    reservePreviousStackSlotForValue(SI.Bases[i], Builder);
+    reservePreviousStackSlotForValue(SI.Ptrs[i], Builder);
+  }
+
+  // First, prefix the list with the number of unique values to be
+  // lowered.  Note that this is the number of *Values* not the
+  // number of SDValues required to lower them.
+  const int NumVMSArgs = SI.DeoptState.size();
+  pushStackMapConstant(Ops, Builder, NumVMSArgs);
+
+  // The vm state arguments are lowered in an opaque manner.  We do not know
+  // what type of values are contained within.
+  for (const Value *V : SI.DeoptState) {
+    SDValue Incoming = Builder.getValue(V);
+    lowerIncomingStatepointValue(Incoming, Ops, Builder);
+  }
+
+  // Finally, go ahead and lower all the gc arguments.  There's no prefixed
+  // length for this one.  After lowering, we'll have the base and pointer
+  // arrays interwoven with each (lowered) base pointer immediately followed by
+  // it's (lowered) derived pointer.  i.e
+  // (base[0], ptr[0], base[1], ptr[1], ...)
+  for (unsigned i = 0; i < SI.Bases.size(); ++i) {
+    const Value *Base = SI.Bases[i];
+    lowerIncomingStatepointValue(Builder.getValue(Base), Ops, Builder);
+
+    const Value *Ptr = SI.Ptrs[i];
+    lowerIncomingStatepointValue(Builder.getValue(Ptr), Ops, Builder);
+  }
+
+  // If there are any explicit spill slots passed to the statepoint, record
+  // them, but otherwise do not do anything special.  These are user provided
+  // allocas and give control over placement to the consumer.  In this case,
+  // it is the contents of the slot which may get updated, not the pointer to
+  // the alloca
+  for (Value *V : SI.GCArgs) {
+    SDValue Incoming = Builder.getValue(V);
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+      // This handles allocas as arguments to the statepoint
+      Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
+                                                    Incoming.getValueType()));
+    }
+  }
+
+  // Record computed locations for all lowered values.
+  // This can not be embedded in lowering loops as we need to record *all*
+  // values, while previous loops account only values with unique SDValues.
+  const Instruction *StatepointInstr = SI.StatepointInstr;
+  auto &SpillMap = Builder.FuncInfo.StatepointSpillMaps[StatepointInstr];
+
+  for (const GCRelocateInst *Relocate : SI.GCRelocates) {
+    const Value *V = Relocate->getDerivedPtr();
+    SDValue SDV = Builder.getValue(V);
+    SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
+
+    if (Loc.getNode()) {
+      SpillMap.SlotMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
+    } else {
+      // Record value as visited, but not spilled. This is case for allocas
+      // and constants. For this values we can avoid emitting spill load while
+      // visiting corresponding gc_relocate.
+      // Actually we do not need to record them in this map at all.
+      // We do this only to check that we are not relocating any unvisited
+      // value.
+      SpillMap.SlotMap[V] = None;
+
+      // Default llvm mechanisms for exporting values which are used in
+      // different basic blocks does not work for gc relocates.
+      // Note that it would be incorrect to teach llvm that all relocates are
+      // uses of the corresponding values so that it would automatically
+      // export them. Relocates of the spilled values does not use original
+      // value.
+      if (Relocate->getParent() != StatepointInstr->getParent())
+        Builder.ExportFromCurrentBlock(V);
+    }
+  }
+}
+
+SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
+    SelectionDAGBuilder::StatepointLoweringInfo &SI) {
+  // The basic scheme here is that information about both the original call and
+  // the safepoint is encoded in the CallInst.  We create a temporary call and
+  // lower it, then reverse engineer the calling sequence.
+
+  NumOfStatepoints++;
+  // Clear state
+  StatepointLowering.startNewStatepoint(*this);
+
+#ifndef NDEBUG
+  // We schedule gc relocates before removeDuplicateGCPtrs since we _will_
+  // encounter the duplicate gc relocates we elide in removeDuplicateGCPtrs.
+  for (auto *Reloc : SI.GCRelocates)
+    if (Reloc->getParent() == SI.StatepointInstr->getParent())
+      StatepointLowering.scheduleRelocCall(*Reloc);
+#endif
+
+  // Remove any redundant llvm::Values which map to the same SDValue as another
+  // input.  Also has the effect of removing duplicates in the original
+  // llvm::Value input list as well.  This is a useful optimization for
+  // reducing the size of the StackMap section.  It has no other impact.
+  removeDuplicateGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this,
+                        FuncInfo.StatepointSpillMaps[SI.StatepointInstr]);
+  assert(SI.Bases.size() == SI.Ptrs.size() &&
+         SI.Ptrs.size() == SI.GCRelocates.size());
+
+  // Lower statepoint vmstate and gcstate arguments
+  SmallVector<SDValue, 10> LoweredMetaArgs;
+  lowerStatepointMetaArgs(LoweredMetaArgs, SI, *this);
+
+  // Now that we've emitted the spills, we need to update the root so that the
+  // call sequence is ordered correctly.
+  SI.CLI.setChain(getRoot());
+
+  // Get call node, we will replace it later with statepoint
+  SDValue ReturnVal;
+  SDNode *CallNode;
+  std::tie(ReturnVal, CallNode) =
+      lowerCallFromStatepointLoweringInfo(SI, *this, PendingExports);
+
+  // Construct the actual GC_TRANSITION_START, STATEPOINT, and GC_TRANSITION_END
+  // nodes with all the appropriate arguments and return values.
+
+  // Call Node: Chain, Target, {Args}, RegMask, [Glue]
+  SDValue Chain = CallNode->getOperand(0);
+
+  SDValue Glue;
+  bool CallHasIncomingGlue = CallNode->getGluedNode();
+  if (CallHasIncomingGlue) {
+    // Glue is always last operand
+    Glue = CallNode->getOperand(CallNode->getNumOperands() - 1);
+  }
+
+  // Build the GC_TRANSITION_START node if necessary.
+  //
+  // The operands to the GC_TRANSITION_{START,END} nodes are laid out in the
+  // order in which they appear in the call to the statepoint intrinsic. If
+  // any of the operands is a pointer-typed, that operand is immediately
+  // followed by a SRCVALUE for the pointer that may be used during lowering
+  // (e.g. to form MachinePointerInfo values for loads/stores).
+  const bool IsGCTransition =
+      (SI.StatepointFlags & (uint64_t)StatepointFlags::GCTransition) ==
+      (uint64_t)StatepointFlags::GCTransition;
+  if (IsGCTransition) {
+    SmallVector<SDValue, 8> TSOps;
+
+    // Add chain
+    TSOps.push_back(Chain);
+
+    // Add GC transition arguments
+    for (const Value *V : SI.GCTransitionArgs) {
+      TSOps.push_back(getValue(V));
+      if (V->getType()->isPointerTy())
+        TSOps.push_back(DAG.getSrcValue(V));
+    }
+
+    // Add glue if necessary
+    if (CallHasIncomingGlue)
+      TSOps.push_back(Glue);
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    SDValue GCTransitionStart =
+        DAG.getNode(ISD::GC_TRANSITION_START, getCurSDLoc(), NodeTys, TSOps);
+
+    Chain = GCTransitionStart.getValue(0);
+    Glue = GCTransitionStart.getValue(1);
+  }
+
+  // TODO: Currently, all of these operands are being marked as read/write in
+  // PrologEpilougeInserter.cpp, we should special case the VMState arguments
+  // and flags to be read-only.
+  SmallVector<SDValue, 40> Ops;
+
+  // Add the <id> and <numBytes> constants.
+  Ops.push_back(DAG.getTargetConstant(SI.ID, getCurSDLoc(), MVT::i64));
+  Ops.push_back(
+      DAG.getTargetConstant(SI.NumPatchBytes, getCurSDLoc(), MVT::i32));
+
+  // Calculate and push starting position of vmstate arguments
+  // Get number of arguments incoming directly into call node
+  unsigned NumCallRegArgs =
+      CallNode->getNumOperands() - (CallHasIncomingGlue ? 4 : 3);
+  Ops.push_back(DAG.getTargetConstant(NumCallRegArgs, getCurSDLoc(), MVT::i32));
+
+  // Add call target
+  SDValue CallTarget = SDValue(CallNode->getOperand(1).getNode(), 0);
+  Ops.push_back(CallTarget);
+
+  // Add call arguments
+  // Get position of register mask in the call
+  SDNode::op_iterator RegMaskIt;
+  if (CallHasIncomingGlue)
+    RegMaskIt = CallNode->op_end() - 2;
+  else
+    RegMaskIt = CallNode->op_end() - 1;
+  Ops.insert(Ops.end(), CallNode->op_begin() + 2, RegMaskIt);
+
+  // Add a constant argument for the calling convention
+  pushStackMapConstant(Ops, *this, SI.CLI.CallConv);
+
+  // Add a constant argument for the flags
+  uint64_t Flags = SI.StatepointFlags;
+  assert(((Flags & ~(uint64_t)StatepointFlags::MaskAll) == 0) &&
+         "Unknown flag used");
+  pushStackMapConstant(Ops, *this, Flags);
+
+  // Insert all vmstate and gcstate arguments
+  Ops.insert(Ops.end(), LoweredMetaArgs.begin(), LoweredMetaArgs.end());
+
+  // Add register mask from call node
+  Ops.push_back(*RegMaskIt);
+
+  // Add chain
+  Ops.push_back(Chain);
+
+  // Same for the glue, but we add it only if original call had it
+  if (Glue.getNode())
+    Ops.push_back(Glue);
+
+  // Compute return values.  Provide a glue output since we consume one as
+  // input.  This allows someone else to chain off us as needed.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+  SDNode *StatepointMCNode =
+      DAG.getMachineNode(TargetOpcode::STATEPOINT, getCurSDLoc(), NodeTys, Ops);
+
+  SDNode *SinkNode = StatepointMCNode;
+
+  // Build the GC_TRANSITION_END node if necessary.
+  //
+  // See the comment above regarding GC_TRANSITION_START for the layout of
+  // the operands to the GC_TRANSITION_END node.
+  if (IsGCTransition) {
+    SmallVector<SDValue, 8> TEOps;
+
+    // Add chain
+    TEOps.push_back(SDValue(StatepointMCNode, 0));
+
+    // Add GC transition arguments
+    for (const Value *V : SI.GCTransitionArgs) {
+      TEOps.push_back(getValue(V));
+      if (V->getType()->isPointerTy())
+        TEOps.push_back(DAG.getSrcValue(V));
+    }
+
+    // Add glue
+    TEOps.push_back(SDValue(StatepointMCNode, 1));
+
+    SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+
+    SDValue GCTransitionStart =
+        DAG.getNode(ISD::GC_TRANSITION_END, getCurSDLoc(), NodeTys, TEOps);
+
+    SinkNode = GCTransitionStart.getNode();
+  }
+
+  // Replace original call
+  DAG.ReplaceAllUsesWith(CallNode, SinkNode); // This may update Root
+  // Remove original call node
+  DAG.DeleteNode(CallNode);
+
+  // DON'T set the root - under the assumption that it's already set past the
+  // inserted node we created.
+
+  // TODO: A better future implementation would be to emit a single variable
+  // argument, variable return value STATEPOINT node here and then hookup the
+  // return value of each gc.relocate to the respective output of the
+  // previously emitted STATEPOINT value.  Unfortunately, this doesn't appear
+  // to actually be possible today.
+
+  return ReturnVal;
+}
+
+void
+SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
+                                     const BasicBlock *EHPadBB /*= nullptr*/) {
+  assert(ISP.getCallSite().getCallingConv() != CallingConv::AnyReg &&
+         "anyregcc is not supported on statepoints!");
+
+#ifndef NDEBUG
+  // If this is a malformed statepoint, report it early to simplify debugging.
+  // This should catch any IR level mistake that's made when constructing or
+  // transforming statepoints.
+  ISP.verify();
+
+  // Check that the associated GCStrategy expects to encounter statepoints.
+  assert(GFI->getStrategy().useStatepoints() &&
+         "GCStrategy does not expect to encounter statepoints");
+#endif
+
+  SDValue ActualCallee;
+
+  if (ISP.getNumPatchBytes() > 0) {
+    // If we've been asked to emit a nop sequence instead of a call instruction
+    // for this statepoint then don't lower the call target, but use a constant
+    // `null` instead.  Not lowering the call target lets statepoint clients get
+    // away without providing a physical address for the symbolic call target at
+    // link time.
+
+    const auto &TLI = DAG.getTargetLoweringInfo();
+    const auto &DL = DAG.getDataLayout();
+
+    unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace();
+    ActualCallee = DAG.getConstant(0, getCurSDLoc(), TLI.getPointerTy(DL, AS));
+  } else {
+    ActualCallee = getValue(ISP.getCalledValue());
+  }
+
+  StatepointLoweringInfo SI(DAG);
+  populateCallLoweringInfo(SI.CLI, ISP.getCallSite(),
+                           ImmutableStatepoint::CallArgsBeginPos,
+                           ISP.getNumCallArgs(), ActualCallee,
+                           ISP.getActualReturnType(), false /* IsPatchPoint */);
+
+  for (const GCRelocateInst *Relocate : ISP.getRelocates()) {
+    SI.GCRelocates.push_back(Relocate);
+    SI.Bases.push_back(Relocate->getBasePtr());
+    SI.Ptrs.push_back(Relocate->getDerivedPtr());
+  }
+
+  SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
+  SI.StatepointInstr = ISP.getInstruction();
+  SI.GCTransitionArgs =
+      ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
+  SI.ID = ISP.getID();
+  SI.DeoptState = ArrayRef<const Use>(ISP.vm_state_begin(), ISP.vm_state_end());
+  SI.StatepointFlags = ISP.getFlags();
+  SI.NumPatchBytes = ISP.getNumPatchBytes();
+  SI.EHPadBB = EHPadBB;
+
+  SDValue ReturnValue = LowerAsSTATEPOINT(SI);
+
+  // Export the result value if needed
+  const GCResultInst *GCResult = ISP.getGCResult();
+  Type *RetTy = ISP.getActualReturnType();
+  if (!RetTy->isVoidTy() && GCResult) {
+    if (GCResult->getParent() != ISP.getCallSite().getParent()) {
+      // Result value will be used in a different basic block so we need to
+      // export it now.  Default exporting mechanism will not work here because
+      // statepoint call has a different type than the actual call. It means
+      // that by default llvm will create export register of the wrong type
+      // (always i32 in our case). So instead we need to create export register
+      // with correct type manually.
+      // TODO: To eliminate this problem we can remove gc.result intrinsics
+      //       completely and make statepoint call to return a tuple.
+      unsigned Reg = FuncInfo.CreateRegs(RetTy);
+      RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                       DAG.getDataLayout(), Reg, RetTy);
+      SDValue Chain = DAG.getEntryNode();
+
+      RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
+      PendingExports.push_back(Chain);
+      FuncInfo.ValueMap[ISP.getInstruction()] = Reg;
+    } else {
+      // Result value will be used in a same basic block. Don't export it or
+      // perform any explicit register copies.
+      // We'll replace the actuall call node shortly. gc_result will grab
+      // this value.
+      setValue(ISP.getInstruction(), ReturnValue);
+    }
+  } else {
+    // The token value is never used from here on, just generate a poison value
+    setValue(ISP.getInstruction(), DAG.getIntPtrConstant(-1, getCurSDLoc()));
+  }
+}
+
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
+    ImmutableCallSite CS, SDValue Callee, const BasicBlock *EHPadBB,
+    bool VarArgDisallowed, bool ForceVoidReturnTy) {
+  StatepointLoweringInfo SI(DAG);
+  unsigned ArgBeginIndex = CS.arg_begin() - CS.getInstruction()->op_begin();
+  populateCallLoweringInfo(
+      SI.CLI, CS, ArgBeginIndex, CS.getNumArgOperands(), Callee,
+      ForceVoidReturnTy ? Type::getVoidTy(*DAG.getContext()) : CS.getType(),
+      false);
+  if (!VarArgDisallowed)
+    SI.CLI.IsVarArg = CS.getFunctionType()->isVarArg();
+
+  auto DeoptBundle = *CS.getOperandBundle(LLVMContext::OB_deopt);
+
+  unsigned DefaultID = StatepointDirectives::DeoptBundleStatepointID;
+
+  auto SD = parseStatepointDirectivesFromAttrs(CS.getAttributes());
+  SI.ID = SD.StatepointID.getValueOr(DefaultID);
+  SI.NumPatchBytes = SD.NumPatchBytes.getValueOr(0);
+
+  SI.DeoptState =
+      ArrayRef<const Use>(DeoptBundle.Inputs.begin(), DeoptBundle.Inputs.end());
+  SI.StatepointFlags = static_cast<uint64_t>(StatepointFlags::None);
+  SI.EHPadBB = EHPadBB;
+
+  // NB! The GC arguments are deliberately left empty.
+
+  if (SDValue ReturnVal = LowerAsSTATEPOINT(SI)) {
+    const Instruction *Inst = CS.getInstruction();
+    ReturnVal = lowerRangeToAssertZExt(DAG, *Inst, ReturnVal);
+    setValue(Inst, ReturnVal);
+  }
+}
+
+void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
+    ImmutableCallSite CS, SDValue Callee, const BasicBlock *EHPadBB) {
+  LowerCallSiteWithDeoptBundleImpl(CS, Callee, EHPadBB,
+                                   /* VarArgDisallowed = */ false,
+                                   /* ForceVoidReturnTy  = */ false);
+}
+
+void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
+  // The result value of the gc_result is simply the result of the actual
+  // call.  We've already emitted this, so just grab the value.
+  const Instruction *I = CI.getStatepoint();
+
+  if (I->getParent() != CI.getParent()) {
+    // Statepoint is in different basic block so we should have stored call
+    // result in a virtual register.
+    // We can not use default getValue() functionality to copy value from this
+    // register because statepoint and actual call return types can be
+    // different, and getValue() will use CopyFromReg of the wrong type,
+    // which is always i32 in our case.
+    PointerType *CalleeType = cast<PointerType>(
+        ImmutableStatepoint(I).getCalledValue()->getType());
+    Type *RetTy =
+        cast<FunctionType>(CalleeType->getElementType())->getReturnType();
+    SDValue CopyFromReg = getCopyFromRegs(I, RetTy);
+
+    assert(CopyFromReg.getNode());
+    setValue(&CI, CopyFromReg);
+  } else {
+    setValue(&CI, getValue(I));
+  }
+}
+
+void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
+#ifndef NDEBUG
+  // Consistency check
+  // We skip this check for relocates not in the same basic block as thier
+  // statepoint. It would be too expensive to preserve validation info through
+  // different basic blocks.
+  if (Relocate.getStatepoint()->getParent() == Relocate.getParent())
+    StatepointLowering.relocCallVisited(Relocate);
+
+  auto *Ty = Relocate.getType()->getScalarType();
+  if (auto IsManaged = GFI->getStrategy().isGCManagedPointer(Ty))
+    assert(*IsManaged && "Non gc managed pointer relocated!");
+#endif
+
+  const Value *DerivedPtr = Relocate.getDerivedPtr();
+  SDValue SD = getValue(DerivedPtr);
+
+  auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()];
+  auto SlotIt = SpillMap.find(DerivedPtr);
+  assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
+  Optional<int> DerivedPtrLocation = SlotIt->second;
+
+  // We didn't need to spill these special cases (constants and allocas).
+  // See the handling in spillIncomingValueForStatepoint for detail.
+  if (!DerivedPtrLocation) {
+    setValue(&Relocate, SD);
+    return;
+  }
+
+  SDValue SpillSlot = DAG.getTargetFrameIndex(*DerivedPtrLocation,
+                                              SD.getValueType());
+
+  // Be conservative: flush all pending loads
+  // TODO: Probably we can be less restrictive on this,
+  // it may allow more scheduling opportunities.
+  SDValue Chain = getRoot();
+
+  SDValue SpillLoad =
+      DAG.getLoad(SpillSlot.getValueType(), getCurSDLoc(), Chain, SpillSlot,
+                  MachinePointerInfo::getFixedStack(DAG.getMachineFunction(),
+                                                    *DerivedPtrLocation));
+
+  // Again, be conservative, don't emit pending loads
+  DAG.setRoot(SpillLoad.getValue(1));
+
+  assert(SpillLoad.getNode());
+  setValue(&Relocate, SpillLoad);
+}
+
+void SelectionDAGBuilder::LowerDeoptimizeCall(const CallInst *CI) {
+  const auto &TLI = DAG.getTargetLoweringInfo();
+  SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(RTLIB::DEOPTIMIZE),
+                                         TLI.getPointerTy(DAG.getDataLayout()));
+
+  // We don't lower calls to __llvm_deoptimize as varargs, but as a regular
+  // call.  We also do not lower the return value to any virtual register, and
+  // change the immediately following return to a trap instruction.
+  LowerCallSiteWithDeoptBundleImpl(CI, Callee, /* EHPadBB = */ nullptr,
+                                   /* VarArgDisallowed = */ true,
+                                   /* ForceVoidReturnTy = */ true);
+}
+
+void SelectionDAGBuilder::LowerDeoptimizingReturn() {
+  // We do not lower the return value from llvm.deoptimize to any virtual
+  // register, and change the immediately following return to a trap
+  // instruction.
+  if (DAG.getTarget().Options.TrapUnreachable)
+    DAG.setRoot(
+        DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot()));
+}
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
new file mode 100644
index 000000000000..b043184003a0
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -0,0 +1,118 @@
+//===-- StatepointLowering.h - SDAGBuilder's statepoint code -*- C++ -*---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file includes support code use by SelectionDAGBuilder when lowering a
+// statepoint sequence in SelectionDAG IR.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+#define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+
+namespace llvm {
+class SelectionDAGBuilder;
+
+/// This class tracks both per-statepoint and per-selectiondag information.
+/// For each statepoint it tracks locations of it's gc valuess (incoming and
+/// relocated) and list of gcreloc calls scheduled for visiting (this is
+/// used for a debug mode consistency check only).  The spill slot tracking
+/// works in concert with information in FunctionLoweringInfo.
+class StatepointLoweringState {
+public:
+  StatepointLoweringState() : NextSlotToAllocate(0) {}
+
+  /// Reset all state tracking for a newly encountered safepoint.  Also
+  /// performs some consistency checking.
+  void startNewStatepoint(SelectionDAGBuilder &Builder);
+
+  /// Clear the memory usage of this object.  This is called from
+  /// SelectionDAGBuilder::clear.  We require this is never called in the
+  /// midst of processing a statepoint sequence.
+  void clear();
+
+  /// Returns the spill location of a value incoming to the current
+  /// statepoint.  Will return SDValue() if this value hasn't been
+  /// spilled.  Otherwise, the value has already been spilled and no
+  /// further action is required by the caller.
+  SDValue getLocation(SDValue Val) {
+    auto I = Locations.find(Val);
+    if (I == Locations.end())
+      return SDValue();
+    return I->second;
+  }
+
+  void setLocation(SDValue Val, SDValue Location) {
+    assert(!Locations.count(Val) &&
+           "Trying to allocate already allocated location");
+    Locations[Val] = Location;
+  }
+
+  /// Record the fact that we expect to encounter a given gc_relocate
+  /// before the next statepoint.  If we don't see it, we'll report
+  /// an assertion.
+  void scheduleRelocCall(const CallInst &RelocCall) {
+    PendingGCRelocateCalls.push_back(&RelocCall);
+  }
+
+  /// Remove this gc_relocate from the list we're expecting to see
+  /// before the next statepoint.  If we weren't expecting to see
+  /// it, we'll report an assertion.
+  void relocCallVisited(const CallInst &RelocCall) {
+    auto I = find(PendingGCRelocateCalls, &RelocCall);
+    assert(I != PendingGCRelocateCalls.end() &&
+           "Visited unexpected gcrelocate call");
+    PendingGCRelocateCalls.erase(I);
+  }
+
+  // TODO: Should add consistency tracking to ensure we encounter
+  // expected gc_result calls too.
+
+  /// Get a stack slot we can use to store an value of type ValueType.  This
+  /// will hopefully be a recylced slot from another statepoint.
+  SDValue allocateStackSlot(EVT ValueType, SelectionDAGBuilder &Builder);
+
+  void reserveStackSlot(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    assert(!AllocatedStackSlots.test(Offset) && "already reserved!");
+    assert(NextSlotToAllocate <= (unsigned)Offset && "consistency!");
+    AllocatedStackSlots.set(Offset);
+  }
+
+  bool isStackSlotAllocated(int Offset) {
+    assert(Offset >= 0 && Offset < (int)AllocatedStackSlots.size() &&
+           "out of bounds");
+    return AllocatedStackSlots.test(Offset);
+  }
+
+private:
+  /// Maps pre-relocation value (gc pointer directly incoming into statepoint)
+  /// into it's location (currently only stack slots)
+  DenseMap<SDValue, SDValue> Locations;
+
+  /// A boolean indicator for each slot listed in the FunctionInfo as to
+  /// whether it has been used in the current statepoint.  Since we try to
+  /// preserve stack slots across safepoints, there can be gaps in which
+  /// slots have been allocated.
+  SmallBitVector AllocatedStackSlots;
+
+  /// Points just beyond the last slot known to have been allocated
+  unsigned NextSlotToAllocate;
+
+  /// Keep track of pending gcrelocate calls for consistency check
+  SmallVector<const CallInst *, 10> PendingGCRelocateCalls;
+};
+} // end namespace llvm
+
+#endif // LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
diff --git a/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
new file mode 100644
index 000000000000..f2bc88a98597
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -0,0 +1,3550 @@
+//===-- TargetLowering.cpp - Implement the TargetLowering class -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLowering class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cctype>
+using namespace llvm;
+
+/// NOTE: The TargetMachine owns TLOF.
+TargetLowering::TargetLowering(const TargetMachine &tm)
+  : TargetLoweringBase(tm) {}
+
+const char *TargetLowering::getTargetNodeName(unsigned Opcode) const {
+  return nullptr;
+}
+
+bool TargetLowering::isPositionIndependent() const {
+  return getTargetMachine().isPositionIndependent();
+}
+
+/// Check whether a given call node is in tail position within its function. If
+/// so, it sets Chain to the input chain of the tail call.
+bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
+                                          SDValue &Chain) const {
+  const Function *F = DAG.getMachineFunction().getFunction();
+
+  // Conservatively require the attributes of the call to match those of
+  // the return. Ignore noalias because it doesn't affect the call sequence.
+  AttributeSet CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
+      .removeAttribute(Attribute::NoAlias).hasAttributes())
+    return false;
+
+  // It's not safe to eliminate the sign / zero extension of the return value.
+  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    return false;
+
+  // Check if the only use is a function return node.
+  return isUsedByReturnOnly(Node, Chain);
+}
+
+bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
+    const uint32_t *CallerPreservedMask,
+    const SmallVectorImpl<CCValAssign> &ArgLocs,
+    const SmallVectorImpl<SDValue> &OutVals) const {
+  for (unsigned I = 0, E = ArgLocs.size(); I != E; ++I) {
+    const CCValAssign &ArgLoc = ArgLocs[I];
+    if (!ArgLoc.isRegLoc())
+      continue;
+    unsigned Reg = ArgLoc.getLocReg();
+    // Only look at callee saved registers.
+    if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
+      continue;
+    // Check that we pass the value used for the caller.
+    // (We look for a CopyFromReg reading a virtual register that is used
+    //  for the function live-in value of register Reg)
+    SDValue Value = OutVals[I];
+    if (Value->getOpcode() != ISD::CopyFromReg)
+      return false;
+    unsigned ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
+    if (MRI.getLiveInPhysReg(ArgReg) != Reg)
+      return false;
+  }
+  return true;
+}
+
+/// \brief Set CallLoweringInfo attribute flags based on a call instruction
+/// and called function attributes.
+void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                 unsigned AttrIdx) {
+  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
+  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
+  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
+  Alignment  = CS->getParamAlignment(AttrIdx);
+}
+
+/// Generate a libcall taking the given operands as arguments and returning a
+/// result of type RetVT.
+std::pair<SDValue, SDValue>
+TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
+                            ArrayRef<SDValue> Ops, bool isSigned,
+                            const SDLoc &dl, bool doesNotReturn,
+                            bool isReturnValueUsed) const {
+  TargetLowering::ArgListTy Args;
+  Args.reserve(Ops.size());
+
+  TargetLowering::ArgListEntry Entry;
+  for (SDValue Op : Ops) {
+    Entry.Node = Op;
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Args.push_back(Entry);
+  }
+
+  if (LC == RTLIB::UNKNOWN_LIBCALL)
+    report_fatal_error("Unsupported library call operation!");
+  SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC),
+                                         getPointerTy(DAG.getDataLayout()));
+
+  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
+    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
+    .setSExtResult(signExtend).setZExtResult(!signExtend);
+  return LowerCallTo(CLI);
+}
+
+/// Soften the operands of a comparison. This code is shared among BR_CC,
+/// SELECT_CC, and SETCC handlers.
+void TargetLowering::softenSetCCOperands(SelectionDAG &DAG, EVT VT,
+                                         SDValue &NewLHS, SDValue &NewRHS,
+                                         ISD::CondCode &CCCode,
+                                         const SDLoc &dl) const {
+  assert((VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f128 || VT == MVT::ppcf128)
+         && "Unsupported setcc type!");
+
+  // Expand into one or more soft-fp libcall(s).
+  RTLIB::Libcall LC1 = RTLIB::UNKNOWN_LIBCALL, LC2 = RTLIB::UNKNOWN_LIBCALL;
+  bool ShouldInvertCC = false;
+  switch (CCCode) {
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
+    break;
+  case ISD::SETNE:
+  case ISD::SETUNE:
+    LC1 = (VT == MVT::f32) ? RTLIB::UNE_F32 :
+          (VT == MVT::f64) ? RTLIB::UNE_F64 :
+          (VT == MVT::f128) ? RTLIB::UNE_F128 : RTLIB::UNE_PPCF128;
+    break;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+          (VT == MVT::f64) ? RTLIB::OGE_F64 :
+          (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
+    break;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 :
+          (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
+    break;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+          (VT == MVT::f64) ? RTLIB::OLE_F64 :
+          (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
+    break;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 :
+          (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
+    break;
+  case ISD::SETUO:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F128 : RTLIB::UO_PPCF128;
+    break;
+  case ISD::SETO:
+    LC1 = (VT == MVT::f32) ? RTLIB::O_F32 :
+          (VT == MVT::f64) ? RTLIB::O_F64 :
+          (VT == MVT::f128) ? RTLIB::O_F128 : RTLIB::O_PPCF128;
+    break;
+  case ISD::SETONE:
+    // SETONE = SETOLT | SETOGT
+    LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+          (VT == MVT::f64) ? RTLIB::OLT_F64 :
+          (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
+    LC2 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+          (VT == MVT::f64) ? RTLIB::OGT_F64 :
+          (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
+    break;
+  case ISD::SETUEQ:
+    LC1 = (VT == MVT::f32) ? RTLIB::UO_F32 :
+          (VT == MVT::f64) ? RTLIB::UO_F64 :
+          (VT == MVT::f128) ? RTLIB::UO_F64 : RTLIB::UO_PPCF128;
+    LC2 = (VT == MVT::f32) ? RTLIB::OEQ_F32 :
+          (VT == MVT::f64) ? RTLIB::OEQ_F64 :
+          (VT == MVT::f128) ? RTLIB::OEQ_F128 : RTLIB::OEQ_PPCF128;
+    break;
+  default:
+    // Invert CC for unordered comparisons
+    ShouldInvertCC = true;
+    switch (CCCode) {
+    case ISD::SETULT:
+      LC1 = (VT == MVT::f32) ? RTLIB::OGE_F32 :
+            (VT == MVT::f64) ? RTLIB::OGE_F64 :
+            (VT == MVT::f128) ? RTLIB::OGE_F128 : RTLIB::OGE_PPCF128;
+      break;
+    case ISD::SETULE:
+      LC1 = (VT == MVT::f32) ? RTLIB::OGT_F32 :
+            (VT == MVT::f64) ? RTLIB::OGT_F64 :
+            (VT == MVT::f128) ? RTLIB::OGT_F128 : RTLIB::OGT_PPCF128;
+      break;
+    case ISD::SETUGT:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLE_F32 :
+            (VT == MVT::f64) ? RTLIB::OLE_F64 :
+            (VT == MVT::f128) ? RTLIB::OLE_F128 : RTLIB::OLE_PPCF128;
+      break;
+    case ISD::SETUGE:
+      LC1 = (VT == MVT::f32) ? RTLIB::OLT_F32 :
+            (VT == MVT::f64) ? RTLIB::OLT_F64 :
+            (VT == MVT::f128) ? RTLIB::OLT_F128 : RTLIB::OLT_PPCF128;
+      break;
+    default: llvm_unreachable("Do not know how to soften this setcc!");
+    }
+  }
+
+  // Use the target specific return value for comparions lib calls.
+  EVT RetVT = getCmpLibcallReturnType();
+  SDValue Ops[2] = {NewLHS, NewRHS};
+  NewLHS = makeLibCall(DAG, LC1, RetVT, Ops, false /*sign irrelevant*/,
+                       dl).first;
+  NewRHS = DAG.getConstant(0, dl, RetVT);
+
+  CCCode = getCmpLibcallCC(LC1);
+  if (ShouldInvertCC)
+    CCCode = getSetCCInverse(CCCode, /*isInteger=*/true);
+
+  if (LC2 != RTLIB::UNKNOWN_LIBCALL) {
+    SDValue Tmp = DAG.getNode(
+        ISD::SETCC, dl,
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
+        NewLHS, NewRHS, DAG.getCondCode(CCCode));
+    NewLHS = makeLibCall(DAG, LC2, RetVT, Ops, false/*sign irrelevant*/,
+                         dl).first;
+    NewLHS = DAG.getNode(
+        ISD::SETCC, dl,
+        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), RetVT),
+        NewLHS, NewRHS, DAG.getCondCode(getCmpLibcallCC(LC2)));
+    NewLHS = DAG.getNode(ISD::OR, dl, Tmp.getValueType(), Tmp, NewLHS);
+    NewRHS = SDValue();
+  }
+}
+
+/// Return the entry encoding for a jump table in the current function. The
+/// returned value is a member of the MachineJumpTableInfo::JTEntryKind enum.
+unsigned TargetLowering::getJumpTableEncoding() const {
+  // In non-pic modes, just use the address of a block.
+  if (!isPositionIndependent())
+    return MachineJumpTableInfo::EK_BlockAddress;
+
+  // In PIC mode, if the target supports a GPRel32 directive, use it.
+  if (getTargetMachine().getMCAsmInfo()->getGPRel32Directive() != nullptr)
+    return MachineJumpTableInfo::EK_GPRel32BlockAddress;
+
+  // Otherwise, use a label difference.
+  return MachineJumpTableInfo::EK_LabelDifference32;
+}
+
+SDValue TargetLowering::getPICJumpTableRelocBase(SDValue Table,
+                                                 SelectionDAG &DAG) const {
+  // If our PIC model is GP relative, use the global offset table as the base.
+  unsigned JTEncoding = getJumpTableEncoding();
+
+  if ((JTEncoding == MachineJumpTableInfo::EK_GPRel64BlockAddress) ||
+      (JTEncoding == MachineJumpTableInfo::EK_GPRel32BlockAddress))
+    return DAG.getGLOBAL_OFFSET_TABLE(getPointerTy(DAG.getDataLayout()));
+
+  return Table;
+}
+
+/// This returns the relocation base for the given PIC jumptable, the same as
+/// getPICJumpTableRelocBase, but as an MCExpr.
+const MCExpr *
+TargetLowering::getPICJumpTableRelocBaseExpr(const MachineFunction *MF,
+                                             unsigned JTI,MCContext &Ctx) const{
+  // The normal PIC reloc base is the label at the start of the jump table.
+  return MCSymbolRefExpr::create(MF->getJTISymbol(JTI, Ctx), Ctx);
+}
+
+bool
+TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
+  const TargetMachine &TM = getTargetMachine();
+  const GlobalValue *GV = GA->getGlobal();
+
+  // If the address is not even local to this DSO we will have to load it from
+  // a got and then add the offset.
+  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
+    return false;
+
+  // If the code is position independent we will have to add a base register.
+  if (isPositionIndependent())
+    return false;
+
+  // Otherwise we can do it.
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Optimization Methods
+//===----------------------------------------------------------------------===//
+
+/// Check to see if the specified operand of the specified instruction is a
+/// constant integer. If so, check to see if there are any bits set in the
+/// constant that are not demanded. If so, shrink the constant and return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
+                                                        const APInt &Demanded) {
+  SDLoc dl(Op);
+
+  // FIXME: ISD::SELECT, ISD::SELECT_CC
+  switch (Op.getOpcode()) {
+  default: break;
+  case ISD::XOR:
+  case ISD::AND:
+  case ISD::OR: {
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!C) return false;
+
+    if (Op.getOpcode() == ISD::XOR &&
+        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+      return false;
+
+    // if we can expand it to have all bits set, do it
+    if (C->getAPIntValue().intersects(~Demanded)) {
+      EVT VT = Op.getValueType();
+      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
+                                DAG.getConstant(Demanded &
+                                                C->getAPIntValue(),
+                                                dl, VT));
+      return CombineTo(Op, New);
+    }
+
+    break;
+  }
+  }
+
+  return false;
+}
+
+/// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
+/// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
+/// generalized for targets with other types of implicit widening casts.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedOp(SDValue Op,
+                                                         unsigned BitWidth,
+                                                         const APInt &Demanded,
+                                                         const SDLoc &dl) {
+  assert(Op.getNumOperands() == 2 &&
+         "ShrinkDemandedOp only supports binary operators!");
+  assert(Op.getNode()->getNumValues() == 1 &&
+         "ShrinkDemandedOp only supports nodes with one result!");
+
+  // Early return, as this function cannot handle vector types.
+  if (Op.getValueType().isVector())
+    return false;
+
+  // Don't do this if the node has another user, which may require the
+  // full value.
+  if (!Op.getNode()->hasOneUse())
+    return false;
+
+  // Search for the smallest integer type with free casts to and from
+  // Op's type. For expedience, just check power-of-2 integer types.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned DemandedSize = BitWidth - Demanded.countLeadingZeros();
+  unsigned SmallVTBits = DemandedSize;
+  if (!isPowerOf2_32(SmallVTBits))
+    SmallVTBits = NextPowerOf2(SmallVTBits);
+  for (; SmallVTBits < BitWidth; SmallVTBits = NextPowerOf2(SmallVTBits)) {
+    EVT SmallVT = EVT::getIntegerVT(*DAG.getContext(), SmallVTBits);
+    if (TLI.isTruncateFree(Op.getValueType(), SmallVT) &&
+        TLI.isZExtFree(SmallVT, Op.getValueType())) {
+      // We found a type with free casts.
+      SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT,
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(0)),
+                              DAG.getNode(ISD::TRUNCATE, dl, SmallVT,
+                                          Op.getNode()->getOperand(1)));
+      bool NeedZext = DemandedSize > SmallVTBits;
+      SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND,
+                              dl, Op.getValueType(), X);
+      return CombineTo(Op, Z);
+    }
+  }
+  return false;
+}
+
+/// Look at Op. At this point, we know that only the DemandedMask bits of the
+/// result of Op are ever used downstream. If we can use this information to
+/// simplify Op, create a new simplified DAG node and return true, returning the
+/// original and new nodes in Old and New. Otherwise, analyze the expression and
+/// return a mask of KnownOne and KnownZero bits for the expression (used to
+/// simplify the caller).  The KnownZero/One bits may only be accurate for those
+/// bits in the DemandedMask.
+bool TargetLowering::SimplifyDemandedBits(SDValue Op,
+                                          const APInt &DemandedMask,
+                                          APInt &KnownZero,
+                                          APInt &KnownOne,
+                                          TargetLoweringOpt &TLO,
+                                          unsigned Depth) const {
+  unsigned BitWidth = DemandedMask.getBitWidth();
+  assert(Op.getValueType().getScalarType().getSizeInBits() == BitWidth &&
+         "Mask size mismatches value type size!");
+  APInt NewMask = DemandedMask;
+  SDLoc dl(Op);
+  auto &DL = TLO.DAG.getDataLayout();
+
+  // Don't know anything.
+  KnownZero = KnownOne = APInt(BitWidth, 0);
+
+  // Other users may use these bits.
+  if (!Op.getNode()->hasOneUse()) {
+    if (Depth != 0) {
+      // If not at the root, Just compute the KnownZero/KnownOne bits to
+      // simplify things downstream.
+      TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
+      return false;
+    }
+    // If this is the root being simplified, allow it to have multiple uses,
+    // just set the NewMask to all bits.
+    NewMask = APInt::getAllOnesValue(BitWidth);
+  } else if (DemandedMask == 0) {
+    // Not demanding any bits from Op.
+    if (!Op.isUndef())
+      return TLO.CombineTo(Op, TLO.DAG.getUNDEF(Op.getValueType()));
+    return false;
+  } else if (Depth == 6) {        // Limit search depth.
+    return false;
+  }
+
+  APInt KnownZero2, KnownOne2, KnownZeroOut, KnownOneOut;
+  switch (Op.getOpcode()) {
+  case ISD::Constant:
+    // We know all of the bits for a constant!
+    KnownOne = cast<ConstantSDNode>(Op)->getAPIntValue();
+    KnownZero = ~KnownOne;
+    return false;   // Don't fall through, will infinitely loop.
+  case ISD::AND:
+    // If the RHS is a constant, check to see if the LHS would be zero without
+    // using the bits from the RHS.  Below, we use knowledge about the RHS to
+    // simplify the LHS, here we're using information from the LHS to simplify
+    // the RHS.
+    if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt LHSZero, LHSOne;
+      // Do not increment Depth here; that can cause an infinite loop.
+      TLO.DAG.computeKnownBits(Op.getOperand(0), LHSZero, LHSOne, Depth);
+      // If the LHS already has zeros where RHSC does, this and is dead.
+      if ((LHSZero & NewMask) == (~RHSC->getAPIntValue() & NewMask))
+        return TLO.CombineTo(Op, Op.getOperand(0));
+      // If any of the set bits in the RHS are known zero on the LHS, shrink
+      // the constant.
+      if (TLO.ShrinkDemandedConstant(Op, ~LHSZero & NewMask))
+        return true;
+    }
+
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownZero & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known one on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((NewMask & (KnownZero|KnownZero2)) == NewMask)
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, Op.getValueType()));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, ~KnownZero2 & NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    break;
+  case ISD::OR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), ~KnownOne & NewMask,
+                             KnownZero2, KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((NewMask & ~KnownOne2 & KnownZero) == (~KnownOne2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownOne & KnownZero2) == (~KnownOne & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((NewMask & ~KnownZero & KnownOne2) == (~KnownZero & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((NewMask & ~KnownZero2 & KnownOne) == (~KnownZero2 & NewMask))
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the RHS is a constant, see if we can simplify it.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    break;
+  case ISD::XOR:
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    if (SimplifyDemandedBits(Op.getOperand(0), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((KnownZero & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+    if ((KnownZero2 & NewMask) == NewMask)
+      return TLO.CombineTo(Op, Op.getOperand(1));
+    // If the operation can be done in a smaller type, do so.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+
+    // If all of the unknown bits are known to be zero on one side or the other
+    // (but not both) turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((NewMask & ~KnownZero & ~KnownZero2) == 0)
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, Op.getValueType(),
+                                               Op.getOperand(0),
+                                               Op.getOperand(1)));
+
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOneOut = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    // NB: it is okay if more bits are known than are requested
+    if ((NewMask & (KnownZero|KnownOne)) == NewMask) { // all known on one side
+      if (KnownOne == KnownOne2) { // set bits are the same on both sides
+        EVT VT = Op.getValueType();
+        SDValue ANDC = TLO.DAG.getConstant(~KnownOne & NewMask, dl, VT);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::AND, dl, VT,
+                                                 Op.getOperand(0), ANDC));
+      }
+    }
+
+    // If the RHS is a constant, see if we can simplify it.
+    // for XOR, we prefer to force bits to 1 if they will make a -1.
+    // if we can't force bits, try to shrink constant
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      APInt Expanded = C->getAPIntValue() | (~NewMask);
+      // if we can expand it to have all bits set, do it
+      if (Expanded.isAllOnesValue()) {
+        if (Expanded != C->getAPIntValue()) {
+          EVT VT = Op.getValueType();
+          SDValue New = TLO.DAG.getNode(Op.getOpcode(), dl,VT, Op.getOperand(0),
+                                        TLO.DAG.getConstant(Expanded, dl, VT));
+          return TLO.CombineTo(Op, New);
+        }
+        // if it already has all the bits set, nothing to change
+        // but don't shrink either!
+      } else if (TLO.ShrinkDemandedConstant(Op, NewMask)) {
+        return true;
+      }
+    }
+
+    KnownZero = KnownZeroOut;
+    KnownOne  = KnownOneOut;
+    break;
+  case ISD::SELECT:
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SELECT_CC:
+    if (SimplifyDemandedBits(Op.getOperand(3), NewMask, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(2), NewMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?");
+
+    // If the operands are constants, see if we can simplify them.
+    if (TLO.ShrinkDemandedConstant(Op, NewMask))
+      return true;
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    break;
+  case ISD::SHL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      unsigned ShAmt = SA->getZExtValue();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the bottom bits (which are shifted
+      // out) are never demanded.
+      if (InOp.getOpcode() == ISD::SRL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getLowBitsSet(BitWidth, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SHL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SRL;
+          }
+
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
+          EVT VT = Op.getValueType();
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }
+
+      if (SimplifyDemandedBits(InOp, NewMask.lshr(ShAmt),
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+
+      // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
+      // are not demanded. This will likely allow the anyext to be folded away.
+      if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) {
+        SDValue InnerOp = InOp.getNode()->getOperand(0);
+        EVT InnerVT = InnerOp.getValueType();
+        unsigned InnerBits = InnerVT.getSizeInBits();
+        if (ShAmt < InnerBits && NewMask.lshr(InnerBits) == 0 &&
+            isTypeDesirableForOp(ISD::SHL, InnerVT)) {
+          EVT ShTy = getShiftAmountTy(InnerVT, DL);
+          if (!APInt(BitWidth, ShAmt).isIntN(ShTy.getSizeInBits()))
+            ShTy = InnerVT;
+          SDValue NarrowShl =
+            TLO.DAG.getNode(ISD::SHL, dl, InnerVT, InnerOp,
+                            TLO.DAG.getConstant(ShAmt, dl, ShTy));
+          return
+            TLO.CombineTo(Op,
+                          TLO.DAG.getNode(ISD::ANY_EXTEND, dl, Op.getValueType(),
+                                          NarrowShl));
+        }
+        // Repeat the SHL optimization above in cases where an extension
+        // intervenes: (shl (anyext (shr x, c1)), c2) to
+        // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
+        // aren't demanded (as above) and that the shifted upper c1 bits of
+        // x aren't demanded.
+        if (InOp.hasOneUse() &&
+            InnerOp.getOpcode() == ISD::SRL &&
+            InnerOp.hasOneUse() &&
+            isa<ConstantSDNode>(InnerOp.getOperand(1))) {
+          uint64_t InnerShAmt = cast<ConstantSDNode>(InnerOp.getOperand(1))
+            ->getZExtValue();
+          if (InnerShAmt < ShAmt &&
+              InnerShAmt < InnerBits &&
+              NewMask.lshr(InnerBits - InnerShAmt + ShAmt) == 0 &&
+              NewMask.trunc(ShAmt) == 0) {
+            SDValue NewSA =
+              TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
+                                  Op.getOperand(1).getValueType());
+            EVT VT = Op.getValueType();
+            SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
+                                             InnerOp.getOperand(0));
+            return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl, VT,
+                                                     NewExt, NewSA));
+          }
+        }
+      }
+
+      KnownZero <<= SA->getZExtValue();
+      KnownOne  <<= SA->getZExtValue();
+      // low bits known zero.
+      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+    }
+    break;
+  case ISD::SRL:
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+      unsigned VTSize = VT.getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      APInt InDemandedMask = (NewMask << ShAmt);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
+        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+
+      // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
+      // single shift.  We can do this if the top bits (which are shifted out)
+      // are never demanded.
+      if (InOp.getOpcode() == ISD::SHL &&
+          isa<ConstantSDNode>(InOp.getOperand(1))) {
+        if (ShAmt && (NewMask & APInt::getHighBitsSet(VTSize, ShAmt)) == 0) {
+          unsigned C1= cast<ConstantSDNode>(InOp.getOperand(1))->getZExtValue();
+          unsigned Opc = ISD::SRL;
+          int Diff = ShAmt-C1;
+          if (Diff < 0) {
+            Diff = -Diff;
+            Opc = ISD::SHL;
+          }
+
+          SDValue NewSA =
+            TLO.DAG.getConstant(Diff, dl, Op.getOperand(1).getValueType());
+          return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, dl, VT,
+                                                   InOp.getOperand(0), NewSA));
+        }
+      }
+
+      // Compute the new bits that are at the top now.
+      if (SimplifyDemandedBits(InOp, InDemandedMask,
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      KnownZero |= HighBits;  // High bits known zero.
+    }
+    break;
+  case ISD::SRA:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (NewMask == 1)
+      return TLO.CombineTo(Op,
+                           TLO.DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                                           Op.getOperand(0), Op.getOperand(1)));
+
+    if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      EVT VT = Op.getValueType();
+      unsigned ShAmt = SA->getZExtValue();
+
+      // If the shift count is an invalid immediate, don't do anything.
+      if (ShAmt >= BitWidth)
+        break;
+
+      APInt InDemandedMask = (NewMask << ShAmt);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
+        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+
+      // If any of the demanded bits are produced by the sign extension, we also
+      // demand the input sign bit.
+      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
+      if (HighBits.intersects(NewMask))
+        InDemandedMask |= APInt::getSignBit(VT.getScalarType().getSizeInBits());
+
+      if (SimplifyDemandedBits(Op.getOperand(0), InDemandedMask,
+                               KnownZero, KnownOne, TLO, Depth+1))
+        return true;
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+      KnownZero = KnownZero.lshr(ShAmt);
+      KnownOne  = KnownOne.lshr(ShAmt);
+
+      // Handle the sign bit, adjusted to where it is now in the mask.
+      APInt SignBit = APInt::getSignBit(BitWidth).lshr(ShAmt);
+
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (KnownZero.intersects(SignBit) || (HighBits & ~NewMask) == HighBits) {
+        SDNodeFlags Flags;
+        Flags.setExact(cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact());
+        return TLO.CombineTo(Op,
+                             TLO.DAG.getNode(ISD::SRL, dl, VT, Op.getOperand(0),
+                                             Op.getOperand(1), &Flags));
+      }
+
+      int Log2 = NewMask.exactLogBase2();
+      if (Log2 >= 0) {
+        // The bit must come from the sign.
+        SDValue NewSA =
+          TLO.DAG.getConstant(BitWidth - 1 - Log2, dl,
+                              Op.getOperand(1).getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT,
+                                                 Op.getOperand(0), NewSA));
+      }
+
+      if (KnownOne.intersects(SignBit))
+        // New bits are known one.
+        KnownOne |= HighBits;
+    }
+    break;
+  case ISD::SIGN_EXTEND_INREG: {
+    EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+
+    APInt MsbMask = APInt::getHighBitsSet(BitWidth, 1);
+    // If we only care about the highest bit, don't bother shifting right.
+    if (MsbMask == NewMask) {
+      unsigned ShAmt = ExVT.getScalarType().getSizeInBits();
+      SDValue InOp = Op.getOperand(0);
+      unsigned VTBits = Op->getValueType(0).getScalarType().getSizeInBits();
+      bool AlreadySignExtended =
+        TLO.DAG.ComputeNumSignBits(InOp) >= VTBits-ShAmt+1;
+      // However if the input is already sign extended we expect the sign
+      // extension to be dropped altogether later and do not simplify.
+      if (!AlreadySignExtended) {
+        // Compute the correct shift amount type, which must be getShiftAmountTy
+        // for scalar types after legalization.
+        EVT ShiftAmtTy = Op.getValueType();
+        if (TLO.LegalTypes() && !ShiftAmtTy.isVector())
+          ShiftAmtTy = getShiftAmountTy(ShiftAmtTy, DL);
+
+        SDValue ShiftAmt = TLO.DAG.getConstant(BitWidth - ShAmt, dl,
+                                               ShiftAmtTy);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(), InOp,
+                                                 ShiftAmt));
+      }
+    }
+
+    // Sign extension.  Compute the demanded bits in the result that are not
+    // present in the input.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth,
+                            BitWidth - ExVT.getScalarType().getSizeInBits());
+
+    // If none of the extended bits are demanded, eliminate the sextinreg.
+    if ((NewBits & NewMask) == 0)
+      return TLO.CombineTo(Op, Op.getOperand(0));
+
+    APInt InSignBit =
+      APInt::getSignBit(ExVT.getScalarType().getSizeInBits()).zext(BitWidth);
+    APInt InputDemandedBits =
+      APInt::getLowBitsSet(BitWidth,
+                           ExVT.getScalarType().getSizeInBits()) &
+      NewMask;
+
+    // Since the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    InputDemandedBits |= InSignBit;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InputDemandedBits,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, convert this into a zero extension.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op,
+                          TLO.DAG.getZeroExtendInReg(Op.getOperand(0),dl,ExVT));
+
+    if (KnownOne.intersects(InSignBit)) {    // Input sign bit known set
+      KnownOne |= NewBits;
+      KnownZero &= ~NewBits;
+    } else {                       // Input sign bit unknown
+      KnownZero &= ~NewBits;
+      KnownOne &= ~NewBits;
+    }
+    break;
+  }
+  case ISD::BUILD_PAIR: {
+    EVT HalfVT = Op.getOperand(0).getValueType();
+    unsigned HalfBitWidth = HalfVT.getScalarSizeInBits();
+
+    APInt MaskLo = NewMask.getLoBits(HalfBitWidth).trunc(HalfBitWidth);
+    APInt MaskHi = NewMask.getHiBits(HalfBitWidth).trunc(HalfBitWidth);
+
+    APInt KnownZeroLo, KnownOneLo;
+    APInt KnownZeroHi, KnownOneHi;
+
+    if (SimplifyDemandedBits(Op.getOperand(0), MaskLo, KnownZeroLo,
+                             KnownOneLo, TLO, Depth + 1))
+      return true;
+
+    if (SimplifyDemandedBits(Op.getOperand(1), MaskHi, KnownZeroHi,
+                             KnownOneHi, TLO, Depth + 1))
+      return true;
+
+    KnownZero = KnownZeroLo.zext(BitWidth) |
+                KnownZeroHi.zext(BitWidth).shl(HalfBitWidth);
+
+    KnownOne = KnownOneLo.zext(BitWidth) |
+               KnownOneHi.zext(BitWidth).shl(HalfBitWidth);
+    break;
+  }
+  case ISD::ZERO_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask.trunc(OperandBitWidth);
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    APInt NewBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - OperandBitWidth) & NewMask;
+    if (!NewBits.intersects(NewMask))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                               Op.getValueType(),
+                                               Op.getOperand(0)));
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero |= NewBits;
+    break;
+  }
+  case ISD::SIGN_EXTEND: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarType().getSizeInBits();
+    APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
+    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt NewBits   = ~InMask & NewMask;
+
+    // If none of the top bits are demanded, convert this into an any_extend.
+    if (NewBits == 0)
+      return TLO.CombineTo(Op,TLO.DAG.getNode(ISD::ANY_EXTEND, dl,
+                                              Op.getValueType(),
+                                              Op.getOperand(0)));
+
+    // Since some of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    APInt InDemandedBits = InMask & NewMask;
+    InDemandedBits |= InSignBit;
+    InDemandedBits = InDemandedBits.trunc(InBits);
+
+    if (SimplifyDemandedBits(Op.getOperand(0), InDemandedBits, KnownZero,
+                             KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+
+    // If the sign bit is known zero, convert this to a zero extend.
+    if (KnownZero.intersects(InSignBit))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::ZERO_EXTEND, dl,
+                                               Op.getValueType(),
+                                               Op.getOperand(0)));
+
+    // If the sign bit is known one, the top bits match.
+    if (KnownOne.intersects(InSignBit)) {
+      KnownOne |= NewBits;
+      assert((KnownZero & NewBits) == 0);
+    } else {   // Otherwise, top bits aren't known.
+      assert((KnownOne & NewBits) == 0);
+      assert((KnownZero & NewBits) == 0);
+    }
+    break;
+  }
+  case ISD::ANY_EXTEND: {
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt InMask = NewMask.trunc(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), InMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    // Simplify the input, using demanded bit information, and compute the known
+    // zero/one bits live out.
+    unsigned OperandBitWidth =
+      Op.getOperand(0).getValueType().getScalarType().getSizeInBits();
+    APInt TruncMask = NewMask.zext(OperandBitWidth);
+    if (SimplifyDemandedBits(Op.getOperand(0), TruncMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    KnownZero = KnownZero.trunc(BitWidth);
+    KnownOne = KnownOne.trunc(BitWidth);
+
+    // If the input is only used by this truncate, see if we can shrink it based
+    // on the known demanded bits.
+    if (Op.getOperand(0).getNode()->hasOneUse()) {
+      SDValue In = Op.getOperand(0);
+      switch (In.getOpcode()) {
+      default: break;
+      case ISD::SRL:
+        // Shrink SRL by a constant if none of the high bits shifted in are
+        // demanded.
+        if (TLO.LegalTypes() &&
+            !isTypeDesirableForOp(ISD::SRL, Op.getValueType()))
+          // Do not turn (vt1 truncate (vt2 srl)) into (vt1 srl) if vt1 is
+          // undesirable.
+          break;
+        ConstantSDNode *ShAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+        if (!ShAmt)
+          break;
+        SDValue Shift = In.getOperand(1);
+        if (TLO.LegalTypes()) {
+          uint64_t ShVal = ShAmt->getZExtValue();
+          Shift = TLO.DAG.getConstant(ShVal, dl,
+                                      getShiftAmountTy(Op.getValueType(), DL));
+        }
+
+        APInt HighBits = APInt::getHighBitsSet(OperandBitWidth,
+                                               OperandBitWidth - BitWidth);
+        HighBits = HighBits.lshr(ShAmt->getZExtValue()).trunc(BitWidth);
+
+        if (ShAmt->getZExtValue() < BitWidth && !(HighBits & NewMask)) {
+          // None of the shifted in bits are needed.  Add a truncate of the
+          // shift input, then shift it.
+          SDValue NewTrunc = TLO.DAG.getNode(ISD::TRUNCATE, dl,
+                                             Op.getValueType(),
+                                             In.getOperand(0));
+          return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl,
+                                                   Op.getValueType(),
+                                                   NewTrunc,
+                                                   Shift));
+        }
+        break;
+      }
+    }
+
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+    break;
+  }
+  case ISD::AssertZext: {
+    // AssertZext demands all of the high bits, plus any of the low bits
+    // demanded by its users.
+    EVT VT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+    APInt InMask = APInt::getLowBitsSet(BitWidth,
+                                        VT.getSizeInBits());
+    if (SimplifyDemandedBits(Op.getOperand(0), ~InMask | NewMask,
+                             KnownZero, KnownOne, TLO, Depth+1))
+      return true;
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
+
+    KnownZero |= ~InMask & NewMask;
+    break;
+  }
+  case ISD::BITCAST:
+    // If this is an FP->Int bitcast and if the sign bit is the only
+    // thing demanded, turn this into a FGETSIGN.
+    if (!TLO.LegalOperations() &&
+        !Op.getValueType().isVector() &&
+        !Op.getOperand(0).getValueType().isVector() &&
+        NewMask == APInt::getSignBit(Op.getValueType().getSizeInBits()) &&
+        Op.getOperand(0).getValueType().isFloatingPoint()) {
+      bool OpVTLegal = isOperationLegalOrCustom(ISD::FGETSIGN, Op.getValueType());
+      bool i32Legal  = isOperationLegalOrCustom(ISD::FGETSIGN, MVT::i32);
+      if ((OpVTLegal || i32Legal) && Op.getValueType().isSimple() &&
+           Op.getOperand(0).getValueType() != MVT::f128) {
+        // Cannot eliminate/lower SHL for f128 yet.
+        EVT Ty = OpVTLegal ? Op.getValueType() : MVT::i32;
+        // Make a FGETSIGN + SHL to move the sign bit into the appropriate
+        // place.  We expect the SHL to be eliminated by other optimizations.
+        SDValue Sign = TLO.DAG.getNode(ISD::FGETSIGN, dl, Ty, Op.getOperand(0));
+        unsigned OpVTSizeInBits = Op.getValueType().getSizeInBits();
+        if (!OpVTLegal && OpVTSizeInBits > 32)
+          Sign = TLO.DAG.getNode(ISD::ZERO_EXTEND, dl, Op.getValueType(), Sign);
+        unsigned ShVal = Op.getValueType().getSizeInBits()-1;
+        SDValue ShAmt = TLO.DAG.getConstant(ShVal, dl, Op.getValueType());
+        return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SHL, dl,
+                                                 Op.getValueType(),
+                                                 Sign, ShAmt));
+      }
+    }
+    break;
+  case ISD::ADD:
+  case ISD::MUL:
+  case ISD::SUB: {
+    // Add, Sub, and Mul don't demand any bits in positions beyond that
+    // of the highest bit demanded of them.
+    APInt LoMask = APInt::getLowBitsSet(BitWidth,
+                                        BitWidth - NewMask.countLeadingZeros());
+    if (SimplifyDemandedBits(Op.getOperand(0), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(Op.getOperand(1), LoMask, KnownZero2,
+                             KnownOne2, TLO, Depth+1))
+      return true;
+    // See if the operation should be performed at a smaller bit width.
+    if (TLO.ShrinkDemandedOp(Op, BitWidth, NewMask, dl))
+      return true;
+  }
+  // FALL THROUGH
+  default:
+    // Just use computeKnownBits to compute output bits.
+    TLO.DAG.computeKnownBits(Op, KnownZero, KnownOne, Depth);
+    break;
+  }
+
+  // If we know the value of all of the demanded bits, return this as a
+  // constant.
+  if ((NewMask & (KnownZero|KnownOne)) == NewMask) {
+    // Avoid folding to a constant if any OpaqueConstant is involved.
+    const SDNode *N = Op.getNode();
+    for (SDNodeIterator I = SDNodeIterator::begin(N),
+         E = SDNodeIterator::end(N); I != E; ++I) {
+      SDNode *Op = *I;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op))
+        if (C->isOpaque())
+          return false;
+    }
+    return TLO.CombineTo(Op,
+                         TLO.DAG.getConstant(KnownOne, dl, Op.getValueType()));
+  }
+
+  return false;
+}
+
+/// Determine which of the bits specified in Mask are known to be either zero or
+/// one and return them in the KnownZero/KnownOne bitsets.
+void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
+                                                   APInt &KnownZero,
+                                                   APInt &KnownOne,
+                                                   const SelectionDAG &DAG,
+                                                   unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use MaskedValueIsZero if you don't know whether Op"
+         " is a target node!");
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0);
+}
+
+/// This method can be implemented by targets that want to expose additional
+/// information about sign bits to the DAG Combiner.
+unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const SelectionDAG &,
+                                                         unsigned Depth) const {
+  assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
+          Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
+          Op.getOpcode() == ISD::INTRINSIC_VOID) &&
+         "Should use ComputeNumSignBits if you don't know whether Op"
+         " is a target node!");
+  return 1;
+}
+
+bool TargetLowering::isConstTrueVal(const SDNode *N) const {
+  if (!N)
+    return false;
+
+  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) {
+    const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
+    if (!BV)
+      return false;
+
+    BitVector UndefElements;
+    CN = BV->getConstantSplatNode(&UndefElements);
+    // Only interested in constant splats, and we don't try to handle undef
+    // elements in identifying boolean constants.
+    if (!CN || UndefElements.none())
+      return false;
+  }
+
+  switch (getBooleanContents(N->getValueType(0))) {
+  case UndefinedBooleanContent:
+    return CN->getAPIntValue()[0];
+  case ZeroOrOneBooleanContent:
+    return CN->isOne();
+  case ZeroOrNegativeOneBooleanContent:
+    return CN->isAllOnesValue();
+  }
+
+  llvm_unreachable("Invalid boolean contents");
+}
+
+bool TargetLowering::isConstFalseVal(const SDNode *N) const {
+  if (!N)
+    return false;
+
+  const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N);
+  if (!CN) {
+    const BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(N);
+    if (!BV)
+      return false;
+
+    BitVector UndefElements;
+    CN = BV->getConstantSplatNode(&UndefElements);
+    // Only interested in constant splats, and we don't try to handle undef
+    // elements in identifying boolean constants.
+    if (!CN || UndefElements.none())
+      return false;
+  }
+
+  if (getBooleanContents(N->getValueType(0)) == UndefinedBooleanContent)
+    return !CN->getAPIntValue()[0];
+
+  return CN->isNullValue();
+}
+
+bool TargetLowering::isExtendedTrueVal(const ConstantSDNode *N, EVT VT,
+                                       bool SExt) const {
+  if (VT == MVT::i1)
+    return N->isOne();
+
+  TargetLowering::BooleanContent Cnt = getBooleanContents(VT);
+  switch (Cnt) {
+  case TargetLowering::ZeroOrOneBooleanContent:
+    // An extended value of 1 is always true, unless its original type is i1,
+    // in which case it will be sign extended to -1.
+    return (N->isOne() && !SExt) || (SExt && (N->getValueType(0) != MVT::i1));
+  case TargetLowering::UndefinedBooleanContent:
+  case TargetLowering::ZeroOrNegativeOneBooleanContent:
+    return N->isAllOnesValue() && SExt;
+  }
+  llvm_unreachable("Unexpected enumeration.");
+}
+
+/// This helper function of SimplifySetCC tries to optimize the comparison when
+/// either operand of the SetCC node is a bitwise-and instruction.
+SDValue TargetLowering::simplifySetCCWithAnd(EVT VT, SDValue N0, SDValue N1,
+                                             ISD::CondCode Cond,
+                                             DAGCombinerInfo &DCI,
+                                             const SDLoc &DL) const {
+  // Match these patterns in any of their permutations:
+  // (X & Y) == Y
+  // (X & Y) != Y
+  if (N1.getOpcode() == ISD::AND && N0.getOpcode() != ISD::AND)
+    std::swap(N0, N1);
+
+  EVT OpVT = N0.getValueType();
+  if (N0.getOpcode() != ISD::AND || !OpVT.isInteger() ||
+      (Cond != ISD::SETEQ && Cond != ISD::SETNE))
+    return SDValue();
+
+  SDValue X, Y;
+  if (N0.getOperand(0) == N1) {
+    X = N0.getOperand(1);
+    Y = N0.getOperand(0);
+  } else if (N0.getOperand(1) == N1) {
+    X = N0.getOperand(0);
+    Y = N0.getOperand(1);
+  } else {
+    return SDValue();
+  }
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Zero = DAG.getConstant(0, DL, OpVT);
+  if (DAG.isKnownToBeAPowerOfTwo(Y)) {
+    // Simplify X & Y == Y to X & Y != 0 if Y has exactly one bit set.
+    // Note that where Y is variable and is known to have at most one bit set
+    // (for example, if it is Z & 1) we cannot do this; the expressions are not
+    // equivalent when Y == 0.
+    Cond = ISD::getSetCCInverse(Cond, /*isInteger=*/true);
+    if (DCI.isBeforeLegalizeOps() ||
+        isCondCodeLegal(Cond, N0.getSimpleValueType()))
+      return DAG.getSetCC(DL, VT, N0, Zero, Cond);
+  } else if (N0.hasOneUse() && hasAndNotCompare(Y)) {
+    // If the target supports an 'and-not' or 'and-complement' logic operation,
+    // try to use that to make a comparison operation more efficient.
+    // But don't do this transform if the mask is a single bit because there are
+    // more efficient ways to deal with that case (for example, 'bt' on x86 or
+    // 'rlwinm' on PPC).
+
+    // Bail out if the compare operand that we want to turn into a zero is
+    // already a zero (otherwise, infinite loop).
+    auto *YConst = dyn_cast<ConstantSDNode>(Y);
+    if (YConst && YConst->isNullValue())
+      return SDValue();
+
+    // Transform this into: ~X & Y == 0.
+    SDValue NotX = DAG.getNOT(SDLoc(X), X, OpVT);
+    SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, NotX, Y);
+    return DAG.getSetCC(DL, VT, NewAnd, Zero, Cond);
+  }
+
+  return SDValue();
+}
+
+/// Try to simplify a setcc built with the specified operands and cc. If it is
+/// unable to simplify it, return a null SDValue.
+SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
+                                      ISD::CondCode Cond, bool foldBooleans,
+                                      DAGCombinerInfo &DCI,
+                                      const SDLoc &dl) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  // These setcc operations always fold.
+  switch (Cond) {
+  default: break;
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2: return DAG.getConstant(0, dl, VT);
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2: {
+    TargetLowering::BooleanContent Cnt =
+        getBooleanContents(N0->getValueType(0));
+    return DAG.getConstant(
+        Cnt == TargetLowering::ZeroOrNegativeOneBooleanContent ? -1ULL : 1, dl,
+        VT);
+  }
+  }
+
+  // Ensure that the constant occurs on the RHS, and fold constant
+  // comparisons.
+  ISD::CondCode SwappedCC = ISD::getSetCCSwappedOperands(Cond);
+  if (isa<ConstantSDNode>(N0.getNode()) &&
+      (DCI.isBeforeLegalizeOps() ||
+       isCondCodeLegal(SwappedCC, N0.getSimpleValueType())))
+    return DAG.getSetCC(dl, VT, N1, N0, SwappedCC);
+
+  if (auto *N1C = dyn_cast<ConstantSDNode>(N1.getNode())) {
+    const APInt &C1 = N1C->getAPIntValue();
+
+    // If the LHS is '(srl (ctlz x), 5)', the RHS is 0/1, and this is an
+    // equality comparison, then we're just comparing whether X itself is
+    // zero.
+    if (N0.getOpcode() == ISD::SRL && (C1 == 0 || C1 == 1) &&
+        N0.getOperand(0).getOpcode() == ISD::CTLZ &&
+        N0.getOperand(1).getOpcode() == ISD::Constant) {
+      const APInt &ShAmt
+        = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          ShAmt == Log2_32(N0.getValueType().getSizeInBits())) {
+        if ((C1 == 0) == (Cond == ISD::SETEQ)) {
+          // (srl (ctlz x), 5) == 0  -> X != 0
+          // (srl (ctlz x), 5) != 1  -> X != 0
+          Cond = ISD::SETNE;
+        } else {
+          // (srl (ctlz x), 5) != 0  -> X == 0
+          // (srl (ctlz x), 5) == 1  -> X == 0
+          Cond = ISD::SETEQ;
+        }
+        SDValue Zero = DAG.getConstant(0, dl, N0.getValueType());
+        return DAG.getSetCC(dl, VT, N0.getOperand(0).getOperand(0),
+                            Zero, Cond);
+      }
+    }
+
+    SDValue CTPOP = N0;
+    // Look through truncs that don't change the value of a ctpop.
+    if (N0.hasOneUse() && N0.getOpcode() == ISD::TRUNCATE)
+      CTPOP = N0.getOperand(0);
+
+    if (CTPOP.hasOneUse() && CTPOP.getOpcode() == ISD::CTPOP &&
+        (N0 == CTPOP || N0.getValueType().getSizeInBits() >
+                        Log2_32_Ceil(CTPOP.getValueType().getSizeInBits()))) {
+      EVT CTVT = CTPOP.getValueType();
+      SDValue CTOp = CTPOP.getOperand(0);
+
+      // (ctpop x) u< 2 -> (x & x-1) == 0
+      // (ctpop x) u> 1 -> (x & x-1) != 0
+      if ((Cond == ISD::SETULT && C1 == 2) || (Cond == ISD::SETUGT && C1 == 1)){
+        SDValue Sub = DAG.getNode(ISD::SUB, dl, CTVT, CTOp,
+                                  DAG.getConstant(1, dl, CTVT));
+        SDValue And = DAG.getNode(ISD::AND, dl, CTVT, CTOp, Sub);
+        ISD::CondCode CC = Cond == ISD::SETULT ? ISD::SETEQ : ISD::SETNE;
+        return DAG.getSetCC(dl, VT, And, DAG.getConstant(0, dl, CTVT), CC);
+      }
+
+      // TODO: (ctpop x) == 1 -> x && (x & x-1) == 0 iff ctpop is illegal.
+    }
+
+    // (zext x) == C --> x == (trunc C)
+    // (sext x) == C --> x == (trunc C)
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        DCI.isBeforeLegalize() && N0->hasOneUse()) {
+      unsigned MinBits = N0.getValueSizeInBits();
+      SDValue PreExt;
+      bool Signed = false;
+      if (N0->getOpcode() == ISD::ZERO_EXTEND) {
+        // ZExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+      } else if (N0->getOpcode() == ISD::AND) {
+        // DAGCombine turns costly ZExts into ANDs
+        if (auto *C = dyn_cast<ConstantSDNode>(N0->getOperand(1)))
+          if ((C->getAPIntValue()+1).isPowerOf2()) {
+            MinBits = C->getAPIntValue().countTrailingOnes();
+            PreExt = N0->getOperand(0);
+          }
+      } else if (N0->getOpcode() == ISD::SIGN_EXTEND) {
+        // SExt
+        MinBits = N0->getOperand(0).getValueSizeInBits();
+        PreExt = N0->getOperand(0);
+        Signed = true;
+      } else if (auto *LN0 = dyn_cast<LoadSDNode>(N0)) {
+        // ZEXTLOAD / SEXTLOAD
+        if (LN0->getExtensionType() == ISD::ZEXTLOAD) {
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
+        } else if (LN0->getExtensionType() == ISD::SEXTLOAD) {
+          Signed = true;
+          MinBits = LN0->getMemoryVT().getSizeInBits();
+          PreExt = N0;
+        }
+      }
+
+      // Figure out how many bits we need to preserve this constant.
+      unsigned ReqdBits = Signed ?
+        C1.getBitWidth() - C1.getNumSignBits() + 1 :
+        C1.getActiveBits();
+
+      // Make sure we're not losing bits from the constant.
+      if (MinBits > 0 &&
+          MinBits < C1.getBitWidth() &&
+          MinBits >= ReqdBits) {
+        EVT MinVT = EVT::getIntegerVT(*DAG.getContext(), MinBits);
+        if (isTypeDesirableForOp(ISD::SETCC, MinVT)) {
+          // Will get folded away.
+          SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, MinVT, PreExt);
+          SDValue C = DAG.getConstant(C1.trunc(MinBits), dl, MinVT);
+          return DAG.getSetCC(dl, VT, Trunc, C, Cond);
+        }
+
+        // If truncating the setcc operands is not desirable, we can still
+        // simplify the expression in some cases:
+        // setcc ([sz]ext (setcc x, y, cc)), 0, setne) -> setcc (x, y, cc)
+        // setcc ([sz]ext (setcc x, y, cc)), 0, seteq) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, setne) -> setcc (x, y, inv(cc))
+        // setcc (zext (setcc x, y, cc)), 1, seteq) -> setcc (x, y, cc)
+        // setcc (sext (setcc x, y, cc)), -1, setne) -> setcc (x, y, inv(cc))
+        // setcc (sext (setcc x, y, cc)), -1, seteq) -> setcc (x, y, cc)
+        SDValue TopSetCC = N0->getOperand(0);
+        unsigned N0Opc = N0->getOpcode();
+        bool SExt = (N0Opc == ISD::SIGN_EXTEND);
+        if (TopSetCC.getValueType() == MVT::i1 && VT == MVT::i1 &&
+            TopSetCC.getOpcode() == ISD::SETCC &&
+            (N0Opc == ISD::ZERO_EXTEND || N0Opc == ISD::SIGN_EXTEND) &&
+            (isConstFalseVal(N1C) ||
+             isExtendedTrueVal(N1C, N0->getValueType(0), SExt))) {
+
+          bool Inverse = (N1C->isNullValue() && Cond == ISD::SETEQ) ||
+                         (!N1C->isNullValue() && Cond == ISD::SETNE);
+
+          if (!Inverse)
+            return TopSetCC;
+
+          ISD::CondCode InvCond = ISD::getSetCCInverse(
+              cast<CondCodeSDNode>(TopSetCC.getOperand(2))->get(),
+              TopSetCC.getOperand(0).getValueType().isInteger());
+          return DAG.getSetCC(dl, VT, TopSetCC.getOperand(0),
+                                      TopSetCC.getOperand(1),
+                                      InvCond);
+
+        }
+      }
+    }
+
+    // If the LHS is '(and load, const)', the RHS is 0,
+    // the test is for equality or unsigned, and all 1 bits of the const are
+    // in the same partial word, see if we can shorten the load.
+    if (DCI.isBeforeLegalize() &&
+        !ISD::isSignedIntSetCC(Cond) &&
+        N0.getOpcode() == ISD::AND && C1 == 0 &&
+        N0.getNode()->hasOneUse() &&
+        isa<LoadSDNode>(N0.getOperand(0)) &&
+        N0.getOperand(0).getNode()->hasOneUse() &&
+        isa<ConstantSDNode>(N0.getOperand(1))) {
+      LoadSDNode *Lod = cast<LoadSDNode>(N0.getOperand(0));
+      APInt bestMask;
+      unsigned bestWidth = 0, bestOffset = 0;
+      if (!Lod->isVolatile() && Lod->isUnindexed()) {
+        unsigned origWidth = N0.getValueType().getSizeInBits();
+        unsigned maskWidth = origWidth;
+        // We can narrow (e.g.) 16-bit extending loads on 32-bit target to
+        // 8 bits, but have to be careful...
+        if (Lod->getExtensionType() != ISD::NON_EXTLOAD)
+          origWidth = Lod->getMemoryVT().getSizeInBits();
+        const APInt &Mask =
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
+        for (unsigned width = origWidth / 2; width>=8; width /= 2) {
+          APInt newMask = APInt::getLowBitsSet(maskWidth, width);
+          for (unsigned offset=0; offset<origWidth/width; offset++) {
+            if ((newMask & Mask) == Mask) {
+              if (!DAG.getDataLayout().isLittleEndian())
+                bestOffset = (origWidth/width - offset - 1) * (width/8);
+              else
+                bestOffset = (uint64_t)offset * (width/8);
+              bestMask = Mask.lshr(offset * (width/8) * 8);
+              bestWidth = width;
+              break;
+            }
+            newMask = newMask << width;
+          }
+        }
+      }
+      if (bestWidth) {
+        EVT newVT = EVT::getIntegerVT(*DAG.getContext(), bestWidth);
+        if (newVT.isRound()) {
+          EVT PtrType = Lod->getOperand(1).getValueType();
+          SDValue Ptr = Lod->getBasePtr();
+          if (bestOffset != 0)
+            Ptr = DAG.getNode(ISD::ADD, dl, PtrType, Lod->getBasePtr(),
+                              DAG.getConstant(bestOffset, dl, PtrType));
+          unsigned NewAlign = MinAlign(Lod->getAlignment(), bestOffset);
+          SDValue NewLoad = DAG.getLoad(
+              newVT, dl, Lod->getChain(), Ptr,
+              Lod->getPointerInfo().getWithOffset(bestOffset), NewAlign);
+          return DAG.getSetCC(dl, VT,
+                              DAG.getNode(ISD::AND, dl, newVT, NewLoad,
+                                      DAG.getConstant(bestMask.trunc(bestWidth),
+                                                      dl, newVT)),
+                              DAG.getConstant(0LL, dl, newVT), Cond);
+        }
+      }
+    }
+
+    // If the LHS is a ZERO_EXTEND, perform the comparison on the input.
+    if (N0.getOpcode() == ISD::ZERO_EXTEND) {
+      unsigned InSize = N0.getOperand(0).getValueType().getSizeInBits();
+
+      // If the comparison constant has bits in the upper part, the
+      // zero-extended value could never match.
+      if (C1.intersects(APInt::getHighBitsSet(C1.getBitWidth(),
+                                              C1.getBitWidth() - InSize))) {
+        switch (Cond) {
+        case ISD::SETUGT:
+        case ISD::SETUGE:
+        case ISD::SETEQ: return DAG.getConstant(0, dl, VT);
+        case ISD::SETULT:
+        case ISD::SETULE:
+        case ISD::SETNE: return DAG.getConstant(1, dl, VT);
+        case ISD::SETGT:
+        case ISD::SETGE:
+          // True if the sign bit of C1 is set.
+          return DAG.getConstant(C1.isNegative(), dl, VT);
+        case ISD::SETLT:
+        case ISD::SETLE:
+          // True if the sign bit of C1 isn't set.
+          return DAG.getConstant(C1.isNonNegative(), dl, VT);
+        default:
+          break;
+        }
+      }
+
+      // Otherwise, we can perform the comparison with the low bits.
+      switch (Cond) {
+      case ISD::SETEQ:
+      case ISD::SETNE:
+      case ISD::SETUGT:
+      case ISD::SETUGE:
+      case ISD::SETULT:
+      case ISD::SETULE: {
+        EVT newVT = N0.getOperand(0).getValueType();
+        if (DCI.isBeforeLegalizeOps() ||
+            (isOperationLegal(ISD::SETCC, newVT) &&
+             getCondCodeAction(Cond, newVT.getSimpleVT()) == Legal)) {
+          EVT NewSetCCVT =
+              getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), newVT);
+          SDValue NewConst = DAG.getConstant(C1.trunc(InSize), dl, newVT);
+
+          SDValue NewSetCC = DAG.getSetCC(dl, NewSetCCVT, N0.getOperand(0),
+                                          NewConst, Cond);
+          return DAG.getBoolExtOrTrunc(NewSetCC, dl, VT, N0.getValueType());
+        }
+        break;
+      }
+      default:
+        break;   // todo, be more careful with signed comparisons
+      }
+    } else if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
+               (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      EVT ExtSrcTy = cast<VTSDNode>(N0.getOperand(1))->getVT();
+      unsigned ExtSrcTyBits = ExtSrcTy.getSizeInBits();
+      EVT ExtDstTy = N0.getValueType();
+      unsigned ExtDstTyBits = ExtDstTy.getSizeInBits();
+
+      // If the constant doesn't fit into the number of bits for the source of
+      // the sign extension, it is impossible for both sides to be equal.
+      if (C1.getMinSignedBits() > ExtSrcTyBits)
+        return DAG.getConstant(Cond == ISD::SETNE, dl, VT);
+
+      SDValue ZextOp;
+      EVT Op0Ty = N0.getOperand(0).getValueType();
+      if (Op0Ty == ExtSrcTy) {
+        ZextOp = N0.getOperand(0);
+      } else {
+        APInt Imm = APInt::getLowBitsSet(ExtDstTyBits, ExtSrcTyBits);
+        ZextOp = DAG.getNode(ISD::AND, dl, Op0Ty, N0.getOperand(0),
+                              DAG.getConstant(Imm, dl, Op0Ty));
+      }
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(ZextOp.getNode());
+      // Otherwise, make this a use of a zext.
+      return DAG.getSetCC(dl, VT, ZextOp,
+                          DAG.getConstant(C1 & APInt::getLowBitsSet(
+                                                              ExtDstTyBits,
+                                                              ExtSrcTyBits),
+                                          dl, ExtDstTy),
+                          Cond);
+    } else if ((N1C->isNullValue() || N1C->getAPIntValue() == 1) &&
+                (Cond == ISD::SETEQ || Cond == ISD::SETNE)) {
+      // SETCC (SETCC), [0|1], [EQ|NE]  -> SETCC
+      if (N0.getOpcode() == ISD::SETCC &&
+          isTypeLegal(VT) && VT.bitsLE(N0.getValueType())) {
+        bool TrueWhenTrue = (Cond == ISD::SETEQ) ^ (N1C->getAPIntValue() != 1);
+        if (TrueWhenTrue)
+          return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
+        // Invert the condition.
+        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+        CC = ISD::getSetCCInverse(CC,
+                                  N0.getOperand(0).getValueType().isInteger());
+        if (DCI.isBeforeLegalizeOps() ||
+            isCondCodeLegal(CC, N0.getOperand(0).getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
+      }
+
+      if ((N0.getOpcode() == ISD::XOR ||
+           (N0.getOpcode() == ISD::AND &&
+            N0.getOperand(0).getOpcode() == ISD::XOR &&
+            N0.getOperand(1) == N0.getOperand(0).getOperand(1))) &&
+          isa<ConstantSDNode>(N0.getOperand(1)) &&
+          cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue() == 1) {
+        // If this is (X^1) == 0/1, swap the RHS and eliminate the xor.  We
+        // can only do this if the top bits are known zero.
+        unsigned BitWidth = N0.getValueSizeInBits();
+        if (DAG.MaskedValueIsZero(N0,
+                                  APInt::getHighBitsSet(BitWidth,
+                                                        BitWidth-1))) {
+          // Okay, get the un-inverted input value.
+          SDValue Val;
+          if (N0.getOpcode() == ISD::XOR)
+            Val = N0.getOperand(0);
+          else {
+            assert(N0.getOpcode() == ISD::AND &&
+                    N0.getOperand(0).getOpcode() == ISD::XOR);
+            // ((X^1)&1)^1 -> X & 1
+            Val = DAG.getNode(ISD::AND, dl, N0.getValueType(),
+                              N0.getOperand(0).getOperand(0),
+                              N0.getOperand(1));
+          }
+
+          return DAG.getSetCC(dl, VT, Val, N1,
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+      } else if (N1C->getAPIntValue() == 1 &&
+                 (VT == MVT::i1 ||
+                  getBooleanContents(N0->getValueType(0)) ==
+                      ZeroOrOneBooleanContent)) {
+        SDValue Op0 = N0;
+        if (Op0.getOpcode() == ISD::TRUNCATE)
+          Op0 = Op0.getOperand(0);
+
+        if ((Op0.getOpcode() == ISD::XOR) &&
+            Op0.getOperand(0).getOpcode() == ISD::SETCC &&
+            Op0.getOperand(1).getOpcode() == ISD::SETCC) {
+          // (xor (setcc), (setcc)) == / != 1 -> (setcc) != / == (setcc)
+          Cond = (Cond == ISD::SETEQ) ? ISD::SETNE : ISD::SETEQ;
+          return DAG.getSetCC(dl, VT, Op0.getOperand(0), Op0.getOperand(1),
+                              Cond);
+        }
+        if (Op0.getOpcode() == ISD::AND &&
+            isa<ConstantSDNode>(Op0.getOperand(1)) &&
+            cast<ConstantSDNode>(Op0.getOperand(1))->getAPIntValue() == 1) {
+          // If this is (X&1) == / != 1, normalize it to (X&1) != / == 0.
+          if (Op0.getValueType().bitsGT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                          DAG.getNode(ISD::TRUNCATE, dl, VT, Op0.getOperand(0)),
+                          DAG.getConstant(1, dl, VT));
+          else if (Op0.getValueType().bitsLT(VT))
+            Op0 = DAG.getNode(ISD::AND, dl, VT,
+                        DAG.getNode(ISD::ANY_EXTEND, dl, VT, Op0.getOperand(0)),
+                        DAG.getConstant(1, dl, VT));
+
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, dl, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+        }
+        if (Op0.getOpcode() == ISD::AssertZext &&
+            cast<VTSDNode>(Op0.getOperand(1))->getVT() == MVT::i1)
+          return DAG.getSetCC(dl, VT, Op0,
+                              DAG.getConstant(0, dl, Op0.getValueType()),
+                              Cond == ISD::SETEQ ? ISD::SETNE : ISD::SETEQ);
+      }
+    }
+
+    APInt MinVal, MaxVal;
+    unsigned OperandBitSize = N1C->getValueType(0).getSizeInBits();
+    if (ISD::isSignedIntSetCC(Cond)) {
+      MinVal = APInt::getSignedMinValue(OperandBitSize);
+      MaxVal = APInt::getSignedMaxValue(OperandBitSize);
+    } else {
+      MinVal = APInt::getMinValue(OperandBitSize);
+      MaxVal = APInt::getMaxValue(OperandBitSize);
+    }
+
+    // Canonicalize GE/LE comparisons to use GT/LT comparisons.
+    if (Cond == ISD::SETGE || Cond == ISD::SETUGE) {
+      if (C1 == MinVal) return DAG.getConstant(1, dl, VT);  // X >= MIN --> true
+      // X >= C0 --> X > (C0 - 1)
+      APInt C = C1 - 1;
+      ISD::CondCode NewCC = (Cond == ISD::SETGE) ? ISD::SETGT : ISD::SETUGT;
+      if ((DCI.isBeforeLegalizeOps() ||
+           isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+          (!N1C->isOpaque() || (N1C->isOpaque() && C.getBitWidth() <= 64 &&
+                                isLegalICmpImmediate(C.getSExtValue())))) {
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getConstant(C, dl, N1.getValueType()),
+                            NewCC);
+      }
+    }
+
+    if (Cond == ISD::SETLE || Cond == ISD::SETULE) {
+      if (C1 == MaxVal) return DAG.getConstant(1, dl, VT);  // X <= MAX --> true
+      // X <= C0 --> X < (C0 + 1)
+      APInt C = C1 + 1;
+      ISD::CondCode NewCC = (Cond == ISD::SETLE) ? ISD::SETLT : ISD::SETULT;
+      if ((DCI.isBeforeLegalizeOps() ||
+           isCondCodeLegal(NewCC, VT.getSimpleVT())) &&
+          (!N1C->isOpaque() || (N1C->isOpaque() && C.getBitWidth() <= 64 &&
+                                isLegalICmpImmediate(C.getSExtValue())))) {
+        return DAG.getSetCC(dl, VT, N0,
+                            DAG.getConstant(C, dl, N1.getValueType()),
+                            NewCC);
+      }
+    }
+
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal)
+      return DAG.getConstant(0, dl, VT);      // X < MIN --> false
+    if ((Cond == ISD::SETGE || Cond == ISD::SETUGE) && C1 == MinVal)
+      return DAG.getConstant(1, dl, VT);      // X >= MIN --> true
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal)
+      return DAG.getConstant(0, dl, VT);      // X > MAX --> false
+    if ((Cond == ISD::SETLE || Cond == ISD::SETULE) && C1 == MaxVal)
+      return DAG.getConstant(1, dl, VT);      // X <= MAX --> true
+
+    // Canonicalize setgt X, Min --> setne X, Min
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MinVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+    // Canonicalize setlt X, Max --> setne X, Max
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MaxVal)
+      return DAG.getSetCC(dl, VT, N0, N1, ISD::SETNE);
+
+    // If we have setult X, 1, turn it into seteq X, 0
+    if ((Cond == ISD::SETLT || Cond == ISD::SETULT) && C1 == MinVal+1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(MinVal, dl, N0.getValueType()),
+                          ISD::SETEQ);
+    // If we have setugt X, Max-1, turn it into seteq X, Max
+    if ((Cond == ISD::SETGT || Cond == ISD::SETUGT) && C1 == MaxVal-1)
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(MaxVal, dl, N0.getValueType()),
+                          ISD::SETEQ);
+
+    // If we have "setcc X, C0", check to see if we can shrink the immediate
+    // by changing cc.
+
+    // SETUGT X, SINTMAX  -> SETLT X, 0
+    if (Cond == ISD::SETUGT &&
+        C1 == APInt::getSignedMaxValue(OperandBitSize))
+      return DAG.getSetCC(dl, VT, N0,
+                          DAG.getConstant(0, dl, N1.getValueType()),
+                          ISD::SETLT);
+
+    // SETULT X, SINTMIN  -> SETGT X, -1
+    if (Cond == ISD::SETULT &&
+        C1 == APInt::getSignedMinValue(OperandBitSize)) {
+      SDValue ConstMinusOne =
+          DAG.getConstant(APInt::getAllOnesValue(OperandBitSize), dl,
+                          N1.getValueType());
+      return DAG.getSetCC(dl, VT, N0, ConstMinusOne, ISD::SETGT);
+    }
+
+    // Fold bit comparisons when we can.
+    if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+        (VT == N0.getValueType() ||
+         (isTypeLegal(VT) && VT.bitsLE(N0.getValueType()))) &&
+        N0.getOpcode() == ISD::AND) {
+      auto &DL = DAG.getDataLayout();
+      if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+        EVT ShiftTy = DCI.isBeforeLegalize()
+                          ? getPointerTy(DL)
+                          : getShiftAmountTy(N0.getValueType(), DL);
+        if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
+          // Perform the xform if the AND RHS is a single bit.
+          if (AndRHS->getAPIntValue().isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                              DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                   DAG.getConstant(AndRHS->getAPIntValue().logBase2(), dl,
+                                   ShiftTy)));
+          }
+        } else if (Cond == ISD::SETEQ && C1 == AndRHS->getAPIntValue()) {
+          // (X & 8) == 8  -->  (X & 8) >> 3
+          // Perform the xform if C1 is a single bit.
+          if (C1.isPowerOf2()) {
+            return DAG.getNode(ISD::TRUNCATE, dl, VT,
+                               DAG.getNode(ISD::SRL, dl, N0.getValueType(), N0,
+                                      DAG.getConstant(C1.logBase2(), dl,
+                                                      ShiftTy)));
+          }
+        }
+      }
+    }
+
+    if (C1.getMinSignedBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (auto *AndRHS = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countTrailingZeros();
+            auto &DL = DAG.getDataLayout();
+            EVT ShiftTy = DCI.isBeforeLegalize()
+                              ? getPointerTy(DL)
+                              : getShiftAmountTy(N0.getValueType(), DL);
+            EVT CmpTy = N0.getValueType();
+            SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
+                                        DAG.getConstant(ShiftBits, dl,
+                                                        ShiftTy));
+            SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), dl, CmpTy);
+            return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countTrailingOnes();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countTrailingZeros();
+        }
+        NewC = NewC.lshr(ShiftBits);
+        if (ShiftBits && NewC.getMinSignedBits() <= 64 &&
+          isLegalICmpImmediate(NewC.getSExtValue())) {
+          auto &DL = DAG.getDataLayout();
+          EVT ShiftTy = DCI.isBeforeLegalize()
+                            ? getPointerTy(DL)
+                            : getShiftAmountTy(N0.getValueType(), DL);
+          EVT CmpTy = N0.getValueType();
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
+                                      DAG.getConstant(ShiftBits, dl, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, dl, CmpTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
+  }
+
+  if (isa<ConstantFPSDNode>(N0.getNode())) {
+    // Constant fold or commute setcc.
+    SDValue O = DAG.FoldSetCC(VT, N0, N1, Cond, dl);
+    if (O.getNode()) return O;
+  } else if (auto *CFP = dyn_cast<ConstantFPSDNode>(N1.getNode())) {
+    // If the RHS of an FP comparison is a constant, simplify it away in
+    // some cases.
+    if (CFP->getValueAPF().isNaN()) {
+      // If an operand is known to be a nan, we can fold it.
+      switch (ISD::getUnorderedFlavor(Cond)) {
+      default: llvm_unreachable("Unknown flavor!");
+      case 0:  // Known false.
+        return DAG.getConstant(0, dl, VT);
+      case 1:  // Known true.
+        return DAG.getConstant(1, dl, VT);
+      case 2:  // Undefined.
+        return DAG.getUNDEF(VT);
+      }
+    }
+
+    // Otherwise, we know the RHS is not a NaN.  Simplify the node to drop the
+    // constant if knowing that the operand is non-nan is enough.  We prefer to
+    // have SETO(x,x) instead of SETO(x, 0.0) because this avoids having to
+    // materialize 0.0.
+    if (Cond == ISD::SETO || Cond == ISD::SETUO)
+      return DAG.getSetCC(dl, VT, N0, N0, Cond);
+
+    // If the condition is not legal, see if we can find an equivalent one
+    // which is legal.
+    if (!isCondCodeLegal(Cond, N0.getSimpleValueType())) {
+      // If the comparison was an awkward floating-point == or != and one of
+      // the comparison operands is infinity or negative infinity, convert the
+      // condition to a less-awkward <= or >=.
+      if (CFP->getValueAPF().isInfinity()) {
+        if (CFP->getValueAPF().isNegative()) {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
+        } else {
+          if (Cond == ISD::SETOEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
+          if (Cond == ISD::SETUEQ &&
+              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
+          if (Cond == ISD::SETUNE &&
+              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
+          if (Cond == ISD::SETONE &&
+              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
+            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
+        }
+      }
+    }
+  }
+
+  if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    uint64_t EqVal = 0;
+    switch (getBooleanContents(N0.getValueType())) {
+    case UndefinedBooleanContent:
+    case ZeroOrOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond);
+      break;
+    case ZeroOrNegativeOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
+      break;
+    }
+
+    // We can always fold X == X for integer setcc's.
+    if (N0.getValueType().isInteger()) {
+      return DAG.getConstant(EqVal, dl, VT);
+    }
+    unsigned UOF = ISD::getUnorderedFlavor(Cond);
+    if (UOF == 2)   // FP operators that are undefined on NaNs.
+      return DAG.getConstant(EqVal, dl, VT);
+    if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
+      return DAG.getConstant(EqVal, dl, VT);
+    // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
+    // if it is not already.
+    ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
+    if (NewCond != Cond && (DCI.isBeforeLegalizeOps() ||
+          getCondCodeAction(NewCond, N0.getSimpleValueType()) == Legal))
+      return DAG.getSetCC(dl, VT, N0, N1, NewCond);
+  }
+
+  if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+      N0.getValueType().isInteger()) {
+    if (N0.getOpcode() == ISD::ADD || N0.getOpcode() == ISD::SUB ||
+        N0.getOpcode() == ISD::XOR) {
+      // Simplify (X+Y) == (X+Z) -->  Y == Z
+      if (N0.getOpcode() == N1.getOpcode()) {
+        if (N0.getOperand(0) == N1.getOperand(0))
+          return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(1), Cond);
+        if (N0.getOperand(1) == N1.getOperand(1))
+          return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(0), Cond);
+        if (DAG.isCommutativeBinOp(N0.getOpcode())) {
+          // If X op Y == Y op X, try other combinations.
+          if (N0.getOperand(0) == N1.getOperand(1))
+            return DAG.getSetCC(dl, VT, N0.getOperand(1), N1.getOperand(0),
+                                Cond);
+          if (N0.getOperand(1) == N1.getOperand(0))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), N1.getOperand(1),
+                                Cond);
+        }
+      }
+
+      // If RHS is a legal immediate value for a compare instruction, we need
+      // to be careful about increasing register pressure needlessly.
+      bool LegalRHSImm = false;
+
+      if (auto *RHSC = dyn_cast<ConstantSDNode>(N1)) {
+        if (auto *LHSR = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          // Turn (X+C1) == C2 --> X == C2-C1
+          if (N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse()) {
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(RHSC->getAPIntValue()-
+                                                LHSR->getAPIntValue(),
+                                dl, N0.getValueType()), Cond);
+          }
+
+          // Turn (X^C1) == C2 into X == C1^C2 iff X&~C1 = 0.
+          if (N0.getOpcode() == ISD::XOR)
+            // If we know that all of the inverted bits are zero, don't bother
+            // performing the inversion.
+            if (DAG.MaskedValueIsZero(N0.getOperand(0), ~LHSR->getAPIntValue()))
+              return
+                DAG.getSetCC(dl, VT, N0.getOperand(0),
+                             DAG.getConstant(LHSR->getAPIntValue() ^
+                                               RHSC->getAPIntValue(),
+                                             dl, N0.getValueType()),
+                             Cond);
+        }
+
+        // Turn (C1-X) == C2 --> X == C1-C2
+        if (auto *SUBC = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+          if (N0.getOpcode() == ISD::SUB && N0.getNode()->hasOneUse()) {
+            return
+              DAG.getSetCC(dl, VT, N0.getOperand(1),
+                           DAG.getConstant(SUBC->getAPIntValue() -
+                                             RHSC->getAPIntValue(),
+                                           dl, N0.getValueType()),
+                           Cond);
+          }
+        }
+
+        // Could RHSC fold directly into a compare?
+        if (RHSC->getValueType(0).getSizeInBits() <= 64)
+          LegalRHSImm = isLegalICmpImmediate(RHSC->getSExtValue());
+      }
+
+      // Simplify (X+Z) == X -->  Z == 0
+      // Don't do this if X is an immediate that can fold into a cmp
+      // instruction and X+Z has other uses. It could be an induction variable
+      // chain, and the transform would increase register pressure.
+      if (!LegalRHSImm || N0.getNode()->hasOneUse()) {
+        if (N0.getOperand(0) == N1)
+          return DAG.getSetCC(dl, VT, N0.getOperand(1),
+                              DAG.getConstant(0, dl, N0.getValueType()), Cond);
+        if (N0.getOperand(1) == N1) {
+          if (DAG.isCommutativeBinOp(N0.getOpcode()))
+            return DAG.getSetCC(dl, VT, N0.getOperand(0),
+                                DAG.getConstant(0, dl, N0.getValueType()),
+                                Cond);
+          if (N0.getNode()->hasOneUse()) {
+            assert(N0.getOpcode() == ISD::SUB && "Unexpected operation!");
+            auto &DL = DAG.getDataLayout();
+            // (Z-X) == X  --> Z == X<<1
+            SDValue SH = DAG.getNode(
+                ISD::SHL, dl, N1.getValueType(), N1,
+                DAG.getConstant(1, dl,
+                                getShiftAmountTy(N1.getValueType(), DL)));
+            if (!DCI.isCalledByLegalizer())
+              DCI.AddToWorklist(SH.getNode());
+            return DAG.getSetCC(dl, VT, N0.getOperand(0), SH, Cond);
+          }
+        }
+      }
+    }
+
+    if (N1.getOpcode() == ISD::ADD || N1.getOpcode() == ISD::SUB ||
+        N1.getOpcode() == ISD::XOR) {
+      // Simplify  X == (X+Z) -->  Z == 0
+      if (N1.getOperand(0) == N0)
+        return DAG.getSetCC(dl, VT, N1.getOperand(1),
+                        DAG.getConstant(0, dl, N1.getValueType()), Cond);
+      if (N1.getOperand(1) == N0) {
+        if (DAG.isCommutativeBinOp(N1.getOpcode()))
+          return DAG.getSetCC(dl, VT, N1.getOperand(0),
+                          DAG.getConstant(0, dl, N1.getValueType()), Cond);
+        if (N1.getNode()->hasOneUse()) {
+          assert(N1.getOpcode() == ISD::SUB && "Unexpected operation!");
+          auto &DL = DAG.getDataLayout();
+          // X == (Z-X)  --> X<<1 == Z
+          SDValue SH = DAG.getNode(
+              ISD::SHL, dl, N1.getValueType(), N0,
+              DAG.getConstant(1, dl, getShiftAmountTy(N0.getValueType(), DL)));
+          if (!DCI.isCalledByLegalizer())
+            DCI.AddToWorklist(SH.getNode());
+          return DAG.getSetCC(dl, VT, SH, N1.getOperand(0), Cond);
+        }
+      }
+    }
+
+    if (SDValue V = simplifySetCCWithAnd(VT, N0, N1, Cond, DCI, dl))
+      return V;
+  }
+
+  // Fold away ALL boolean setcc's.
+  SDValue Temp;
+  if (N0.getValueType() == MVT::i1 && foldBooleans) {
+    switch (Cond) {
+    default: llvm_unreachable("Unknown integer setcc!");
+    case ISD::SETEQ:  // X == Y  -> ~(X^Y)
+      Temp = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      N0 = DAG.getNOT(dl, Temp, MVT::i1);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETNE:  // X != Y   -->  (X^Y)
+      N0 = DAG.getNode(ISD::XOR, dl, MVT::i1, N0, N1);
+      break;
+    case ISD::SETGT:  // X >s Y   -->  X == 0 & Y == 1  -->  ~X & Y
+    case ISD::SETULT: // X <u Y   -->  X == 0 & Y == 1  -->  ~X & Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETLT:  // X <s Y   --> X == 1 & Y == 0  -->  ~Y & X
+    case ISD::SETUGT: // X >u Y   --> X == 1 & Y == 0  -->  ~Y & X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::AND, dl, MVT::i1, N0, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETULE: // X <=u Y  --> X == 0 | Y == 1  -->  ~X | Y
+    case ISD::SETGE:  // X >=s Y  --> X == 0 | Y == 1  -->  ~X | Y
+      Temp = DAG.getNOT(dl, N0, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N1, Temp);
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(Temp.getNode());
+      break;
+    case ISD::SETUGE: // X >=u Y  --> X == 1 | Y == 0  -->  ~Y | X
+    case ISD::SETLE:  // X <=s Y  --> X == 1 | Y == 0  -->  ~Y | X
+      Temp = DAG.getNOT(dl, N1, MVT::i1);
+      N0 = DAG.getNode(ISD::OR, dl, MVT::i1, N0, Temp);
+      break;
+    }
+    if (VT != MVT::i1) {
+      if (!DCI.isCalledByLegalizer())
+        DCI.AddToWorklist(N0.getNode());
+      // FIXME: If running after legalize, we probably can't do this.
+      N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, VT, N0);
+    }
+    return N0;
+  }
+
+  // Could not fold it.
+  return SDValue();
+}
+
+/// Returns true (and the GlobalValue and the offset) if the node is a
+/// GlobalAddress + offset.
+bool TargetLowering::isGAPlusOffset(SDNode *N, const GlobalValue *&GA,
+                                    int64_t &Offset) const {
+  if (auto *GASD = dyn_cast<GlobalAddressSDNode>(N)) {
+    GA = GASD->getGlobal();
+    Offset += GASD->getOffset();
+    return true;
+  }
+
+  if (N->getOpcode() == ISD::ADD) {
+    SDValue N1 = N->getOperand(0);
+    SDValue N2 = N->getOperand(1);
+    if (isGAPlusOffset(N1.getNode(), GA, Offset)) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N2)) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    } else if (isGAPlusOffset(N2.getNode(), GA, Offset)) {
+      if (auto *V = dyn_cast<ConstantSDNode>(N1)) {
+        Offset += V->getSExtValue();
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
+SDValue TargetLowering::PerformDAGCombine(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
+  // Default implementation: no optimization.
+  return SDValue();
+}
+
+//===----------------------------------------------------------------------===//
+//  Inline Assembler Implementation Methods
+//===----------------------------------------------------------------------===//
+
+TargetLowering::ConstraintType
+TargetLowering::getConstraintType(StringRef Constraint) const {
+  unsigned S = Constraint.size();
+
+  if (S == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'r': return C_RegisterClass;
+    case 'm':    // memory
+    case 'o':    // offsetable
+    case 'V':    // not offsetable
+      return C_Memory;
+    case 'i':    // Simple Integer or Relocatable Constant
+    case 'n':    // Simple Integer
+    case 'E':    // Floating Point Constant
+    case 'F':    // Floating Point Constant
+    case 's':    // Relocatable Constant
+    case 'p':    // Address.
+    case 'X':    // Allow ANY value.
+    case 'I':    // Target registers.
+    case 'J':
+    case 'K':
+    case 'L':
+    case 'M':
+    case 'N':
+    case 'O':
+    case 'P':
+    case '<':
+    case '>':
+      return C_Other;
+    }
+  }
+
+  if (S > 1 && Constraint[0] == '{' && Constraint[S-1] == '}') {
+    if (S == 8 && Constraint.substr(1, 6) == "memory") // "{memory}"
+      return C_Memory;
+    return C_Register;
+  }
+  return C_Unknown;
+}
+
+/// Try to replace an X constraint, which matches anything, with another that
+/// has more specific requirements based on the type of the corresponding
+/// operand.
+const char *TargetLowering::LowerXConstraint(EVT ConstraintVT) const{
+  if (ConstraintVT.isInteger())
+    return "r";
+  if (ConstraintVT.isFloatingPoint())
+    return "f";      // works for many targets
+  return nullptr;
+}
+
+/// Lower the specified operand into the Ops vector.
+/// If it is invalid, don't add anything to Ops.
+void TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                  std::string &Constraint,
+                                                  std::vector<SDValue> &Ops,
+                                                  SelectionDAG &DAG) const {
+
+  if (Constraint.length() > 1) return;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default: break;
+  case 'X':     // Allows any operand; labels (basic block) use this.
+    if (Op.getOpcode() == ISD::BasicBlock) {
+      Ops.push_back(Op);
+      return;
+    }
+    // fall through
+  case 'i':    // Simple Integer or Relocatable Constant
+  case 'n':    // Simple Integer
+  case 's': {  // Relocatable Constant
+    // These operands are interested in values of the form (GV+C), where C may
+    // be folded in as an offset of GV, or it may be explicitly added.  Also, it
+    // is possible and fine if either GV or C are missing.
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
+    GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op);
+
+    // If we have "(add GV, C)", pull out GV/C
+    if (Op.getOpcode() == ISD::ADD) {
+      C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+      GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(0));
+      if (!C || !GA) {
+        C = dyn_cast<ConstantSDNode>(Op.getOperand(0));
+        GA = dyn_cast<GlobalAddressSDNode>(Op.getOperand(1));
+      }
+      if (!C || !GA) {
+        C = nullptr;
+        GA = nullptr;
+      }
+    }
+
+    // If we find a valid operand, map to the TargetXXX version so that the
+    // value itself doesn't get selected.
+    if (GA) {   // Either &GV   or   &GV+C
+      if (ConstraintLetter != 'n') {
+        int64_t Offs = GA->getOffset();
+        if (C) Offs += C->getZExtValue();
+        Ops.push_back(DAG.getTargetGlobalAddress(GA->getGlobal(),
+                                                 C ? SDLoc(C) : SDLoc(),
+                                                 Op.getValueType(), Offs));
+      }
+      return;
+    }
+    if (C) {   // just C, no GV.
+      // Simple constants are not allowed for 's'.
+      if (ConstraintLetter != 's') {
+        // gcc prints these as sign extended.  Sign extend value to 64 bits
+        // now; without this it would get ZExt'd later in
+        // ScheduleDAGSDNodes::EmitNode, which is very generic.
+        Ops.push_back(DAG.getTargetConstant(C->getAPIntValue().getSExtValue(),
+                                            SDLoc(C), MVT::i64));
+      }
+      return;
+    }
+    break;
+  }
+  }
+}
+
+std::pair<unsigned, const TargetRegisterClass *>
+TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *RI,
+                                             StringRef Constraint,
+                                             MVT VT) const {
+  if (Constraint.empty() || Constraint[0] != '{')
+    return std::make_pair(0u, static_cast<TargetRegisterClass*>(nullptr));
+  assert(*(Constraint.end()-1) == '}' && "Not a brace enclosed constraint?");
+
+  // Remove the braces from around the name.
+  StringRef RegName(Constraint.data()+1, Constraint.size()-2);
+
+  std::pair<unsigned, const TargetRegisterClass*> R =
+    std::make_pair(0u, static_cast<const TargetRegisterClass*>(nullptr));
+
+  // Figure out which register class contains this reg.
+  for (TargetRegisterInfo::regclass_iterator RCI = RI->regclass_begin(),
+       E = RI->regclass_end(); RCI != E; ++RCI) {
+    const TargetRegisterClass *RC = *RCI;
+
+    // If none of the value types for this register class are valid, we
+    // can't use it.  For example, 64-bit reg classes on 32-bit targets.
+    if (!isLegalRC(RC))
+      continue;
+
+    for (TargetRegisterClass::iterator I = RC->begin(), E = RC->end();
+         I != E; ++I) {
+      if (RegName.equals_lower(RI->getRegAsmName(*I))) {
+        std::pair<unsigned, const TargetRegisterClass*> S =
+          std::make_pair(*I, RC);
+
+        // If this register class has the requested value type, return it,
+        // otherwise keep searching and return the first class found
+        // if no other is found which explicitly has the requested type.
+        if (RC->hasType(VT))
+          return S;
+        else if (!R.second)
+          R = S;
+      }
+    }
+  }
+
+  return R;
+}
+
+//===----------------------------------------------------------------------===//
+// Constraint Selection.
+
+/// Return true of this is an input operand that is a matching constraint like
+/// "4".
+bool TargetLowering::AsmOperandInfo::isMatchingInputConstraint() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return isdigit(static_cast<unsigned char>(ConstraintCode[0]));
+}
+
+/// If this is an input matching constraint, this method returns the output
+/// operand it matches.
+unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
+  assert(!ConstraintCode.empty() && "No known constraint!");
+  return atoi(ConstraintCode.c_str());
+}
+
+/// Split up the constraint string from the inline assembly value into the
+/// specific constraints and their prefixes, and also tie in the associated
+/// operand values.
+/// If this returns an empty vector, and if the constraint string itself
+/// isn't empty, there was an error parsing.
+TargetLowering::AsmOperandInfoVector
+TargetLowering::ParseConstraints(const DataLayout &DL,
+                                 const TargetRegisterInfo *TRI,
+                                 ImmutableCallSite CS) const {
+  /// Information about all of the constraints.
+  AsmOperandInfoVector ConstraintOperands;
+  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+  unsigned maCount = 0; // Largest number of multiple alternative constraints.
+
+  // Do a prepass over the constraints, canonicalizing them, and building up the
+  // ConstraintOperands list.
+  unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0;   // ResNo - The result number of the next output.
+
+  for (InlineAsm::ConstraintInfo &CI : IA->ParseConstraints()) {
+    ConstraintOperands.emplace_back(std::move(CI));
+    AsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Update multiple alternative constraint count.
+    if (OpInfo.multipleAlternatives.size() > maCount)
+      maCount = OpInfo.multipleAlternatives.size();
+
+    OpInfo.ConstraintVT = MVT::Other;
+
+    // Compute the value type for each operand.
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      // Indirect outputs just consume an argument.
+      if (OpInfo.isIndirect) {
+        OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+        break;
+      }
+
+      // The return value of the call is this value.  As such, there is no
+      // corresponding argument.
+      assert(!CS.getType()->isVoidTy() &&
+             "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+        OpInfo.ConstraintVT =
+            getSimpleValueType(DL, STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpInfo.ConstraintVT = getSimpleValueType(DL, CS.getType());
+      }
+      ++ResNo;
+      break;
+    case InlineAsm::isInput:
+      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      break;
+    case InlineAsm::isClobber:
+      // Nothing to do.
+      break;
+    }
+
+    if (OpInfo.CallOperandVal) {
+      llvm::Type *OpTy = OpInfo.CallOperandVal->getType();
+      if (OpInfo.isIndirect) {
+        llvm::PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+        if (!PtrTy)
+          report_fatal_error("Indirect operand for inline asm not a pointer!");
+        OpTy = PtrTy->getElementType();
+      }
+
+      // Look for vector wrapped in a struct. e.g. { <16 x i8> }.
+      if (StructType *STy = dyn_cast<StructType>(OpTy))
+        if (STy->getNumElements() == 1)
+          OpTy = STy->getElementType(0);
+
+      // If OpTy is not a single value, it may be a struct/union that we
+      // can tile with integers.
+      if (!OpTy->isSingleValueType() && OpTy->isSized()) {
+        unsigned BitSize = DL.getTypeSizeInBits(OpTy);
+        switch (BitSize) {
+        default: break;
+        case 1:
+        case 8:
+        case 16:
+        case 32:
+        case 64:
+        case 128:
+          OpInfo.ConstraintVT =
+            MVT::getVT(IntegerType::get(OpTy->getContext(), BitSize), true);
+          break;
+        }
+      } else if (PointerType *PT = dyn_cast<PointerType>(OpTy)) {
+        unsigned PtrSize = DL.getPointerSizeInBits(PT->getAddressSpace());
+        OpInfo.ConstraintVT = MVT::getIntegerVT(PtrSize);
+      } else {
+        OpInfo.ConstraintVT = MVT::getVT(OpTy, true);
+      }
+    }
+  }
+
+  // If we have multiple alternative constraints, select the best alternative.
+  if (!ConstraintOperands.empty()) {
+    if (maCount) {
+      unsigned bestMAIndex = 0;
+      int bestWeight = -1;
+      // weight:  -1 = invalid match, and 0 = so-so match to 5 = good match.
+      int weight = -1;
+      unsigned maIndex;
+      // Compute the sums of the weights for each alternative, keeping track
+      // of the best (highest weight) one so far.
+      for (maIndex = 0; maIndex < maCount; ++maIndex) {
+        int weightSum = 0;
+        for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+            cIndex != eIndex; ++cIndex) {
+          AsmOperandInfo& OpInfo = ConstraintOperands[cIndex];
+          if (OpInfo.Type == InlineAsm::isClobber)
+            continue;
+
+          // If this is an output operand with a matching input operand,
+          // look up the matching input. If their types mismatch, e.g. one
+          // is an integer, the other is floating point, or their sizes are
+          // different, flag it as an maCantMatch.
+          if (OpInfo.hasMatchingInput()) {
+            AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+            if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+              if ((OpInfo.ConstraintVT.isInteger() !=
+                   Input.ConstraintVT.isInteger()) ||
+                  (OpInfo.ConstraintVT.getSizeInBits() !=
+                   Input.ConstraintVT.getSizeInBits())) {
+                weightSum = -1;  // Can't match.
+                break;
+              }
+            }
+          }
+          weight = getMultipleConstraintMatchWeight(OpInfo, maIndex);
+          if (weight == -1) {
+            weightSum = -1;
+            break;
+          }
+          weightSum += weight;
+        }
+        // Update best.
+        if (weightSum > bestWeight) {
+          bestWeight = weightSum;
+          bestMAIndex = maIndex;
+        }
+      }
+
+      // Now select chosen alternative in each constraint.
+      for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+          cIndex != eIndex; ++cIndex) {
+        AsmOperandInfo& cInfo = ConstraintOperands[cIndex];
+        if (cInfo.Type == InlineAsm::isClobber)
+          continue;
+        cInfo.selectAlternative(bestMAIndex);
+      }
+    }
+  }
+
+  // Check and hook up tied operands, choose constraint code to use.
+  for (unsigned cIndex = 0, eIndex = ConstraintOperands.size();
+      cIndex != eIndex; ++cIndex) {
+    AsmOperandInfo& OpInfo = ConstraintOperands[cIndex];
+
+    // If this is an output operand with a matching input operand, look up the
+    // matching input. If their types mismatch, e.g. one is an integer, the
+    // other is floating point, or their sizes are different, flag it as an
+    // error.
+    if (OpInfo.hasMatchingInput()) {
+      AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
+
+      if (OpInfo.ConstraintVT != Input.ConstraintVT) {
+        std::pair<unsigned, const TargetRegisterClass *> MatchRC =
+            getRegForInlineAsmConstraint(TRI, OpInfo.ConstraintCode,
+                                         OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass *> InputRC =
+            getRegForInlineAsmConstraint(TRI, Input.ConstraintCode,
+                                         Input.ConstraintVT);
+        if ((OpInfo.ConstraintVT.isInteger() !=
+             Input.ConstraintVT.isInteger()) ||
+            (MatchRC.second != InputRC.second)) {
+          report_fatal_error("Unsupported asm: input constraint"
+                             " with a matching output constraint of"
+                             " incompatible type!");
+        }
+      }
+    }
+  }
+
+  return ConstraintOperands;
+}
+
+/// Return an integer indicating how general CT is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+    return 3;
+  }
+  llvm_unreachable("Invalid constraint type");
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getMultipleConstraintMatchWeight(
+    AsmOperandInfo &info, int maIndex) const {
+  InlineAsm::ConstraintCodeVector *rCodes;
+  if (maIndex >= (int)info.multipleAlternatives.size())
+    rCodes = &info.Codes;
+  else
+    rCodes = &info.multipleAlternatives[maIndex].Codes;
+  ConstraintWeight BestWeight = CW_Invalid;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = rCodes->size(); i != e; ++i) {
+    ConstraintWeight weight =
+      getSingleConstraintMatchWeight(info, (*rCodes)[i].c_str());
+    if (weight > BestWeight)
+      BestWeight = weight;
+  }
+
+  return BestWeight;
+}
+
+/// Examine constraint type and operand type and determine a weight value.
+/// This object must already have been set up with the operand type
+/// and the current alternative constraint selected.
+TargetLowering::ConstraintWeight
+  TargetLowering::getSingleConstraintMatchWeight(
+    AsmOperandInfo &info, const char *constraint) const {
+  ConstraintWeight weight = CW_Invalid;
+  Value *CallOperandVal = info.CallOperandVal;
+    // If we don't have a value, we can't do a match,
+    // but allow it at the lowest weight.
+  if (!CallOperandVal)
+    return CW_Default;
+  // Look at the constraint type.
+  switch (*constraint) {
+    case 'i': // immediate integer.
+    case 'n': // immediate integer with a known value.
+      if (isa<ConstantInt>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 's': // non-explicit intregal immediate.
+      if (isa<GlobalValue>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case 'E': // immediate float if host format.
+    case 'F': // immediate float.
+      if (isa<ConstantFP>(CallOperandVal))
+        weight = CW_Constant;
+      break;
+    case '<': // memory operand with autodecrement.
+    case '>': // memory operand with autoincrement.
+    case 'm': // memory operand.
+    case 'o': // offsettable memory operand
+    case 'V': // non-offsettable memory operand
+      weight = CW_Memory;
+      break;
+    case 'r': // general register.
+    case 'g': // general register, memory operand or immediate integer.
+              // note: Clang converts "g" to "imr".
+      if (CallOperandVal->getType()->isIntegerTy())
+        weight = CW_Register;
+      break;
+    case 'X': // any operand.
+    default:
+      weight = CW_Default;
+      break;
+  }
+  return weight;
+}
+
+/// If there are multiple different constraints that we could pick for this
+/// operand (e.g. "imr") try to pick the 'best' one.
+/// This is somewhat tricky: constraints fall into four classes:
+///    Other         -> immediates and magic values
+///    Register      -> one specific register
+///    RegisterClass -> a group of regs
+///    Memory        -> memory
+/// Ideally, we would pick the most specific constraint possible: if we have
+/// something that fits into a register, we would pick it.  The problem here
+/// is that if we have something that could either be in a register or in
+/// memory that use of the register could cause selection of *other*
+/// operands to fail: they might only succeed if we pick memory.  Because of
+/// this the heuristic we use is:
+///
+///  1) If there is an 'other' constraint, and if the operand is valid for
+///     that constraint, use it.  This makes us take advantage of 'i'
+///     constraints when available.
+///  2) Otherwise, pick the most general constraint present.  This prefers
+///     'm' over 'r', for example.
+///
+static void ChooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering &TLI,
+                             SDValue Op, SelectionDAG *DAG) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+      TLI.getConstraintType(OpInfo.Codes[i]);
+
+    // If this is an 'other' constraint, see if the operand is valid for it.
+    // For example, on X86 we might have an 'rI' constraint.  If the operand
+    // is an integer in the range [0..31] we want to use I (saving a load
+    // of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other && Op.getNode()) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      std::vector<SDValue> ResultOps;
+      TLI.LowerAsmOperandForConstraint(Op, OpInfo.Codes[i],
+                                       ResultOps, *DAG);
+      if (!ResultOps.empty()) {
+        BestType = CType;
+        BestIdx = i;
+        break;
+      }
+    }
+
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+/// Determines the constraint code and constraint type to use for the specific
+/// AsmOperandInfo, setting OpInfo.ConstraintCode and OpInfo.ConstraintType.
+void TargetLowering::ComputeConstraintToUse(AsmOperandInfo &OpInfo,
+                                            SDValue Op,
+                                            SelectionDAG *DAG) const {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    ChooseConstraint(OpInfo, *this, Op, DAG);
+  }
+
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).  For Functions, the type here is the type of
+    // the result, which is not what we want to look at; leave them alone.
+    Value *v = OpInfo.CallOperandVal;
+    if (isa<BasicBlock>(v) || isa<ConstantInt>(v) || isa<Function>(v)) {
+      OpInfo.CallOperandVal = v;
+      return;
+    }
+
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+/// \brief Given an exact SDIV by a constant, create a multiplication
+/// with the multiplicative inverse of the constant.
+static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
+                              const SDLoc &dl, SelectionDAG &DAG,
+                              std::vector<SDNode *> &Created) {
+  assert(d != 0 && "Division by zero!");
+
+  // Shift the value upfront if it is even, so the LSB is one.
+  unsigned ShAmt = d.countTrailingZeros();
+  if (ShAmt) {
+    // TODO: For UDIV use SRL instead of SRA.
+    SDValue Amt =
+        DAG.getConstant(ShAmt, dl, TLI.getShiftAmountTy(Op1.getValueType(),
+                                                        DAG.getDataLayout()));
+    SDNodeFlags Flags;
+    Flags.setExact(true);
+    Op1 = DAG.getNode(ISD::SRA, dl, Op1.getValueType(), Op1, Amt, &Flags);
+    Created.push_back(Op1.getNode());
+    d = d.ashr(ShAmt);
+  }
+
+  // Calculate the multiplicative inverse, using Newton's method.
+  APInt t, xn = d;
+  while ((t = d*xn) != 1)
+    xn *= APInt(d.getBitWidth(), 2) - t;
+
+  SDValue Op2 = DAG.getConstant(xn, dl, Op1.getValueType());
+  SDValue Mul = DAG.getNode(ISD::MUL, dl, Op1.getValueType(), Op1, Op2);
+  Created.push_back(Mul.getNode());
+  return Mul;
+}
+
+SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
+                                      SelectionDAG &DAG,
+                                      std::vector<SDNode *> *Created) const {
+  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.isIntDivCheap(N->getValueType(0), Attr))
+    return SDValue(N,0); // Lower SDIV as SDIV
+  return SDValue();
+}
+
+/// \brief Given an ISD::SDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildSDIV(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG, bool IsAfterLegalization,
+                                  std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold sdiv ops.");
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // If the sdiv has an 'exact' bit we can use a simpler lowering.
+  if (cast<BinaryWithFlagsSDNode>(N)->Flags.hasExact())
+    return BuildExactSDIV(*this, N->getOperand(0), Divisor, dl, DAG, *Created);
+
+  APInt::ms magics = Divisor.magic();
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  SDValue Q;
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHS, VT) :
+                            isOperationLegalOrCustom(ISD::MULHS, VT))
+    Q = DAG.getNode(ISD::MULHS, dl, VT, N->getOperand(0),
+                    DAG.getConstant(magics.m, dl, VT));
+  else if (IsAfterLegalization ? isOperationLegal(ISD::SMUL_LOHI, VT) :
+                                 isOperationLegalOrCustom(ISD::SMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(VT, VT),
+                              N->getOperand(0),
+                              DAG.getConstant(magics.m, dl, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhs or equvialent
+  // If d > 0 and m < 0, add the numerator
+  if (Divisor.isStrictlyPositive() && magics.m.isNegative()) {
+    Q = DAG.getNode(ISD::ADD, dl, VT, Q, N->getOperand(0));
+    Created->push_back(Q.getNode());
+  }
+  // If d < 0 and m > 0, subtract the numerator.
+  if (Divisor.isNegative() && magics.m.isStrictlyPositive()) {
+    Q = DAG.getNode(ISD::SUB, dl, VT, Q, N->getOperand(0));
+    Created->push_back(Q.getNode());
+  }
+  auto &DL = DAG.getDataLayout();
+  // Shift right algebraic if shift value is nonzero
+  if (magics.s > 0) {
+    Q = DAG.getNode(
+        ISD::SRA, dl, VT, Q,
+        DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL)));
+    Created->push_back(Q.getNode());
+  }
+  // Extract the sign bit and add it to the quotient
+  SDValue T =
+      DAG.getNode(ISD::SRL, dl, VT, Q,
+                  DAG.getConstant(VT.getScalarSizeInBits() - 1, dl,
+                                  getShiftAmountTy(Q.getValueType(), DL)));
+  Created->push_back(T.getNode());
+  return DAG.getNode(ISD::ADD, dl, VT, Q, T);
+}
+
+/// \brief Given an ISD::UDIV node expressing a divide by constant,
+/// return a DAG expression to select that will generate the same value by
+/// multiplying by a magic number.
+/// Ref: "Hacker's Delight" or "The PowerPC Compiler Writer's Guide".
+SDValue TargetLowering::BuildUDIV(SDNode *N, const APInt &Divisor,
+                                  SelectionDAG &DAG, bool IsAfterLegalization,
+                                  std::vector<SDNode *> *Created) const {
+  assert(Created && "No vector to hold udiv ops.");
+
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+  auto &DL = DAG.getDataLayout();
+
+  // Check to see if we can do this.
+  // FIXME: We should be more aggressive here.
+  if (!isTypeLegal(VT))
+    return SDValue();
+
+  // FIXME: We should use a narrower constant when the upper
+  // bits are known to be zero.
+  APInt::mu magics = Divisor.magicu();
+
+  SDValue Q = N->getOperand(0);
+
+  // If the divisor is even, we can avoid using the expensive fixup by shifting
+  // the divided value upfront.
+  if (magics.a != 0 && !Divisor[0]) {
+    unsigned Shift = Divisor.countTrailingZeros();
+    Q = DAG.getNode(
+        ISD::SRL, dl, VT, Q,
+        DAG.getConstant(Shift, dl, getShiftAmountTy(Q.getValueType(), DL)));
+    Created->push_back(Q.getNode());
+
+    // Get magic number for the shifted divisor.
+    magics = Divisor.lshr(Shift).magicu(Shift);
+    assert(magics.a == 0 && "Should use cheap fixup now");
+  }
+
+  // Multiply the numerator (operand 0) by the magic value
+  // FIXME: We should support doing a MUL in a wider type
+  if (IsAfterLegalization ? isOperationLegal(ISD::MULHU, VT) :
+                            isOperationLegalOrCustom(ISD::MULHU, VT))
+    Q = DAG.getNode(ISD::MULHU, dl, VT, Q, DAG.getConstant(magics.m, dl, VT));
+  else if (IsAfterLegalization ? isOperationLegal(ISD::UMUL_LOHI, VT) :
+                                 isOperationLegalOrCustom(ISD::UMUL_LOHI, VT))
+    Q = SDValue(DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(VT, VT), Q,
+                            DAG.getConstant(magics.m, dl, VT)).getNode(), 1);
+  else
+    return SDValue();       // No mulhu or equvialent
+
+  Created->push_back(Q.getNode());
+
+  if (magics.a == 0) {
+    assert(magics.s < Divisor.getBitWidth() &&
+           "We shouldn't generate an undefined shift!");
+    return DAG.getNode(
+        ISD::SRL, dl, VT, Q,
+        DAG.getConstant(magics.s, dl, getShiftAmountTy(Q.getValueType(), DL)));
+  } else {
+    SDValue NPQ = DAG.getNode(ISD::SUB, dl, VT, N->getOperand(0), Q);
+    Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(
+        ISD::SRL, dl, VT, NPQ,
+        DAG.getConstant(1, dl, getShiftAmountTy(NPQ.getValueType(), DL)));
+    Created->push_back(NPQ.getNode());
+    NPQ = DAG.getNode(ISD::ADD, dl, VT, NPQ, Q);
+    Created->push_back(NPQ.getNode());
+    return DAG.getNode(
+        ISD::SRL, dl, VT, NPQ,
+        DAG.getConstant(magics.s - 1, dl,
+                        getShiftAmountTy(NPQ.getValueType(), DL)));
+  }
+}
+
+bool TargetLowering::
+verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
+  if (!isa<ConstantSDNode>(Op.getOperand(0))) {
+    DAG.getContext()->emitError("argument to '__builtin_return_address' must "
+                                "be a constant integer");
+    return true;
+  }
+
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Legalization Utilities
+//===----------------------------------------------------------------------===//
+
+bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
+                               SelectionDAG &DAG, SDValue LL, SDValue LH,
+                               SDValue RL, SDValue RH) const {
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  bool HasMULHS = isOperationLegalOrCustom(ISD::MULHS, HiLoVT);
+  bool HasMULHU = isOperationLegalOrCustom(ISD::MULHU, HiLoVT);
+  bool HasSMUL_LOHI = isOperationLegalOrCustom(ISD::SMUL_LOHI, HiLoVT);
+  bool HasUMUL_LOHI = isOperationLegalOrCustom(ISD::UMUL_LOHI, HiLoVT);
+  if (HasMULHU || HasMULHS || HasUMUL_LOHI || HasSMUL_LOHI) {
+    unsigned OuterBitSize = VT.getSizeInBits();
+    unsigned InnerBitSize = HiLoVT.getSizeInBits();
+    unsigned LHSSB = DAG.ComputeNumSignBits(N->getOperand(0));
+    unsigned RHSSB = DAG.ComputeNumSignBits(N->getOperand(1));
+
+    // LL, LH, RL, and RH must be either all NULL or all set to a value.
+    assert((LL.getNode() && LH.getNode() && RL.getNode() && RH.getNode()) ||
+           (!LL.getNode() && !LH.getNode() && !RL.getNode() && !RH.getNode()));
+
+    if (!LL.getNode() && !RL.getNode() &&
+        isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+      LL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, N->getOperand(0));
+      RL = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, N->getOperand(1));
+    }
+
+    if (!LL.getNode())
+      return false;
+
+    APInt HighMask = APInt::getHighBitsSet(OuterBitSize, InnerBitSize);
+    if (DAG.MaskedValueIsZero(N->getOperand(0), HighMask) &&
+        DAG.MaskedValueIsZero(N->getOperand(1), HighMask)) {
+      // The inputs are both zero-extended.
+      if (HasUMUL_LOHI) {
+        // We can emit a umul_lohi.
+        Lo = DAG.getNode(ISD::UMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return true;
+      }
+      if (HasMULHU) {
+        // We can emit a mulhu+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHU, dl, HiLoVT, LL, RL);
+        return true;
+      }
+    }
+    if (LHSSB > InnerBitSize && RHSSB > InnerBitSize) {
+      // The input values are both sign-extended.
+      if (HasSMUL_LOHI) {
+        // We can emit a smul_lohi.
+        Lo = DAG.getNode(ISD::SMUL_LOHI, dl, DAG.getVTList(HiLoVT, HiLoVT), LL,
+                         RL);
+        Hi = SDValue(Lo.getNode(), 1);
+        return true;
+      }
+      if (HasMULHS) {
+        // We can emit a mulhs+mul.
+        Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+        Hi = DAG.getNode(ISD::MULHS, dl, HiLoVT, LL, RL);
+        return true;
+      }
+    }
+
+    if (!LH.getNode() && !RH.getNode() &&
+        isOperationLegalOrCustom(ISD::SRL, VT) &&
+        isOperationLegalOrCustom(ISD::TRUNCATE, HiLoVT)) {
+      auto &DL = DAG.getDataLayout();
+      unsigned ShiftAmt = VT.getSizeInBits() - HiLoVT.getSizeInBits();
+      SDValue Shift = DAG.getConstant(ShiftAmt, dl, getShiftAmountTy(VT, DL));
+      LH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(0), Shift);
+      LH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, LH);
+      RH = DAG.getNode(ISD::SRL, dl, VT, N->getOperand(1), Shift);
+      RH = DAG.getNode(ISD::TRUNCATE, dl, HiLoVT, RH);
+    }
+
+    if (!LH.getNode())
+      return false;
+
+    if (HasUMUL_LOHI) {
+      // Lo,Hi = umul LHS, RHS.
+      SDValue UMulLOHI = DAG.getNode(ISD::UMUL_LOHI, dl,
+                                     DAG.getVTList(HiLoVT, HiLoVT), LL, RL);
+      Lo = UMulLOHI;
+      Hi = UMulLOHI.getValue(1);
+      RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+      return true;
+    }
+    if (HasMULHU) {
+      Lo = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RL);
+      Hi = DAG.getNode(ISD::MULHU, dl, HiLoVT, LL, RL);
+      RH = DAG.getNode(ISD::MUL, dl, HiLoVT, LL, RH);
+      LH = DAG.getNode(ISD::MUL, dl, HiLoVT, LH, RL);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, RH);
+      Hi = DAG.getNode(ISD::ADD, dl, HiLoVT, Hi, LH);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool TargetLowering::expandFP_TO_SINT(SDNode *Node, SDValue &Result,
+                               SelectionDAG &DAG) const {
+  EVT VT = Node->getOperand(0).getValueType();
+  EVT NVT = Node->getValueType(0);
+  SDLoc dl(SDValue(Node, 0));
+
+  // FIXME: Only f32 to i64 conversions are supported.
+  if (VT != MVT::f32 || NVT != MVT::i64)
+    return false;
+
+  // Expand f32 -> i64 conversion
+  // This algorithm comes from compiler-rt's implementation of fixsfdi:
+  // https://github.com/llvm-mirror/compiler-rt/blob/master/lib/builtins/fixsfdi.c
+  EVT IntVT = EVT::getIntegerVT(*DAG.getContext(),
+                                VT.getSizeInBits());
+  SDValue ExponentMask = DAG.getConstant(0x7F800000, dl, IntVT);
+  SDValue ExponentLoBit = DAG.getConstant(23, dl, IntVT);
+  SDValue Bias = DAG.getConstant(127, dl, IntVT);
+  SDValue SignMask = DAG.getConstant(APInt::getSignBit(VT.getSizeInBits()), dl,
+                                     IntVT);
+  SDValue SignLowBit = DAG.getConstant(VT.getSizeInBits() - 1, dl, IntVT);
+  SDValue MantissaMask = DAG.getConstant(0x007FFFFF, dl, IntVT);
+
+  SDValue Bits = DAG.getNode(ISD::BITCAST, dl, IntVT, Node->getOperand(0));
+
+  auto &DL = DAG.getDataLayout();
+  SDValue ExponentBits = DAG.getNode(
+      ISD::SRL, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, ExponentMask),
+      DAG.getZExtOrTrunc(ExponentLoBit, dl, getShiftAmountTy(IntVT, DL)));
+  SDValue Exponent = DAG.getNode(ISD::SUB, dl, IntVT, ExponentBits, Bias);
+
+  SDValue Sign = DAG.getNode(
+      ISD::SRA, dl, IntVT, DAG.getNode(ISD::AND, dl, IntVT, Bits, SignMask),
+      DAG.getZExtOrTrunc(SignLowBit, dl, getShiftAmountTy(IntVT, DL)));
+  Sign = DAG.getSExtOrTrunc(Sign, dl, NVT);
+
+  SDValue R = DAG.getNode(ISD::OR, dl, IntVT,
+      DAG.getNode(ISD::AND, dl, IntVT, Bits, MantissaMask),
+      DAG.getConstant(0x00800000, dl, IntVT));
+
+  R = DAG.getZExtOrTrunc(R, dl, NVT);
+
+  R = DAG.getSelectCC(
+      dl, Exponent, ExponentLoBit,
+      DAG.getNode(ISD::SHL, dl, NVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, Exponent, ExponentLoBit),
+                      dl, getShiftAmountTy(IntVT, DL))),
+      DAG.getNode(ISD::SRL, dl, NVT, R,
+                  DAG.getZExtOrTrunc(
+                      DAG.getNode(ISD::SUB, dl, IntVT, ExponentLoBit, Exponent),
+                      dl, getShiftAmountTy(IntVT, DL))),
+      ISD::SETGT);
+
+  SDValue Ret = DAG.getNode(ISD::SUB, dl, NVT,
+      DAG.getNode(ISD::XOR, dl, NVT, R, Sign),
+      Sign);
+
+  Result = DAG.getSelectCC(dl, Exponent, DAG.getConstant(0, dl, IntVT),
+      DAG.getConstant(0, dl, NVT), Ret, ISD::SETLT);
+  return true;
+}
+
+SDValue TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
+                                            SelectionDAG &DAG) const {
+  SDLoc SL(LD);
+  SDValue Chain = LD->getChain();
+  SDValue BasePTR = LD->getBasePtr();
+  EVT SrcVT = LD->getMemoryVT();
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+
+  unsigned NumElem = SrcVT.getVectorNumElements();
+
+  EVT SrcEltVT = SrcVT.getScalarType();
+  EVT DstEltVT = LD->getValueType(0).getScalarType();
+
+  unsigned Stride = SrcEltVT.getSizeInBits() / 8;
+  assert(SrcEltVT.isByteSized());
+
+  EVT PtrVT = BasePTR.getValueType();
+
+  SmallVector<SDValue, 8> Vals;
+  SmallVector<SDValue, 8> LoadChains;
+
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue ScalarLoad =
+        DAG.getExtLoad(ExtType, SL, DstEltVT, Chain, BasePTR,
+                       LD->getPointerInfo().getWithOffset(Idx * Stride),
+                       SrcEltVT, MinAlign(LD->getAlignment(), Idx * Stride),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+    BasePTR = DAG.getNode(ISD::ADD, SL, PtrVT, BasePTR,
+                          DAG.getConstant(Stride, SL, PtrVT));
+
+    Vals.push_back(ScalarLoad.getValue(0));
+    LoadChains.push_back(ScalarLoad.getValue(1));
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
+  SDValue Value = DAG.getNode(ISD::BUILD_VECTOR, SL, LD->getValueType(0), Vals);
+
+  return DAG.getMergeValues({ Value, NewChain }, SL);
+}
+
+// FIXME: This relies on each element having a byte size, otherwise the stride
+// is 0 and just overwrites the same location. ExpandStore currently expects
+// this broken behavior.
+SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  SDLoc SL(ST);
+
+  SDValue Chain = ST->getChain();
+  SDValue BasePtr = ST->getBasePtr();
+  SDValue Value = ST->getValue();
+  EVT StVT = ST->getMemoryVT();
+
+  // The type of the data we want to save
+  EVT RegVT = Value.getValueType();
+  EVT RegSclVT = RegVT.getScalarType();
+
+  // The type of data as saved in memory.
+  EVT MemSclVT = StVT.getScalarType();
+
+  EVT PtrVT = BasePtr.getValueType();
+
+  // Store Stride in bytes
+  unsigned Stride = MemSclVT.getSizeInBits() / 8;
+  EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
+  unsigned NumElem = StVT.getVectorNumElements();
+
+  // Extract each of the elements from the original vector and save them into
+  // memory individually.
+  SmallVector<SDValue, 8> Stores;
+  for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
+                              DAG.getConstant(Idx, SL, IdxVT));
+
+    SDValue Ptr = DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
+                              DAG.getConstant(Idx * Stride, SL, PtrVT));
+
+    // This scalar TruncStore may be illegal, but we legalize it later.
+    SDValue Store = DAG.getTruncStore(
+        Chain, SL, Elt, Ptr, ST->getPointerInfo().getWithOffset(Idx * Stride),
+        MemSclVT, MinAlign(ST->getAlignment(), Idx * Stride),
+        ST->getMemOperand()->getFlags(), ST->getAAInfo());
+
+    Stores.push_back(Store);
+  }
+
+  return DAG.getNode(ISD::TokenFactor, SL, MVT::Other, Stores);
+}
+
+std::pair<SDValue, SDValue>
+TargetLowering::expandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG) const {
+  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed loads not implemented!");
+  SDValue Chain = LD->getChain();
+  SDValue Ptr = LD->getBasePtr();
+  EVT VT = LD->getValueType(0);
+  EVT LoadedVT = LD->getMemoryVT();
+  SDLoc dl(LD);
+  if (VT.isFloatingPoint() || VT.isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
+    if (isTypeLegal(intVT) && isTypeLegal(LoadedVT)) {
+      if (!isOperationLegalOrCustom(ISD::LOAD, intVT)) {
+        // Scalarize the load and let the individual components be handled.
+        SDValue Scalarized = scalarizeVectorLoad(LD, DAG);
+        return std::make_pair(Scalarized.getValue(0), Scalarized.getValue(1));
+      }
+
+      // Expand to a (misaligned) integer load of the same size,
+      // then bitconvert to floating point or vector.
+      SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr,
+                                    LD->getMemOperand());
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
+
+      return std::make_pair(Result, newLoad.getValue(1));
+    }
+
+    // Copy the value to a (aligned) stack slot using (unaligned) integer
+    // loads and stores, then do a (aligned) load from the stack slot.
+    MVT RegVT = getRegisterType(*DAG.getContext(), intVT);
+    unsigned LoadedBytes = LoadedVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (LoadedBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackBase = DAG.CreateStackTemporary(LoadedVT, RegVT);
+
+    SmallVector<SDValue, 8> Stores;
+    SDValue StackPtr = StackBase;
+    unsigned Offset = 0;
+
+    EVT PtrVT = Ptr.getValueType();
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the original location.
+      SDValue Load = DAG.getLoad(
+          RegVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(Offset),
+          MinAlign(LD->getAlignment(), Offset), LD->getMemOperand()->getFlags(),
+          LD->getAAInfo());
+      // Follow the load with a store to the stack slot.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, StackPtr,
+                                    MachinePointerInfo()));
+      // Increment the pointers.
+      Offset += RegBytes;
+      Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, PtrIncrement);
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtrVT, StackPtr,
+                             StackPtrIncrement);
+    }
+
+    // The last copy may be partial.  Do an extending load.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (LoadedBytes - Offset));
+    SDValue Load =
+        DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Chain, Ptr,
+                       LD->getPointerInfo().getWithOffset(Offset), MemVT,
+                       MinAlign(LD->getAlignment(), Offset),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+    // Follow the load with a store to the stack slot.  Remember the store.
+    // On big-endian machines this requires a truncating store to ensure
+    // that the bits end up in the right place.
+    Stores.push_back(DAG.getTruncStore(Load.getValue(1), dl, Load, StackPtr,
+                                       MachinePointerInfo(), MemVT));
+
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+
+    // Finally, perform the original load only redirected to the stack slot.
+    Load = DAG.getExtLoad(LD->getExtensionType(), dl, VT, TF, StackBase,
+                          MachinePointerInfo(), LoadedVT);
+
+    // Callers expect a MERGE_VALUES node.
+    return std::make_pair(Load, TF);
+  }
+
+  assert(LoadedVT.isInteger() && !LoadedVT.isVector() &&
+         "Unaligned load of unsupported type.");
+
+  // Compute the new VT that is half the size of the old one.  This is an
+  // integer MVT.
+  unsigned NumBits = LoadedVT.getSizeInBits();
+  EVT NewLoadedVT;
+  NewLoadedVT = EVT::getIntegerVT(*DAG.getContext(), NumBits/2);
+  NumBits >>= 1;
+
+  unsigned Alignment = LD->getAlignment();
+  unsigned IncrementSize = NumBits / 8;
+  ISD::LoadExtType HiExtType = LD->getExtensionType();
+
+  // If the original load is NON_EXTLOAD, the hi part load must be ZEXTLOAD.
+  if (HiExtType == ISD::NON_EXTLOAD)
+    HiExtType = ISD::ZEXTLOAD;
+
+  // Load the value in two parts
+  SDValue Lo, Hi;
+  if (DAG.getDataLayout().isLittleEndian()) {
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+  } else {
+    Hi = DAG.getExtLoad(HiExtType, dl, VT, Chain, Ptr, LD->getPointerInfo(),
+                        NewLoadedVT, Alignment, LD->getMemOperand()->getFlags(),
+                        LD->getAAInfo());
+    Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                      DAG.getConstant(IncrementSize, dl, Ptr.getValueType()));
+    Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, VT, Chain, Ptr,
+                        LD->getPointerInfo().getWithOffset(IncrementSize),
+                        NewLoadedVT, MinAlign(Alignment, IncrementSize),
+                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
+  }
+
+  // aggregate the two parts
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, getShiftAmountTy(Hi.getValueType(),
+                                                    DAG.getDataLayout()));
+  SDValue Result = DAG.getNode(ISD::SHL, dl, VT, Hi, ShiftAmount);
+  Result = DAG.getNode(ISD::OR, dl, VT, Result, Lo);
+
+  SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                             Hi.getValue(1));
+
+  return std::make_pair(Result, TF);
+}
+
+SDValue TargetLowering::expandUnalignedStore(StoreSDNode *ST,
+                                             SelectionDAG &DAG) const {
+  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed stores not implemented!");
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
+  SDValue Val = ST->getValue();
+  EVT VT = Val.getValueType();
+  int Alignment = ST->getAlignment();
+
+  SDLoc dl(ST);
+  if (ST->getMemoryVT().isFloatingPoint() ||
+      ST->getMemoryVT().isVector()) {
+    EVT intVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (isTypeLegal(intVT)) {
+      if (!isOperationLegalOrCustom(ISD::STORE, intVT)) {
+        // Scalarize the store and let the individual components be handled.
+        SDValue Result = scalarizeVectorStore(ST, DAG);
+
+        return Result;
+      }
+      // Expand to a bitconvert of the value to the integer type of the
+      // same size, then a (misaligned) int store.
+      // FIXME: Does not handle truncating floating point stores!
+      SDValue Result = DAG.getNode(ISD::BITCAST, dl, intVT, Val);
+      Result = DAG.getStore(Chain, dl, Result, Ptr, ST->getPointerInfo(),
+                            Alignment, ST->getMemOperand()->getFlags());
+      return Result;
+    }
+    // Do a (aligned) store to a stack slot, then copy from the stack slot
+    // to the final destination using (unaligned) integer loads and stores.
+    EVT StoredVT = ST->getMemoryVT();
+    MVT RegVT =
+      getRegisterType(*DAG.getContext(),
+                      EVT::getIntegerVT(*DAG.getContext(),
+                                        StoredVT.getSizeInBits()));
+    EVT PtrVT = Ptr.getValueType();
+    unsigned StoredBytes = StoredVT.getSizeInBits() / 8;
+    unsigned RegBytes = RegVT.getSizeInBits() / 8;
+    unsigned NumRegs = (StoredBytes + RegBytes - 1) / RegBytes;
+
+    // Make sure the stack slot is also aligned for the register type.
+    SDValue StackPtr = DAG.CreateStackTemporary(StoredVT, RegVT);
+
+    // Perform the original store, only redirected to the stack slot.
+    SDValue Store = DAG.getTruncStore(Chain, dl, Val, StackPtr,
+                                      MachinePointerInfo(), StoredVT);
+
+    EVT StackPtrVT = StackPtr.getValueType();
+
+    SDValue PtrIncrement = DAG.getConstant(RegBytes, dl, PtrVT);
+    SDValue StackPtrIncrement = DAG.getConstant(RegBytes, dl, StackPtrVT);
+    SmallVector<SDValue, 8> Stores;
+    unsigned Offset = 0;
+
+    // Do all but one copies using the full register width.
+    for (unsigned i = 1; i < NumRegs; i++) {
+      // Load one integer register's worth from the stack slot.
+      SDValue Load =
+          DAG.getLoad(RegVT, dl, Store, StackPtr, MachinePointerInfo());
+      // Store it to the final location.  Remember the store.
+      Stores.push_back(DAG.getStore(Load.getValue(1), dl, Load, Ptr,
+                                    ST->getPointerInfo().getWithOffset(Offset),
+                                    MinAlign(ST->getAlignment(), Offset),
+                                    ST->getMemOperand()->getFlags()));
+      // Increment the pointers.
+      Offset += RegBytes;
+      StackPtr = DAG.getNode(ISD::ADD, dl, StackPtrVT,
+                             StackPtr, StackPtrIncrement);
+      Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr, PtrIncrement);
+    }
+
+    // The last store may be partial.  Do a truncating store.  On big-endian
+    // machines this requires an extending load from the stack slot to ensure
+    // that the bits are in the right place.
+    EVT MemVT = EVT::getIntegerVT(*DAG.getContext(),
+                                  8 * (StoredBytes - Offset));
+
+    // Load from the stack slot.
+    SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, RegVT, Store, StackPtr,
+                                  MachinePointerInfo(), MemVT);
+
+    Stores.push_back(
+        DAG.getTruncStore(Load.getValue(1), dl, Load, Ptr,
+                          ST->getPointerInfo().getWithOffset(Offset), MemVT,
+                          MinAlign(ST->getAlignment(), Offset),
+                          ST->getMemOperand()->getFlags(), ST->getAAInfo()));
+    // The order of the stores doesn't matter - say it with a TokenFactor.
+    SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Stores);
+    return Result;
+  }
+
+  assert(ST->getMemoryVT().isInteger() &&
+         !ST->getMemoryVT().isVector() &&
+         "Unaligned store of unknown type.");
+  // Get the half-size VT
+  EVT NewStoredVT = ST->getMemoryVT().getHalfSizedIntegerVT(*DAG.getContext());
+  int NumBits = NewStoredVT.getSizeInBits();
+  int IncrementSize = NumBits / 8;
+
+  // Divide the stored value in two parts.
+  SDValue ShiftAmount =
+      DAG.getConstant(NumBits, dl, getShiftAmountTy(Val.getValueType(),
+                                                    DAG.getDataLayout()));
+  SDValue Lo = Val;
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, VT, Val, ShiftAmount);
+
+  // Store the two parts
+  SDValue Store1, Store2;
+  Store1 = DAG.getTruncStore(Chain, dl,
+                             DAG.getDataLayout().isLittleEndian() ? Lo : Hi,
+                             Ptr, ST->getPointerInfo(), NewStoredVT, Alignment,
+                             ST->getMemOperand()->getFlags());
+
+  EVT PtrVT = Ptr.getValueType();
+  Ptr = DAG.getNode(ISD::ADD, dl, PtrVT, Ptr,
+                    DAG.getConstant(IncrementSize, dl, PtrVT));
+  Alignment = MinAlign(Alignment, IncrementSize);
+  Store2 = DAG.getTruncStore(
+      Chain, dl, DAG.getDataLayout().isLittleEndian() ? Hi : Lo, Ptr,
+      ST->getPointerInfo().getWithOffset(IncrementSize), NewStoredVT, Alignment,
+      ST->getMemOperand()->getFlags(), ST->getAAInfo());
+
+  SDValue Result =
+    DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+// Implementation of Emulated TLS Model
+//===----------------------------------------------------------------------===//
+
+SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
+                                                SelectionDAG &DAG) const {
+  // Access to address of TLS varialbe xyz is lowered to a function call:
+  //   __emutls_get_address( address of global variable named "__emutls_v.xyz" )
+  EVT PtrVT = getPointerTy(DAG.getDataLayout());
+  PointerType *VoidPtrType = Type::getInt8PtrTy(*DAG.getContext());
+  SDLoc dl(GA);
+
+  ArgListTy Args;
+  ArgListEntry Entry;
+  std::string NameString = ("__emutls_v." + GA->getGlobal()->getName()).str();
+  Module *VariableModule = const_cast<Module*>(GA->getGlobal()->getParent());
+  StringRef EmuTlsVarName(NameString);
+  GlobalVariable *EmuTlsVar = VariableModule->getNamedGlobal(EmuTlsVarName);
+  assert(EmuTlsVar && "Cannot find EmuTlsVar ");
+  Entry.Node = DAG.getGlobalAddress(EmuTlsVar, dl, PtrVT);
+  Entry.Ty = VoidPtrType;
+  Args.push_back(Entry);
+
+  SDValue EmuTlsGetAddr = DAG.getExternalSymbol("__emutls_get_address", PtrVT);
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
+  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
+  std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+  // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  // At last for X86 targets, maybe good for other targets too?
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI->setAdjustsStack(true);  // Is this only for X86 target?
+  MFI->setHasCalls(true);
+
+  assert((GA->getOffset() == 0) &&
+         "Emulated TLS must have zero offset in GlobalAddressSDNode");
+  return CallResult.first;
+}
diff --git a/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
new file mode 100644
index 000000000000..1efc440cd701
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ShadowStackGCLowering.cpp
@@ -0,0 +1,468 @@
+//===-- ShadowStackGCLowering.cpp - Custom lowering for shadow-stack gc ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the custom lowering code required by the shadow-stack GC
+// strategy.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "shadowstackgclowering"
+
+namespace {
+
+class ShadowStackGCLowering : public FunctionPass {
+  /// RootChain - This is the global linked-list that contains the chain of GC
+  /// roots.
+  GlobalVariable *Head;
+
+  /// StackEntryTy - Abstract type of a link in the shadow stack.
+  ///
+  StructType *StackEntryTy;
+  StructType *FrameMapTy;
+
+  /// Roots - GC roots in the current function. Each is a pair of the
+  /// intrinsic call and its corresponding alloca.
+  std::vector<std::pair<CallInst *, AllocaInst *>> Roots;
+
+public:
+  static char ID;
+  ShadowStackGCLowering();
+
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+private:
+  bool IsNullValue(Value *V);
+  Constant *GetFrameMap(Function &F);
+  Type *GetConcreteStackEntryType(Function &F);
+  void CollectRoots(Function &F);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Type *Ty, Value *BasePtr, int Idx1,
+                                      const char *Name);
+  static GetElementPtrInst *CreateGEP(LLVMContext &Context, IRBuilder<> &B,
+                                      Type *Ty, Value *BasePtr, int Idx1, int Idx2,
+                                      const char *Name);
+};
+}
+
+INITIALIZE_PASS_BEGIN(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                      "Shadow Stack GC Lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(GCModuleInfo)
+INITIALIZE_PASS_END(ShadowStackGCLowering, "shadow-stack-gc-lowering",
+                    "Shadow Stack GC Lowering", false, false)
+
+FunctionPass *llvm::createShadowStackGCLoweringPass() { return new ShadowStackGCLowering(); }
+
+char ShadowStackGCLowering::ID = 0;
+
+ShadowStackGCLowering::ShadowStackGCLowering()
+  : FunctionPass(ID), Head(nullptr), StackEntryTy(nullptr),
+    FrameMapTy(nullptr) {
+  initializeShadowStackGCLoweringPass(*PassRegistry::getPassRegistry());
+}
+
+namespace {
+/// EscapeEnumerator - This is a little algorithm to find all escape points
+/// from a function so that "finally"-style code can be inserted. In addition
+/// to finding the existing return and unwind instructions, it also (if
+/// necessary) transforms any call instructions into invokes and sends them to
+/// a landing pad.
+///
+/// It's wrapped up in a state machine using the same transform C# uses for
+/// 'yield return' enumerators, This transform allows it to be non-allocating.
+class EscapeEnumerator {
+  Function &F;
+  const char *CleanupBBName;
+
+  // State.
+  int State;
+  Function::iterator StateBB, StateE;
+  IRBuilder<> Builder;
+
+public:
+  EscapeEnumerator(Function &F, const char *N = "cleanup")
+      : F(F), CleanupBBName(N), State(0), Builder(F.getContext()) {}
+
+  IRBuilder<> *Next() {
+    switch (State) {
+    default:
+      return nullptr;
+
+    case 0:
+      StateBB = F.begin();
+      StateE = F.end();
+      State = 1;
+
+    case 1:
+      // Find all 'return', 'resume', and 'unwind' instructions.
+      while (StateBB != StateE) {
+        BasicBlock *CurBB = &*StateBB++;
+
+        // Branches and invokes do not escape, only unwind, resume, and return
+        // do.
+        TerminatorInst *TI = CurBB->getTerminator();
+        if (!isa<ReturnInst>(TI) && !isa<ResumeInst>(TI))
+          continue;
+
+        Builder.SetInsertPoint(TI);
+        return &Builder;
+      }
+
+      State = 2;
+
+      // Find all 'call' instructions.
+      SmallVector<Instruction *, 16> Calls;
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+        for (BasicBlock::iterator II = BB->begin(), EE = BB->end(); II != EE;
+             ++II)
+          if (CallInst *CI = dyn_cast<CallInst>(II))
+            if (!CI->getCalledFunction() ||
+                !CI->getCalledFunction()->getIntrinsicID())
+              Calls.push_back(CI);
+
+      if (Calls.empty())
+        return nullptr;
+
+      // Create a cleanup block.
+      LLVMContext &C = F.getContext();
+      BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F);
+      Type *ExnTy =
+          StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr);
+      if (!F.hasPersonalityFn()) {
+        Constant *PersFn = F.getParent()->getOrInsertFunction(
+            "__gcc_personality_v0",
+            FunctionType::get(Type::getInt32Ty(C), true));
+        F.setPersonalityFn(PersFn);
+      }
+      LandingPadInst *LPad =
+          LandingPadInst::Create(ExnTy, 1, "cleanup.lpad", CleanupBB);
+      LPad->setCleanup(true);
+      ResumeInst *RI = ResumeInst::Create(LPad, CleanupBB);
+
+      // Transform the 'call' instructions into 'invoke's branching to the
+      // cleanup block. Go in reverse order to make prettier BB names.
+      SmallVector<Value *, 16> Args;
+      for (unsigned I = Calls.size(); I != 0;) {
+        CallInst *CI = cast<CallInst>(Calls[--I]);
+
+        // Split the basic block containing the function call.
+        BasicBlock *CallBB = CI->getParent();
+        BasicBlock *NewBB = CallBB->splitBasicBlock(
+            CI->getIterator(), CallBB->getName() + ".cont");
+
+        // Remove the unconditional branch inserted at the end of CallBB.
+        CallBB->getInstList().pop_back();
+        NewBB->getInstList().remove(CI);
+
+        // Create a new invoke instruction.
+        Args.clear();
+        CallSite CS(CI);
+        Args.append(CS.arg_begin(), CS.arg_end());
+
+        InvokeInst *II =
+            InvokeInst::Create(CI->getCalledValue(), NewBB, CleanupBB, Args,
+                               CI->getName(), CallBB);
+        II->setCallingConv(CI->getCallingConv());
+        II->setAttributes(CI->getAttributes());
+        CI->replaceAllUsesWith(II);
+        delete CI;
+      }
+
+      Builder.SetInsertPoint(RI);
+      return &Builder;
+    }
+  }
+};
+}
+
+
+Constant *ShadowStackGCLowering::GetFrameMap(Function &F) {
+  // doInitialization creates the abstract type of this value.
+  Type *VoidPtr = Type::getInt8PtrTy(F.getContext());
+
+  // Truncate the ShadowStackDescriptor if some metadata is null.
+  unsigned NumMeta = 0;
+  SmallVector<Constant *, 16> Metadata;
+  for (unsigned I = 0; I != Roots.size(); ++I) {
+    Constant *C = cast<Constant>(Roots[I].first->getArgOperand(1));
+    if (!C->isNullValue())
+      NumMeta = I + 1;
+    Metadata.push_back(ConstantExpr::getBitCast(C, VoidPtr));
+  }
+  Metadata.resize(NumMeta);
+
+  Type *Int32Ty = Type::getInt32Ty(F.getContext());
+
+  Constant *BaseElts[] = {
+      ConstantInt::get(Int32Ty, Roots.size(), false),
+      ConstantInt::get(Int32Ty, NumMeta, false),
+  };
+
+  Constant *DescriptorElts[] = {
+      ConstantStruct::get(FrameMapTy, BaseElts),
+      ConstantArray::get(ArrayType::get(VoidPtr, NumMeta), Metadata)};
+
+  Type *EltTys[] = {DescriptorElts[0]->getType(), DescriptorElts[1]->getType()};
+  StructType *STy = StructType::create(EltTys, "gc_map." + utostr(NumMeta));
+
+  Constant *FrameMap = ConstantStruct::get(STy, DescriptorElts);
+
+  // FIXME: Is this actually dangerous as WritingAnLLVMPass.html claims? Seems
+  //        that, short of multithreaded LLVM, it should be safe; all that is
+  //        necessary is that a simple Module::iterator loop not be invalidated.
+  //        Appending to the GlobalVariable list is safe in that sense.
+  //
+  //        All of the output passes emit globals last. The ExecutionEngine
+  //        explicitly supports adding globals to the module after
+  //        initialization.
+  //
+  //        Still, if it isn't deemed acceptable, then this transformation needs
+  //        to be a ModulePass (which means it cannot be in the 'llc' pipeline
+  //        (which uses a FunctionPassManager (which segfaults (not asserts) if
+  //        provided a ModulePass))).
+  Constant *GV = new GlobalVariable(*F.getParent(), FrameMap->getType(), true,
+                                    GlobalVariable::InternalLinkage, FrameMap,
+                                    "__gc_" + F.getName());
+
+  Constant *GEPIndices[2] = {
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0),
+      ConstantInt::get(Type::getInt32Ty(F.getContext()), 0)};
+  return ConstantExpr::getGetElementPtr(FrameMap->getType(), GV, GEPIndices);
+}
+
+Type *ShadowStackGCLowering::GetConcreteStackEntryType(Function &F) {
+  // doInitialization creates the generic version of this type.
+  std::vector<Type *> EltTys;
+  EltTys.push_back(StackEntryTy);
+  for (size_t I = 0; I != Roots.size(); I++)
+    EltTys.push_back(Roots[I].second->getAllocatedType());
+
+  return StructType::create(EltTys, ("gc_stackentry." + F.getName()).str());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now. If
+/// not, exit fast.
+bool ShadowStackGCLowering::doInitialization(Module &M) {
+  bool Active = false;
+  for (Function &F : M) {
+    if (F.hasGC() && F.getGC() == std::string("shadow-stack")) {
+      Active = true;
+      break;
+    }
+  }
+  if (!Active)
+    return false;
+  
+  // struct FrameMap {
+  //   int32_t NumRoots; // Number of roots in stack frame.
+  //   int32_t NumMeta;  // Number of metadata descriptors. May be < NumRoots.
+  //   void *Meta[];     // May be absent for roots without metadata.
+  // };
+  std::vector<Type *> EltTys;
+  // 32 bits is ok up to a 32GB stack frame. :)
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  // Specifies length of variable length array.
+  EltTys.push_back(Type::getInt32Ty(M.getContext()));
+  FrameMapTy = StructType::create(EltTys, "gc_map");
+  PointerType *FrameMapPtrTy = PointerType::getUnqual(FrameMapTy);
+
+  // struct StackEntry {
+  //   ShadowStackEntry *Next; // Caller's stack entry.
+  //   FrameMap *Map;          // Pointer to constant FrameMap.
+  //   void *Roots[];          // Stack roots (in-place array, so we pretend).
+  // };
+
+  StackEntryTy = StructType::create(M.getContext(), "gc_stackentry");
+
+  EltTys.clear();
+  EltTys.push_back(PointerType::getUnqual(StackEntryTy));
+  EltTys.push_back(FrameMapPtrTy);
+  StackEntryTy->setBody(EltTys);
+  PointerType *StackEntryPtrTy = PointerType::getUnqual(StackEntryTy);
+
+  // Get the root chain if it already exists.
+  Head = M.getGlobalVariable("llvm_gc_root_chain");
+  if (!Head) {
+    // If the root chain does not exist, insert a new one with linkonce
+    // linkage!
+    Head = new GlobalVariable(
+        M, StackEntryPtrTy, false, GlobalValue::LinkOnceAnyLinkage,
+        Constant::getNullValue(StackEntryPtrTy), "llvm_gc_root_chain");
+  } else if (Head->hasExternalLinkage() && Head->isDeclaration()) {
+    Head->setInitializer(Constant::getNullValue(StackEntryPtrTy));
+    Head->setLinkage(GlobalValue::LinkOnceAnyLinkage);
+  }
+
+  return true;
+}
+
+bool ShadowStackGCLowering::IsNullValue(Value *V) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C->isNullValue();
+  return false;
+}
+
+void ShadowStackGCLowering::CollectRoots(Function &F) {
+  // FIXME: Account for original alignment. Could fragment the root array.
+  //   Approach 1: Null initialize empty slots at runtime. Yuck.
+  //   Approach 2: Emit a map of the array instead of just a count.
+
+  assert(Roots.empty() && "Not cleaned up?");
+
+  SmallVector<std::pair<CallInst *, AllocaInst *>, 16> MetaRoots;
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++))
+        if (Function *F = CI->getCalledFunction())
+          if (F->getIntrinsicID() == Intrinsic::gcroot) {
+            std::pair<CallInst *, AllocaInst *> Pair = std::make_pair(
+                CI,
+                cast<AllocaInst>(CI->getArgOperand(0)->stripPointerCasts()));
+            if (IsNullValue(CI->getArgOperand(1)))
+              Roots.push_back(Pair);
+            else
+              MetaRoots.push_back(Pair);
+          }
+
+  // Number roots with metadata (usually empty) at the beginning, so that the
+  // FrameMap::Meta array can be elided.
+  Roots.insert(Roots.begin(), MetaRoots.begin(), MetaRoots.end());
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                                    IRBuilder<> &B, Type *Ty,
+                                                    Value *BasePtr, int Idx,
+                                                    int Idx2,
+                                                    const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx2)};
+  Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+GetElementPtrInst *ShadowStackGCLowering::CreateGEP(LLVMContext &Context,
+                                            IRBuilder<> &B, Type *Ty, Value *BasePtr,
+                                            int Idx, const char *Name) {
+  Value *Indices[] = {ConstantInt::get(Type::getInt32Ty(Context), 0),
+                      ConstantInt::get(Type::getInt32Ty(Context), Idx)};
+  Value *Val = B.CreateGEP(Ty, BasePtr, Indices, Name);
+
+  assert(isa<GetElementPtrInst>(Val) && "Unexpected folded constant");
+
+  return dyn_cast<GetElementPtrInst>(Val);
+}
+
+/// runOnFunction - Insert code to maintain the shadow stack.
+bool ShadowStackGCLowering::runOnFunction(Function &F) {
+  // Quick exit for functions that do not use the shadow stack GC.
+  if (!F.hasGC() ||
+      F.getGC() != std::string("shadow-stack"))
+    return false;
+  
+  LLVMContext &Context = F.getContext();
+
+  // Find calls to llvm.gcroot.
+  CollectRoots(F);
+
+  // If there are no roots in this function, then there is no need to add a
+  // stack map entry for it.
+  if (Roots.empty())
+    return false;
+
+  // Build the constant map and figure the type of the shadow stack entry.
+  Value *FrameMap = GetFrameMap(F);
+  Type *ConcreteStackEntryTy = GetConcreteStackEntryType(F);
+
+  // Build the shadow stack entry at the very start of the function.
+  BasicBlock::iterator IP = F.getEntryBlock().begin();
+  IRBuilder<> AtEntry(IP->getParent(), IP);
+
+  Instruction *StackEntry =
+      AtEntry.CreateAlloca(ConcreteStackEntryTy, nullptr, "gc_frame");
+
+  while (isa<AllocaInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Initialize the map pointer and load the current head of the shadow stack.
+  Instruction *CurrentHead = AtEntry.CreateLoad(Head, "gc_currhead");
+  Instruction *EntryMapPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                       StackEntry, 0, 1, "gc_frame.map");
+  AtEntry.CreateStore(FrameMap, EntryMapPtr);
+
+  // After all the allocas...
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    // For each root, find the corresponding slot in the aggregate...
+    Value *SlotPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                               StackEntry, 1 + I, "gc_root");
+
+    // And use it in lieu of the alloca.
+    AllocaInst *OriginalAlloca = Roots[I].second;
+    SlotPtr->takeName(OriginalAlloca);
+    OriginalAlloca->replaceAllUsesWith(SlotPtr);
+  }
+
+  // Move past the original stores inserted by GCStrategy::InitRoots. This isn't
+  // really necessary (the collector would never see the intermediate state at
+  // runtime), but it's nicer not to push the half-initialized entry onto the
+  // shadow stack.
+  while (isa<StoreInst>(IP))
+    ++IP;
+  AtEntry.SetInsertPoint(IP->getParent(), IP);
+
+  // Push the entry onto the shadow stack.
+  Instruction *EntryNextPtr = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                        StackEntry, 0, 0, "gc_frame.next");
+  Instruction *NewHeadVal = CreateGEP(Context, AtEntry, ConcreteStackEntryTy,
+                                      StackEntry, 0, "gc_newhead");
+  AtEntry.CreateStore(CurrentHead, EntryNextPtr);
+  AtEntry.CreateStore(NewHeadVal, Head);
+
+  // For each instruction that escapes...
+  EscapeEnumerator EE(F, "gc_cleanup");
+  while (IRBuilder<> *AtExit = EE.Next()) {
+    // Pop the entry from the shadow stack. Don't reuse CurrentHead from
+    // AtEntry, since that would make the value live for the entire function.
+    Instruction *EntryNextPtr2 =
+        CreateGEP(Context, *AtExit, ConcreteStackEntryTy, StackEntry, 0, 0,
+                  "gc_frame.next");
+    Value *SavedHead = AtExit->CreateLoad(EntryNextPtr2, "gc_savedhead");
+    AtExit->CreateStore(SavedHead, Head);
+  }
+
+  // Delete the original allocas (which are no longer used) and the intrinsic
+  // calls (which are no longer valid). Doing this last avoids invalidating
+  // iterators.
+  for (unsigned I = 0, E = Roots.size(); I != E; ++I) {
+    Roots[I].first->eraseFromParent();
+    Roots[I].second->eraseFromParent();
+  }
+
+  Roots.clear();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
new file mode 100644
index 000000000000..d361a6c4ad06
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -0,0 +1,556 @@
+//===-- ShrinkWrap.cpp - Compute safe point for prolog/epilog insertion ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for safe point where the prologue and epilogue can be
+// inserted.
+// The safe point for the prologue (resp. epilogue) is called Save
+// (resp. Restore).
+// A point is safe for prologue (resp. epilogue) if and only if
+// it 1) dominates (resp. post-dominates) all the frame related operations and
+// between 2) two executions of the Save (resp. Restore) point there is an
+// execution of the Restore (resp. Save) point.
+//
+// For instance, the following points are safe:
+// for (int i = 0; i < 10; ++i) {
+//   Save
+//   ...
+//   Restore
+// }
+// Indeed, the execution looks like Save -> Restore -> Save -> Restore ...
+// And the following points are not:
+// for (int i = 0; i < 10; ++i) {
+//   Save
+//   ...
+// }
+// for (int i = 0; i < 10; ++i) {
+//   ...
+//   Restore
+// }
+// Indeed, the execution looks like Save -> Save -> ... -> Restore -> Restore.
+//
+// This pass also ensures that the safe points are 3) cheaper than the regular
+// entry and exits blocks.
+//
+// Property #1 is ensured via the use of MachineDominatorTree and
+// MachinePostDominatorTree.
+// Property #2 is ensured via property #1 and MachineLoopInfo, i.e., both
+// points must be in the same loop.
+// Property #3 is ensured via the MachineBlockFrequencyInfo.
+//
+// If this pass found points matching all these properties, then
+// MachineFrameInfo is updated with this information.
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+// To check for profitability.
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+// For property #1 for Save.
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+// To record the result of the analysis.
+#include "llvm/CodeGen/MachineFrameInfo.h"
+// For property #2.
+#include "llvm/CodeGen/MachineLoopInfo.h"
+// For property #1 for Restore.
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/Passes.h"
+// To know about callee-saved.
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+// To query the target about frame lowering.
+#include "llvm/Target/TargetFrameLowering.h"
+// To know about frame setup operation.
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+// To access TargetInstrInfo.
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define DEBUG_TYPE "shrink-wrap"
+
+using namespace llvm;
+
+STATISTIC(NumFunc, "Number of functions");
+STATISTIC(NumCandidates, "Number of shrink-wrapping candidates");
+STATISTIC(NumCandidatesDropped,
+          "Number of shrink-wrapping candidates dropped because of frequency");
+
+static cl::opt<cl::boolOrDefault>
+    EnableShrinkWrapOpt("enable-shrink-wrap", cl::Hidden,
+                        cl::desc("enable the shrink-wrapping pass"));
+
+namespace {
+/// \brief Class to determine where the safe point to insert the
+/// prologue and epilogue are.
+/// Unlike the paper from Fred C. Chow, PLDI'88, that introduces the
+/// shrink-wrapping term for prologue/epilogue placement, this pass
+/// does not rely on expensive data-flow analysis. Instead we use the
+/// dominance properties and loop information to decide which point
+/// are safe for such insertion.
+class ShrinkWrap : public MachineFunctionPass {
+  /// Hold callee-saved information.
+  RegisterClassInfo RCI;
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
+  /// Current safe point found for the prologue.
+  /// The prologue will be inserted before the first instruction
+  /// in this basic block.
+  MachineBasicBlock *Save;
+  /// Current safe point found for the epilogue.
+  /// The epilogue will be inserted before the first terminator instruction
+  /// in this basic block.
+  MachineBasicBlock *Restore;
+  /// Hold the information of the basic block frequency.
+  /// Use to check the profitability of the new points.
+  MachineBlockFrequencyInfo *MBFI;
+  /// Hold the loop information. Used to determine if Save and Restore
+  /// are in the same loop.
+  MachineLoopInfo *MLI;
+  /// Frequency of the Entry block.
+  uint64_t EntryFreq;
+  /// Current opcode for frame setup.
+  unsigned FrameSetupOpcode;
+  /// Current opcode for frame destroy.
+  unsigned FrameDestroyOpcode;
+  /// Entry block.
+  const MachineBasicBlock *Entry;
+  typedef SmallSetVector<unsigned, 16> SetOfRegs;
+  /// Registers that need to be saved for the current function.
+  mutable SetOfRegs CurrentCSRs;
+  /// Current MachineFunction.
+  MachineFunction *MachineFunc;
+
+  /// \brief Check if \p MI uses or defines a callee-saved register or
+  /// a frame index. If this is the case, this means \p MI must happen
+  /// after Save and before Restore.
+  bool useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS) const;
+
+  const SetOfRegs &getCurrentCSRs(RegScavenger *RS) const {
+    if (CurrentCSRs.empty()) {
+      BitVector SavedRegs;
+      const TargetFrameLowering *TFI =
+          MachineFunc->getSubtarget().getFrameLowering();
+
+      TFI->determineCalleeSaves(*MachineFunc, SavedRegs, RS);
+
+      for (int Reg = SavedRegs.find_first(); Reg != -1;
+           Reg = SavedRegs.find_next(Reg))
+        CurrentCSRs.insert((unsigned)Reg);
+    }
+    return CurrentCSRs;
+  }
+
+  /// \brief Update the Save and Restore points such that \p MBB is in
+  /// the region that is dominated by Save and post-dominated by Restore
+  /// and Save and Restore still match the safe point definition.
+  /// Such point may not exist and Save and/or Restore may be null after
+  /// this call.
+  void updateSaveRestorePoints(MachineBasicBlock &MBB, RegScavenger *RS);
+
+  /// \brief Initialize the pass for \p MF.
+  void init(MachineFunction &MF) {
+    RCI.runOnMachineFunction(MF);
+    MDT = &getAnalysis<MachineDominatorTree>();
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    Save = nullptr;
+    Restore = nullptr;
+    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MLI = &getAnalysis<MachineLoopInfo>();
+    EntryFreq = MBFI->getEntryFreq();
+    const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+    FrameSetupOpcode = TII.getCallFrameSetupOpcode();
+    FrameDestroyOpcode = TII.getCallFrameDestroyOpcode();
+    Entry = &MF.front();
+    CurrentCSRs.clear();
+    MachineFunc = &MF;
+
+    ++NumFunc;
+  }
+
+  /// Check whether or not Save and Restore points are still interesting for
+  /// shrink-wrapping.
+  bool ArePointsInteresting() const { return Save != Entry && Save && Restore; }
+
+  /// \brief Check if shrink wrapping is enabled for this target and function.
+  static bool isShrinkWrapEnabled(const MachineFunction &MF);
+
+public:
+  static char ID;
+
+  ShrinkWrap() : MachineFunctionPass(ID) {
+    initializeShrinkWrapPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<MachineBlockFrequencyInfo>();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  const char *getPassName() const override {
+    return "Shrink Wrapping analysis";
+  }
+
+  /// \brief Perform the shrink-wrapping analysis and update
+  /// the MachineFrameInfo attached to \p MF with the results.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char ShrinkWrap::ID = 0;
+char &llvm::ShrinkWrapID = ShrinkWrap::ID;
+
+INITIALIZE_PASS_BEGIN(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(ShrinkWrap, "shrink-wrap", "Shrink Wrap Pass", false, false)
+
+bool ShrinkWrap::useOrDefCSROrFI(const MachineInstr &MI,
+                                 RegScavenger *RS) const {
+  if (MI.getOpcode() == FrameSetupOpcode ||
+      MI.getOpcode() == FrameDestroyOpcode) {
+    DEBUG(dbgs() << "Frame instruction: " << MI << '\n');
+    return true;
+  }
+  for (const MachineOperand &MO : MI.operands()) {
+    bool UseOrDefCSR = false;
+    if (MO.isReg()) {
+      unsigned PhysReg = MO.getReg();
+      if (!PhysReg)
+        continue;
+      assert(TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+             "Unallocated register?!");
+      UseOrDefCSR = RCI.getLastCalleeSavedAlias(PhysReg);
+    } else if (MO.isRegMask()) {
+      // Check if this regmask clobbers any of the CSRs.
+      for (unsigned Reg : getCurrentCSRs(RS)) {
+        if (MO.clobbersPhysReg(Reg)) {
+          UseOrDefCSR = true;
+          break;
+        }
+      }
+    }
+    if (UseOrDefCSR || MO.isFI()) {
+      DEBUG(dbgs() << "Use or define CSR(" << UseOrDefCSR << ") or FI("
+                   << MO.isFI() << "): " << MI << '\n');
+      return true;
+    }
+  }
+  return false;
+}
+
+/// \brief Helper function to find the immediate (post) dominator.
+template <typename ListOfBBs, typename DominanceAnalysis>
+MachineBasicBlock *FindIDom(MachineBasicBlock &Block, ListOfBBs BBs,
+                            DominanceAnalysis &Dom) {
+  MachineBasicBlock *IDom = &Block;
+  for (MachineBasicBlock *BB : BBs) {
+    IDom = Dom.findNearestCommonDominator(IDom, BB);
+    if (!IDom)
+      break;
+  }
+  if (IDom == &Block)
+    return nullptr;
+  return IDom;
+}
+
+void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB,
+                                         RegScavenger *RS) {
+  // Get rid of the easy cases first.
+  if (!Save)
+    Save = &MBB;
+  else
+    Save = MDT->findNearestCommonDominator(Save, &MBB);
+
+  if (!Save) {
+    DEBUG(dbgs() << "Found a block that is not reachable from Entry\n");
+    return;
+  }
+
+  if (!Restore)
+    Restore = &MBB;
+  else
+    Restore = MPDT->findNearestCommonDominator(Restore, &MBB);
+
+  // Make sure we would be able to insert the restore code before the
+  // terminator.
+  if (Restore == &MBB) {
+    for (const MachineInstr &Terminator : MBB.terminators()) {
+      if (!useOrDefCSROrFI(Terminator, RS))
+        continue;
+      // One of the terminator needs to happen before the restore point.
+      if (MBB.succ_empty()) {
+        Restore = nullptr;
+        break;
+      }
+      // Look for a restore point that post-dominates all the successors.
+      // The immediate post-dominator is what we are looking for.
+      Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT);
+      break;
+    }
+  }
+
+  if (!Restore) {
+    DEBUG(dbgs() << "Restore point needs to be spanned on several blocks\n");
+    return;
+  }
+
+  // Make sure Save and Restore are suitable for shrink-wrapping:
+  // 1. all path from Save needs to lead to Restore before exiting.
+  // 2. all path to Restore needs to go through Save from Entry.
+  // We achieve that by making sure that:
+  // A. Save dominates Restore.
+  // B. Restore post-dominates Save.
+  // C. Save and Restore are in the same loop.
+  bool SaveDominatesRestore = false;
+  bool RestorePostDominatesSave = false;
+  while (Save && Restore &&
+         (!(SaveDominatesRestore = MDT->dominates(Save, Restore)) ||
+          !(RestorePostDominatesSave = MPDT->dominates(Restore, Save)) ||
+          // Post-dominance is not enough in loops to ensure that all uses/defs
+          // are after the prologue and before the epilogue at runtime.
+          // E.g.,
+          // while(1) {
+          //  Save
+          //  Restore
+          //   if (...)
+          //     break;
+          //  use/def CSRs
+          // }
+          // All the uses/defs of CSRs are dominated by Save and post-dominated
+          // by Restore. However, the CSRs uses are still reachable after
+          // Restore and before Save are executed.
+          //
+          // For now, just push the restore/save points outside of loops.
+          // FIXME: Refine the criteria to still find interesting cases
+          // for loops.
+          MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+    // Fix (A).
+    if (!SaveDominatesRestore) {
+      Save = MDT->findNearestCommonDominator(Save, Restore);
+      continue;
+    }
+    // Fix (B).
+    if (!RestorePostDominatesSave)
+      Restore = MPDT->findNearestCommonDominator(Restore, Save);
+
+    // Fix (C).
+    if (Save && Restore &&
+        (MLI->getLoopFor(Save) || MLI->getLoopFor(Restore))) {
+      if (MLI->getLoopDepth(Save) > MLI->getLoopDepth(Restore)) {
+        // Push Save outside of this loop if immediate dominator is different
+        // from save block. If immediate dominator is not different, bail out.
+        Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
+        if (!Save)
+          break;
+      } else {
+        // If the loop does not exit, there is no point in looking
+        // for a post-dominator outside the loop.
+        SmallVector<MachineBasicBlock*, 4> ExitBlocks;
+        MLI->getLoopFor(Restore)->getExitingBlocks(ExitBlocks);
+        // Push Restore outside of this loop.
+        // Look for the immediate post-dominator of the loop exits.
+        MachineBasicBlock *IPdom = Restore;
+        for (MachineBasicBlock *LoopExitBB: ExitBlocks) {
+          IPdom = FindIDom<>(*IPdom, LoopExitBB->successors(), *MPDT);
+          if (!IPdom)
+            break;
+        }
+        // If the immediate post-dominator is not in a less nested loop,
+        // then we are stuck in a program with an infinite loop.
+        // In that case, we will not find a safe point, hence, bail out.
+        if (IPdom && MLI->getLoopDepth(IPdom) < MLI->getLoopDepth(Restore))
+          Restore = IPdom;
+        else {
+          Restore = nullptr;
+          break;
+        }
+      }
+    }
+  }
+}
+
+/// Check whether the edge (\p SrcBB, \p DestBB) is a backedge according to MLI.
+/// I.e., check if it exists a loop that contains SrcBB and where DestBB is the
+/// loop header.
+static bool isProperBackedge(const MachineLoopInfo &MLI,
+                             const MachineBasicBlock *SrcBB,
+                             const MachineBasicBlock *DestBB) {
+  for (const MachineLoop *Loop = MLI.getLoopFor(SrcBB); Loop;
+       Loop = Loop->getParentLoop()) {
+    if (Loop->getHeader() == DestBB)
+      return true;
+  }
+  return false;
+}
+
+/// Check if the CFG of \p MF is irreducible.
+static bool isIrreducibleCFG(const MachineFunction &MF,
+                             const MachineLoopInfo &MLI) {
+  const MachineBasicBlock *Entry = &*MF.begin();
+  ReversePostOrderTraversal<const MachineBasicBlock *> RPOT(Entry);
+  BitVector VisitedBB(MF.getNumBlockIDs());
+  for (const MachineBasicBlock *MBB : RPOT) {
+    VisitedBB.set(MBB->getNumber());
+    for (const MachineBasicBlock *SuccBB : MBB->successors()) {
+      if (!VisitedBB.test(SuccBB->getNumber()))
+        continue;
+      // We already visited SuccBB, thus MBB->SuccBB must be a backedge.
+      // Check that the head matches what we have in the loop information.
+      // Otherwise, we have an irreducible graph.
+      if (!isProperBackedge(MLI, MBB, SuccBB))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.empty() || !isShrinkWrapEnabled(MF))
+    return false;
+
+  DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n');
+
+  init(MF);
+
+  if (isIrreducibleCFG(MF, *MLI)) {
+    // If MF is irreducible, a block may be in a loop without
+    // MachineLoopInfo reporting it. I.e., we may use the
+    // post-dominance property in loops, which lead to incorrect
+    // results. Moreover, we may miss that the prologue and
+    // epilogue are not in the same loop, leading to unbalanced
+    // construction/deconstruction of the stack frame.
+    DEBUG(dbgs() << "Irreducible CFGs are not supported yet\n");
+    return false;
+  }
+
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  std::unique_ptr<RegScavenger> RS(
+      TRI->requiresRegisterScavenging(MF) ? new RegScavenger() : nullptr);
+
+  for (MachineBasicBlock &MBB : MF) {
+    DEBUG(dbgs() << "Look into: " << MBB.getNumber() << ' ' << MBB.getName()
+                 << '\n');
+
+    if (MBB.isEHFuncletEntry()) {
+      DEBUG(dbgs() << "EH Funclets are not supported yet.\n");
+      return false;
+    }
+
+    for (const MachineInstr &MI : MBB) {
+      if (!useOrDefCSROrFI(MI, RS.get()))
+        continue;
+      // Save (resp. restore) point must dominate (resp. post dominate)
+      // MI. Look for the proper basic block for those.
+      updateSaveRestorePoints(MBB, RS.get());
+      // If we are at a point where we cannot improve the placement of
+      // save/restore instructions, just give up.
+      if (!ArePointsInteresting()) {
+        DEBUG(dbgs() << "No Shrink wrap candidate found\n");
+        return false;
+      }
+      // No need to look for other instructions, this basic block
+      // will already be part of the handled region.
+      break;
+    }
+  }
+  if (!ArePointsInteresting()) {
+    // If the points are not interesting at this point, then they must be null
+    // because it means we did not encounter any frame/CSR related code.
+    // Otherwise, we would have returned from the previous loop.
+    assert(!Save && !Restore && "We miss a shrink-wrap opportunity?!");
+    DEBUG(dbgs() << "Nothing to shrink-wrap\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "\n ** Results **\nFrequency of the Entry: " << EntryFreq
+               << '\n');
+
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  do {
+    DEBUG(dbgs() << "Shrink wrap candidates (#, Name, Freq):\nSave: "
+                 << Save->getNumber() << ' ' << Save->getName() << ' '
+                 << MBFI->getBlockFreq(Save).getFrequency() << "\nRestore: "
+                 << Restore->getNumber() << ' ' << Restore->getName() << ' '
+                 << MBFI->getBlockFreq(Restore).getFrequency() << '\n');
+
+    bool IsSaveCheap, TargetCanUseSaveAsPrologue = false;
+    if (((IsSaveCheap = EntryFreq >= MBFI->getBlockFreq(Save).getFrequency()) &&
+         EntryFreq >= MBFI->getBlockFreq(Restore).getFrequency()) &&
+        ((TargetCanUseSaveAsPrologue = TFI->canUseAsPrologue(*Save)) &&
+         TFI->canUseAsEpilogue(*Restore)))
+      break;
+    DEBUG(dbgs() << "New points are too expensive or invalid for the target\n");
+    MachineBasicBlock *NewBB;
+    if (!IsSaveCheap || !TargetCanUseSaveAsPrologue) {
+      Save = FindIDom<>(*Save, Save->predecessors(), *MDT);
+      if (!Save)
+        break;
+      NewBB = Save;
+    } else {
+      // Restore is expensive.
+      Restore = FindIDom<>(*Restore, Restore->successors(), *MPDT);
+      if (!Restore)
+        break;
+      NewBB = Restore;
+    }
+    updateSaveRestorePoints(*NewBB, RS.get());
+  } while (Save && Restore);
+
+  if (!ArePointsInteresting()) {
+    ++NumCandidatesDropped;
+    return false;
+  }
+
+  DEBUG(dbgs() << "Final shrink wrap candidates:\nSave: " << Save->getNumber()
+               << ' ' << Save->getName() << "\nRestore: "
+               << Restore->getNumber() << ' ' << Restore->getName() << '\n');
+
+  MachineFrameInfo *MFI = MF.getFrameInfo();
+  MFI->setSavePoint(Save);
+  MFI->setRestorePoint(Restore);
+  ++NumCandidates;
+  return false;
+}
+
+bool ShrinkWrap::isShrinkWrapEnabled(const MachineFunction &MF) {
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+
+  switch (EnableShrinkWrapOpt) {
+  case cl::BOU_UNSET:
+    return TFI->enableShrinkWrapping(MF) &&
+      // Windows with CFI has some limitations that make it impossible
+      // to use shrink-wrapping.
+      !MF.getTarget().getMCAsmInfo()->usesWindowsCFI() &&
+      // Sanitizers look at the value of the stack at the location
+      // of the crash. Since a crash can happen anywhere, the
+      // frame must be lowered before anything else happen for the
+      // sanitizers to be able to get a correct stack frame.
+      !(MF.getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
+        MF.getFunction()->hasFnAttribute(Attribute::SanitizeThread) ||
+        MF.getFunction()->hasFnAttribute(Attribute::SanitizeMemory));
+  // If EnableShrinkWrap is set, it takes precedence on whatever the
+  // target sets. The rational is that we assume we want to test
+  // something related to shrink-wrapping.
+  case cl::BOU_TRUE:
+    return true;
+  case cl::BOU_FALSE:
+    return false;
+  }
+  llvm_unreachable("Invalid shrink-wrapping state");
+}
diff --git a/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
new file mode 100644
index 000000000000..ce01c5f23e57
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -0,0 +1,480 @@
+//===- SjLjEHPrepare.cpp - Eliminate Invoke & Unwind instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation is designed for use by code generators which use SjLj
+// based exception handling.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "sjljehprepare"
+
+STATISTIC(NumInvokes, "Number of invokes replaced");
+STATISTIC(NumSpilled, "Number of registers live across unwind edges");
+
+namespace {
+class SjLjEHPrepare : public FunctionPass {
+  Type *doubleUnderDataTy;
+  Type *doubleUnderJBufTy;
+  Type *FunctionContextTy;
+  Constant *RegisterFn;
+  Constant *UnregisterFn;
+  Constant *BuiltinSetupDispatchFn;
+  Constant *FrameAddrFn;
+  Constant *StackAddrFn;
+  Constant *StackRestoreFn;
+  Constant *LSDAAddrFn;
+  Constant *CallSiteFn;
+  Constant *FuncCtxFn;
+  AllocaInst *FuncCtx;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  explicit SjLjEHPrepare() : FunctionPass(ID) {}
+  bool doInitialization(Module &M) override;
+  bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {}
+  const char *getPassName() const override {
+    return "SJLJ Exception Handling preparation";
+  }
+
+private:
+  bool setupEntryBlockAndCallSites(Function &F);
+  void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
+  Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
+  void lowerIncomingArguments(Function &F);
+  void lowerAcrossUnwindEdges(Function &F, ArrayRef<InvokeInst *> Invokes);
+  void insertCallSiteStore(Instruction *I, int Number);
+};
+} // end anonymous namespace
+
+char SjLjEHPrepare::ID = 0;
+INITIALIZE_PASS(SjLjEHPrepare, "sjljehprepare", "Prepare SjLj exceptions",
+                false, false)
+
+// Public Interface To the SjLjEHPrepare pass.
+FunctionPass *llvm::createSjLjEHPreparePass() { return new SjLjEHPrepare(); }
+// doInitialization - Set up decalarations and types needed to process
+// exceptions.
+bool SjLjEHPrepare::doInitialization(Module &M) {
+  // Build the function context structure.
+  // builtin_setjmp uses a five word jbuf
+  Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
+  Type *Int32Ty = Type::getInt32Ty(M.getContext());
+  doubleUnderDataTy = ArrayType::get(Int32Ty, 4);
+  doubleUnderJBufTy = ArrayType::get(VoidPtrTy, 5);
+  FunctionContextTy = StructType::get(VoidPtrTy,         // __prev
+                                      Int32Ty,           // call_site
+                                      doubleUnderDataTy, // __data
+                                      VoidPtrTy,         // __personality
+                                      VoidPtrTy,         // __lsda
+                                      doubleUnderJBufTy, // __jbuf
+                                      nullptr);
+
+  return true;
+}
+
+/// insertCallSiteStore - Insert a store of the call-site value to the
+/// function context
+void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
+  IRBuilder<> Builder(I);
+
+  // Get a reference to the call_site field.
+  Type *Int32Ty = Type::getInt32Ty(I->getContext());
+  Value *Zero = ConstantInt::get(Int32Ty, 0);
+  Value *One = ConstantInt::get(Int32Ty, 1);
+  Value *Idxs[2] = { Zero, One };
+  Value *CallSite =
+      Builder.CreateGEP(FunctionContextTy, FuncCtx, Idxs, "call_site");
+
+  // Insert a store of the call-site number
+  ConstantInt *CallSiteNoC =
+      ConstantInt::get(Type::getInt32Ty(I->getContext()), Number);
+  Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
+}
+
+/// MarkBlocksLiveIn - Insert BB and all of its predecessors into LiveBBs until
+/// we reach blocks we've already seen.
+static void MarkBlocksLiveIn(BasicBlock *BB,
+                             SmallPtrSetImpl<BasicBlock *> &LiveBBs) {
+  if (!LiveBBs.insert(BB).second)
+    return; // already been here.
+
+  for (BasicBlock *PredBB : predecessors(BB))
+    MarkBlocksLiveIn(PredBB, LiveBBs);
+}
+
+/// substituteLPadValues - Substitute the values returned by the landingpad
+/// instruction with those returned by the personality function.
+void SjLjEHPrepare::substituteLPadValues(LandingPadInst *LPI, Value *ExnVal,
+                                         Value *SelVal) {
+  SmallVector<Value *, 8> UseWorkList(LPI->user_begin(), LPI->user_end());
+  while (!UseWorkList.empty()) {
+    Value *Val = UseWorkList.pop_back_val();
+    auto *EVI = dyn_cast<ExtractValueInst>(Val);
+    if (!EVI)
+      continue;
+    if (EVI->getNumIndices() != 1)
+      continue;
+    if (*EVI->idx_begin() == 0)
+      EVI->replaceAllUsesWith(ExnVal);
+    else if (*EVI->idx_begin() == 1)
+      EVI->replaceAllUsesWith(SelVal);
+    if (EVI->use_empty())
+      EVI->eraseFromParent();
+  }
+
+  if (LPI->use_empty())
+    return;
+
+  // There are still some uses of LPI. Construct an aggregate with the exception
+  // values and replace the LPI with that aggregate.
+  Type *LPadType = LPI->getType();
+  Value *LPadVal = UndefValue::get(LPadType);
+  auto *SelI = cast<Instruction>(SelVal);
+  IRBuilder<> Builder(SelI->getParent(), std::next(SelI->getIterator()));
+  LPadVal = Builder.CreateInsertValue(LPadVal, ExnVal, 0, "lpad.val");
+  LPadVal = Builder.CreateInsertValue(LPadVal, SelVal, 1, "lpad.val");
+
+  LPI->replaceAllUsesWith(LPadVal);
+}
+
+/// setupFunctionContext - Allocate the function context on the stack and fill
+/// it with all of the data that we know at this point.
+Value *SjLjEHPrepare::setupFunctionContext(Function &F,
+                                           ArrayRef<LandingPadInst *> LPads) {
+  BasicBlock *EntryBB = &F.front();
+
+  // Create an alloca for the incoming jump buffer ptr and the new jump buffer
+  // that needs to be restored on all exits from the function. This is an alloca
+  // because the value needs to be added to the global context list.
+  auto &DL = F.getParent()->getDataLayout();
+  unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
+  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
+                           &EntryBB->front());
+
+  // Fill in the function context structure.
+  for (LandingPadInst *LPI : LPads) {
+    IRBuilder<> Builder(LPI->getParent(),
+                        LPI->getParent()->getFirstInsertionPt());
+
+    // Reference the __data field.
+    Value *FCData =
+        Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 2, "__data");
+
+    // The exception values come back in context->__data[0].
+    Value *ExceptionAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
+                                                      0, 0, "exception_gep");
+    Value *ExnVal = Builder.CreateLoad(ExceptionAddr, true, "exn_val");
+    ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
+
+    Value *SelectorAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
+                                                     0, 1, "exn_selector_gep");
+    Value *SelVal = Builder.CreateLoad(SelectorAddr, true, "exn_selector_val");
+
+    substituteLPadValues(LPI, ExnVal, SelVal);
+  }
+
+  // Personality function
+  IRBuilder<> Builder(EntryBB->getTerminator());
+  Value *PersonalityFn = F.getPersonalityFn();
+  Value *PersonalityFieldPtr = Builder.CreateConstGEP2_32(
+      FunctionContextTy, FuncCtx, 0, 3, "pers_fn_gep");
+  Builder.CreateStore(
+      Builder.CreateBitCast(PersonalityFn, Builder.getInt8PtrTy()),
+      PersonalityFieldPtr, /*isVolatile=*/true);
+
+  // LSDA address
+  Value *LSDA = Builder.CreateCall(LSDAAddrFn, {}, "lsda_addr");
+  Value *LSDAFieldPtr =
+      Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 4, "lsda_gep");
+  Builder.CreateStore(LSDA, LSDAFieldPtr, /*isVolatile=*/true);
+
+  return FuncCtx;
+}
+
+/// lowerIncomingArguments - To avoid having to handle incoming arguments
+/// specially, we lower each arg to a copy instruction in the entry block. This
+/// ensures that the argument value itself cannot be live out of the entry
+/// block.
+void SjLjEHPrepare::lowerIncomingArguments(Function &F) {
+  BasicBlock::iterator AfterAllocaInsPt = F.begin()->begin();
+  while (isa<AllocaInst>(AfterAllocaInsPt) &&
+         cast<AllocaInst>(AfterAllocaInsPt)->isStaticAlloca())
+    ++AfterAllocaInsPt;
+  assert(AfterAllocaInsPt != F.front().end());
+
+  for (auto &AI : F.args()) {
+    Type *Ty = AI.getType();
+
+    // Use 'select i8 true, %arg, undef' to simulate a 'no-op' instruction.
+    Value *TrueValue = ConstantInt::getTrue(F.getContext());
+    Value *UndefValue = UndefValue::get(Ty);
+    Instruction *SI = SelectInst::Create(
+        TrueValue, &AI, UndefValue, AI.getName() + ".tmp", &*AfterAllocaInsPt);
+    AI.replaceAllUsesWith(SI);
+
+    // Reset the operand, because it  was clobbered by the RAUW above.
+    SI->setOperand(1, &AI);
+  }
+}
+
+/// lowerAcrossUnwindEdges - Find all variables which are alive across an unwind
+/// edge and spill them.
+void SjLjEHPrepare::lowerAcrossUnwindEdges(Function &F,
+                                           ArrayRef<InvokeInst *> Invokes) {
+  // Finally, scan the code looking for instructions with bad live ranges.
+  for (BasicBlock &BB : F) {
+    for (Instruction &Inst : BB) {
+      // Ignore obvious cases we don't have to handle. In particular, most
+      // instructions either have no uses or only have a single use inside the
+      // current block. Ignore them quickly.
+      if (Inst.use_empty())
+        continue;
+      if (Inst.hasOneUse() &&
+          cast<Instruction>(Inst.user_back())->getParent() == &BB &&
+          !isa<PHINode>(Inst.user_back()))
+        continue;
+
+      // If this is an alloca in the entry block, it's not a real register
+      // value.
+      if (auto *AI = dyn_cast<AllocaInst>(&Inst))
+        if (AI->isStaticAlloca())
+          continue;
+
+      // Avoid iterator invalidation by copying users to a temporary vector.
+      SmallVector<Instruction *, 16> Users;
+      for (User *U : Inst.users()) {
+        Instruction *UI = cast<Instruction>(U);
+        if (UI->getParent() != &BB || isa<PHINode>(UI))
+          Users.push_back(UI);
+      }
+
+      // Find all of the blocks that this value is live in.
+      SmallPtrSet<BasicBlock *, 32> LiveBBs;
+      LiveBBs.insert(&BB);
+      while (!Users.empty()) {
+        Instruction *U = Users.pop_back_val();
+
+        if (!isa<PHINode>(U)) {
+          MarkBlocksLiveIn(U->getParent(), LiveBBs);
+        } else {
+          // Uses for a PHI node occur in their predecessor block.
+          PHINode *PN = cast<PHINode>(U);
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == &Inst)
+              MarkBlocksLiveIn(PN->getIncomingBlock(i), LiveBBs);
+        }
+      }
+
+      // Now that we know all of the blocks that this thing is live in, see if
+      // it includes any of the unwind locations.
+      bool NeedsSpill = false;
+      for (InvokeInst *Invoke : Invokes) {
+        BasicBlock *UnwindBlock = Invoke->getUnwindDest();
+        if (UnwindBlock != &BB && LiveBBs.count(UnwindBlock)) {
+          DEBUG(dbgs() << "SJLJ Spill: " << Inst << " around "
+                       << UnwindBlock->getName() << "\n");
+          NeedsSpill = true;
+          break;
+        }
+      }
+
+      // If we decided we need a spill, do it.
+      // FIXME: Spilling this way is overkill, as it forces all uses of
+      // the value to be reloaded from the stack slot, even those that aren't
+      // in the unwind blocks. We should be more selective.
+      if (NeedsSpill) {
+        DemoteRegToStack(Inst, true);
+        ++NumSpilled;
+      }
+    }
+  }
+
+  // Go through the landing pads and remove any PHIs there.
+  for (InvokeInst *Invoke : Invokes) {
+    BasicBlock *UnwindBlock = Invoke->getUnwindDest();
+    LandingPadInst *LPI = UnwindBlock->getLandingPadInst();
+
+    // Place PHIs into a set to avoid invalidating the iterator.
+    SmallPtrSet<PHINode *, 8> PHIsToDemote;
+    for (BasicBlock::iterator PN = UnwindBlock->begin(); isa<PHINode>(PN); ++PN)
+      PHIsToDemote.insert(cast<PHINode>(PN));
+    if (PHIsToDemote.empty())
+      continue;
+
+    // Demote the PHIs to the stack.
+    for (PHINode *PN : PHIsToDemote)
+      DemotePHIToStack(PN);
+
+    // Move the landingpad instruction back to the top of the landing pad block.
+    LPI->moveBefore(&UnwindBlock->front());
+  }
+}
+
+/// setupEntryBlockAndCallSites - Setup the entry block by creating and filling
+/// the function context and marking the call sites with the appropriate
+/// values. These values are used by the DWARF EH emitter.
+bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
+  SmallVector<ReturnInst *, 16> Returns;
+  SmallVector<InvokeInst *, 16> Invokes;
+  SmallSetVector<LandingPadInst *, 16> LPads;
+
+  // Look through the terminators of the basic blocks to find invokes.
+  for (BasicBlock &BB : F)
+    if (auto *II = dyn_cast<InvokeInst>(BB.getTerminator())) {
+      if (Function *Callee = II->getCalledFunction())
+        if (Callee->getIntrinsicID() == Intrinsic::donothing) {
+          // Remove the NOP invoke.
+          BranchInst::Create(II->getNormalDest(), II);
+          II->eraseFromParent();
+          continue;
+        }
+
+      Invokes.push_back(II);
+      LPads.insert(II->getUnwindDest()->getLandingPadInst());
+    } else if (auto *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
+      Returns.push_back(RI);
+    }
+
+  if (Invokes.empty())
+    return false;
+
+  NumInvokes += Invokes.size();
+
+  lowerIncomingArguments(F);
+  lowerAcrossUnwindEdges(F, Invokes);
+
+  Value *FuncCtx =
+      setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
+  BasicBlock *EntryBB = &F.front();
+  IRBuilder<> Builder(EntryBB->getTerminator());
+
+  // Get a reference to the jump buffer.
+  Value *JBufPtr =
+      Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 5, "jbuf_gep");
+
+  // Save the frame pointer.
+  Value *FramePtr = Builder.CreateConstGEP2_32(doubleUnderJBufTy, JBufPtr, 0, 0,
+                                               "jbuf_fp_gep");
+
+  Value *Val = Builder.CreateCall(FrameAddrFn, Builder.getInt32(0), "fp");
+  Builder.CreateStore(Val, FramePtr, /*isVolatile=*/true);
+
+  // Save the stack pointer.
+  Value *StackPtr = Builder.CreateConstGEP2_32(doubleUnderJBufTy, JBufPtr, 0, 2,
+                                               "jbuf_sp_gep");
+
+  Val = Builder.CreateCall(StackAddrFn, {}, "sp");
+  Builder.CreateStore(Val, StackPtr, /*isVolatile=*/true);
+
+  // Call the setup_dispatch instrinsic. It fills in the rest of the jmpbuf.
+  Builder.CreateCall(BuiltinSetupDispatchFn, {});
+
+  // Store a pointer to the function context so that the back-end will know
+  // where to look for it.
+  Value *FuncCtxArg = Builder.CreateBitCast(FuncCtx, Builder.getInt8PtrTy());
+  Builder.CreateCall(FuncCtxFn, FuncCtxArg);
+
+  // At this point, we are all set up, update the invoke instructions to mark
+  // their call_site values.
+  for (unsigned I = 0, E = Invokes.size(); I != E; ++I) {
+    insertCallSiteStore(Invokes[I], I + 1);
+
+    ConstantInt *CallSiteNum =
+        ConstantInt::get(Type::getInt32Ty(F.getContext()), I + 1);
+
+    // Record the call site value for the back end so it stays associated with
+    // the invoke.
+    CallInst::Create(CallSiteFn, CallSiteNum, "", Invokes[I]);
+  }
+
+  // Mark call instructions that aren't nounwind as no-action (call_site ==
+  // -1). Skip the entry block, as prior to then, no function context has been
+  // created for this function and any unexpected exceptions thrown will go
+  // directly to the caller's context, which is what we want anyway, so no need
+  // to do anything here.
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
+      continue;
+    for (Instruction &I : BB)
+      if (I.mayThrow())
+        insertCallSiteStore(&I, -1);
+  }
+
+  // Register the function context and make sure it's known to not throw
+  CallInst *Register =
+      CallInst::Create(RegisterFn, FuncCtx, "", EntryBB->getTerminator());
+  Register->setDoesNotThrow();
+
+  // Following any allocas not in the entry block, update the saved SP in the
+  // jmpbuf to the new value.
+  for (BasicBlock &BB : F) {
+    if (&BB == &F.front())
+      continue;
+    for (Instruction &I : BB) {
+      if (auto *CI = dyn_cast<CallInst>(&I)) {
+        if (CI->getCalledFunction() != StackRestoreFn)
+          continue;
+      } else if (!isa<AllocaInst>(&I)) {
+        continue;
+      }
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(&I);
+      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
+      StoreStackAddr->insertAfter(StackAddr);
+    }
+  }
+
+  // Finally, for any returns from this function, if this function contains an
+  // invoke, add a call to unregister the function context.
+  for (ReturnInst *Return : Returns)
+    CallInst::Create(UnregisterFn, FuncCtx, "", Return);
+
+  return true;
+}
+
+bool SjLjEHPrepare::runOnFunction(Function &F) {
+  Module &M = *F.getParent();
+  RegisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), nullptr);
+  UnregisterFn = M.getOrInsertFunction(
+      "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
+      PointerType::getUnqual(FunctionContextTy), nullptr);
+  FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
+  StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
+  BuiltinSetupDispatchFn =
+    Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setup_dispatch);
+  LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
+  CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
+  FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
+
+  bool Res = setupEntryBlockAndCallSites(F);
+  return Res;
+}
diff --git a/contrib/llvm/lib/CodeGen/SlotIndexes.cpp b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
new file mode 100644
index 000000000000..dba103e9bfb1
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -0,0 +1,245 @@
+//===-- SlotIndexes.cpp - Slot Indexes Pass  ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "slotindexes"
+
+char SlotIndexes::ID = 0;
+INITIALIZE_PASS(SlotIndexes, "slotindexes",
+                "Slot index numbering", false, false)
+
+STATISTIC(NumLocalRenum,  "Number of local renumberings");
+STATISTIC(NumGlobalRenum, "Number of global renumberings");
+
+void SlotIndexes::getAnalysisUsage(AnalysisUsage &au) const {
+  au.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(au);
+}
+
+void SlotIndexes::releaseMemory() {
+  mi2iMap.clear();
+  MBBRanges.clear();
+  idx2MBBMap.clear();
+  indexList.clear();
+  ileAllocator.Reset();
+}
+
+bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
+
+  // Compute numbering as follows:
+  // Grab an iterator to the start of the index list.
+  // Iterate over all MBBs, and within each MBB all MIs, keeping the MI
+  // iterator in lock-step (though skipping it over indexes which have
+  // null pointers in the instruction field).
+  // At each iteration assert that the instruction pointed to in the index
+  // is the same one pointed to by the MI iterator. This
+
+  // FIXME: This can be simplified. The mi2iMap_, Idx2MBBMap, etc. should
+  // only need to be set up once after the first numbering is computed.
+
+  mf = &fn;
+
+  // Check that the list contains only the sentinal.
+  assert(indexList.empty() && "Index list non-empty at initial numbering?");
+  assert(idx2MBBMap.empty() &&
+         "Index -> MBB mapping non-empty at initial numbering?");
+  assert(MBBRanges.empty() &&
+         "MBB -> Index mapping non-empty at initial numbering?");
+  assert(mi2iMap.empty() &&
+         "MachineInstr -> Index mapping non-empty at initial numbering?");
+
+  unsigned index = 0;
+  MBBRanges.resize(mf->getNumBlockIDs());
+  idx2MBBMap.reserve(mf->size());
+
+  indexList.push_back(createEntry(nullptr, index));
+
+  // Iterate over the function.
+  for (MachineBasicBlock &MBB : *mf) {
+    // Insert an index for the MBB start.
+    SlotIndex blockStartIndex(&indexList.back(), SlotIndex::Slot_Block);
+
+    for (MachineInstr &MI : MBB) {
+      if (MI.isDebugValue())
+        continue;
+
+      // Insert a store index for the instr.
+      indexList.push_back(createEntry(&MI, index += SlotIndex::InstrDist));
+
+      // Save this base index in the maps.
+      mi2iMap.insert(std::make_pair(
+          &MI, SlotIndex(&indexList.back(), SlotIndex::Slot_Block)));
+    }
+
+    // We insert one blank instructions between basic blocks.
+    indexList.push_back(createEntry(nullptr, index += SlotIndex::InstrDist));
+
+    MBBRanges[MBB.getNumber()].first = blockStartIndex;
+    MBBRanges[MBB.getNumber()].second = SlotIndex(&indexList.back(),
+                                                   SlotIndex::Slot_Block);
+    idx2MBBMap.push_back(IdxMBBPair(blockStartIndex, &MBB));
+  }
+
+  // Sort the Idx2MBBMap
+  std::sort(idx2MBBMap.begin(), idx2MBBMap.end(), Idx2MBBCompare());
+
+  DEBUG(mf->print(dbgs(), this));
+
+  // And we're done!
+  return false;
+}
+
+void SlotIndexes::renumberIndexes() {
+  // Renumber updates the index of every element of the index list.
+  DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
+  ++NumGlobalRenum;
+
+  unsigned index = 0;
+
+  for (IndexList::iterator I = indexList.begin(), E = indexList.end();
+       I != E; ++I) {
+    I->setIndex(index);
+    index += SlotIndex::InstrDist;
+  }
+}
+
+// Renumber indexes locally after curItr was inserted, but failed to get a new
+// index.
+void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
+  // Number indexes with half the default spacing so we can catch up quickly.
+  const unsigned Space = SlotIndex::InstrDist/2;
+  static_assert((Space & 3) == 0, "InstrDist must be a multiple of 2*NUM");
+
+  IndexList::iterator startItr = std::prev(curItr);
+  unsigned index = startItr->getIndex();
+  do {
+    curItr->setIndex(index += Space);
+    ++curItr;
+    // If the next index is bigger, we have caught up.
+  } while (curItr != indexList.end() && curItr->getIndex() <= index);
+
+  DEBUG(dbgs() << "\n*** Renumbered SlotIndexes " << startItr->getIndex() << '-'
+               << index << " ***\n");
+  ++NumLocalRenum;
+}
+
+// Repair indexes after adding and removing instructions.
+void SlotIndexes::repairIndexesInRange(MachineBasicBlock *MBB,
+                                       MachineBasicBlock::iterator Begin,
+                                       MachineBasicBlock::iterator End) {
+  // FIXME: Is this really necessary? The only caller repairIntervalsForRange()
+  // does the same thing.
+  // Find anchor points, which are at the beginning/end of blocks or at
+  // instructions that already have indexes.
+  while (Begin != MBB->begin() && !hasIndex(*Begin))
+    --Begin;
+  while (End != MBB->end() && !hasIndex(*End))
+    ++End;
+
+  bool includeStart = (Begin == MBB->begin());
+  SlotIndex startIdx;
+  if (includeStart)
+    startIdx = getMBBStartIdx(MBB);
+  else
+    startIdx = getInstructionIndex(*Begin);
+
+  SlotIndex endIdx;
+  if (End == MBB->end())
+    endIdx = getMBBEndIdx(MBB);
+  else
+    endIdx = getInstructionIndex(*End);
+
+  // FIXME: Conceptually, this code is implementing an iterator on MBB that
+  // optionally includes an additional position prior to MBB->begin(), indicated
+  // by the includeStart flag. This is done so that we can iterate MIs in a MBB
+  // in parallel with SlotIndexes, but there should be a better way to do this.
+  IndexList::iterator ListB = startIdx.listEntry()->getIterator();
+  IndexList::iterator ListI = endIdx.listEntry()->getIterator();
+  MachineBasicBlock::iterator MBBI = End;
+  bool pastStart = false;
+  while (ListI != ListB || MBBI != Begin || (includeStart && !pastStart)) {
+    assert(ListI->getIndex() >= startIdx.getIndex() &&
+           (includeStart || !pastStart) &&
+           "Decremented past the beginning of region to repair.");
+
+    MachineInstr *SlotMI = ListI->getInstr();
+    MachineInstr *MI = (MBBI != MBB->end() && !pastStart) ? &*MBBI : nullptr;
+    bool MBBIAtBegin = MBBI == Begin && (!includeStart || pastStart);
+
+    if (SlotMI == MI && !MBBIAtBegin) {
+      --ListI;
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else if (MI && mi2iMap.find(MI) == mi2iMap.end()) {
+      if (MBBI != Begin)
+        --MBBI;
+      else
+        pastStart = true;
+    } else {
+      --ListI;
+      if (SlotMI)
+        removeMachineInstrFromMaps(*SlotMI);
+    }
+  }
+
+  // In theory this could be combined with the previous loop, but it is tricky
+  // to update the IndexList while we are iterating it.
+  for (MachineBasicBlock::iterator I = End; I != Begin;) {
+    --I;
+    MachineInstr &MI = *I;
+    if (!MI.isDebugValue() && mi2iMap.find(&MI) == mi2iMap.end())
+      insertMachineInstrInMaps(MI);
+  }
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SlotIndexes::dump() const {
+  for (IndexList::const_iterator itr = indexList.begin();
+       itr != indexList.end(); ++itr) {
+    dbgs() << itr->getIndex() << " ";
+
+    if (itr->getInstr()) {
+      dbgs() << *itr->getInstr();
+    } else {
+      dbgs() << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MBBRanges.size(); i != e; ++i)
+    dbgs() << "BB#" << i << "\t[" << MBBRanges[i].first << ';'
+           << MBBRanges[i].second << ")\n";
+}
+#endif
+
+// Print a SlotIndex to a raw_ostream.
+void SlotIndex::print(raw_ostream &os) const {
+  if (isValid())
+    os << listEntry()->getIndex() << "Berd"[getSlot()];
+  else
+    os << "invalid";
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+// Dump a SlotIndex to stderr.
+LLVM_DUMP_METHOD void SlotIndex::dump() const {
+  print(dbgs());
+  dbgs() << "\n";
+}
+#endif
+
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.cpp b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
new file mode 100644
index 000000000000..f10c98ef4e50
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.cpp
@@ -0,0 +1,375 @@
+//===-- SpillPlacement.cpp - Optimal Spill Code Placement -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the spill code placement analysis.
+//
+// Each edge bundle corresponds to a node in a Hopfield network. Constraints on
+// basic blocks are weighted by the block frequency and added to become the node
+// bias.
+//
+// Transparent basic blocks have the variable live through, but don't care if it
+// is spilled or in a register. These blocks become connections in the Hopfield
+// network, again weighted by block frequency.
+//
+// The Hopfield network minimizes (possibly locally) its energy function:
+//
+//   E = -sum_n V_n * ( B_n + sum_{n, m linked by b} V_m * F_b )
+//
+// The energy function represents the expected spill code execution frequency,
+// or the cost of spilling. This is a Lyapunov function which never increases
+// when a node is updated. It is guaranteed to converge to a local minimum.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SpillPlacement.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "spillplacement"
+
+char SpillPlacement::ID = 0;
+INITIALIZE_PASS_BEGIN(SpillPlacement, "spill-code-placement",
+                      "Spill Code Placement Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(EdgeBundles)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(SpillPlacement, "spill-code-placement",
+                    "Spill Code Placement Analysis", true, true)
+
+char &llvm::SpillPlacementID = SpillPlacement::ID;
+
+void SpillPlacement::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequiredTransitive<EdgeBundles>();
+  AU.addRequiredTransitive<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Node - Each edge bundle corresponds to a Hopfield node.
+///
+/// The node contains precomputed frequency data that only depends on the CFG,
+/// but Bias and Links are computed each time placeSpills is called.
+///
+/// The node Value is positive when the variable should be in a register. The
+/// value can change when linked nodes change, but convergence is very fast
+/// because all weights are positive.
+///
+struct SpillPlacement::Node {
+  /// BiasN - Sum of blocks that prefer a spill.
+  BlockFrequency BiasN;
+  /// BiasP - Sum of blocks that prefer a register.
+  BlockFrequency BiasP;
+
+  /// Value - Output value of this node computed from the Bias and links.
+  /// This is always on of the values {-1, 0, 1}. A positive number means the
+  /// variable should go in a register through this bundle.
+  int Value;
+
+  typedef SmallVector<std::pair<BlockFrequency, unsigned>, 4> LinkVector;
+
+  /// Links - (Weight, BundleNo) for all transparent blocks connecting to other
+  /// bundles. The weights are all positive block frequencies.
+  LinkVector Links;
+
+  /// SumLinkWeights - Cached sum of the weights of all links + ThresHold.
+  BlockFrequency SumLinkWeights;
+
+  /// preferReg - Return true when this node prefers to be in a register.
+  bool preferReg() const {
+    // Undecided nodes (Value==0) go on the stack.
+    return Value > 0;
+  }
+
+  /// mustSpill - Return True if this node is so biased that it must spill.
+  bool mustSpill() const {
+    // We must spill if Bias < -sum(weights) or the MustSpill flag was set.
+    // BiasN is saturated when MustSpill is set, make sure this still returns
+    // true when the RHS saturates. Note that SumLinkWeights includes Threshold.
+    return BiasN >= BiasP + SumLinkWeights;
+  }
+
+  /// clear - Reset per-query data, but preserve frequencies that only depend on
+  // the CFG.
+  void clear(const BlockFrequency &Threshold) {
+    BiasN = BiasP = Value = 0;
+    SumLinkWeights = Threshold;
+    Links.clear();
+  }
+
+  /// addLink - Add a link to bundle b with weight w.
+  void addLink(unsigned b, BlockFrequency w) {
+    // Update cached sum.
+    SumLinkWeights += w;
+
+    // There can be multiple links to the same bundle, add them up.
+    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I)
+      if (I->second == b) {
+        I->first += w;
+        return;
+      }
+    // This must be the first link to b.
+    Links.push_back(std::make_pair(w, b));
+  }
+
+  /// addBias - Bias this node.
+  void addBias(BlockFrequency freq, BorderConstraint direction) {
+    switch (direction) {
+    default:
+      break;
+    case PrefReg:
+      BiasP += freq;
+      break;
+    case PrefSpill:
+      BiasN += freq;
+      break;
+    case MustSpill:
+      BiasN = BlockFrequency::getMaxFrequency();
+      break;
+    }
+  }
+
+  /// update - Recompute Value from Bias and Links. Return true when node
+  /// preference changes.
+  bool update(const Node nodes[], const BlockFrequency &Threshold) {
+    // Compute the weighted sum of inputs.
+    BlockFrequency SumN = BiasN;
+    BlockFrequency SumP = BiasP;
+    for (LinkVector::iterator I = Links.begin(), E = Links.end(); I != E; ++I) {
+      if (nodes[I->second].Value == -1)
+        SumN += I->first;
+      else if (nodes[I->second].Value == 1)
+        SumP += I->first;
+    }
+
+    // Each weighted sum is going to be less than the total frequency of the
+    // bundle. Ideally, we should simply set Value = sign(SumP - SumN), but we
+    // will add a dead zone around 0 for two reasons:
+    //
+    //  1. It avoids arbitrary bias when all links are 0 as is possible during
+    //     initial iterations.
+    //  2. It helps tame rounding errors when the links nominally sum to 0.
+    //
+    bool Before = preferReg();
+    if (SumN >= SumP + Threshold)
+      Value = -1;
+    else if (SumP >= SumN + Threshold)
+      Value = 1;
+    else
+      Value = 0;
+    return Before != preferReg();
+  }
+
+  void getDissentingNeighbors(SparseSet<unsigned> &List,
+                              const Node nodes[]) const {
+    for (const auto &Elt : Links) {
+      unsigned n = Elt.second;
+      // Neighbors that already have the same value are not going to
+      // change because of this node changing.
+      if (Value != nodes[n].Value)
+        List.insert(n);
+    }
+  }
+};
+
+bool SpillPlacement::runOnMachineFunction(MachineFunction &mf) {
+  MF = &mf;
+  bundles = &getAnalysis<EdgeBundles>();
+  loops = &getAnalysis<MachineLoopInfo>();
+
+  assert(!nodes && "Leaking node array");
+  nodes = new Node[bundles->getNumBundles()];
+  TodoList.clear();
+  TodoList.setUniverse(bundles->getNumBundles());
+
+  // Compute total ingoing and outgoing block frequencies for all bundles.
+  BlockFrequencies.resize(mf.getNumBlockIDs());
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+  setThreshold(MBFI->getEntryFreq());
+  for (auto &I : mf) {
+    unsigned Num = I.getNumber();
+    BlockFrequencies[Num] = MBFI->getBlockFreq(&I);
+  }
+
+  // We never change the function.
+  return false;
+}
+
+void SpillPlacement::releaseMemory() {
+  delete[] nodes;
+  nodes = nullptr;
+  TodoList.clear();
+}
+
+/// activate - mark node n as active if it wasn't already.
+void SpillPlacement::activate(unsigned n) {
+  TodoList.insert(n);
+  if (ActiveNodes->test(n))
+    return;
+  ActiveNodes->set(n);
+  nodes[n].clear(Threshold);
+
+  // Very large bundles usually come from big switches, indirect branches,
+  // landing pads, or loops with many 'continue' statements. It is difficult to
+  // allocate registers when so many different blocks are involved.
+  //
+  // Give a small negative bias to large bundles such that a substantial
+  // fraction of the connected blocks need to be interested before we consider
+  // expanding the region through the bundle. This helps compile time by
+  // limiting the number of blocks visited and the number of links in the
+  // Hopfield network.
+  if (bundles->getBlocks(n).size() > 100) {
+    nodes[n].BiasP = 0;
+    nodes[n].BiasN = (MBFI->getEntryFreq() / 16);
+  }
+}
+
+/// \brief Set the threshold for a given entry frequency.
+///
+/// Set the threshold relative to \c Entry.  Since the threshold is used as a
+/// bound on the open interval (-Threshold;Threshold), 1 is the minimum
+/// threshold.
+void SpillPlacement::setThreshold(const BlockFrequency &Entry) {
+  // Apparently 2 is a good threshold when Entry==2^14, but we need to scale
+  // it.  Divide by 2^13, rounding as appropriate.
+  uint64_t Freq = Entry.getFrequency();
+  uint64_t Scaled = (Freq >> 13) + bool(Freq & (1 << 12));
+  Threshold = std::max(UINT64_C(1), Scaled);
+}
+
+/// addConstraints - Compute node biases and weights from a set of constraints.
+/// Set a bit in NodeMask for each active node.
+void SpillPlacement::addConstraints(ArrayRef<BlockConstraint> LiveBlocks) {
+  for (ArrayRef<BlockConstraint>::iterator I = LiveBlocks.begin(),
+       E = LiveBlocks.end(); I != E; ++I) {
+    BlockFrequency Freq = BlockFrequencies[I->Number];
+
+    // Live-in to block?
+    if (I->Entry != DontCare) {
+      unsigned ib = bundles->getBundle(I->Number, 0);
+      activate(ib);
+      nodes[ib].addBias(Freq, I->Entry);
+    }
+
+    // Live-out from block?
+    if (I->Exit != DontCare) {
+      unsigned ob = bundles->getBundle(I->Number, 1);
+      activate(ob);
+      nodes[ob].addBias(Freq, I->Exit);
+    }
+  }
+}
+
+/// addPrefSpill - Same as addConstraints(PrefSpill)
+void SpillPlacement::addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong) {
+  for (ArrayRef<unsigned>::iterator I = Blocks.begin(), E = Blocks.end();
+       I != E; ++I) {
+    BlockFrequency Freq = BlockFrequencies[*I];
+    if (Strong)
+      Freq += Freq;
+    unsigned ib = bundles->getBundle(*I, 0);
+    unsigned ob = bundles->getBundle(*I, 1);
+    activate(ib);
+    activate(ob);
+    nodes[ib].addBias(Freq, PrefSpill);
+    nodes[ob].addBias(Freq, PrefSpill);
+  }
+}
+
+void SpillPlacement::addLinks(ArrayRef<unsigned> Links) {
+  for (ArrayRef<unsigned>::iterator I = Links.begin(), E = Links.end(); I != E;
+       ++I) {
+    unsigned Number = *I;
+    unsigned ib = bundles->getBundle(Number, 0);
+    unsigned ob = bundles->getBundle(Number, 1);
+
+    // Ignore self-loops.
+    if (ib == ob)
+      continue;
+    activate(ib);
+    activate(ob);
+    BlockFrequency Freq = BlockFrequencies[Number];
+    nodes[ib].addLink(ob, Freq);
+    nodes[ob].addLink(ib, Freq);
+  }
+}
+
+bool SpillPlacement::scanActiveBundles() {
+  RecentPositive.clear();
+  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n)) {
+    update(n);
+    // A node that must spill, or a node without any links is not going to
+    // change its value ever again, so exclude it from iterations.
+    if (nodes[n].mustSpill())
+      continue;
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+  return !RecentPositive.empty();
+}
+
+bool SpillPlacement::update(unsigned n) {
+  if (!nodes[n].update(nodes, Threshold))
+    return false;
+  nodes[n].getDissentingNeighbors(TodoList, nodes);
+  return true;
+}
+
+/// iterate - Repeatedly update the Hopfield nodes until stability or the
+/// maximum number of iterations is reached.
+void SpillPlacement::iterate() {
+  // We do not need to push those node in the todolist.
+  // They are already been proceeded as part of the previous iteration.
+  RecentPositive.clear();
+
+  // Since the last iteration, the todolist have been augmented by calls
+  // to addConstraints, addLinks, and co.
+  // Update the network energy starting at this new frontier.
+  // The call to ::update will add the nodes that changed into the todolist.
+  unsigned Limit = bundles->getNumBundles() * 10;
+  while(Limit-- > 0 && !TodoList.empty()) {
+    unsigned n = TodoList.pop_back_val();
+    if (!update(n))
+      continue;
+    if (nodes[n].preferReg())
+      RecentPositive.push_back(n);
+  }
+}
+
+void SpillPlacement::prepare(BitVector &RegBundles) {
+  RecentPositive.clear();
+  TodoList.clear();
+  // Reuse RegBundles as our ActiveNodes vector.
+  ActiveNodes = &RegBundles;
+  ActiveNodes->clear();
+  ActiveNodes->resize(bundles->getNumBundles());
+}
+
+bool
+SpillPlacement::finish() {
+  assert(ActiveNodes && "Call prepare() first");
+
+  // Write preferences back to ActiveNodes.
+  bool Perfect = true;
+  for (int n = ActiveNodes->find_first(); n>=0; n = ActiveNodes->find_next(n))
+    if (!nodes[n].preferReg()) {
+      ActiveNodes->reset(n);
+      Perfect = false;
+    }
+  ActiveNodes = nullptr;
+  return Perfect;
+}
diff --git a/contrib/llvm/lib/CodeGen/SpillPlacement.h b/contrib/llvm/lib/CodeGen/SpillPlacement.h
new file mode 100644
index 000000000000..9b9ecccf9049
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SpillPlacement.h
@@ -0,0 +1,170 @@
+//===-- SpillPlacement.h - Optimal Spill Code Placement --------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This analysis computes the optimal spill code placement between basic blocks.
+//
+// The runOnMachineFunction() method only precomputes some profiling information
+// about the CFG. The real work is done by prepare(), addConstraints(), and
+// finish() which are called by the register allocator.
+//
+// Given a variable that is live across multiple basic blocks, and given
+// constraints on the basic blocks where the variable is live, determine which
+// edge bundles should have the variable in a register and which edge bundles
+// should have the variable in a stack slot.
+//
+// The returned bit vector can be used to place optimal spill code at basic
+// block entries and exits. Spill code placement inside a basic block is not
+// considered.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+#define LLVM_LIB_CODEGEN_SPILLPLACEMENT_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/BlockFrequency.h"
+
+namespace llvm {
+
+class BitVector;
+class EdgeBundles;
+class MachineBasicBlock;
+class MachineLoopInfo;
+class MachineBlockFrequencyInfo;
+
+class SpillPlacement : public MachineFunctionPass {
+  struct Node;
+  const MachineFunction *MF;
+  const EdgeBundles *bundles;
+  const MachineLoopInfo *loops;
+  const MachineBlockFrequencyInfo *MBFI;
+  Node *nodes;
+
+  // Nodes that are active in the current computation. Owned by the prepare()
+  // caller.
+  BitVector *ActiveNodes;
+
+  // Nodes with active links. Populated by scanActiveBundles.
+  SmallVector<unsigned, 8> Linked;
+
+  // Nodes that went positive during the last call to scanActiveBundles or
+  // iterate.
+  SmallVector<unsigned, 8> RecentPositive;
+
+  // Block frequencies are computed once. Indexed by block number.
+  SmallVector<BlockFrequency, 8> BlockFrequencies;
+
+  /// Decision threshold. A node gets the output value 0 if the weighted sum of
+  /// its inputs falls in the open interval (-Threshold;Threshold).
+  BlockFrequency Threshold;
+
+  /// List of nodes that need to be updated in ::iterate.
+  SparseSet<unsigned> TodoList;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  SpillPlacement() : MachineFunctionPass(ID), nodes(nullptr) {}
+  ~SpillPlacement() override { releaseMemory(); }
+
+  /// BorderConstraint - A basic block has separate constraints for entry and
+  /// exit.
+  enum BorderConstraint {
+    DontCare,  ///< Block doesn't care / variable not live.
+    PrefReg,   ///< Block entry/exit prefers a register.
+    PrefSpill, ///< Block entry/exit prefers a stack slot.
+    PrefBoth,  ///< Block entry prefers both register and stack.
+    MustSpill  ///< A register is impossible, variable must be spilled.
+  };
+
+  /// BlockConstraint - Entry and exit constraints for a basic block.
+  struct BlockConstraint {
+    unsigned Number;            ///< Basic block number (from MBB::getNumber()).
+    BorderConstraint Entry : 8; ///< Constraint on block entry.
+    BorderConstraint Exit : 8;  ///< Constraint on block exit.
+
+    /// True when this block changes the value of the live range. This means
+    /// the block has a non-PHI def.  When this is false, a live-in value on
+    /// the stack can be live-out on the stack without inserting a spill.
+    bool ChangesValue;
+  };
+
+  /// prepare - Reset state and prepare for a new spill placement computation.
+  /// @param RegBundles Bit vector to receive the edge bundles where the
+  ///                   variable should be kept in a register. Each bit
+  ///                   corresponds to an edge bundle, a set bit means the
+  ///                   variable should be kept in a register through the
+  ///                   bundle. A clear bit means the variable should be
+  ///                   spilled. This vector is retained.
+  void prepare(BitVector &RegBundles);
+
+  /// addConstraints - Add constraints and biases. This method may be called
+  /// more than once to accumulate constraints.
+  /// @param LiveBlocks Constraints for blocks that have the variable live in or
+  ///                   live out.
+  void addConstraints(ArrayRef<BlockConstraint> LiveBlocks);
+
+  /// addPrefSpill - Add PrefSpill constraints to all blocks listed.  This is
+  /// equivalent to calling addConstraint with identical BlockConstraints with
+  /// Entry = Exit = PrefSpill, and ChangesValue = false.
+  ///
+  /// @param Blocks Array of block numbers that prefer to spill in and out.
+  /// @param Strong When true, double the negative bias for these blocks.
+  void addPrefSpill(ArrayRef<unsigned> Blocks, bool Strong);
+
+  /// addLinks - Add transparent blocks with the given numbers.
+  void addLinks(ArrayRef<unsigned> Links);
+
+  /// scanActiveBundles - Perform an initial scan of all bundles activated by
+  /// addConstraints and addLinks, updating their state. Add all the bundles
+  /// that now prefer a register to RecentPositive.
+  /// Prepare internal data structures for iterate.
+  /// Return true is there are any positive nodes.
+  bool scanActiveBundles();
+
+  /// iterate - Update the network iteratively until convergence, or new bundles
+  /// are found.
+  void iterate();
+
+  /// getRecentPositive - Return an array of bundles that became positive during
+  /// the previous call to scanActiveBundles or iterate.
+  ArrayRef<unsigned> getRecentPositive() { return RecentPositive; }
+
+  /// finish - Compute the optimal spill code placement given the
+  /// constraints. No MustSpill constraints will be violated, and the smallest
+  /// possible number of PrefX constraints will be violated, weighted by
+  /// expected execution frequencies.
+  /// The selected bundles are returned in the bitvector passed to prepare().
+  /// @return True if a perfect solution was found, allowing the variable to be
+  ///         in a register through all relevant bundles.
+  bool finish();
+
+  /// getBlockFrequency - Return the estimated block execution frequency per
+  /// function invocation.
+  BlockFrequency getBlockFrequency(unsigned Number) const {
+    return BlockFrequencies[Number];
+  }
+
+private:
+  bool runOnMachineFunction(MachineFunction&) override;
+  void getAnalysisUsage(AnalysisUsage&) const override;
+  void releaseMemory() override;
+
+  void activate(unsigned);
+  void setThreshold(const BlockFrequency &Entry);
+
+  bool update(unsigned);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/Spiller.h b/contrib/llvm/lib/CodeGen/Spiller.h
new file mode 100644
index 000000000000..61ee508c8394
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/Spiller.h
@@ -0,0 +1,42 @@
+//===-- llvm/CodeGen/Spiller.h - Spiller -*- C++ -*------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SPILLER_H
+#define LLVM_LIB_CODEGEN_SPILLER_H
+
+namespace llvm {
+
+  class LiveRangeEdit;
+  class MachineFunction;
+  class MachineFunctionPass;
+  class VirtRegMap;
+  class LiveIntervals;
+
+  /// Spiller interface.
+  ///
+  /// Implementations are utility classes which insert spill or remat code on
+  /// demand.
+  class Spiller {
+    virtual void anchor();
+  public:
+    virtual ~Spiller() = 0;
+
+    /// spill - Spill the LRE.getParent() live interval.
+    virtual void spill(LiveRangeEdit &LRE) = 0;
+    virtual void postOptimization(){};
+  };
+
+  /// Create and return a spiller that will insert spill code directly instead
+  /// of deferring though VirtRegMap.
+  Spiller *createInlineSpiller(MachineFunctionPass &pass,
+                               MachineFunction &mf,
+                               VirtRegMap &vrm);
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.cpp b/contrib/llvm/lib/CodeGen/SplitKit.cpp
new file mode 100644
index 000000000000..07be24b18dd5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SplitKit.cpp
@@ -0,0 +1,1525 @@
+//===---------- SplitKit.cpp - Toolkit for splitting live ranges ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#include "SplitKit.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRangeEdit.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumFinished, "Number of splits finished");
+STATISTIC(NumSimple,   "Number of splits that were simple");
+STATISTIC(NumCopies,   "Number of copies inserted for splitting");
+STATISTIC(NumRemats,   "Number of rematerialized defs for splitting");
+STATISTIC(NumRepairs,  "Number of invalid live ranges repaired");
+
+//===----------------------------------------------------------------------===//
+//                     Last Insert Point Analysis
+//===----------------------------------------------------------------------===//
+
+InsertPointAnalysis::InsertPointAnalysis(const LiveIntervals &lis,
+                                         unsigned BBNum)
+    : LIS(lis), LastInsertPoint(BBNum) {}
+
+SlotIndex
+InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
+                                            const MachineBasicBlock &MBB) {
+  unsigned Num = MBB.getNumber();
+  std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
+  SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);
+
+  SmallVector<const MachineBasicBlock *, 1> EHPadSucessors;
+  for (const MachineBasicBlock *SMBB : MBB.successors())
+    if (SMBB->isEHPad())
+      EHPadSucessors.push_back(SMBB);
+
+  // Compute insert points on the first call. The pair is independent of the
+  // current live interval.
+  if (!LIP.first.isValid()) {
+    MachineBasicBlock::const_iterator FirstTerm = MBB.getFirstTerminator();
+    if (FirstTerm == MBB.end())
+      LIP.first = MBBEnd;
+    else
+      LIP.first = LIS.getInstructionIndex(*FirstTerm);
+
+    // If there is a landing pad successor, also find the call instruction.
+    if (EHPadSucessors.empty())
+      return LIP.first;
+    // There may not be a call instruction (?) in which case we ignore LPad.
+    LIP.second = LIP.first;
+    for (MachineBasicBlock::const_iterator I = MBB.end(), E = MBB.begin();
+         I != E;) {
+      --I;
+      if (I->isCall()) {
+        LIP.second = LIS.getInstructionIndex(*I);
+        break;
+      }
+    }
+  }
+
+  // If CurLI is live into a landing pad successor, move the last insert point
+  // back to the call that may throw.
+  if (!LIP.second)
+    return LIP.first;
+
+  if (none_of(EHPadSucessors, [&](const MachineBasicBlock *EHPad) {
+        return LIS.isLiveInToMBB(CurLI, EHPad);
+      }))
+    return LIP.first;
+
+  // Find the value leaving MBB.
+  const VNInfo *VNI = CurLI.getVNInfoBefore(MBBEnd);
+  if (!VNI)
+    return LIP.first;
+
+  // If the value leaving MBB was defined after the call in MBB, it can't
+  // really be live-in to the landing pad.  This can happen if the landing pad
+  // has a PHI, and this register is undef on the exceptional edge.
+  // <rdar://problem/10664933>
+  if (!SlotIndex::isEarlierInstr(VNI->def, LIP.second) && VNI->def < MBBEnd)
+    return LIP.first;
+
+  // Value is properly live-in to the landing pad.
+  // Only allow inserts before the call.
+  return LIP.second;
+}
+
+MachineBasicBlock::iterator
+InsertPointAnalysis::getLastInsertPointIter(const LiveInterval &CurLI,
+                                            MachineBasicBlock &MBB) {
+  SlotIndex LIP = getLastInsertPoint(CurLI, MBB);
+  if (LIP == LIS.getMBBEndIdx(&MBB))
+    return MBB.end();
+  return LIS.getInstructionFromIndex(LIP);
+}
+
+//===----------------------------------------------------------------------===//
+//                                 Split Analysis
+//===----------------------------------------------------------------------===//
+
+SplitAnalysis::SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                             const MachineLoopInfo &mli)
+    : MF(vrm.getMachineFunction()), VRM(vrm), LIS(lis), Loops(mli),
+      TII(*MF.getSubtarget().getInstrInfo()), CurLI(nullptr),
+      IPA(lis, MF.getNumBlockIDs()) {}
+
+void SplitAnalysis::clear() {
+  UseSlots.clear();
+  UseBlocks.clear();
+  ThroughBlocks.clear();
+  CurLI = nullptr;
+  DidRepairRange = false;
+}
+
+/// analyzeUses - Count instructions, basic blocks, and loops using CurLI.
+void SplitAnalysis::analyzeUses() {
+  assert(UseSlots.empty() && "Call clear first");
+
+  // First get all the defs from the interval values. This provides the correct
+  // slots for early clobbers.
+  for (const VNInfo *VNI : CurLI->valnos)
+    if (!VNI->isPHIDef() && !VNI->isUnused())
+      UseSlots.push_back(VNI->def);
+
+  // Get use slots form the use-def chain.
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (MachineOperand &MO : MRI.use_nodbg_operands(CurLI->reg))
+    if (!MO.isUndef())
+      UseSlots.push_back(LIS.getInstructionIndex(*MO.getParent()).getRegSlot());
+
+  array_pod_sort(UseSlots.begin(), UseSlots.end());
+
+  // Remove duplicates, keeping the smaller slot for each instruction.
+  // That is what we want for early clobbers.
+  UseSlots.erase(std::unique(UseSlots.begin(), UseSlots.end(),
+                             SlotIndex::isSameInstr),
+                 UseSlots.end());
+
+  // Compute per-live block info.
+  if (!calcLiveBlockInfo()) {
+    // FIXME: calcLiveBlockInfo found inconsistencies in the live range.
+    // I am looking at you, RegisterCoalescer!
+    DidRepairRange = true;
+    ++NumRepairs;
+    DEBUG(dbgs() << "*** Fixing inconsistent live interval! ***\n");
+    const_cast<LiveIntervals&>(LIS)
+      .shrinkToUses(const_cast<LiveInterval*>(CurLI));
+    UseBlocks.clear();
+    ThroughBlocks.clear();
+    bool fixed = calcLiveBlockInfo();
+    (void)fixed;
+    assert(fixed && "Couldn't fix broken live interval");
+  }
+
+  DEBUG(dbgs() << "Analyze counted "
+               << UseSlots.size() << " instrs in "
+               << UseBlocks.size() << " blocks, through "
+               << NumThroughBlocks << " blocks.\n");
+}
+
+/// calcLiveBlockInfo - Fill the LiveBlocks array with information about blocks
+/// where CurLI is live.
+bool SplitAnalysis::calcLiveBlockInfo() {
+  ThroughBlocks.resize(MF.getNumBlockIDs());
+  NumThroughBlocks = NumGapBlocks = 0;
+  if (CurLI->empty())
+    return true;
+
+  LiveInterval::const_iterator LVI = CurLI->begin();
+  LiveInterval::const_iterator LVE = CurLI->end();
+
+  SmallVectorImpl<SlotIndex>::const_iterator UseI, UseE;
+  UseI = UseSlots.begin();
+  UseE = UseSlots.end();
+
+  // Loop over basic blocks where CurLI is live.
+  MachineFunction::iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
+  for (;;) {
+    BlockInfo BI;
+    BI.MBB = &*MFI;
+    SlotIndex Start, Stop;
+    std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+    // If the block contains no uses, the range must be live through. At one
+    // point, RegisterCoalescer could create dangling ranges that ended
+    // mid-block.
+    if (UseI == UseE || *UseI >= Stop) {
+      ++NumThroughBlocks;
+      ThroughBlocks.set(BI.MBB->getNumber());
+      // The range shouldn't end mid-block if there are no uses. This shouldn't
+      // happen.
+      if (LVI->end < Stop)
+        return false;
+    } else {
+      // This block has uses. Find the first and last uses in the block.
+      BI.FirstInstr = *UseI;
+      assert(BI.FirstInstr >= Start);
+      do ++UseI;
+      while (UseI != UseE && *UseI < Stop);
+      BI.LastInstr = UseI[-1];
+      assert(BI.LastInstr < Stop);
+
+      // LVI is the first live segment overlapping MBB.
+      BI.LiveIn = LVI->start <= Start;
+
+      // When not live in, the first use should be a def.
+      if (!BI.LiveIn) {
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
+        assert(LVI->start == BI.FirstInstr && "First instr should be a def");
+        BI.FirstDef = BI.FirstInstr;
+      }
+
+      // Look for gaps in the live range.
+      BI.LiveOut = true;
+      while (LVI->end < Stop) {
+        SlotIndex LastStop = LVI->end;
+        if (++LVI == LVE || LVI->start >= Stop) {
+          BI.LiveOut = false;
+          BI.LastInstr = LastStop;
+          break;
+        }
+
+        if (LastStop < LVI->start) {
+          // There is a gap in the live range. Create duplicate entries for the
+          // live-in snippet and the live-out snippet.
+          ++NumGapBlocks;
+
+          // Push the Live-in part.
+          BI.LiveOut = false;
+          UseBlocks.push_back(BI);
+          UseBlocks.back().LastInstr = LastStop;
+
+          // Set up BI for the live-out part.
+          BI.LiveIn = false;
+          BI.LiveOut = true;
+          BI.FirstInstr = BI.FirstDef = LVI->start;
+        }
+
+        // A Segment that starts in the middle of the block must be a def.
+        assert(LVI->start == LVI->valno->def && "Dangling Segment start");
+        if (!BI.FirstDef)
+          BI.FirstDef = LVI->start;
+      }
+
+      UseBlocks.push_back(BI);
+
+      // LVI is now at LVE or LVI->end >= Stop.
+      if (LVI == LVE)
+        break;
+    }
+
+    // Live segment ends exactly at Stop. Move to the next segment.
+    if (LVI->end == Stop && ++LVI == LVE)
+      break;
+
+    // Pick the next basic block.
+    if (LVI->start < Stop)
+      ++MFI;
+    else
+      MFI = LIS.getMBBFromIndex(LVI->start)->getIterator();
+  }
+
+  assert(getNumLiveBlocks() == countLiveBlocks(CurLI) && "Bad block count");
+  return true;
+}
+
+unsigned SplitAnalysis::countLiveBlocks(const LiveInterval *cli) const {
+  if (cli->empty())
+    return 0;
+  LiveInterval *li = const_cast<LiveInterval*>(cli);
+  LiveInterval::iterator LVI = li->begin();
+  LiveInterval::iterator LVE = li->end();
+  unsigned Count = 0;
+
+  // Loop over basic blocks where li is live.
+  MachineFunction::const_iterator MFI =
+      LIS.getMBBFromIndex(LVI->start)->getIterator();
+  SlotIndex Stop = LIS.getMBBEndIdx(&*MFI);
+  for (;;) {
+    ++Count;
+    LVI = li->advanceTo(LVI, Stop);
+    if (LVI == LVE)
+      return Count;
+    do {
+      ++MFI;
+      Stop = LIS.getMBBEndIdx(&*MFI);
+    } while (Stop <= LVI->start);
+  }
+}
+
+bool SplitAnalysis::isOriginalEndpoint(SlotIndex Idx) const {
+  unsigned OrigReg = VRM.getOriginal(CurLI->reg);
+  const LiveInterval &Orig = LIS.getInterval(OrigReg);
+  assert(!Orig.empty() && "Splitting empty interval?");
+  LiveInterval::const_iterator I = Orig.find(Idx);
+
+  // Range containing Idx should begin at Idx.
+  if (I != Orig.end() && I->start <= Idx)
+    return I->start == Idx;
+
+  // Range does not contain Idx, previous must end at Idx.
+  return I != Orig.begin() && (--I)->end == Idx;
+}
+
+void SplitAnalysis::analyze(const LiveInterval *li) {
+  clear();
+  CurLI = li;
+  analyzeUses();
+}
+
+
+//===----------------------------------------------------------------------===//
+//                               Split Editor
+//===----------------------------------------------------------------------===//
+
+/// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+SplitEditor::SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa,
+                         LiveIntervals &lis, VirtRegMap &vrm,
+                         MachineDominatorTree &mdt,
+                         MachineBlockFrequencyInfo &mbfi)
+    : SA(sa), AA(aa), LIS(lis), VRM(vrm),
+      MRI(vrm.getMachineFunction().getRegInfo()), MDT(mdt),
+      TII(*vrm.getMachineFunction().getSubtarget().getInstrInfo()),
+      TRI(*vrm.getMachineFunction().getSubtarget().getRegisterInfo()),
+      MBFI(mbfi), Edit(nullptr), OpenIdx(0), SpillMode(SM_Partition),
+      RegAssign(Allocator) {}
+
+void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
+  Edit = &LRE;
+  SpillMode = SM;
+  OpenIdx = 0;
+  RegAssign.clear();
+  Values.clear();
+
+  // Reset the LiveRangeCalc instances needed for this spill mode.
+  LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                  &LIS.getVNInfoAllocator());
+  if (SpillMode)
+    LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+                    &LIS.getVNInfoAllocator());
+
+  // We don't need an AliasAnalysis since we will only be performing
+  // cheap-as-a-copy remats anyway.
+  Edit->anyRematerializable(nullptr);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SplitEditor::dump() const {
+  if (RegAssign.empty()) {
+    dbgs() << " empty\n";
+    return;
+  }
+
+  for (RegAssignMap::const_iterator I = RegAssign.begin(); I.valid(); ++I)
+    dbgs() << " [" << I.start() << ';' << I.stop() << "):" << I.value();
+  dbgs() << '\n';
+}
+#endif
+
+VNInfo *SplitEditor::defValue(unsigned RegIdx,
+                              const VNInfo *ParentVNI,
+                              SlotIndex Idx) {
+  assert(ParentVNI && "Mapping  NULL value");
+  assert(Idx.isValid() && "Invalid SlotIndex");
+  assert(Edit->getParent().getVNInfoAt(Idx) == ParentVNI && "Bad Parent VNI");
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+
+  // Create a new value.
+  VNInfo *VNI = LI->getNextValue(Idx, LIS.getVNInfoAllocator());
+
+  // Use insert for lookup, so we can add missing values with a second lookup.
+  std::pair<ValueMap::iterator, bool> InsP =
+    Values.insert(std::make_pair(std::make_pair(RegIdx, ParentVNI->id),
+                                 ValueForcePair(VNI, false)));
+
+  // This was the first time (RegIdx, ParentVNI) was mapped.
+  // Keep it as a simple def without any liveness.
+  if (InsP.second)
+    return VNI;
+
+  // If the previous value was a simple mapping, add liveness for it now.
+  if (VNInfo *OldVNI = InsP.first->second.getPointer()) {
+    SlotIndex Def = OldVNI->def;
+    LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), OldVNI));
+    // No longer a simple mapping.  Switch to a complex, non-forced mapping.
+    InsP.first->second = ValueForcePair();
+  }
+
+  // This is a complex mapping, add liveness for VNI
+  SlotIndex Def = VNI->def;
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
+
+  return VNI;
+}
+
+void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
+  assert(ParentVNI && "Mapping  NULL value");
+  ValueForcePair &VFP = Values[std::make_pair(RegIdx, ParentVNI->id)];
+  VNInfo *VNI = VFP.getPointer();
+
+  // ParentVNI was either unmapped or already complex mapped. Either way, just
+  // set the force bit.
+  if (!VNI) {
+    VFP.setInt(true);
+    return;
+  }
+
+  // This was previously a single mapping. Make sure the old def is represented
+  // by a trivial live range.
+  SlotIndex Def = VNI->def;
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+  LI->addSegment(LiveInterval::Segment(Def, Def.getDeadSlot(), VNI));
+  // Mark as complex mapped, forced.
+  VFP = ValueForcePair(nullptr, true);
+}
+
+VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
+                                   VNInfo *ParentVNI,
+                                   SlotIndex UseIdx,
+                                   MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator I) {
+  MachineInstr *CopyMI = nullptr;
+  SlotIndex Def;
+  LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+
+  // We may be trying to avoid interference that ends at a deleted instruction,
+  // so always begin RegIdx 0 early and all others late.
+  bool Late = RegIdx != 0;
+
+  // Attempt cheap-as-a-copy rematerialization.
+  unsigned Original = VRM.getOriginal(Edit->get(RegIdx));
+  LiveInterval &OrigLI = LIS.getInterval(Original);
+  VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
+  LiveRangeEdit::Remat RM(ParentVNI);
+  RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
+
+  if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
+    Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
+    ++NumRemats;
+  } else {
+    // Can't remat, just insert a copy from parent.
+    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
+               .addReg(Edit->getReg());
+    Def = LIS.getSlotIndexes()
+              ->insertMachineInstrInMaps(*CopyMI, Late)
+              .getRegSlot();
+    ++NumCopies;
+  }
+
+  // Define the value in Reg.
+  return defValue(RegIdx, ParentVNI, Def);
+}
+
+/// Create a new virtual register and live interval.
+unsigned SplitEditor::openIntv() {
+  // Create the complement as index 0.
+  if (Edit->empty())
+    Edit->createEmptyInterval();
+
+  // Create the open interval.
+  OpenIdx = Edit->size();
+  Edit->createEmptyInterval();
+  return OpenIdx;
+}
+
+void SplitEditor::selectIntv(unsigned Idx) {
+  assert(Idx != 0 && "Cannot select the complement interval");
+  assert(Idx < Edit->size() && "Can only select previously opened interval");
+  DEBUG(dbgs() << "    selectIntv " << OpenIdx << " -> " << Idx << '\n');
+  OpenIdx = Idx;
+}
+
+SlotIndex SplitEditor::enterIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvBefore");
+  DEBUG(dbgs() << "    enterIntvBefore " << Idx);
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvBefore called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before enterIntvAfter");
+  DEBUG(dbgs() << "    enterIntvAfter " << Idx);
+  Idx = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "enterIntvAfter called with invalid index");
+
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Idx, *MI->getParent(),
+                              std::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::enterIntvAtEnd(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before enterIntvAtEnd");
+  SlotIndex End = LIS.getMBBEndIdx(&MBB);
+  SlotIndex Last = End.getPrevSlot();
+  DEBUG(dbgs() << "    enterIntvAtEnd BB#" << MBB.getNumber() << ", " << Last);
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Last);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return End;
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id);
+  VNInfo *VNI = defFromParent(OpenIdx, ParentVNI, Last, MBB,
+                              SA.getLastSplitPointIter(&MBB));
+  RegAssign.insert(VNI->def, End, OpenIdx);
+  DEBUG(dump());
+  return VNI->def;
+}
+
+/// useIntv - indicate that all instructions in MBB should use OpenLI.
+void SplitEditor::useIntv(const MachineBasicBlock &MBB) {
+  useIntv(LIS.getMBBStartIdx(&MBB), LIS.getMBBEndIdx(&MBB));
+}
+
+void SplitEditor::useIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before useIntv");
+  DEBUG(dbgs() << "    useIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  DEBUG(dump());
+}
+
+SlotIndex SplitEditor::leaveIntvAfter(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAfter");
+  DEBUG(dbgs() << "    leaveIntvAfter " << Idx);
+
+  // The interval must be live beyond the instruction at Idx.
+  SlotIndex Boundary = Idx.getBoundaryIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Boundary);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Boundary.getNextSlot();
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+  MachineInstr *MI = LIS.getInstructionFromIndex(Boundary);
+  assert(MI && "No instruction at index");
+
+  // In spill mode, make live ranges as short as possible by inserting the copy
+  // before MI.  This is only possible if that instruction doesn't redefine the
+  // value.  The inserted COPY is not a kill, and we don't need to recompute
+  // the source live range.  The spiller also won't try to hoist this copy.
+  if (SpillMode && !SlotIndex::isSameInstr(ParentVNI->def, Idx) &&
+      MI->readsVirtualRegister(Edit->getReg())) {
+    forceRecompute(0, ParentVNI);
+    defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+    return Idx;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Boundary, *MI->getParent(),
+                              std::next(MachineBasicBlock::iterator(MI)));
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvBefore(SlotIndex Idx) {
+  assert(OpenIdx && "openIntv not called before leaveIntvBefore");
+  DEBUG(dbgs() << "    leaveIntvBefore " << Idx);
+
+  // The interval must be live into the instruction at Idx.
+  Idx = Idx.getBaseIndex();
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Idx);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Idx.getNextSlot();
+  }
+  DEBUG(dbgs() << ": valno " << ParentVNI->id << '\n');
+
+  MachineInstr *MI = LIS.getInstructionFromIndex(Idx);
+  assert(MI && "No instruction at index");
+  VNInfo *VNI = defFromParent(0, ParentVNI, Idx, *MI->getParent(), MI);
+  return VNI->def;
+}
+
+SlotIndex SplitEditor::leaveIntvAtTop(MachineBasicBlock &MBB) {
+  assert(OpenIdx && "openIntv not called before leaveIntvAtTop");
+  SlotIndex Start = LIS.getMBBStartIdx(&MBB);
+  DEBUG(dbgs() << "    leaveIntvAtTop BB#" << MBB.getNumber() << ", " << Start);
+
+  VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  if (!ParentVNI) {
+    DEBUG(dbgs() << ": not live\n");
+    return Start;
+  }
+
+  VNInfo *VNI = defFromParent(0, ParentVNI, Start, MBB,
+                              MBB.SkipPHIsAndLabels(MBB.begin()));
+  RegAssign.insert(Start, VNI->def, OpenIdx);
+  DEBUG(dump());
+  return VNI->def;
+}
+
+void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
+  assert(OpenIdx && "openIntv not called before overlapIntv");
+  const VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(Start);
+  assert(ParentVNI == Edit->getParent().getVNInfoBefore(End) &&
+         "Parent changes value in extended range");
+  assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
+         "Range cannot span basic blocks");
+
+  // The complement interval will be extended as needed by LRCalc.extend().
+  if (ParentVNI)
+    forceRecompute(0, ParentVNI);
+  DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
+  RegAssign.insert(Start, End, OpenIdx);
+  DEBUG(dump());
+}
+
+//===----------------------------------------------------------------------===//
+//                                  Spill modes
+//===----------------------------------------------------------------------===//
+
+void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  DEBUG(dbgs() << "Removing " << Copies.size() << " back-copies.\n");
+  RegAssignMap::iterator AssignI;
+  AssignI.setMap(RegAssign);
+
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    SlotIndex Def = Copies[i]->def;
+    MachineInstr *MI = LIS.getInstructionFromIndex(Def);
+    assert(MI && "No instruction for back-copy");
+
+    MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::iterator MBBI(MI);
+    bool AtBegin;
+    do AtBegin = MBBI == MBB->begin();
+    while (!AtBegin && (--MBBI)->isDebugValue());
+
+    DEBUG(dbgs() << "Removing " << Def << '\t' << *MI);
+    LIS.removeVRegDefAt(*LI, Def);
+    LIS.RemoveMachineInstrFromMaps(*MI);
+    MI->eraseFromParent();
+
+    // Adjust RegAssign if a register assignment is killed at Def. We want to
+    // avoid calculating the live range of the source register if possible.
+    AssignI.find(Def.getPrevSlot());
+    if (!AssignI.valid() || AssignI.start() >= Def)
+      continue;
+    // If MI doesn't kill the assigned register, just leave it.
+    if (AssignI.stop() != Def)
+      continue;
+    unsigned RegIdx = AssignI.value();
+    if (AtBegin || !MBBI->readsVirtualRegister(Edit->getReg())) {
+      DEBUG(dbgs() << "  cannot find simple kill of RegIdx " << RegIdx << '\n');
+      forceRecompute(RegIdx, Edit->getParent().getVNInfoAt(Def));
+    } else {
+      SlotIndex Kill = LIS.getInstructionIndex(*MBBI).getRegSlot();
+      DEBUG(dbgs() << "  move kill to " << Kill << '\t' << *MBBI);
+      AssignI.setStop(Kill);
+    }
+  }
+}
+
+MachineBasicBlock*
+SplitEditor::findShallowDominator(MachineBasicBlock *MBB,
+                                  MachineBasicBlock *DefMBB) {
+  if (MBB == DefMBB)
+    return MBB;
+  assert(MDT.dominates(DefMBB, MBB) && "MBB must be dominated by the def.");
+
+  const MachineLoopInfo &Loops = SA.Loops;
+  const MachineLoop *DefLoop = Loops.getLoopFor(DefMBB);
+  MachineDomTreeNode *DefDomNode = MDT[DefMBB];
+
+  // Best candidate so far.
+  MachineBasicBlock *BestMBB = MBB;
+  unsigned BestDepth = UINT_MAX;
+
+  for (;;) {
+    const MachineLoop *Loop = Loops.getLoopFor(MBB);
+
+    // MBB isn't in a loop, it doesn't get any better.  All dominators have a
+    // higher frequency by definition.
+    if (!Loop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth 0\n");
+      return MBB;
+    }
+
+    // We'll never be able to exit the DefLoop.
+    if (Loop == DefLoop) {
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " in the same loop\n");
+      return MBB;
+    }
+
+    // Least busy dominator seen so far.
+    unsigned Depth = Loop->getLoopDepth();
+    if (Depth < BestDepth) {
+      BestMBB = MBB;
+      BestDepth = Depth;
+      DEBUG(dbgs() << "Def in BB#" << DefMBB->getNumber() << " dominates BB#"
+                   << MBB->getNumber() << " at depth " << Depth << '\n');
+    }
+
+    // Leave loop by going to the immediate dominator of the loop header.
+    // This is a bigger stride than simply walking up the dominator tree.
+    MachineDomTreeNode *IDom = MDT[Loop->getHeader()]->getIDom();
+
+    // Too far up the dominator tree?
+    if (!IDom || !MDT.dominates(DefDomNode, IDom))
+      return BestMBB;
+
+    MBB = IDom->getBlock();
+  }
+}
+
+void SplitEditor::computeRedundantBackCopies(
+    DenseSet<unsigned> &NotToHoistSet, SmallVectorImpl<VNInfo *> &BackCopies) {
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+  SmallVector<SmallPtrSet<VNInfo *, 8>, 8> EqualVNs(Parent->getNumValNums());
+  SmallPtrSet<VNInfo *, 8> DominatedVNIs;
+
+  // Aggregate VNIs having the same value as ParentVNI.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    EqualVNs[ParentVNI->id].insert(VNI);
+  }
+
+  // For VNI aggregation of each ParentVNI, collect dominated, i.e.,
+  // redundant VNIs to BackCopies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    if (!NotToHoistSet.count(ParentVNI->id))
+      continue;
+    SmallPtrSetIterator<VNInfo *> It1 = EqualVNs[ParentVNI->id].begin();
+    SmallPtrSetIterator<VNInfo *> It2 = It1;
+    for (; It1 != EqualVNs[ParentVNI->id].end(); ++It1) {
+      It2 = It1;
+      for (++It2; It2 != EqualVNs[ParentVNI->id].end(); ++It2) {
+        if (DominatedVNIs.count(*It1) || DominatedVNIs.count(*It2))
+          continue;
+
+        MachineBasicBlock *MBB1 = LIS.getMBBFromIndex((*It1)->def);
+        MachineBasicBlock *MBB2 = LIS.getMBBFromIndex((*It2)->def);
+        if (MBB1 == MBB2) {
+          DominatedVNIs.insert((*It1)->def < (*It2)->def ? (*It2) : (*It1));
+        } else if (MDT.dominates(MBB1, MBB2)) {
+          DominatedVNIs.insert(*It2);
+        } else if (MDT.dominates(MBB2, MBB1)) {
+          DominatedVNIs.insert(*It1);
+        }
+      }
+    }
+    if (!DominatedVNIs.empty()) {
+      forceRecompute(0, ParentVNI);
+      for (auto VNI : DominatedVNIs) {
+        BackCopies.push_back(VNI);
+      }
+      DominatedVNIs.clear();
+    }
+  }
+}
+
+/// For SM_Size mode, find a common dominator for all the back-copies for
+/// the same ParentVNI and hoist the backcopies to the dominator BB.
+/// For SM_Speed mode, if the common dominator is hot and it is not beneficial
+/// to do the hoisting, simply remove the dominated backcopies for the same
+/// ParentVNI.
+void SplitEditor::hoistCopies() {
+  // Get the complement interval, always RegIdx 0.
+  LiveInterval *LI = &LIS.getInterval(Edit->get(0));
+  LiveInterval *Parent = &Edit->getParent();
+
+  // Track the nearest common dominator for all back-copies for each ParentVNI,
+  // indexed by ParentVNI->id.
+  typedef std::pair<MachineBasicBlock*, SlotIndex> DomPair;
+  SmallVector<DomPair, 8> NearestDom(Parent->getNumValNums());
+  // The total cost of all the back-copies for each ParentVNI.
+  SmallVector<BlockFrequency, 8> Costs(Parent->getNumValNums());
+  // The ParentVNI->id set for which hoisting back-copies are not beneficial
+  // for Speed.
+  DenseSet<unsigned> NotToHoistSet;
+
+  // Find the nearest common dominator for parent values with multiple
+  // back-copies.  If a single back-copy dominates, put it in DomPair.second.
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    assert(ParentVNI && "Parent not live at complement def");
+
+    // Don't hoist remats.  The complement is probably going to disappear
+    // completely anyway.
+    if (Edit->didRematerialize(ParentVNI))
+      continue;
+
+    MachineBasicBlock *ValMBB = LIS.getMBBFromIndex(VNI->def);
+
+    DomPair &Dom = NearestDom[ParentVNI->id];
+
+    // Keep directly defined parent values.  This is either a PHI or an
+    // instruction in the complement range.  All other copies of ParentVNI
+    // should be eliminated.
+    if (VNI->def == ParentVNI->def) {
+      DEBUG(dbgs() << "Direct complement def at " << VNI->def << '\n');
+      Dom = DomPair(ValMBB, VNI->def);
+      continue;
+    }
+    // Skip the singly mapped values.  There is nothing to gain from hoisting a
+    // single back-copy.
+    if (Values.lookup(std::make_pair(0, ParentVNI->id)).getPointer()) {
+      DEBUG(dbgs() << "Single complement def at " << VNI->def << '\n');
+      continue;
+    }
+
+    if (!Dom.first) {
+      // First time we see ParentVNI.  VNI dominates itself.
+      Dom = DomPair(ValMBB, VNI->def);
+    } else if (Dom.first == ValMBB) {
+      // Two defs in the same block.  Pick the earlier def.
+      if (!Dom.second.isValid() || VNI->def < Dom.second)
+        Dom.second = VNI->def;
+    } else {
+      // Different basic blocks. Check if one dominates.
+      MachineBasicBlock *Near =
+        MDT.findNearestCommonDominator(Dom.first, ValMBB);
+      if (Near == ValMBB)
+        // Def ValMBB dominates.
+        Dom = DomPair(ValMBB, VNI->def);
+      else if (Near != Dom.first)
+        // None dominate. Hoist to common dominator, need new def.
+        Dom = DomPair(Near, SlotIndex());
+      Costs[ParentVNI->id] += MBFI.getBlockFreq(ValMBB);
+    }
+
+    DEBUG(dbgs() << "Multi-mapped complement " << VNI->id << '@' << VNI->def
+                 << " for parent " << ParentVNI->id << '@' << ParentVNI->def
+                 << " hoist to BB#" << Dom.first->getNumber() << ' '
+                 << Dom.second << '\n');
+  }
+
+  // Insert the hoisted copies.
+  for (unsigned i = 0, e = Parent->getNumValNums(); i != e; ++i) {
+    DomPair &Dom = NearestDom[i];
+    if (!Dom.first || Dom.second.isValid())
+      continue;
+    // This value needs a hoisted copy inserted at the end of Dom.first.
+    VNInfo *ParentVNI = Parent->getValNumInfo(i);
+    MachineBasicBlock *DefMBB = LIS.getMBBFromIndex(ParentVNI->def);
+    // Get a less loopy dominator than Dom.first.
+    Dom.first = findShallowDominator(Dom.first, DefMBB);
+    if (SpillMode == SM_Speed &&
+        MBFI.getBlockFreq(Dom.first) > Costs[ParentVNI->id]) {
+      NotToHoistSet.insert(ParentVNI->id);
+      continue;
+    }
+    SlotIndex Last = LIS.getMBBEndIdx(Dom.first).getPrevSlot();
+    Dom.second =
+      defFromParent(0, ParentVNI, Last, *Dom.first,
+                    SA.getLastSplitPointIter(Dom.first))->def;
+  }
+
+  // Remove redundant back-copies that are now known to be dominated by another
+  // def with the same value.
+  SmallVector<VNInfo*, 8> BackCopies;
+  for (VNInfo *VNI : LI->valnos) {
+    if (VNI->isUnused())
+      continue;
+    VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
+    const DomPair &Dom = NearestDom[ParentVNI->id];
+    if (!Dom.first || Dom.second == VNI->def ||
+        NotToHoistSet.count(ParentVNI->id))
+      continue;
+    BackCopies.push_back(VNI);
+    forceRecompute(0, ParentVNI);
+  }
+
+  // If it is not beneficial to hoist all the BackCopies, simply remove
+  // redundant BackCopies in speed mode.
+  if (SpillMode == SM_Speed && !NotToHoistSet.empty())
+    computeRedundantBackCopies(NotToHoistSet, BackCopies);
+
+  removeBackCopies(BackCopies);
+}
+
+
+/// transferValues - Transfer all possible values to the new live ranges.
+/// Values that were rematerialized are left alone, they need LRCalc.extend().
+bool SplitEditor::transferValues() {
+  bool Skipped = false;
+  RegAssignMap::const_iterator AssignI = RegAssign.begin();
+  for (const LiveRange::Segment &S : Edit->getParent()) {
+    DEBUG(dbgs() << "  blit " << S << ':');
+    VNInfo *ParentVNI = S.valno;
+    // RegAssign has holes where RegIdx 0 should be used.
+    SlotIndex Start = S.start;
+    AssignI.advanceTo(Start);
+    do {
+      unsigned RegIdx;
+      SlotIndex End = S.end;
+      if (!AssignI.valid()) {
+        RegIdx = 0;
+      } else if (AssignI.start() <= Start) {
+        RegIdx = AssignI.value();
+        if (AssignI.stop() < End) {
+          End = AssignI.stop();
+          ++AssignI;
+        }
+      } else {
+        RegIdx = 0;
+        End = std::min(End, AssignI.start());
+      }
+
+      // The interval [Start;End) is continuously mapped to RegIdx, ParentVNI.
+      DEBUG(dbgs() << " [" << Start << ';' << End << ")=" << RegIdx);
+      LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
+
+      // Check for a simply defined value that can be blitted directly.
+      ValueForcePair VFP = Values.lookup(std::make_pair(RegIdx, ParentVNI->id));
+      if (VNInfo *VNI = VFP.getPointer()) {
+        DEBUG(dbgs() << ':' << VNI->id);
+        LR.addSegment(LiveInterval::Segment(Start, End, VNI));
+        Start = End;
+        continue;
+      }
+
+      // Skip values with forced recomputation.
+      if (VFP.getInt()) {
+        DEBUG(dbgs() << "(recalc)");
+        Skipped = true;
+        Start = End;
+        continue;
+      }
+
+      LiveRangeCalc &LRC = getLRCalc(RegIdx);
+
+      // This value has multiple defs in RegIdx, but it wasn't rematerialized,
+      // so the live range is accurate. Add live-in blocks in [Start;End) to the
+      // LiveInBlocks.
+      MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
+      SlotIndex BlockStart, BlockEnd;
+      std::tie(BlockStart, BlockEnd) = LIS.getSlotIndexes()->getMBBRange(&*MBB);
+
+      // The first block may be live-in, or it may have its own def.
+      if (Start != BlockStart) {
+        VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
+        assert(VNI && "Missing def for complex mapped value");
+        DEBUG(dbgs() << ':' << VNI->id << "*BB#" << MBB->getNumber());
+        // MBB has its own def. Is it also live-out?
+        if (BlockEnd <= End)
+          LRC.setLiveOutValue(&*MBB, VNI);
+
+        // Skip to the next block for live-in.
+        ++MBB;
+        BlockStart = BlockEnd;
+      }
+
+      // Handle the live-in blocks covered by [Start;End).
+      assert(Start <= BlockStart && "Expected live-in block");
+      while (BlockStart < End) {
+        DEBUG(dbgs() << ">BB#" << MBB->getNumber());
+        BlockEnd = LIS.getMBBEndIdx(&*MBB);
+        if (BlockStart == ParentVNI->def) {
+          // This block has the def of a parent PHI, so it isn't live-in.
+          assert(ParentVNI->isPHIDef() && "Non-phi defined at block start?");
+          VNInfo *VNI = LR.extendInBlock(BlockStart, std::min(BlockEnd, End));
+          assert(VNI && "Missing def for complex mapped parent PHI");
+          if (End >= BlockEnd)
+            LRC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
+        } else {
+          // This block needs a live-in value.  The last block covered may not
+          // be live-out.
+          if (End < BlockEnd)
+            LRC.addLiveInBlock(LR, MDT[&*MBB], End);
+          else {
+            // Live-through, and we don't know the value.
+            LRC.addLiveInBlock(LR, MDT[&*MBB]);
+            LRC.setLiveOutValue(&*MBB, nullptr);
+          }
+        }
+        BlockStart = BlockEnd;
+        ++MBB;
+      }
+      Start = End;
+    } while (Start != S.end);
+    DEBUG(dbgs() << '\n');
+  }
+
+  LRCalc[0].calculateValues();
+  if (SpillMode)
+    LRCalc[1].calculateValues();
+
+  return Skipped;
+}
+
+void SplitEditor::extendPHIKillRanges() {
+  // Extend live ranges to be live-out for successor PHI values.
+  for (const VNInfo *PHIVNI : Edit->getParent().valnos) {
+    if (PHIVNI->isUnused() || !PHIVNI->isPHIDef())
+      continue;
+    unsigned RegIdx = RegAssign.lookup(PHIVNI->def);
+    LiveRange &LR = LIS.getInterval(Edit->get(RegIdx));
+
+    // Check whether PHI is dead.
+    const LiveRange::Segment *Segment = LR.getSegmentContaining(PHIVNI->def);
+    assert(Segment != nullptr && "Missing segment for VNI");
+    if (Segment->end == PHIVNI->def.getDeadSlot()) {
+      // This is a dead PHI. Remove it.
+      LR.removeSegment(*Segment, true);
+      continue;
+    }
+
+    LiveRangeCalc &LRC = getLRCalc(RegIdx);
+    MachineBasicBlock *MBB = LIS.getMBBFromIndex(PHIVNI->def);
+    for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
+         PE = MBB->pred_end(); PI != PE; ++PI) {
+      SlotIndex End = LIS.getMBBEndIdx(*PI);
+      SlotIndex LastUse = End.getPrevSlot();
+      // The predecessor may not have a live-out value. That is OK, like an
+      // undef PHI operand.
+      if (Edit->getParent().liveAt(LastUse)) {
+        assert(RegAssign.lookup(LastUse) == RegIdx &&
+               "Different register assignment in phi predecessor");
+        LRC.extend(LR, End);
+      }
+    }
+  }
+}
+
+/// rewriteAssigned - Rewrite all uses of Edit->getReg().
+void SplitEditor::rewriteAssigned(bool ExtendRanges) {
+  for (MachineRegisterInfo::reg_iterator RI = MRI.reg_begin(Edit->getReg()),
+       RE = MRI.reg_end(); RI != RE;) {
+    MachineOperand &MO = *RI;
+    MachineInstr *MI = MO.getParent();
+    ++RI;
+    // LiveDebugVariables should have handled all DBG_VALUE instructions.
+    if (MI->isDebugValue()) {
+      DEBUG(dbgs() << "Zapping " << *MI);
+      MO.setReg(0);
+      continue;
+    }
+
+    // <undef> operands don't really read the register, so it doesn't matter
+    // which register we choose.  When the use operand is tied to a def, we must
+    // use the same register as the def, so just do that always.
+    SlotIndex Idx = LIS.getInstructionIndex(*MI);
+    if (MO.isDef() || MO.isUndef())
+      Idx = Idx.getRegSlot(MO.isEarlyClobber());
+
+    // Rewrite to the mapped register at Idx.
+    unsigned RegIdx = RegAssign.lookup(Idx);
+    LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
+    MO.setReg(LI->reg);
+    DEBUG(dbgs() << "  rewr BB#" << MI->getParent()->getNumber() << '\t'
+                 << Idx << ':' << RegIdx << '\t' << *MI);
+
+    // Extend liveness to Idx if the instruction reads reg.
+    if (!ExtendRanges || MO.isUndef())
+      continue;
+
+    // Skip instructions that don't read Reg.
+    if (MO.isDef()) {
+      if (!MO.getSubReg() && !MO.isEarlyClobber())
+        continue;
+      // We may wan't to extend a live range for a partial redef, or for a use
+      // tied to an early clobber.
+      Idx = Idx.getPrevSlot();
+      if (!Edit->getParent().liveAt(Idx))
+        continue;
+    } else
+      Idx = Idx.getRegSlot(true);
+
+    getLRCalc(RegIdx).extend(*LI, Idx.getNextSlot());
+  }
+}
+
+void SplitEditor::deleteRematVictims() {
+  SmallVector<MachineInstr*, 8> Dead;
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I){
+    LiveInterval *LI = &LIS.getInterval(*I);
+    for (const LiveRange::Segment &S : LI->segments) {
+      // Dead defs end at the dead slot.
+      if (S.end != S.valno->def.getDeadSlot())
+        continue;
+      if (S.valno->isPHIDef())
+        continue;
+      MachineInstr *MI = LIS.getInstructionFromIndex(S.valno->def);
+      assert(MI && "Missing instruction for dead def");
+      MI->addRegisterDead(LI->reg, &TRI);
+
+      if (!MI->allDefsAreDead())
+        continue;
+
+      DEBUG(dbgs() << "All defs dead: " << *MI);
+      Dead.push_back(MI);
+    }
+  }
+
+  if (Dead.empty())
+    return;
+
+  Edit->eliminateDeadDefs(Dead, None, &AA);
+}
+
+void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
+  ++NumFinished;
+
+  // At this point, the live intervals in Edit contain VNInfos corresponding to
+  // the inserted copies.
+
+  // Add the original defs from the parent interval.
+  for (const VNInfo *ParentVNI : Edit->getParent().valnos) {
+    if (ParentVNI->isUnused())
+      continue;
+    unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
+    defValue(RegIdx, ParentVNI, ParentVNI->def);
+
+    // Force rematted values to be recomputed everywhere.
+    // The new live ranges may be truncated.
+    if (Edit->didRematerialize(ParentVNI))
+      for (unsigned i = 0, e = Edit->size(); i != e; ++i)
+        forceRecompute(i, ParentVNI);
+  }
+
+  // Hoist back-copies to the complement interval when in spill mode.
+  switch (SpillMode) {
+  case SM_Partition:
+    // Leave all back-copies as is.
+    break;
+  case SM_Size:
+  case SM_Speed:
+    // hoistCopies will behave differently between size and speed.
+    hoistCopies();
+  }
+
+  // Transfer the simply mapped values, check if any are skipped.
+  bool Skipped = transferValues();
+
+  // Rewrite virtual registers, possibly extending ranges.
+  rewriteAssigned(Skipped);
+
+  if (Skipped)
+    extendPHIKillRanges();
+  else
+    ++NumSimple;
+
+  // Delete defs that were rematted everywhere.
+  if (Skipped)
+    deleteRematVictims();
+
+  // Get rid of unused values and set phi-kill flags.
+  for (LiveRangeEdit::iterator I = Edit->begin(), E = Edit->end(); I != E; ++I) {
+    LiveInterval &LI = LIS.getInterval(*I);
+    LI.RenumberValues();
+  }
+
+  // Provide a reverse mapping from original indices to Edit ranges.
+  if (LRMap) {
+    LRMap->clear();
+    for (unsigned i = 0, e = Edit->size(); i != e; ++i)
+      LRMap->push_back(i);
+  }
+
+  // Now check if any registers were separated into multiple components.
+  ConnectedVNInfoEqClasses ConEQ(LIS);
+  for (unsigned i = 0, e = Edit->size(); i != e; ++i) {
+    // Don't use iterators, they are invalidated by create() below.
+    unsigned VReg = Edit->get(i);
+    LiveInterval &LI = LIS.getInterval(VReg);
+    SmallVector<LiveInterval*, 8> SplitLIs;
+    LIS.splitSeparateComponents(LI, SplitLIs);
+    unsigned Original = VRM.getOriginal(VReg);
+    for (LiveInterval *SplitLI : SplitLIs)
+      VRM.setIsSplitFromReg(SplitLI->reg, Original);
+
+    // The new intervals all map back to i.
+    if (LRMap)
+      LRMap->resize(Edit->size(), i);
+  }
+
+  // Calculate spill weight and allocation hints for new intervals.
+  Edit->calculateRegClassAndHint(VRM.getMachineFunction(), SA.Loops, MBFI);
+
+  assert(!LRMap || LRMap->size() == Edit->size());
+}
+
+
+//===----------------------------------------------------------------------===//
+//                            Single Block Splitting
+//===----------------------------------------------------------------------===//
+
+bool SplitAnalysis::shouldSplitSingleBlock(const BlockInfo &BI,
+                                           bool SingleInstrs) const {
+  // Always split for multiple instructions.
+  if (!BI.isOneInstr())
+    return true;
+  // Don't split for single instructions unless explicitly requested.
+  if (!SingleInstrs)
+    return false;
+  // Splitting a live-through range always makes progress.
+  if (BI.LiveIn && BI.LiveOut)
+    return true;
+  // No point in isolating a copy. It has no register class constraints.
+  if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike())
+    return false;
+  // Finally, don't isolate an end point that was created by earlier splits.
+  return isOriginalEndpoint(BI.FirstInstr);
+}
+
+void SplitEditor::splitSingleBlock(const SplitAnalysis::BlockInfo &BI) {
+  openIntv();
+  SlotIndex LastSplitPoint = SA.getLastSplitPoint(BI.MBB->getNumber());
+  SlotIndex SegStart = enterIntvBefore(std::min(BI.FirstInstr,
+    LastSplitPoint));
+  if (!BI.LiveOut || BI.LastInstr < LastSplitPoint) {
+    useIntv(SegStart, leaveIntvAfter(BI.LastInstr));
+  } else {
+      // The last use is after the last valid split point.
+    SlotIndex SegStop = leaveIntvBefore(LastSplitPoint);
+    useIntv(SegStart, SegStop);
+    overlapIntv(SegStop, BI.LastInstr);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//                    Global Live Range Splitting Support
+//===----------------------------------------------------------------------===//
+
+// These methods support a method of global live range splitting that uses a
+// global algorithm to decide intervals for CFG edges. They will insert split
+// points and color intervals in basic blocks while avoiding interference.
+//
+// Note that splitSingleBlock is also useful for blocks where both CFG edges
+// are on the stack.
+
+void SplitEditor::splitLiveThroughBlock(unsigned MBBNum,
+                                        unsigned IntvIn, SlotIndex LeaveBefore,
+                                        unsigned IntvOut, SlotIndex EnterAfter){
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(MBBNum);
+
+  DEBUG(dbgs() << "BB#" << MBBNum << " [" << Start << ';' << Stop
+               << ") intf " << LeaveBefore << '-' << EnterAfter
+               << ", live-through " << IntvIn << " -> " << IntvOut);
+
+  assert((IntvIn || IntvOut) && "Use splitSingleBlock for isolated blocks");
+
+  assert((!LeaveBefore || LeaveBefore < Stop) && "Interference after block");
+  assert((!IntvIn || !LeaveBefore || LeaveBefore > Start) && "Impossible intf");
+  assert((!EnterAfter || EnterAfter >= Start) && "Interference before block");
+
+  MachineBasicBlock *MBB = VRM.getMachineFunction().getBlockNumbered(MBBNum);
+
+  if (!IntvOut) {
+    DEBUG(dbgs() << ", spill on entry.\n");
+    //
+    //        <<<<<<<<<    Possible LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    -____________    Spill on entry.
+    //
+    selectIntv(IntvIn);
+    SlotIndex Idx = leaveIntvAtTop(*MBB);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (!IntvIn) {
+    DEBUG(dbgs() << ", reload on exit.\n");
+    //
+    //    >>>>>>>          Possible EnterAfter interference.
+    //    |-----------|    Live through.
+    //    ___________--    Reload on exit.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvAtEnd(*MBB);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    (void)Idx;
+    return;
+  }
+
+  if (IntvIn == IntvOut && !LeaveBefore && !EnterAfter) {
+    DEBUG(dbgs() << ", straight through.\n");
+    //
+    //    |-----------|    Live through.
+    //    -------------    Straight through, same intv, no interference.
+    //
+    selectIntv(IntvOut);
+    useIntv(Start, Stop);
+    return;
+  }
+
+  // We cannot legally insert splits after LSP.
+  SlotIndex LSP = SA.getLastSplitPoint(MBBNum);
+  assert((!IntvOut || !EnterAfter || EnterAfter < LSP) && "Impossible intf");
+
+  if (IntvIn != IntvOut && (!LeaveBefore || !EnterAfter ||
+                  LeaveBefore.getBaseIndex() > EnterAfter.getBoundaryIndex())) {
+    DEBUG(dbgs() << ", switch avoiding interference.\n");
+    //
+    //    >>>>     <<<<    Non-overlapping EnterAfter/LeaveBefore interference.
+    //    |-----------|    Live through.
+    //    ------=======    Switch intervals between interference.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx;
+    if (LeaveBefore && LeaveBefore < LSP) {
+      Idx = enterIntvBefore(LeaveBefore);
+      useIntv(Idx, Stop);
+    } else {
+      Idx = enterIntvAtEnd(*MBB);
+    }
+    selectIntv(IntvIn);
+    useIntv(Start, Idx);
+    assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  DEBUG(dbgs() << ", create local intv for interference.\n");
+  //
+  //    >>><><><><<<<    Overlapping EnterAfter/LeaveBefore interference.
+  //    |-----------|    Live through.
+  //    ==---------==    Switch intervals before/after interference.
+  //
+  assert(LeaveBefore <= EnterAfter && "Missed case");
+
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  selectIntv(IntvIn);
+  Idx = leaveIntvBefore(LeaveBefore);
+  useIntv(Start, Idx);
+  assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+}
+
+
+void SplitEditor::splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                                  unsigned IntvIn, SlotIndex LeaveBefore) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
+               << ", reg-in " << IntvIn << ", leave before " << LeaveBefore
+               << (BI.LiveOut ? ", stack-out" : ", killed in block"));
+
+  assert(IntvIn && "Must have register in");
+  assert(BI.LiveIn && "Must be live-in");
+  assert((!LeaveBefore || LeaveBefore > Start) && "Bad interference");
+
+  if (!BI.LiveOut && (!LeaveBefore || LeaveBefore >= BI.LastInstr)) {
+    DEBUG(dbgs() << " before interference.\n");
+    //
+    //               <<<    Interference after kill.
+    //     |---o---x   |    Killed in block.
+    //     =========        Use IntvIn everywhere.
+    //
+    selectIntv(IntvIn);
+    useIntv(Start, BI.LastInstr);
+    return;
+  }
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
+
+  if (!LeaveBefore || LeaveBefore > BI.LastInstr.getBoundaryIndex()) {
+    //
+    //               <<<    Possible interference after last use.
+    //     |---o---o---|    Live-out on stack.
+    //     =========____    Leave IntvIn after last use.
+    //
+    //                 <    Interference after last use.
+    //     |---o---o--o|    Live-out on stack, late last use.
+    //     ============     Copy to stack after LSP, overlap IntvIn.
+    //            \_____    Stack interval is live-out.
+    //
+    if (BI.LastInstr < LSP) {
+      DEBUG(dbgs() << ", spill after last use before interference.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvAfter(BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    } else {
+      DEBUG(dbgs() << ", spill before last split point.\n");
+      selectIntv(IntvIn);
+      SlotIndex Idx = leaveIntvBefore(LSP);
+      overlapIntv(Idx, BI.LastInstr);
+      useIntv(Start, Idx);
+      assert((!LeaveBefore || Idx <= LeaveBefore) && "Interference");
+    }
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvIn. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  unsigned LocalIntv = openIntv();
+  (void)LocalIntv;
+  DEBUG(dbgs() << ", creating local interval " << LocalIntv << ".\n");
+
+  if (!BI.LiveOut || BI.LastInstr < LSP) {
+    //
+    //           <<<<<<<    Interference overlapping uses.
+    //     |---o---o---|    Live-out on stack.
+    //     =====----____    Leave IntvIn before interference, then spill.
+    //
+    SlotIndex To = leaveIntvAfter(BI.LastInstr);
+    SlotIndex From = enterIntvBefore(LeaveBefore);
+    useIntv(From, To);
+    selectIntv(IntvIn);
+    useIntv(Start, From);
+    assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+    return;
+  }
+
+  //           <<<<<<<    Interference overlapping uses.
+  //     |---o---o--o|    Live-out on stack, late last use.
+  //     =====-------     Copy to stack before LSP, overlap LocalIntv.
+  //            \_____    Stack interval is live-out.
+  //
+  SlotIndex To = leaveIntvBefore(LSP);
+  overlapIntv(To, BI.LastInstr);
+  SlotIndex From = enterIntvBefore(std::min(To, LeaveBefore));
+  useIntv(From, To);
+  selectIntv(IntvIn);
+  useIntv(Start, From);
+  assert((!LeaveBefore || From <= LeaveBefore) && "Interference");
+}
+
+void SplitEditor::splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                                   unsigned IntvOut, SlotIndex EnterAfter) {
+  SlotIndex Start, Stop;
+  std::tie(Start, Stop) = LIS.getSlotIndexes()->getMBBRange(BI.MBB);
+
+  DEBUG(dbgs() << "BB#" << BI.MBB->getNumber() << " [" << Start << ';' << Stop
+               << "), uses " << BI.FirstInstr << '-' << BI.LastInstr
+               << ", reg-out " << IntvOut << ", enter after " << EnterAfter
+               << (BI.LiveIn ? ", stack-in" : ", defined in block"));
+
+  SlotIndex LSP = SA.getLastSplitPoint(BI.MBB->getNumber());
+
+  assert(IntvOut && "Must have register out");
+  assert(BI.LiveOut && "Must be live-out");
+  assert((!EnterAfter || EnterAfter < LSP) && "Bad interference");
+
+  if (!BI.LiveIn && (!EnterAfter || EnterAfter <= BI.FirstInstr)) {
+    DEBUG(dbgs() << " after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |   o---o---|    Defined in block.
+    //        =========    Use IntvOut everywhere.
+    //
+    selectIntv(IntvOut);
+    useIntv(BI.FirstInstr, Stop);
+    return;
+  }
+
+  if (!EnterAfter || EnterAfter < BI.FirstInstr.getBaseIndex()) {
+    DEBUG(dbgs() << ", reload after interference.\n");
+    //
+    //    >>>>             Interference before def.
+    //    |---o---o---|    Live-through, stack-in.
+    //    ____=========    Enter IntvOut before first use.
+    //
+    selectIntv(IntvOut);
+    SlotIndex Idx = enterIntvBefore(std::min(LSP, BI.FirstInstr));
+    useIntv(Idx, Stop);
+    assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+    return;
+  }
+
+  // The interference is overlapping somewhere we wanted to use IntvOut. That
+  // means we need to create a local interval that can be allocated a
+  // different register.
+  DEBUG(dbgs() << ", interference overlaps uses.\n");
+  //
+  //    >>>>>>>          Interference overlapping uses.
+  //    |---o---o---|    Live-through, stack-in.
+  //    ____---======    Create local interval for interference range.
+  //
+  selectIntv(IntvOut);
+  SlotIndex Idx = enterIntvAfter(EnterAfter);
+  useIntv(Idx, Stop);
+  assert((!EnterAfter || Idx >= EnterAfter) && "Interference");
+
+  openIntv();
+  SlotIndex From = enterIntvBefore(std::min(Idx, BI.FirstInstr));
+  useIntv(From, Idx);
+}
diff --git a/contrib/llvm/lib/CodeGen/SplitKit.h b/contrib/llvm/lib/CodeGen/SplitKit.h
new file mode 100644
index 000000000000..a9684942885e
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/SplitKit.h
@@ -0,0 +1,503 @@
+//===-------- SplitKit.h - Toolkit for splitting live ranges ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the SplitAnalysis class as well as mutator functions for
+// live range splitting.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_CODEGEN_SPLITKIT_H
+#define LLVM_LIB_CODEGEN_SPLITKIT_H
+
+#include "LiveRangeCalc.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+namespace llvm {
+
+class ConnectedVNInfoEqClasses;
+class LiveInterval;
+class LiveIntervals;
+class LiveRangeEdit;
+class MachineBlockFrequencyInfo;
+class MachineInstr;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class VirtRegMap;
+class VNInfo;
+class raw_ostream;
+
+/// Determines the latest safe point in a block in which we can insert a split,
+/// spill or other instruction related with CurLI.
+class LLVM_LIBRARY_VISIBILITY InsertPointAnalysis {
+private:
+  const LiveIntervals &LIS;
+
+  /// Last legal insert point in each basic block in the current function.
+  /// The first entry is the first terminator, the second entry is the
+  /// last valid point to insert a split or spill for a variable that is
+  /// live into a landing pad successor.
+  SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastInsertPoint;
+
+  SlotIndex computeLastInsertPoint(const LiveInterval &CurLI,
+                                   const MachineBasicBlock &MBB);
+
+public:
+  InsertPointAnalysis(const LiveIntervals &lis, unsigned BBNum);
+
+  /// Return the base index of the last valid insert point for \pCurLI in \pMBB.
+  SlotIndex getLastInsertPoint(const LiveInterval &CurLI,
+                               const MachineBasicBlock &MBB) {
+    unsigned Num = MBB.getNumber();
+    // Inline the common simple case.
+    if (LastInsertPoint[Num].first.isValid() &&
+        !LastInsertPoint[Num].second.isValid())
+      return LastInsertPoint[Num].first;
+    return computeLastInsertPoint(CurLI, MBB);
+  }
+
+  /// Returns the last insert point as an iterator for \pCurLI in \pMBB.
+  MachineBasicBlock::iterator getLastInsertPointIter(const LiveInterval &CurLI,
+                                                     MachineBasicBlock &MBB);
+};
+
+/// SplitAnalysis - Analyze a LiveInterval, looking for live range splitting
+/// opportunities.
+class LLVM_LIBRARY_VISIBILITY SplitAnalysis {
+public:
+  const MachineFunction &MF;
+  const VirtRegMap &VRM;
+  const LiveIntervals &LIS;
+  const MachineLoopInfo &Loops;
+  const TargetInstrInfo &TII;
+
+  /// Additional information about basic blocks where the current variable is
+  /// live. Such a block will look like one of these templates:
+  ///
+  ///  1. |   o---x   | Internal to block. Variable is only live in this block.
+  ///  2. |---x       | Live-in, kill.
+  ///  3. |       o---| Def, live-out.
+  ///  4. |---x   o---| Live-in, kill, def, live-out. Counted by NumGapBlocks.
+  ///  5. |---o---o---| Live-through with uses or defs.
+  ///  6. |-----------| Live-through without uses. Counted by NumThroughBlocks.
+  ///
+  /// Two BlockInfo entries are created for template 4. One for the live-in
+  /// segment, and one for the live-out segment. These entries look as if the
+  /// block were split in the middle where the live range isn't live.
+  ///
+  /// Live-through blocks without any uses don't get BlockInfo entries. They
+  /// are simply listed in ThroughBlocks instead.
+  ///
+  struct BlockInfo {
+    MachineBasicBlock *MBB;
+    SlotIndex FirstInstr; ///< First instr accessing current reg.
+    SlotIndex LastInstr;  ///< Last instr accessing current reg.
+    SlotIndex FirstDef;   ///< First non-phi valno->def, or SlotIndex().
+    bool LiveIn;          ///< Current reg is live in.
+    bool LiveOut;         ///< Current reg is live out.
+
+    /// isOneInstr - Returns true when this BlockInfo describes a single
+    /// instruction.
+    bool isOneInstr() const {
+      return SlotIndex::isSameInstr(FirstInstr, LastInstr);
+    }
+  };
+
+private:
+  // Current live interval.
+  const LiveInterval *CurLI;
+
+  /// Insert Point Analysis.
+  InsertPointAnalysis IPA;
+
+  // Sorted slot indexes of using instructions.
+  SmallVector<SlotIndex, 8> UseSlots;
+
+  /// UseBlocks - Blocks where CurLI has uses.
+  SmallVector<BlockInfo, 8> UseBlocks;
+
+  /// NumGapBlocks - Number of duplicate entries in UseBlocks for blocks where
+  /// the live range has a gap.
+  unsigned NumGapBlocks;
+
+  /// ThroughBlocks - Block numbers where CurLI is live through without uses.
+  BitVector ThroughBlocks;
+
+  /// NumThroughBlocks - Number of live-through blocks.
+  unsigned NumThroughBlocks;
+
+  /// DidRepairRange - analyze was forced to shrinkToUses().
+  bool DidRepairRange;
+
+  // Sumarize statistics by counting instructions using CurLI.
+  void analyzeUses();
+
+  /// calcLiveBlockInfo - Compute per-block information about CurLI.
+  bool calcLiveBlockInfo();
+
+public:
+  SplitAnalysis(const VirtRegMap &vrm, const LiveIntervals &lis,
+                const MachineLoopInfo &mli);
+
+  /// analyze - set CurLI to the specified interval, and analyze how it may be
+  /// split.
+  void analyze(const LiveInterval *li);
+
+  /// didRepairRange() - Returns true if CurLI was invalid and has been repaired
+  /// by analyze(). This really shouldn't happen, but sometimes the coalescer
+  /// can create live ranges that end in mid-air.
+  bool didRepairRange() const { return DidRepairRange; }
+
+  /// clear - clear all data structures so SplitAnalysis is ready to analyze a
+  /// new interval.
+  void clear();
+
+  /// getParent - Return the last analyzed interval.
+  const LiveInterval &getParent() const { return *CurLI; }
+
+  /// isOriginalEndpoint - Return true if the original live range was killed or
+  /// (re-)defined at Idx. Idx should be the 'def' slot for a normal kill/def,
+  /// and 'use' for an early-clobber def.
+  /// This can be used to recognize code inserted by earlier live range
+  /// splitting.
+  bool isOriginalEndpoint(SlotIndex Idx) const;
+
+  /// getUseSlots - Return an array of SlotIndexes of instructions using CurLI.
+  /// This include both use and def operands, at most one entry per instruction.
+  ArrayRef<SlotIndex> getUseSlots() const { return UseSlots; }
+
+  /// getUseBlocks - Return an array of BlockInfo objects for the basic blocks
+  /// where CurLI has uses.
+  ArrayRef<BlockInfo> getUseBlocks() const { return UseBlocks; }
+
+  /// getNumThroughBlocks - Return the number of through blocks.
+  unsigned getNumThroughBlocks() const { return NumThroughBlocks; }
+
+  /// isThroughBlock - Return true if CurLI is live through MBB without uses.
+  bool isThroughBlock(unsigned MBB) const { return ThroughBlocks.test(MBB); }
+
+  /// getThroughBlocks - Return the set of through blocks.
+  const BitVector &getThroughBlocks() const { return ThroughBlocks; }
+
+  /// getNumLiveBlocks - Return the number of blocks where CurLI is live.
+  unsigned getNumLiveBlocks() const {
+    return getUseBlocks().size() - NumGapBlocks + getNumThroughBlocks();
+  }
+
+  /// countLiveBlocks - Return the number of blocks where li is live. This is
+  /// guaranteed to return the same number as getNumLiveBlocks() after calling
+  /// analyze(li).
+  unsigned countLiveBlocks(const LiveInterval *li) const;
+
+  typedef SmallPtrSet<const MachineBasicBlock*, 16> BlockPtrSet;
+
+  /// shouldSplitSingleBlock - Returns true if it would help to create a local
+  /// live range for the instructions in BI. There is normally no benefit to
+  /// creating a live range for a single instruction, but it does enable
+  /// register class inflation if the instruction has a restricted register
+  /// class.
+  ///
+  /// @param BI           The block to be isolated.
+  /// @param SingleInstrs True when single instructions should be isolated.
+  bool shouldSplitSingleBlock(const BlockInfo &BI, bool SingleInstrs) const;
+
+  SlotIndex getLastSplitPoint(unsigned Num) {
+    return IPA.getLastInsertPoint(*CurLI, *MF.getBlockNumbered(Num));
+  }
+
+  MachineBasicBlock::iterator getLastSplitPointIter(MachineBasicBlock *BB) {
+    return IPA.getLastInsertPointIter(*CurLI, *BB);
+  }
+};
+
+
+/// SplitEditor - Edit machine code and LiveIntervals for live range
+/// splitting.
+///
+/// - Create a SplitEditor from a SplitAnalysis.
+/// - Start a new live interval with openIntv.
+/// - Mark the places where the new interval is entered using enterIntv*
+/// - Mark the ranges where the new interval is used with useIntv* 
+/// - Mark the places where the interval is exited with exitIntv*.
+/// - Finish the current interval with closeIntv and repeat from 2.
+/// - Rewrite instructions with finish().
+///
+class LLVM_LIBRARY_VISIBILITY SplitEditor {
+  SplitAnalysis &SA;
+  AliasAnalysis &AA;
+  LiveIntervals &LIS;
+  VirtRegMap &VRM;
+  MachineRegisterInfo &MRI;
+  MachineDominatorTree &MDT;
+  const TargetInstrInfo &TII;
+  const TargetRegisterInfo &TRI;
+  const MachineBlockFrequencyInfo &MBFI;
+
+public:
+
+  /// ComplementSpillMode - Select how the complement live range should be
+  /// created.  SplitEditor automatically creates interval 0 to contain
+  /// anything that isn't added to another interval.  This complement interval
+  /// can get quite complicated, and it can sometimes be an advantage to allow
+  /// it to overlap the other intervals.  If it is going to spill anyway, no
+  /// registers are wasted by keeping a value in two places at the same time.
+  enum ComplementSpillMode {
+    /// SM_Partition(Default) - Try to create the complement interval so it
+    /// doesn't overlap any other intervals, and the original interval is
+    /// partitioned.  This may require a large number of back copies and extra
+    /// PHI-defs.  Only segments marked with overlapIntv will be overlapping.
+    SM_Partition,
+
+    /// SM_Size - Overlap intervals to minimize the number of inserted COPY
+    /// instructions.  Copies to the complement interval are hoisted to their
+    /// common dominator, so only one COPY is required per value in the
+    /// complement interval.  This also means that no extra PHI-defs need to be
+    /// inserted in the complement interval.
+    SM_Size,
+
+    /// SM_Speed - Overlap intervals to minimize the expected execution
+    /// frequency of the inserted copies.  This is very similar to SM_Size, but
+    /// the complement interval may get some extra PHI-defs.
+    SM_Speed
+  };
+
+private:
+
+  /// Edit - The current parent register and new intervals created.
+  LiveRangeEdit *Edit;
+
+  /// Index into Edit of the currently open interval.
+  /// The index 0 is used for the complement, so the first interval started by
+  /// openIntv will be 1.
+  unsigned OpenIdx;
+
+  /// The current spill mode, selected by reset().
+  ComplementSpillMode SpillMode;
+
+  typedef IntervalMap<SlotIndex, unsigned> RegAssignMap;
+
+  /// Allocator for the interval map. This will eventually be shared with
+  /// SlotIndexes and LiveIntervals.
+  RegAssignMap::Allocator Allocator;
+
+  /// RegAssign - Map of the assigned register indexes.
+  /// Edit.get(RegAssign.lookup(Idx)) is the register that should be live at
+  /// Idx.
+  RegAssignMap RegAssign;
+
+  typedef PointerIntPair<VNInfo*, 1> ValueForcePair;
+  typedef DenseMap<std::pair<unsigned, unsigned>, ValueForcePair> ValueMap;
+
+  /// Values - keep track of the mapping from parent values to values in the new
+  /// intervals. Given a pair (RegIdx, ParentVNI->id), Values contains:
+  ///
+  /// 1. No entry - the value is not mapped to Edit.get(RegIdx).
+  /// 2. (Null, false) - the value is mapped to multiple values in
+  ///    Edit.get(RegIdx).  Each value is represented by a minimal live range at
+  ///    its def.  The full live range can be inferred exactly from the range
+  ///    of RegIdx in RegAssign.
+  /// 3. (Null, true).  As above, but the ranges in RegAssign are too large, and
+  ///    the live range must be recomputed using LiveRangeCalc::extend().
+  /// 4. (VNI, false) The value is mapped to a single new value.
+  ///    The new value has no live ranges anywhere.
+  ValueMap Values;
+
+  /// LRCalc - Cache for computing live ranges and SSA update.  Each instance
+  /// can only handle non-overlapping live ranges, so use a separate
+  /// LiveRangeCalc instance for the complement interval when in spill mode.
+  LiveRangeCalc LRCalc[2];
+
+  /// getLRCalc - Return the LRCalc to use for RegIdx.  In spill mode, the
+  /// complement interval can overlap the other intervals, so it gets its own
+  /// LRCalc instance.  When not in spill mode, all intervals can share one.
+  LiveRangeCalc &getLRCalc(unsigned RegIdx) {
+    return LRCalc[SpillMode != SM_Partition && RegIdx != 0];
+  }
+
+  /// defValue - define a value in RegIdx from ParentVNI at Idx.
+  /// Idx does not have to be ParentVNI->def, but it must be contained within
+  /// ParentVNI's live range in ParentLI. The new value is added to the value
+  /// map.
+  /// Return the new LI value.
+  VNInfo *defValue(unsigned RegIdx, const VNInfo *ParentVNI, SlotIndex Idx);
+
+  /// forceRecompute - Force the live range of ParentVNI in RegIdx to be
+  /// recomputed by LiveRangeCalc::extend regardless of the number of defs.
+  /// This is used for values whose live range doesn't match RegAssign exactly.
+  /// They could have rematerialized, or back-copies may have been moved.
+  void forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI);
+
+  /// defFromParent - Define Reg from ParentVNI at UseIdx using either
+  /// rematerialization or a COPY from parent. Return the new value.
+  VNInfo *defFromParent(unsigned RegIdx,
+                        VNInfo *ParentVNI,
+                        SlotIndex UseIdx,
+                        MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator I);
+
+  /// removeBackCopies - Remove the copy instructions that defines the values
+  /// in the vector in the complement interval.
+  void removeBackCopies(SmallVectorImpl<VNInfo*> &Copies);
+
+  /// getShallowDominator - Returns the least busy dominator of MBB that is
+  /// also dominated by DefMBB.  Busy is measured by loop depth.
+  MachineBasicBlock *findShallowDominator(MachineBasicBlock *MBB,
+                                          MachineBasicBlock *DefMBB);
+
+  /// Find out all the backCopies dominated by others.
+  void computeRedundantBackCopies(DenseSet<unsigned> &NotToHoistSet,
+                                  SmallVectorImpl<VNInfo *> &BackCopies);
+
+  /// Hoist back-copies to the complement interval. It tries to hoist all
+  /// the back-copies to one BB if it is beneficial, or else simply remove
+  /// redundant backcopies dominated by others.
+  void hoistCopies();
+
+  /// transferValues - Transfer values to the new ranges.
+  /// Return true if any ranges were skipped.
+  bool transferValues();
+
+  /// extendPHIKillRanges - Extend the ranges of all values killed by original
+  /// parent PHIDefs.
+  void extendPHIKillRanges();
+
+  /// rewriteAssigned - Rewrite all uses of Edit.getReg() to assigned registers.
+  void rewriteAssigned(bool ExtendRanges);
+
+  /// deleteRematVictims - Delete defs that are dead after rematerializing.
+  void deleteRematVictims();
+
+public:
+  /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
+  /// Newly created intervals will be appended to newIntervals.
+  SplitEditor(SplitAnalysis &SA, AliasAnalysis &AA, LiveIntervals&,
+              VirtRegMap&, MachineDominatorTree&,
+              MachineBlockFrequencyInfo &);
+
+  /// reset - Prepare for a new split.
+  void reset(LiveRangeEdit&, ComplementSpillMode = SM_Partition);
+
+  /// Create a new virtual register and live interval.
+  /// Return the interval index, starting from 1. Interval index 0 is the
+  /// implicit complement interval.
+  unsigned openIntv();
+
+  /// currentIntv - Return the current interval index.
+  unsigned currentIntv() const { return OpenIdx; }
+
+  /// selectIntv - Select a previously opened interval index.
+  void selectIntv(unsigned Idx);
+
+  /// enterIntvBefore - Enter the open interval before the instruction at Idx.
+  /// If the parent interval is not live before Idx, a COPY is not inserted.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvBefore(SlotIndex Idx);
+
+  /// enterIntvAfter - Enter the open interval after the instruction at Idx.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAfter(SlotIndex Idx);
+
+  /// enterIntvAtEnd - Enter the open interval at the end of MBB.
+  /// Use the open interval from the inserted copy to the MBB end.
+  /// Return the beginning of the new live range.
+  SlotIndex enterIntvAtEnd(MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in MBB should use OpenLI.
+  void useIntv(const MachineBasicBlock &MBB);
+
+  /// useIntv - indicate that all instructions in range should use OpenLI.
+  void useIntv(SlotIndex Start, SlotIndex End);
+
+  /// leaveIntvAfter - Leave the open interval after the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAfter(SlotIndex Idx);
+
+  /// leaveIntvBefore - Leave the open interval before the instruction at Idx.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvBefore(SlotIndex Idx);
+
+  /// leaveIntvAtTop - Leave the interval at the top of MBB.
+  /// Add liveness from the MBB top to the copy.
+  /// Return the end of the live range.
+  SlotIndex leaveIntvAtTop(MachineBasicBlock &MBB);
+
+  /// overlapIntv - Indicate that all instructions in range should use the open
+  /// interval, but also let the complement interval be live.
+  ///
+  /// This doubles the register pressure, but is sometimes required to deal with
+  /// register uses after the last valid split point.
+  ///
+  /// The Start index should be a return value from a leaveIntv* call, and End
+  /// should be in the same basic block. The parent interval must have the same
+  /// value across the range.
+  ///
+  void overlapIntv(SlotIndex Start, SlotIndex End);
+
+  /// finish - after all the new live ranges have been created, compute the
+  /// remaining live range, and rewrite instructions to use the new registers.
+  /// @param LRMap When not null, this vector will map each live range in Edit
+  ///              back to the indices returned by openIntv.
+  ///              There may be extra indices created by dead code elimination.
+  void finish(SmallVectorImpl<unsigned> *LRMap = nullptr);
+
+  /// dump - print the current interval mapping to dbgs().
+  void dump() const;
+
+  // ===--- High level methods ---===
+
+  /// splitSingleBlock - Split CurLI into a separate live interval around the
+  /// uses in a single block. This is intended to be used as part of a larger
+  /// split, and doesn't call finish().
+  void splitSingleBlock(const SplitAnalysis::BlockInfo &BI);
+
+  /// splitLiveThroughBlock - Split CurLI in the given block such that it
+  /// enters the block in IntvIn and leaves it in IntvOut. There may be uses in
+  /// the block, but they will be ignored when placing split points.
+  ///
+  /// @param MBBNum      Block number.
+  /// @param IntvIn      Interval index entering the block.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitLiveThroughBlock(unsigned MBBNum,
+                             unsigned IntvIn, SlotIndex LeaveBefore,
+                             unsigned IntvOut, SlotIndex EnterAfter);
+
+  /// splitRegInBlock - Split CurLI in the given block such that it enters the
+  /// block in IntvIn and leaves it on the stack (or not at all). Split points
+  /// are placed in a way that avoids putting uses in the stack interval. This
+  /// may require creating a local interval when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvIn      Interval index entering the block. Not 0.
+  /// @param LeaveBefore When set, leave IntvIn before this point.
+  void splitRegInBlock(const SplitAnalysis::BlockInfo &BI,
+                       unsigned IntvIn, SlotIndex LeaveBefore);
+
+  /// splitRegOutBlock - Split CurLI in the given block such that it enters the
+  /// block on the stack (or isn't live-in at all) and leaves it in IntvOut.
+  /// Split points are placed to avoid interference and such that the uses are
+  /// not in the stack interval. This may require creating a local interval
+  /// when there is interference.
+  ///
+  /// @param BI          Block descriptor.
+  /// @param IntvOut     Interval index leaving the block.
+  /// @param EnterAfter  When set, enter IntvOut after this point.
+  void splitRegOutBlock(const SplitAnalysis::BlockInfo &BI,
+                        unsigned IntvOut, SlotIndex EnterAfter);
+};
+
+}
+
+#endif
diff --git a/contrib/llvm/lib/CodeGen/StackColoring.cpp b/contrib/llvm/lib/CodeGen/StackColoring.cpp
new file mode 100644
index 000000000000..87cd470d5690
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackColoring.cpp
@@ -0,0 +1,1126 @@
+//===-- StackColoring.cpp -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements the stack-coloring optimization that looks for
+// lifetime markers machine instructions (LIFESTART_BEGIN and LIFESTART_END),
+// which represent the possible lifetime of stack slots. It attempts to
+// merge disjoint stack slots and reduce the used stack space.
+// NOTE: This pass is not StackSlotColoring, which optimizes spill slots.
+//
+// TODO: In the future we plan to improve stack coloring in the following ways:
+// 1. Allow merging multiple small slots into a single larger slot at different
+//    offsets.
+// 2. Merge this pass with StackSlotColoring and allow merging of allocas with
+//    spill slots.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stackcoloring"
+
+static cl::opt<bool>
+DisableColoring("no-stack-coloring",
+        cl::init(false), cl::Hidden,
+        cl::desc("Disable stack coloring"));
+
+/// The user may write code that uses allocas outside of the declared lifetime
+/// zone. This can happen when the user returns a reference to a local
+/// data-structure. We can detect these cases and decide not to optimize the
+/// code. If this flag is enabled, we try to save the user. This option
+/// is treated as overriding LifetimeStartOnFirstUse below.
+static cl::opt<bool>
+ProtectFromEscapedAllocas("protect-from-escaped-allocas",
+                          cl::init(false), cl::Hidden,
+                          cl::desc("Do not optimize lifetime zones that "
+                                   "are broken"));
+
+/// Enable enhanced dataflow scheme for lifetime analysis (treat first
+/// use of stack slot as start of slot lifetime, as opposed to looking
+/// for LIFETIME_START marker). See "Implementation notes" below for
+/// more info.
+static cl::opt<bool>
+LifetimeStartOnFirstUse("stackcoloring-lifetime-start-on-first-use",
+        cl::init(true), cl::Hidden,
+        cl::desc("Treat stack lifetimes as starting on first use, not on START marker."));
+
+
+STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
+STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
+STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+STATISTIC(EscapedAllocas, "Number of allocas that escaped the lifetime region");
+
+//
+// Implementation Notes:
+// ---------------------
+//
+// Consider the following motivating example:
+//
+//     int foo() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         char b3[1024];
+//         <uses of b1, b3>;
+//         return x;
+//       } else {
+//         char b4[1024], b5[1024];
+//         <uses of b2, b4, b5>;
+//         return y;
+//       }
+//     }
+//
+// In the code above, "b3" and "b4" are declared in distinct lexical
+// scopes, meaning that it is easy to prove that they can share the
+// same stack slot. Variables "b1" and "b2" are declared in the same
+// scope, meaning that from a lexical point of view, their lifetimes
+// overlap. From a control flow pointer of view, however, the two
+// variables are accessed in disjoint regions of the CFG, thus it
+// should be possible for them to share the same stack slot. An ideal
+// stack allocation for the function above would look like:
+//
+//     slot 0: b1, b2
+//     slot 1: b3, b4
+//     slot 2: b5
+//
+// Achieving this allocation is tricky, however, due to the way
+// lifetime markers are inserted. Here is a simplified view of the
+// control flow graph for the code above:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+   +------  block 2 -------+
+//  2| LIFETIME_START b3     |  5| LIFETIME_START b4, b5 |
+//  3| <uses of b1, b3>      |  6| <uses of b2, b4, b5>  |
+//  4| LIFETIME_END b3       |  7| LIFETIME_END b4, b5   |
+//   +-----------------------+   +-----------------------+
+//                   \.              /.
+//                +------  block 3 -------+
+//               8| <cleanupcode>         |
+//               9| LIFETIME_END b1, b2   |
+//              10| return                |
+//                +-----------------------+
+//
+// If we create live intervals for the variables above strictly based
+// on the lifetime markers, we'll get the set of intervals on the
+// left. If we ignore the lifetime start markers and instead treat a
+// variable's lifetime as beginning with the first reference to the
+// var, then we get the intervals on the right.
+//
+//            LIFETIME_START      First Use
+//     b1:    [0,9]               [3,4] [8,9]
+//     b2:    [0,9]               [6,9]
+//     b3:    [2,4]               [3,4]
+//     b4:    [5,7]               [6,7]
+//     b5:    [5,7]               [6,7]
+//
+// For the intervals on the left, the best we can do is overlap two
+// variables (b3 and b4, for example); this gives us a stack size of
+// 4*1024 bytes, not ideal. When treating first-use as the start of a
+// lifetime, we can additionally overlap b1 and b5, giving us a 3*1024
+// byte stack (better).
+//
+// Relying entirely on first-use of stack slots is problematic,
+// however, due to the fact that optimizations can sometimes migrate
+// uses of a variable outside of its lifetime start/end region. Here
+// is an example:
+//
+//     int bar() {
+//       char b1[1024], b2[1024];
+//       if (...) {
+//         <uses of b2>
+//         return y;
+//       } else {
+//         <uses of b1>
+//         while (...) {
+//           char b3[1024];
+//           <uses of b3>
+//         }
+//       }
+//     }
+//
+// Before optimization, the control flow graph for the code above
+// might look like the following:
+//
+//                +------  block 0 -------+
+//               0| LIFETIME_START b1, b2 |
+//               1| <test 'if' condition> |
+//                +-----------------------+
+//                   ./              \.
+//   +------  block 1 -------+    +------- block 2 -------+
+//  2| <uses of b2>          |   3| <uses of b1>          |
+//   +-----------------------+    +-----------------------+
+//              |                            |
+//              |                 +------- block 3 -------+ <-\.
+//              |                4| <while condition>     |    |
+//              |                 +-----------------------+    |
+//              |               /          |                   |
+//              |              /  +------- block 4 -------+
+//              \             /  5| LIFETIME_START b3     |    |
+//               \           /   6| <uses of b3>          |    |
+//                \         /    7| LIFETIME_END b3       |    |
+//                 \        |    +------------------------+    |
+//                  \       |                 \                /
+//                +------  block 5 -----+      \---------------
+//               8| <cleanupcode>       |
+//               9| LIFETIME_END b1, b2 |
+//              10| return              |
+//                +---------------------+
+//
+// During optimization, however, it can happen that an instruction
+// computing an address in "b3" (for example, a loop-invariant GEP) is
+// hoisted up out of the loop from block 4 to block 2.  [Note that
+// this is not an actual load from the stack, only an instruction that
+// computes the address to be loaded]. If this happens, there is now a
+// path leading from the first use of b3 to the return instruction
+// that does not encounter the b3 LIFETIME_END, hence b3's lifetime is
+// now larger than if we were computing live intervals strictly based
+// on lifetime markers. In the example above, this lengthened lifetime
+// would mean that it would appear illegal to overlap b3 with b2.
+//
+// To deal with this such cases, the code in ::collectMarkers() below
+// tries to identify "degenerate" slots -- those slots where on a single
+// forward pass through the CFG we encounter a first reference to slot
+// K before we hit the slot K lifetime start marker. For such slots,
+// we fall back on using the lifetime start marker as the beginning of
+// the variable's lifetime.  NB: with this implementation, slots can
+// appear degenerate in cases where there is unstructured control flow:
+//
+//    if (q) goto mid;
+//    if (x > 9) {
+//         int b[100];
+//         memcpy(&b[0], ...);
+//    mid: b[k] = ...;
+//         abc(&b);
+//    }
+//
+// If in RPO ordering chosen to walk the CFG  we happen to visit the b[k]
+// before visiting the memcpy block (which will contain the lifetime start
+// for "b" then it will appear that 'b' has a degenerate lifetime.
+//
+
+//===----------------------------------------------------------------------===//
+//                           StackColoring Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// StackColoring - A machine pass for merging disjoint stack allocations,
+/// marked by the LIFETIME_START and LIFETIME_END pseudo instructions.
+class StackColoring : public MachineFunctionPass {
+  MachineFrameInfo *MFI;
+  MachineFunction *MF;
+
+  /// A class representing liveness information for a single basic block.
+  /// Each bit in the BitVector represents the liveness property
+  /// for a different stack slot.
+  struct BlockLifetimeInfo {
+    /// Which slots BEGINs in each basic block.
+    BitVector Begin;
+    /// Which slots ENDs in each basic block.
+    BitVector End;
+    /// Which slots are marked as LIVE_IN, coming into each basic block.
+    BitVector LiveIn;
+    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
+    BitVector LiveOut;
+  };
+
+  /// Maps active slots (per bit) for each basic block.
+  typedef DenseMap<const MachineBasicBlock*, BlockLifetimeInfo> LivenessMap;
+  LivenessMap BlockLiveness;
+
+  /// Maps serial numbers to basic blocks.
+  DenseMap<const MachineBasicBlock*, int> BasicBlocks;
+  /// Maps basic blocks to a serial number.
+  SmallVector<const MachineBasicBlock*, 8> BasicBlockNumbering;
+
+  /// Maps liveness intervals for each slot.
+  SmallVector<std::unique_ptr<LiveInterval>, 16> Intervals;
+  /// VNInfo is used for the construction of LiveIntervals.
+  VNInfo::Allocator VNInfoAllocator;
+  /// SlotIndex analysis object.
+  SlotIndexes *Indexes;
+  /// The stack protector object.
+  StackProtector *SP;
+
+  /// The list of lifetime markers found. These markers are to be removed
+  /// once the coloring is done.
+  SmallVector<MachineInstr*, 8> Markers;
+
+  /// Record the FI slots for which we have seen some sort of
+  /// lifetime marker (either start or end).
+  BitVector InterestingSlots;
+
+  /// FI slots that need to be handled conservatively (for these
+  /// slots lifetime-start-on-first-use is disabled).
+  BitVector ConservativeSlots;
+
+  /// Number of iterations taken during data flow analysis.
+  unsigned NumIterations;
+
+public:
+  static char ID;
+  StackColoring() : MachineFunctionPass(ID) {
+    initializeStackColoringPass(*PassRegistry::getPassRegistry());
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// Debug.
+  void dump() const;
+  void dumpIntervals() const;
+  void dumpBB(MachineBasicBlock *MBB) const;
+  void dumpBV(const char *tag, const BitVector &BV) const;
+
+  /// Removes all of the lifetime marker instructions from the function.
+  /// \returns true if any markers were removed.
+  bool removeAllMarkers();
+
+  /// Scan the machine function and find all of the lifetime markers.
+  /// Record the findings in the BEGIN and END vectors.
+  /// \returns the number of markers found.
+  unsigned collectMarkers(unsigned NumSlot);
+
+  /// Perform the dataflow calculation and calculate the lifetime for each of
+  /// the slots, based on the BEGIN/END vectors. Set the LifetimeLIVE_IN and
+  /// LifetimeLIVE_OUT maps that represent which stack slots are live coming
+  /// in and out blocks.
+  void calculateLocalLiveness();
+
+  /// Returns TRUE if we're using the first-use-begins-lifetime method for
+  /// this slot (if FALSE, then the start marker is treated as start of lifetime).
+  bool applyFirstUse(int Slot) {
+    if (!LifetimeStartOnFirstUse || ProtectFromEscapedAllocas)
+      return false;
+    if (ConservativeSlots.test(Slot))
+      return false;
+    return true;
+  }
+
+  /// Examines the specified instruction and returns TRUE if the instruction
+  /// represents the start or end of an interesting lifetime. The slot or slots
+  /// starting or ending are added to the vector "slots" and "isStart" is set
+  /// accordingly.
+  /// \returns True if inst contains a lifetime start or end
+  bool isLifetimeStartOrEnd(const MachineInstr &MI,
+                            SmallVector<int, 4> &slots,
+                            bool &isStart);
+
+  /// Construct the LiveIntervals for the slots.
+  void calculateLiveIntervals(unsigned NumSlots);
+
+  /// Go over the machine function and change instructions which use stack
+  /// slots to use the joint slots.
+  void remapInstructions(DenseMap<int, int> &SlotRemap);
+
+  /// The input program may contain instructions which are not inside lifetime
+  /// markers. This can happen due to a bug in the compiler or due to a bug in
+  /// user code (for example, returning a reference to a local variable).
+  /// This procedure checks all of the instructions in the function and
+  /// invalidates lifetime ranges which do not contain all of the instructions
+  /// which access that frame slot.
+  void removeInvalidSlotRanges();
+
+  /// Map entries which point to other entries to their destination.
+  ///   A->B->C becomes A->C.
+  void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
+
+  /// Used in collectMarkers
+  typedef DenseMap<const MachineBasicBlock*, BitVector> BlockBitVecMap;
+};
+} // end anonymous namespace
+
+char StackColoring::ID = 0;
+char &llvm::StackColoringID = StackColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(StackProtector)
+INITIALIZE_PASS_END(StackColoring,
+                   "stack-coloring", "Merge disjoint stack slots", false, false)
+
+void StackColoring::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<SlotIndexes>();
+  AU.addRequired<StackProtector>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+#ifndef NDEBUG
+
+LLVM_DUMP_METHOD void StackColoring::dumpBV(const char *tag,
+                                            const BitVector &BV) const {
+  DEBUG(dbgs() << tag << " : { ");
+  for (unsigned I = 0, E = BV.size(); I != E; ++I)
+    DEBUG(dbgs() << BV.test(I) << " ");
+  DEBUG(dbgs() << "}\n");
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpBB(MachineBasicBlock *MBB) const {
+  LivenessMap::const_iterator BI = BlockLiveness.find(MBB);
+  assert(BI != BlockLiveness.end() && "Block not found");
+  const BlockLifetimeInfo &BlockInfo = BI->second;
+
+  dumpBV("BEGIN", BlockInfo.Begin);
+  dumpBV("END", BlockInfo.End);
+  dumpBV("LIVE_IN", BlockInfo.LiveIn);
+  dumpBV("LIVE_OUT", BlockInfo.LiveOut);
+}
+
+LLVM_DUMP_METHOD void StackColoring::dump() const {
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+    DEBUG(dbgs() << "Inspecting block #" << MBB->getNumber() << " ["
+                 << MBB->getName() << "]\n");
+    DEBUG(dumpBB(MBB));
+  }
+}
+
+LLVM_DUMP_METHOD void StackColoring::dumpIntervals() const {
+  for (unsigned I = 0, E = Intervals.size(); I != E; ++I) {
+    DEBUG(dbgs() << "Interval[" << I << "]:\n");
+    DEBUG(Intervals[I]->dump());
+  }
+}
+
+#endif // not NDEBUG
+
+static inline int getStartOrEndSlot(const MachineInstr &MI)
+{
+  assert((MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) &&
+         "Expected LIFETIME_START or LIFETIME_END op");
+  const MachineOperand &MO = MI.getOperand(0);
+  int Slot = MO.getIndex();
+  if (Slot >= 0)
+    return Slot;
+  return -1;
+}
+
+//
+// At the moment the only way to end a variable lifetime is with
+// a VARIABLE_LIFETIME op (which can't contain a start). If things
+// change and the IR allows for a single inst that both begins
+// and ends lifetime(s), this interface will need to be reworked.
+//
+bool StackColoring::isLifetimeStartOrEnd(const MachineInstr &MI,
+                                         SmallVector<int, 4> &slots,
+                                         bool &isStart)
+{
+  if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+      MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+    int Slot = getStartOrEndSlot(MI);
+    if (Slot < 0)
+      return false;
+    if (!InterestingSlots.test(Slot))
+      return false;
+    slots.push_back(Slot);
+    if (MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+      isStart = false;
+      return true;
+    }
+    if (! applyFirstUse(Slot)) {
+      isStart = true;
+      return true;
+    }
+  } else if (LifetimeStartOnFirstUse && !ProtectFromEscapedAllocas) {
+    if (! MI.isDebugValue()) {
+      bool found = false;
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isFI())
+          continue;
+        int Slot = MO.getIndex();
+        if (Slot<0)
+          continue;
+        if (InterestingSlots.test(Slot) && applyFirstUse(Slot)) {
+          slots.push_back(Slot);
+          found = true;
+        }
+      }
+      if (found) {
+        isStart = true;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+unsigned StackColoring::collectMarkers(unsigned NumSlot)
+{
+  unsigned MarkersFound = 0;
+  BlockBitVecMap SeenStartMap;
+  InterestingSlots.clear();
+  InterestingSlots.resize(NumSlot);
+  ConservativeSlots.clear();
+  ConservativeSlots.resize(NumSlot);
+
+  // number of start and end lifetime ops for each slot
+  SmallVector<int, 8> NumStartLifetimes(NumSlot, 0);
+  SmallVector<int, 8> NumEndLifetimes(NumSlot, 0);
+
+  // Step 1: collect markers and populate the "InterestingSlots"
+  // and "ConservativeSlots" sets.
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+
+    // Compute the set of slots for which we've seen a START marker but have
+    // not yet seen an END marker at this point in the walk (e.g. on entry
+    // to this bb).
+    BitVector BetweenStartEnd;
+    BetweenStartEnd.resize(NumSlot);
+    for (MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(),
+             PE = MBB->pred_end(); PI != PE; ++PI) {
+      BlockBitVecMap::const_iterator I = SeenStartMap.find(*PI);
+      if (I != SeenStartMap.end()) {
+        BetweenStartEnd |= I->second;
+      }
+    }
+
+    // Walk the instructions in the block to look for start/end ops.
+    for (MachineInstr &MI : *MBB) {
+      if (MI.getOpcode() == TargetOpcode::LIFETIME_START ||
+          MI.getOpcode() == TargetOpcode::LIFETIME_END) {
+        int Slot = getStartOrEndSlot(MI);
+        if (Slot < 0)
+          continue;
+        InterestingSlots.set(Slot);
+        if (MI.getOpcode() == TargetOpcode::LIFETIME_START) {
+          BetweenStartEnd.set(Slot);
+          NumStartLifetimes[Slot] += 1;
+        } else {
+          BetweenStartEnd.reset(Slot);
+          NumEndLifetimes[Slot] += 1;
+        }
+        const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+        if (Allocation) {
+          DEBUG(dbgs() << "Found a lifetime ");
+          DEBUG(dbgs() << (MI.getOpcode() == TargetOpcode::LIFETIME_START
+                               ? "start"
+                               : "end"));
+          DEBUG(dbgs() << " marker for slot #" << Slot);
+          DEBUG(dbgs() << " with allocation: " << Allocation->getName()
+                       << "\n");
+        }
+        Markers.push_back(&MI);
+        MarkersFound += 1;
+      } else {
+        for (const MachineOperand &MO : MI.operands()) {
+          if (!MO.isFI())
+            continue;
+          int Slot = MO.getIndex();
+          if (Slot < 0)
+            continue;
+          if (! BetweenStartEnd.test(Slot)) {
+            ConservativeSlots.set(Slot);
+          }
+        }
+      }
+    }
+    BitVector &SeenStart = SeenStartMap[MBB];
+    SeenStart |= BetweenStartEnd;
+  }
+  if (!MarkersFound) {
+    return 0;
+  }
+
+  // PR27903: slots with multiple start or end lifetime ops are not
+  // safe to enable for "lifetime-start-on-first-use".
+  for (unsigned slot = 0; slot < NumSlot; ++slot)
+    if (NumStartLifetimes[slot] > 1 || NumEndLifetimes[slot] > 1)
+      ConservativeSlots.set(slot);
+  DEBUG(dumpBV("Conservative slots", ConservativeSlots));
+
+  // Step 2: compute begin/end sets for each block
+
+  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
+  // deterministic numbering, and because we'll need a post-order iteration
+  // later for solving the liveness dataflow problem.
+  for (MachineBasicBlock *MBB : depth_first(MF)) {
+
+    // Assign a serial number to this basic block.
+    BasicBlocks[MBB] = BasicBlockNumbering.size();
+    BasicBlockNumbering.push_back(MBB);
+
+    // Keep a reference to avoid repeated lookups.
+    BlockLifetimeInfo &BlockInfo = BlockLiveness[MBB];
+
+    BlockInfo.Begin.resize(NumSlot);
+    BlockInfo.End.resize(NumSlot);
+
+    SmallVector<int, 4> slots;
+    for (MachineInstr &MI : *MBB) {
+      bool isStart = false;
+      slots.clear();
+      if (isLifetimeStartOrEnd(MI, slots, isStart)) {
+        if (!isStart) {
+          assert(slots.size() == 1 && "unexpected: MI ends multiple slots");
+          int Slot = slots[0];
+          if (BlockInfo.Begin.test(Slot)) {
+            BlockInfo.Begin.reset(Slot);
+          }
+          BlockInfo.End.set(Slot);
+        } else {
+          for (auto Slot : slots) {
+            DEBUG(dbgs() << "Found a use of slot #" << Slot);
+            DEBUG(dbgs() << " at BB#" << MBB->getNumber() << " index ");
+            DEBUG(Indexes->getInstructionIndex(MI).print(dbgs()));
+            const AllocaInst *Allocation = MFI->getObjectAllocation(Slot);
+            if (Allocation) {
+              DEBUG(dbgs() << " with allocation: "<< Allocation->getName());
+            }
+            DEBUG(dbgs() << "\n");
+            if (BlockInfo.End.test(Slot)) {
+              BlockInfo.End.reset(Slot);
+            }
+            BlockInfo.Begin.set(Slot);
+          }
+        }
+      }
+    }
+  }
+
+  // Update statistics.
+  NumMarkerSeen += MarkersFound;
+  return MarkersFound;
+}
+
+void StackColoring::calculateLocalLiveness()
+{
+  unsigned NumIters = 0;
+  bool changed = true;
+  while (changed) {
+    changed = false;
+    ++NumIters;
+
+    for (const MachineBasicBlock *BB : BasicBlockNumbering) {
+
+      // Use an iterator to avoid repeated lookups.
+      LivenessMap::iterator BI = BlockLiveness.find(BB);
+      assert(BI != BlockLiveness.end() && "Block not found");
+      BlockLifetimeInfo &BlockInfo = BI->second;
+
+      // Compute LiveIn by unioning together the LiveOut sets of all preds.
+      BitVector LocalLiveIn;
+      for (MachineBasicBlock::const_pred_iterator PI = BB->pred_begin(),
+           PE = BB->pred_end(); PI != PE; ++PI) {
+        LivenessMap::const_iterator I = BlockLiveness.find(*PI);
+        assert(I != BlockLiveness.end() && "Predecessor not found");
+        LocalLiveIn |= I->second.LiveOut;
+      }
+
+      // Compute LiveOut by subtracting out lifetimes that end in this
+      // block, then adding in lifetimes that begin in this block.  If
+      // we have both BEGIN and END markers in the same basic block
+      // then we know that the BEGIN marker comes after the END,
+      // because we already handle the case where the BEGIN comes
+      // before the END when collecting the markers (and building the
+      // BEGIN/END vectors).
+      BitVector LocalLiveOut = LocalLiveIn;
+      LocalLiveOut.reset(BlockInfo.End);
+      LocalLiveOut |= BlockInfo.Begin;
+
+      // Update block LiveIn set, noting whether it has changed.
+      if (LocalLiveIn.test(BlockInfo.LiveIn)) {
+        changed = true;
+        BlockInfo.LiveIn |= LocalLiveIn;
+      }
+
+      // Update block LiveOut set, noting whether it has changed.
+      if (LocalLiveOut.test(BlockInfo.LiveOut)) {
+        changed = true;
+        BlockInfo.LiveOut |= LocalLiveOut;
+      }
+    }
+  }// while changed.
+
+  NumIterations = NumIters;
+}
+
+void StackColoring::calculateLiveIntervals(unsigned NumSlots) {
+  SmallVector<SlotIndex, 16> Starts;
+  SmallVector<SlotIndex, 16> Finishes;
+
+  // For each block, find which slots are active within this block
+  // and update the live intervals.
+  for (const MachineBasicBlock &MBB : *MF) {
+    Starts.clear();
+    Starts.resize(NumSlots);
+    Finishes.clear();
+    Finishes.resize(NumSlots);
+
+    // Create the interval for the basic blocks containing lifetime begin/end.
+    for (const MachineInstr &MI : MBB) {
+
+      SmallVector<int, 4> slots;
+      bool IsStart = false;
+      if (!isLifetimeStartOrEnd(MI, slots, IsStart))
+        continue;
+      SlotIndex ThisIndex = Indexes->getInstructionIndex(MI);
+      for (auto Slot : slots) {
+        if (IsStart) {
+          if (!Starts[Slot].isValid() || Starts[Slot] > ThisIndex)
+            Starts[Slot] = ThisIndex;
+        } else {
+          if (!Finishes[Slot].isValid() || Finishes[Slot] < ThisIndex)
+            Finishes[Slot] = ThisIndex;
+        }
+      }
+    }
+
+    // Create the interval of the blocks that we previously found to be 'alive'.
+    BlockLifetimeInfo &MBBLiveness = BlockLiveness[&MBB];
+    for (int pos = MBBLiveness.LiveIn.find_first(); pos != -1;
+         pos = MBBLiveness.LiveIn.find_next(pos)) {
+      Starts[pos] = Indexes->getMBBStartIdx(&MBB);
+    }
+    for (int pos = MBBLiveness.LiveOut.find_first(); pos != -1;
+         pos = MBBLiveness.LiveOut.find_next(pos)) {
+      Finishes[pos] = Indexes->getMBBEndIdx(&MBB);
+    }
+
+    for (unsigned i = 0; i < NumSlots; ++i) {
+      //
+      // When LifetimeStartOnFirstUse is turned on, data flow analysis
+      // is forward (from starts to ends), not bidirectional. A
+      // consequence of this is that we can wind up in situations
+      // where Starts[i] is invalid but Finishes[i] is valid and vice
+      // versa. Example:
+      //
+      //     LIFETIME_START x
+      //     if (...) {
+      //       <use of x>
+      //       throw ...;
+      //     }
+      //     LIFETIME_END x
+      //     return 2;
+      //
+      //
+      // Here the slot for "x" will not be live into the block
+      // containing the "return 2" (since lifetimes start with first
+      // use, not at the dominating LIFETIME_START marker).
+      //
+      if (Starts[i].isValid() && !Finishes[i].isValid()) {
+        Finishes[i] = Indexes->getMBBEndIdx(&MBB);
+      }
+      if (!Starts[i].isValid())
+        continue;
+
+      assert(Starts[i] && Finishes[i] && "Invalid interval");
+      VNInfo *ValNum = Intervals[i]->getValNumInfo(0);
+      SlotIndex S = Starts[i];
+      SlotIndex F = Finishes[i];
+      if (S < F) {
+        // We have a single consecutive region.
+        Intervals[i]->addSegment(LiveInterval::Segment(S, F, ValNum));
+      } else {
+        // We have two non-consecutive regions. This happens when
+        // LIFETIME_START appears after the LIFETIME_END marker.
+        SlotIndex NewStart = Indexes->getMBBStartIdx(&MBB);
+        SlotIndex NewFin = Indexes->getMBBEndIdx(&MBB);
+        Intervals[i]->addSegment(LiveInterval::Segment(NewStart, F, ValNum));
+        Intervals[i]->addSegment(LiveInterval::Segment(S, NewFin, ValNum));
+      }
+    }
+  }
+}
+
+bool StackColoring::removeAllMarkers() {
+  unsigned Count = 0;
+  for (MachineInstr *MI : Markers) {
+    MI->eraseFromParent();
+    Count++;
+  }
+  Markers.clear();
+
+  DEBUG(dbgs()<<"Removed "<<Count<<" markers.\n");
+  return Count;
+}
+
+void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
+  unsigned FixedInstr = 0;
+  unsigned FixedMemOp = 0;
+  unsigned FixedDbg = 0;
+  MachineModuleInfo *MMI = &MF->getMMI();
+
+  // Remap debug information that refers to stack slots.
+  for (auto &VI : MMI->getVariableDbgInfo()) {
+    if (!VI.Var)
+      continue;
+    if (SlotRemap.count(VI.Slot)) {
+      DEBUG(dbgs() << "Remapping debug info for ["
+                   << cast<DILocalVariable>(VI.Var)->getName() << "].\n");
+      VI.Slot = SlotRemap[VI.Slot];
+      FixedDbg++;
+    }
+  }
+
+  // Keep a list of *allocas* which need to be remapped.
+  DenseMap<const AllocaInst*, const AllocaInst*> Allocas;
+  for (const std::pair<int, int> &SI : SlotRemap) {
+    const AllocaInst *From = MFI->getObjectAllocation(SI.first);
+    const AllocaInst *To = MFI->getObjectAllocation(SI.second);
+    assert(To && From && "Invalid allocation object");
+    Allocas[From] = To;
+
+    // AA might be used later for instruction scheduling, and we need it to be
+    // able to deduce the correct aliasing releationships between pointers
+    // derived from the alloca being remapped and the target of that remapping.
+    // The only safe way, without directly informing AA about the remapping
+    // somehow, is to directly update the IR to reflect the change being made
+    // here.
+    Instruction *Inst = const_cast<AllocaInst *>(To);
+    if (From->getType() != To->getType()) {
+      BitCastInst *Cast = new BitCastInst(Inst, From->getType());
+      Cast->insertAfter(Inst);
+      Inst = Cast;
+    }
+
+    // Allow the stack protector to adjust its value map to account for the
+    // upcoming replacement.
+    SP->adjustForColoring(From, To);
+
+    // The new alloca might not be valid in a llvm.dbg.declare for this
+    // variable, so undef out the use to make the verifier happy.
+    AllocaInst *FromAI = const_cast<AllocaInst *>(From);
+    if (FromAI->isUsedByMetadata())
+      ValueAsMetadata::handleRAUW(FromAI, UndefValue::get(FromAI->getType()));
+    for (auto &Use : FromAI->uses()) {
+      if (BitCastInst *BCI = dyn_cast<BitCastInst>(Use.get()))
+        if (BCI->isUsedByMetadata())
+          ValueAsMetadata::handleRAUW(BCI, UndefValue::get(BCI->getType()));
+    }
+
+    // Note that this will not replace uses in MMOs (which we'll update below),
+    // or anywhere else (which is why we won't delete the original
+    // instruction).
+    FromAI->replaceAllUsesWith(Inst);
+  }
+
+  // Remap all instructions to the new stack slots.
+  for (MachineBasicBlock &BB : *MF)
+    for (MachineInstr &I : BB) {
+      // Skip lifetime markers. We'll remove them soon.
+      if (I.getOpcode() == TargetOpcode::LIFETIME_START ||
+          I.getOpcode() == TargetOpcode::LIFETIME_END)
+        continue;
+
+      // Update the MachineMemOperand to use the new alloca.
+      for (MachineMemOperand *MMO : I.memoperands()) {
+        // FIXME: In order to enable the use of TBAA when using AA in CodeGen,
+        // we'll also need to update the TBAA nodes in MMOs with values
+        // derived from the merged allocas. When doing this, we'll need to use
+        // the same variant of GetUnderlyingObjects that is used by the
+        // instruction scheduler (that can look through ptrtoint/inttoptr
+        // pairs).
+
+        // We've replaced IR-level uses of the remapped allocas, so we only
+        // need to replace direct uses here.
+        const AllocaInst *AI = dyn_cast_or_null<AllocaInst>(MMO->getValue());
+        if (!AI)
+          continue;
+
+        if (!Allocas.count(AI))
+          continue;
+
+        MMO->setValue(Allocas[AI]);
+        FixedMemOp++;
+      }
+
+      // Update all of the machine instruction operands.
+      for (MachineOperand &MO : I.operands()) {
+        if (!MO.isFI())
+          continue;
+        int FromSlot = MO.getIndex();
+
+        // Don't touch arguments.
+        if (FromSlot<0)
+          continue;
+
+        // Only look at mapped slots.
+        if (!SlotRemap.count(FromSlot))
+          continue;
+
+        // In a debug build, check that the instruction that we are modifying is
+        // inside the expected live range. If the instruction is not inside
+        // the calculated range then it means that the alloca usage moved
+        // outside of the lifetime markers, or that the user has a bug.
+        // NOTE: Alloca address calculations which happen outside the lifetime
+        // zone are are okay, despite the fact that we don't have a good way
+        // for validating all of the usages of the calculation.
+#ifndef NDEBUG
+        bool TouchesMemory = I.mayLoad() || I.mayStore();
+        // If we *don't* protect the user from escaped allocas, don't bother
+        // validating the instructions.
+        if (!I.isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
+          SlotIndex Index = Indexes->getInstructionIndex(I);
+          const LiveInterval *Interval = &*Intervals[FromSlot];
+          assert(Interval->find(Index) != Interval->end() &&
+                 "Found instruction usage outside of live range.");
+        }
+#endif
+
+        // Fix the machine instructions.
+        int ToSlot = SlotRemap[FromSlot];
+        MO.setIndex(ToSlot);
+        FixedInstr++;
+      }
+    }
+
+  // Update the location of C++ catch objects for the MSVC personality routine.
+  if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
+    for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
+      for (WinEHHandlerType &H : TBME.HandlerArray)
+        if (H.CatchObj.FrameIndex != INT_MAX &&
+            SlotRemap.count(H.CatchObj.FrameIndex))
+          H.CatchObj.FrameIndex = SlotRemap[H.CatchObj.FrameIndex];
+
+  DEBUG(dbgs()<<"Fixed "<<FixedMemOp<<" machine memory operands.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedDbg<<" debug locations.\n");
+  DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
+}
+
+void StackColoring::removeInvalidSlotRanges() {
+  for (MachineBasicBlock &BB : *MF)
+    for (MachineInstr &I : BB) {
+      if (I.getOpcode() == TargetOpcode::LIFETIME_START ||
+          I.getOpcode() == TargetOpcode::LIFETIME_END || I.isDebugValue())
+        continue;
+
+      // Some intervals are suspicious! In some cases we find address
+      // calculations outside of the lifetime zone, but not actual memory
+      // read or write. Memory accesses outside of the lifetime zone are a clear
+      // violation, but address calculations are okay. This can happen when
+      // GEPs are hoisted outside of the lifetime zone.
+      // So, in here we only check instructions which can read or write memory.
+      if (!I.mayLoad() && !I.mayStore())
+        continue;
+
+      // Check all of the machine operands.
+      for (const MachineOperand &MO : I.operands()) {
+        if (!MO.isFI())
+          continue;
+
+        int Slot = MO.getIndex();
+
+        if (Slot<0)
+          continue;
+
+        if (Intervals[Slot]->empty())
+          continue;
+
+        // Check that the used slot is inside the calculated lifetime range.
+        // If it is not, warn about it and invalidate the range.
+        LiveInterval *Interval = &*Intervals[Slot];
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        if (Interval->find(Index) == Interval->end()) {
+          Interval->clear();
+          DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
+          EscapedAllocas++;
+        }
+      }
+    }
+}
+
+void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
+                                   unsigned NumSlots) {
+  // Expunge slot remap map.
+  for (unsigned i=0; i < NumSlots; ++i) {
+    // If we are remapping i
+    if (SlotRemap.count(i)) {
+      int Target = SlotRemap[i];
+      // As long as our target is mapped to something else, follow it.
+      while (SlotRemap.count(Target)) {
+        Target = SlotRemap[Target];
+        SlotRemap[i] = Target;
+      }
+    }
+  }
+}
+
+bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
+  DEBUG(dbgs() << "********** Stack Coloring **********\n"
+               << "********** Function: "
+               << ((const Value*)Func.getFunction())->getName() << '\n');
+  MF = &Func;
+  MFI = MF->getFrameInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  SP = &getAnalysis<StackProtector>();
+  BlockLiveness.clear();
+  BasicBlocks.clear();
+  BasicBlockNumbering.clear();
+  Markers.clear();
+  Intervals.clear();
+  VNInfoAllocator.Reset();
+
+  unsigned NumSlots = MFI->getObjectIndexEnd();
+
+  // If there are no stack slots then there are no markers to remove.
+  if (!NumSlots)
+    return false;
+
+  SmallVector<int, 8> SortedSlots;
+  SortedSlots.reserve(NumSlots);
+  Intervals.reserve(NumSlots);
+
+  unsigned NumMarkers = collectMarkers(NumSlots);
+
+  unsigned TotalSize = 0;
+  DEBUG(dbgs()<<"Found "<<NumMarkers<<" markers and "<<NumSlots<<" slots\n");
+  DEBUG(dbgs()<<"Slot structure:\n");
+
+  for (int i=0; i < MFI->getObjectIndexEnd(); ++i) {
+    DEBUG(dbgs()<<"Slot #"<<i<<" - "<<MFI->getObjectSize(i)<<" bytes.\n");
+    TotalSize += MFI->getObjectSize(i);
+  }
+
+  DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
+
+  // Don't continue because there are not enough lifetime markers, or the
+  // stack is too small, or we are told not to optimize the slots.
+  if (NumMarkers < 2 || TotalSize < 16 || DisableColoring ||
+      skipFunction(*Func.getFunction())) {
+    DEBUG(dbgs()<<"Will not try to merge slots.\n");
+    return removeAllMarkers();
+  }
+
+  for (unsigned i=0; i < NumSlots; ++i) {
+    std::unique_ptr<LiveInterval> LI(new LiveInterval(i, 0));
+    LI->getNextValue(Indexes->getZeroIndex(), VNInfoAllocator);
+    Intervals.push_back(std::move(LI));
+    SortedSlots.push_back(i);
+  }
+
+  // Calculate the liveness of each block.
+  calculateLocalLiveness();
+  DEBUG(dbgs() << "Dataflow iterations: " << NumIterations << "\n");
+  DEBUG(dump());
+
+  // Propagate the liveness information.
+  calculateLiveIntervals(NumSlots);
+  DEBUG(dumpIntervals());
+
+  // Search for allocas which are used outside of the declared lifetime
+  // markers.
+  if (ProtectFromEscapedAllocas)
+    removeInvalidSlotRanges();
+
+  // Maps old slots to new slots.
+  DenseMap<int, int> SlotRemap;
+  unsigned RemovedSlots = 0;
+  unsigned ReducedSize = 0;
+
+  // Do not bother looking at empty intervals.
+  for (unsigned I = 0; I < NumSlots; ++I) {
+    if (Intervals[SortedSlots[I]]->empty())
+      SortedSlots[I] = -1;
+  }
+
+  // This is a simple greedy algorithm for merging allocas. First, sort the
+  // slots, placing the largest slots first. Next, perform an n^2 scan and look
+  // for disjoint slots. When you find disjoint slots, merge the samller one
+  // into the bigger one and update the live interval. Remove the small alloca
+  // and continue.
+
+  // Sort the slots according to their size. Place unused slots at the end.
+  // Use stable sort to guarantee deterministic code generation.
+  std::stable_sort(SortedSlots.begin(), SortedSlots.end(),
+                   [this](int LHS, int RHS) {
+    // We use -1 to denote a uninteresting slot. Place these slots at the end.
+    if (LHS == -1) return false;
+    if (RHS == -1) return true;
+    // Sort according to size.
+    return MFI->getObjectSize(LHS) > MFI->getObjectSize(RHS);
+  });
+
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
+    for (unsigned I = 0; I < NumSlots; ++I) {
+      if (SortedSlots[I] == -1)
+        continue;
+
+      for (unsigned J=I+1; J < NumSlots; ++J) {
+        if (SortedSlots[J] == -1)
+          continue;
+
+        int FirstSlot = SortedSlots[I];
+        int SecondSlot = SortedSlots[J];
+        LiveInterval *First = &*Intervals[FirstSlot];
+        LiveInterval *Second = &*Intervals[SecondSlot];
+        assert (!First->empty() && !Second->empty() && "Found an empty range");
+
+        // Merge disjoint slots.
+        if (!First->overlaps(*Second)) {
+          Changed = true;
+          First->MergeSegmentsInAsValue(*Second, First->getValNumInfo(0));
+          SlotRemap[SecondSlot] = FirstSlot;
+          SortedSlots[J] = -1;
+          DEBUG(dbgs()<<"Merging #"<<FirstSlot<<" and slots #"<<
+                SecondSlot<<" together.\n");
+          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
+                                           MFI->getObjectAlignment(SecondSlot));
+
+          assert(MFI->getObjectSize(FirstSlot) >=
+                 MFI->getObjectSize(SecondSlot) &&
+                 "Merging a small object into a larger one");
+
+          RemovedSlots+=1;
+          ReducedSize += MFI->getObjectSize(SecondSlot);
+          MFI->setObjectAlignment(FirstSlot, MaxAlignment);
+          MFI->RemoveStackObject(SecondSlot);
+        }
+      }
+    }
+  }// While changed.
+
+  // Record statistics.
+  StackSpaceSaved += ReducedSize;
+  StackSlotMerged += RemovedSlots;
+  DEBUG(dbgs()<<"Merge "<<RemovedSlots<<" slots. Saved "<<
+        ReducedSize<<" bytes\n");
+
+  // Scan the entire function and update all machine operands that use frame
+  // indices to use the remapped frame index.
+  expungeSlotMap(SlotRemap, NumSlots);
+  remapInstructions(SlotRemap);
+
+  return removeAllMarkers();
+}
diff --git a/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp b/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
new file mode 100644
index 000000000000..87e4eb66c9c9
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackMapLivenessAnalysis.cpp
@@ -0,0 +1,172 @@
+//===-- StackMapLivenessAnalysis.cpp - StackMap live Out Analysis ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the StackMap Liveness analysis pass. The pass calculates
+// the liveness for each basic block in a function and attaches the register
+// live-out information to a stackmap or patchpoint intrinsic if present.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stackmaps"
+
+static cl::opt<bool> EnablePatchPointLiveness(
+    "enable-patchpoint-liveness", cl::Hidden, cl::init(true),
+    cl::desc("Enable PatchPoint Liveness Analysis Pass"));
+
+STATISTIC(NumStackMapFuncVisited, "Number of functions visited");
+STATISTIC(NumStackMapFuncSkipped, "Number of functions skipped");
+STATISTIC(NumBBsVisited,          "Number of basic blocks visited");
+STATISTIC(NumBBsHaveNoStackmap,   "Number of basic blocks with no stackmap");
+STATISTIC(NumStackMaps,           "Number of StackMaps visited");
+
+namespace {
+/// \brief This pass calculates the liveness information for each basic block in
+/// a function and attaches the register live-out information to a patchpoint
+/// intrinsic if present.
+///
+/// This pass can be disabled via the -enable-patchpoint-liveness=false flag.
+/// The pass skips functions that don't have any patchpoint intrinsics. The
+/// information provided by this pass is optional and not required by the
+/// aformentioned intrinsic to function.
+class StackMapLiveness : public MachineFunctionPass {
+  const TargetRegisterInfo *TRI;
+  LivePhysRegs LiveRegs;
+
+public:
+  static char ID;
+
+  /// \brief Default construct and initialize the pass.
+  StackMapLiveness();
+
+  /// \brief Tell the pass manager which passes we depend on and what
+  /// information we preserve.
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+
+  /// \brief Calculate the liveness information for the given machine function.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  /// \brief Performs the actual liveness calculation for the function.
+  bool calculateLiveness(MachineFunction &MF);
+
+  /// \brief Add the current register live set to the instruction.
+  void addLiveOutSetToMI(MachineFunction &MF, MachineInstr &MI);
+
+  /// \brief Create a register mask and initialize it with the registers from
+  /// the register live set.
+  uint32_t *createRegisterMask(MachineFunction &MF) const;
+};
+} // namespace
+
+char StackMapLiveness::ID = 0;
+char &llvm::StackMapLivenessID = StackMapLiveness::ID;
+INITIALIZE_PASS(StackMapLiveness, "stackmap-liveness",
+                "StackMap Liveness Analysis", false, false)
+
+/// Default construct and initialize the pass.
+StackMapLiveness::StackMapLiveness() : MachineFunctionPass(ID) {
+  initializeStackMapLivenessPass(*PassRegistry::getPassRegistry());
+}
+
+/// Tell the pass manager which passes we depend on and what information we
+/// preserve.
+void StackMapLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
+  // We preserve all information.
+  AU.setPreservesAll();
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Calculate the liveness information for the given machine function.
+bool StackMapLiveness::runOnMachineFunction(MachineFunction &MF) {
+  if (!EnablePatchPointLiveness)
+    return false;
+
+  DEBUG(dbgs() << "********** COMPUTING STACKMAP LIVENESS: " << MF.getName()
+               << " **********\n");
+  TRI = MF.getSubtarget().getRegisterInfo();
+  ++NumStackMapFuncVisited;
+
+  // Skip this function if there are no patchpoints to process.
+  if (!MF.getFrameInfo()->hasPatchPoint()) {
+    ++NumStackMapFuncSkipped;
+    return false;
+  }
+  return calculateLiveness(MF);
+}
+
+/// Performs the actual liveness calculation for the function.
+bool StackMapLiveness::calculateLiveness(MachineFunction &MF) {
+  bool HasChanged = false;
+  // For all basic blocks in the function.
+  for (auto &MBB : MF) {
+    DEBUG(dbgs() << "****** BB " << MBB.getName() << " ******\n");
+    LiveRegs.init(TRI);
+    // FIXME: This should probably be addLiveOuts().
+    LiveRegs.addLiveOutsNoPristines(MBB);
+    bool HasStackMap = false;
+    // Reverse iterate over all instructions and add the current live register
+    // set to an instruction if we encounter a patchpoint instruction.
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
+      if (I->getOpcode() == TargetOpcode::PATCHPOINT) {
+        addLiveOutSetToMI(MF, *I);
+        HasChanged = true;
+        HasStackMap = true;
+        ++NumStackMaps;
+      }
+      DEBUG(dbgs() << "   " << LiveRegs << "   " << *I);
+      LiveRegs.stepBackward(*I);
+    }
+    ++NumBBsVisited;
+    if (!HasStackMap)
+      ++NumBBsHaveNoStackmap;
+  }
+  return HasChanged;
+}
+
+/// Add the current register live set to the instruction.
+void StackMapLiveness::addLiveOutSetToMI(MachineFunction &MF,
+                                         MachineInstr &MI) {
+  uint32_t *Mask = createRegisterMask(MF);
+  MachineOperand MO = MachineOperand::CreateRegLiveOut(Mask);
+  MI.addOperand(MF, MO);
+}
+
+/// Create a register mask and initialize it with the registers from the
+/// register live set.
+uint32_t *StackMapLiveness::createRegisterMask(MachineFunction &MF) const {
+  // The mask is owned and cleaned up by the Machine Function.
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  for (auto Reg : LiveRegs)
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+
+  // Give the target a chance to adjust the mask.
+  TRI->adjustStackMapLiveOutMask(Mask);
+
+  return Mask;
+}
diff --git a/contrib/llvm/lib/CodeGen/StackMaps.cpp b/contrib/llvm/lib/CodeGen/StackMaps.cpp
new file mode 100644
index 000000000000..d91bb8066aed
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackMaps.cpp
@@ -0,0 +1,552 @@
+//===---------------------------- StackMaps.cpp ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOpcodes.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <iterator>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "stackmaps"
+
+static cl::opt<int> StackMapVersion(
+    "stackmap-version", cl::init(1),
+    cl::desc("Specify the stackmap encoding version (default = 1)"));
+
+const char *StackMaps::WSMP = "Stack Maps: ";
+
+PatchPointOpers::PatchPointOpers(const MachineInstr *MI)
+    : MI(MI), HasDef(MI->getOperand(0).isReg() && MI->getOperand(0).isDef() &&
+                     !MI->getOperand(0).isImplicit()),
+      IsAnyReg(MI->getOperand(getMetaIdx(CCPos)).getImm() ==
+               CallingConv::AnyReg) {
+#ifndef NDEBUG
+  unsigned CheckStartIdx = 0, e = MI->getNumOperands();
+  while (CheckStartIdx < e && MI->getOperand(CheckStartIdx).isReg() &&
+         MI->getOperand(CheckStartIdx).isDef() &&
+         !MI->getOperand(CheckStartIdx).isImplicit())
+    ++CheckStartIdx;
+
+  assert(getMetaIdx() == CheckStartIdx &&
+         "Unexpected additional definition in Patchpoint intrinsic.");
+#endif
+}
+
+unsigned PatchPointOpers::getNextScratchIdx(unsigned StartIdx) const {
+  if (!StartIdx)
+    StartIdx = getVarIdx();
+
+  // Find the next scratch register (implicit def and early clobber)
+  unsigned ScratchIdx = StartIdx, e = MI->getNumOperands();
+  while (ScratchIdx < e &&
+         !(MI->getOperand(ScratchIdx).isReg() &&
+           MI->getOperand(ScratchIdx).isDef() &&
+           MI->getOperand(ScratchIdx).isImplicit() &&
+           MI->getOperand(ScratchIdx).isEarlyClobber()))
+    ++ScratchIdx;
+
+  assert(ScratchIdx != e && "No scratch register available");
+  return ScratchIdx;
+}
+
+StackMaps::StackMaps(AsmPrinter &AP) : AP(AP) {
+  if (StackMapVersion != 1)
+    llvm_unreachable("Unsupported stackmap version!");
+}
+
+/// Go up the super-register chain until we hit a valid dwarf register number.
+static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
+  int RegNum = TRI->getDwarfRegNum(Reg, false);
+  for (MCSuperRegIterator SR(Reg, TRI); SR.isValid() && RegNum < 0; ++SR)
+    RegNum = TRI->getDwarfRegNum(*SR, false);
+
+  assert(RegNum >= 0 && "Invalid Dwarf register number.");
+  return (unsigned)RegNum;
+}
+
+MachineInstr::const_mop_iterator
+StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
+                        MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
+                        LiveOutVec &LiveOuts) const {
+  const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
+  if (MOI->isImm()) {
+    switch (MOI->getImm()) {
+    default:
+      llvm_unreachable("Unrecognized operand type.");
+    case StackMaps::DirectMemRefOp: {
+      auto &DL = AP.MF->getDataLayout();
+
+      unsigned Size = DL.getPointerSizeInBits();
+      assert((Size % 8) == 0 && "Need pointer size in bytes.");
+      Size /= 8;
+      unsigned Reg = (++MOI)->getReg();
+      int64_t Imm = (++MOI)->getImm();
+      Locs.emplace_back(StackMaps::Location::Direct, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
+      break;
+    }
+    case StackMaps::IndirectMemRefOp: {
+      int64_t Size = (++MOI)->getImm();
+      assert(Size > 0 && "Need a valid size for indirect memory locations.");
+      unsigned Reg = (++MOI)->getReg();
+      int64_t Imm = (++MOI)->getImm();
+      Locs.emplace_back(StackMaps::Location::Indirect, Size,
+                        getDwarfRegNum(Reg, TRI), Imm);
+      break;
+    }
+    case StackMaps::ConstantOp: {
+      ++MOI;
+      assert(MOI->isImm() && "Expected constant operand.");
+      int64_t Imm = MOI->getImm();
+      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, Imm);
+      break;
+    }
+    }
+    return ++MOI;
+  }
+
+  // The physical register number will ultimately be encoded as a DWARF regno.
+  // The stack map also records the size of a spill slot that can hold the
+  // register content. (The runtime can track the actual size of the data type
+  // if it needs to.)
+  if (MOI->isReg()) {
+    // Skip implicit registers (this includes our scratch registers)
+    if (MOI->isImplicit())
+      return ++MOI;
+
+    assert(TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) &&
+           "Virtreg operands should have been rewritten before now.");
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(MOI->getReg());
+    assert(!MOI->getSubReg() && "Physical subreg still around.");
+
+    unsigned Offset = 0;
+    unsigned DwarfRegNum = getDwarfRegNum(MOI->getReg(), TRI);
+    unsigned LLVMRegNum = TRI->getLLVMRegNum(DwarfRegNum, false);
+    unsigned SubRegIdx = TRI->getSubRegIndex(LLVMRegNum, MOI->getReg());
+    if (SubRegIdx)
+      Offset = TRI->getSubRegIdxOffset(SubRegIdx);
+
+    Locs.emplace_back(Location::Register, RC->getSize(), DwarfRegNum, Offset);
+    return ++MOI;
+  }
+
+  if (MOI->isRegLiveOut())
+    LiveOuts = parseRegisterLiveOutMask(MOI->getRegLiveOut());
+
+  return ++MOI;
+}
+
+void StackMaps::print(raw_ostream &OS) {
+  const TargetRegisterInfo *TRI =
+      AP.MF ? AP.MF->getSubtarget().getRegisterInfo() : nullptr;
+  OS << WSMP << "callsites:\n";
+  for (const auto &CSI : CSInfos) {
+    const LocationVec &CSLocs = CSI.Locations;
+    const LiveOutVec &LiveOuts = CSI.LiveOuts;
+
+    OS << WSMP << "callsite " << CSI.ID << "\n";
+    OS << WSMP << "  has " << CSLocs.size() << " locations\n";
+
+    unsigned Idx = 0;
+    for (const auto &Loc : CSLocs) {
+      OS << WSMP << "\t\tLoc " << Idx << ": ";
+      switch (Loc.Type) {
+      case Location::Unprocessed:
+        OS << "<Unprocessed operand>";
+        break;
+      case Location::Register:
+        OS << "Register ";
+        if (TRI)
+          OS << TRI->getName(Loc.Reg);
+        else
+          OS << Loc.Reg;
+        break;
+      case Location::Direct:
+        OS << "Direct ";
+        if (TRI)
+          OS << TRI->getName(Loc.Reg);
+        else
+          OS << Loc.Reg;
+        if (Loc.Offset)
+          OS << " + " << Loc.Offset;
+        break;
+      case Location::Indirect:
+        OS << "Indirect ";
+        if (TRI)
+          OS << TRI->getName(Loc.Reg);
+        else
+          OS << Loc.Reg;
+        OS << "+" << Loc.Offset;
+        break;
+      case Location::Constant:
+        OS << "Constant " << Loc.Offset;
+        break;
+      case Location::ConstantIndex:
+        OS << "Constant Index " << Loc.Offset;
+        break;
+      }
+      OS << "\t[encoding: .byte " << Loc.Type << ", .byte " << Loc.Size
+         << ", .short " << Loc.Reg << ", .int " << Loc.Offset << "]\n";
+      Idx++;
+    }
+
+    OS << WSMP << "\thas " << LiveOuts.size() << " live-out registers\n";
+
+    Idx = 0;
+    for (const auto &LO : LiveOuts) {
+      OS << WSMP << "\t\tLO " << Idx << ": ";
+      if (TRI)
+        OS << TRI->getName(LO.Reg);
+      else
+        OS << LO.Reg;
+      OS << "\t[encoding: .short " << LO.DwarfRegNum << ", .byte 0, .byte "
+         << LO.Size << "]\n";
+      Idx++;
+    }
+  }
+}
+
+/// Create a live-out register record for the given register Reg.
+StackMaps::LiveOutReg
+StackMaps::createLiveOutReg(unsigned Reg, const TargetRegisterInfo *TRI) const {
+  unsigned DwarfRegNum = getDwarfRegNum(Reg, TRI);
+  unsigned Size = TRI->getMinimalPhysRegClass(Reg)->getSize();
+  return LiveOutReg(Reg, DwarfRegNum, Size);
+}
+
+/// Parse the register live-out mask and return a vector of live-out registers
+/// that need to be recorded in the stackmap.
+StackMaps::LiveOutVec
+StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
+  assert(Mask && "No register mask specified");
+  const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
+  LiveOutVec LiveOuts;
+
+  // Create a LiveOutReg for each bit that is set in the register mask.
+  for (unsigned Reg = 0, NumRegs = TRI->getNumRegs(); Reg != NumRegs; ++Reg)
+    if ((Mask[Reg / 32] >> Reg % 32) & 1)
+      LiveOuts.push_back(createLiveOutReg(Reg, TRI));
+
+  // We don't need to keep track of a register if its super-register is already
+  // in the list. Merge entries that refer to the same dwarf register and use
+  // the maximum size that needs to be spilled.
+
+  std::sort(LiveOuts.begin(), LiveOuts.end(),
+            [](const LiveOutReg &LHS, const LiveOutReg &RHS) {
+              // Only sort by the dwarf register number.
+              return LHS.DwarfRegNum < RHS.DwarfRegNum;
+            });
+
+  for (auto I = LiveOuts.begin(), E = LiveOuts.end(); I != E; ++I) {
+    for (auto II = std::next(I); II != E; ++II) {
+      if (I->DwarfRegNum != II->DwarfRegNum) {
+        // Skip all the now invalid entries.
+        I = --II;
+        break;
+      }
+      I->Size = std::max(I->Size, II->Size);
+      if (TRI->isSuperRegister(I->Reg, II->Reg))
+        I->Reg = II->Reg;
+      II->Reg = 0; // mark for deletion.
+    }
+  }
+
+  LiveOuts.erase(
+      std::remove_if(LiveOuts.begin(), LiveOuts.end(),
+                     [](const LiveOutReg &LO) { return LO.Reg == 0; }),
+      LiveOuts.end());
+
+  return LiveOuts;
+}
+
+void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
+                                    MachineInstr::const_mop_iterator MOI,
+                                    MachineInstr::const_mop_iterator MOE,
+                                    bool recordResult) {
+
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCSymbol *MILabel = OutContext.createTempSymbol();
+  AP.OutStreamer->EmitLabel(MILabel);
+
+  LocationVec Locations;
+  LiveOutVec LiveOuts;
+
+  if (recordResult) {
+    assert(PatchPointOpers(&MI).hasDef() && "Stackmap has no return value.");
+    parseOperand(MI.operands_begin(), std::next(MI.operands_begin()), Locations,
+                 LiveOuts);
+  }
+
+  // Parse operands.
+  while (MOI != MOE) {
+    MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
+  }
+
+  // Move large constants into the constant pool.
+  for (auto &Loc : Locations) {
+    // Constants are encoded as sign-extended integers.
+    // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
+    if (Loc.Type == Location::Constant && !isInt<32>(Loc.Offset)) {
+      Loc.Type = Location::ConstantIndex;
+      // ConstPool is intentionally a MapVector of 'uint64_t's (as
+      // opposed to 'int64_t's).  We should never be in a situation
+      // where we have to insert either the tombstone or the empty
+      // keys into a map, and for a DenseMap<uint64_t, T> these are
+      // (uint64_t)0 and (uint64_t)-1.  They can be and are
+      // represented using 32 bit integers.
+      assert((uint64_t)Loc.Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
+             (uint64_t)Loc.Offset !=
+                 DenseMapInfo<uint64_t>::getTombstoneKey() &&
+             "empty and tombstone keys should fit in 32 bits!");
+      auto Result = ConstPool.insert(std::make_pair(Loc.Offset, Loc.Offset));
+      Loc.Offset = Result.first - ConstPool.begin();
+    }
+  }
+
+  // Create an expression to calculate the offset of the callsite from function
+  // entry.
+  const MCExpr *CSOffsetExpr = MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(MILabel, OutContext),
+      MCSymbolRefExpr::create(AP.CurrentFnSymForSize, OutContext), OutContext);
+
+  CSInfos.emplace_back(CSOffsetExpr, ID, std::move(Locations),
+                       std::move(LiveOuts));
+
+  // Record the stack size of the current function.
+  const MachineFrameInfo *MFI = AP.MF->getFrameInfo();
+  const TargetRegisterInfo *RegInfo = AP.MF->getSubtarget().getRegisterInfo();
+  bool HasDynamicFrameSize =
+      MFI->hasVarSizedObjects() || RegInfo->needsStackRealignment(*(AP.MF));
+  FnStackSize[AP.CurrentFnSym] =
+      HasDynamicFrameSize ? UINT64_MAX : MFI->getStackSize();
+}
+
+void StackMaps::recordStackMap(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STACKMAP && "expected stackmap");
+
+  int64_t ID = MI.getOperand(0).getImm();
+  recordStackMapOpers(MI, ID, std::next(MI.operands_begin(), 2),
+                      MI.operands_end());
+}
+
+void StackMaps::recordPatchPoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::PATCHPOINT && "expected patchpoint");
+
+  PatchPointOpers opers(&MI);
+  int64_t ID = opers.getMetaOper(PatchPointOpers::IDPos).getImm();
+
+  auto MOI = std::next(MI.operands_begin(), opers.getStackMapStartIdx());
+  recordStackMapOpers(MI, ID, MOI, MI.operands_end(),
+                      opers.isAnyReg() && opers.hasDef());
+
+#ifndef NDEBUG
+  // verify anyregcc
+  auto &Locations = CSInfos.back().Locations;
+  if (opers.isAnyReg()) {
+    unsigned NArgs = opers.getMetaOper(PatchPointOpers::NArgPos).getImm();
+    for (unsigned i = 0, e = (opers.hasDef() ? NArgs + 1 : NArgs); i != e; ++i)
+      assert(Locations[i].Type == Location::Register &&
+             "anyreg arg must be in reg.");
+  }
+#endif
+}
+void StackMaps::recordStatepoint(const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
+
+  StatepointOpers opers(&MI);
+  // Record all the deopt and gc operands (they're contiguous and run from the
+  // initial index to the end of the operand list)
+  const unsigned StartIdx = opers.getVarIdx();
+  recordStackMapOpers(MI, opers.getID(), MI.operands_begin() + StartIdx,
+                      MI.operands_end(), false);
+}
+
+/// Emit the stackmap header.
+///
+/// Header {
+///   uint8  : Stack Map Version (currently 1)
+///   uint8  : Reserved (expected to be 0)
+///   uint16 : Reserved (expected to be 0)
+/// }
+/// uint32 : NumFunctions
+/// uint32 : NumConstants
+/// uint32 : NumRecords
+void StackMaps::emitStackmapHeader(MCStreamer &OS) {
+  // Header.
+  OS.EmitIntValue(StackMapVersion, 1); // Version.
+  OS.EmitIntValue(0, 1);               // Reserved.
+  OS.EmitIntValue(0, 2);               // Reserved.
+
+  // Num functions.
+  DEBUG(dbgs() << WSMP << "#functions = " << FnStackSize.size() << '\n');
+  OS.EmitIntValue(FnStackSize.size(), 4);
+  // Num constants.
+  DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
+  OS.EmitIntValue(ConstPool.size(), 4);
+  // Num callsites.
+  DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
+  OS.EmitIntValue(CSInfos.size(), 4);
+}
+
+/// Emit the function frame record for each function.
+///
+/// StkSizeRecord[NumFunctions] {
+///   uint64 : Function Address
+///   uint64 : Stack Size
+/// }
+void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
+  // Function Frame records.
+  DEBUG(dbgs() << WSMP << "functions:\n");
+  for (auto const &FR : FnStackSize) {
+    DEBUG(dbgs() << WSMP << "function addr: " << FR.first
+                 << " frame size: " << FR.second);
+    OS.EmitSymbolValue(FR.first, 8);
+    OS.EmitIntValue(FR.second, 8);
+  }
+}
+
+/// Emit the constant pool.
+///
+/// int64  : Constants[NumConstants]
+void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
+  // Constant pool entries.
+  DEBUG(dbgs() << WSMP << "constants:\n");
+  for (const auto &ConstEntry : ConstPool) {
+    DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
+    OS.EmitIntValue(ConstEntry.second, 8);
+  }
+}
+
+/// Emit the callsite info for each callsite.
+///
+/// StkMapRecord[NumRecords] {
+///   uint64 : PatchPoint ID
+///   uint32 : Instruction Offset
+///   uint16 : Reserved (record flags)
+///   uint16 : NumLocations
+///   Location[NumLocations] {
+///     uint8  : Register | Direct | Indirect | Constant | ConstantIndex
+///     uint8  : Size in Bytes
+///     uint16 : Dwarf RegNum
+///     int32  : Offset
+///   }
+///   uint16 : Padding
+///   uint16 : NumLiveOuts
+///   LiveOuts[NumLiveOuts] {
+///     uint16 : Dwarf RegNum
+///     uint8  : Reserved
+///     uint8  : Size in Bytes
+///   }
+///   uint32 : Padding (only if required to align to 8 byte)
+/// }
+///
+/// Location Encoding, Type, Value:
+///   0x1, Register, Reg                 (value in register)
+///   0x2, Direct, Reg + Offset          (frame index)
+///   0x3, Indirect, [Reg + Offset]      (spilled value)
+///   0x4, Constant, Offset              (small constant)
+///   0x5, ConstIndex, Constants[Offset] (large constant)
+void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
+  DEBUG(print(dbgs()));
+  // Callsite entries.
+  for (const auto &CSI : CSInfos) {
+    const LocationVec &CSLocs = CSI.Locations;
+    const LiveOutVec &LiveOuts = CSI.LiveOuts;
+
+    // Verify stack map entry. It's better to communicate a problem to the
+    // runtime than crash in case of in-process compilation. Currently, we do
+    // simple overflow checks, but we may eventually communicate other
+    // compilation errors this way.
+    if (CSLocs.size() > UINT16_MAX || LiveOuts.size() > UINT16_MAX) {
+      OS.EmitIntValue(UINT64_MAX, 8); // Invalid ID.
+      OS.EmitValue(CSI.CSOffsetExpr, 4);
+      OS.EmitIntValue(0, 2); // Reserved.
+      OS.EmitIntValue(0, 2); // 0 locations.
+      OS.EmitIntValue(0, 2); // padding.
+      OS.EmitIntValue(0, 2); // 0 live-out registers.
+      OS.EmitIntValue(0, 4); // padding.
+      continue;
+    }
+
+    OS.EmitIntValue(CSI.ID, 8);
+    OS.EmitValue(CSI.CSOffsetExpr, 4);
+
+    // Reserved for flags.
+    OS.EmitIntValue(0, 2);
+    OS.EmitIntValue(CSLocs.size(), 2);
+
+    for (const auto &Loc : CSLocs) {
+      OS.EmitIntValue(Loc.Type, 1);
+      OS.EmitIntValue(Loc.Size, 1);
+      OS.EmitIntValue(Loc.Reg, 2);
+      OS.EmitIntValue(Loc.Offset, 4);
+    }
+
+    // Num live-out registers and padding to align to 4 byte.
+    OS.EmitIntValue(0, 2);
+    OS.EmitIntValue(LiveOuts.size(), 2);
+
+    for (const auto &LO : LiveOuts) {
+      OS.EmitIntValue(LO.DwarfRegNum, 2);
+      OS.EmitIntValue(0, 1);
+      OS.EmitIntValue(LO.Size, 1);
+    }
+    // Emit alignment to 8 byte.
+    OS.EmitValueToAlignment(8);
+  }
+}
+
+/// Serialize the stackmap data.
+void StackMaps::serializeToStackMapSection() {
+  (void)WSMP;
+  // Bail out if there's no stack map data.
+  assert((!CSInfos.empty() || ConstPool.empty()) &&
+         "Expected empty constant pool too!");
+  assert((!CSInfos.empty() || FnStackSize.empty()) &&
+         "Expected empty function record too!");
+  if (CSInfos.empty())
+    return;
+
+  MCContext &OutContext = AP.OutStreamer->getContext();
+  MCStreamer &OS = *AP.OutStreamer;
+
+  // Create the section.
+  MCSection *StackMapSection =
+      OutContext.getObjectFileInfo()->getStackMapSection();
+  OS.SwitchSection(StackMapSection);
+
+  // Emit a dummy symbol to force section inclusion.
+  OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
+
+  // Serialize data.
+  DEBUG(dbgs() << "********** Stack Map Output **********\n");
+  emitStackmapHeader(OS);
+  emitFunctionFrameRecords(OS);
+  emitConstantPoolEntries(OS);
+  emitCallsiteEntries(OS);
+  OS.AddBlankLine();
+
+  // Clean up.
+  CSInfos.clear();
+  ConstPool.clear();
+}
diff --git a/contrib/llvm/lib/CodeGen/StackProtector.cpp b/contrib/llvm/lib/CodeGen/StackProtector.cpp
new file mode 100644
index 000000000000..89868e43aba4
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackProtector.cpp
@@ -0,0 +1,471 @@
+//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass inserts stack protectors into functions which need them. A variable
+// with a random value in it is stored onto the stack before the local variables
+// are allocated. Upon exiting the block, the stored value is checked. If it's
+// changed, then there was some sort of violation and the program aborts.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/StackProtector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+#define DEBUG_TYPE "stack-protector"
+
+STATISTIC(NumFunProtected, "Number of functions protected");
+STATISTIC(NumAddrTaken, "Number of local variables that have their address"
+                        " taken.");
+
+static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
+                                          cl::init(true), cl::Hidden);
+
+char StackProtector::ID = 0;
+INITIALIZE_PASS(StackProtector, "stack-protector", "Insert stack protectors",
+                false, true)
+
+FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
+  return new StackProtector(TM);
+}
+
+StackProtector::SSPLayoutKind
+StackProtector::getSSPLayout(const AllocaInst *AI) const {
+  return AI ? Layout.lookup(AI) : SSPLK_None;
+}
+
+void StackProtector::adjustForColoring(const AllocaInst *From,
+                                       const AllocaInst *To) {
+  // When coloring replaces one alloca with another, transfer the SSPLayoutKind
+  // tag from the remapped to the target alloca. The remapped alloca should
+  // have a size smaller than or equal to the replacement alloca.
+  SSPLayoutMap::iterator I = Layout.find(From);
+  if (I != Layout.end()) {
+    SSPLayoutKind Kind = I->second;
+    Layout.erase(I);
+
+    // Transfer the tag, but make sure that SSPLK_AddrOf does not overwrite
+    // SSPLK_SmallArray or SSPLK_LargeArray, and make sure that
+    // SSPLK_SmallArray does not overwrite SSPLK_LargeArray.
+    I = Layout.find(To);
+    if (I == Layout.end())
+      Layout.insert(std::make_pair(To, Kind));
+    else if (I->second != SSPLK_LargeArray && Kind != SSPLK_AddrOf)
+      I->second = Kind;
+  }
+}
+
+bool StackProtector::runOnFunction(Function &Fn) {
+  F = &Fn;
+  M = F->getParent();
+  DominatorTreeWrapperPass *DTWP =
+      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+  TLI = TM->getSubtargetImpl(Fn)->getTargetLowering();
+  HasPrologue = false;
+  HasIRCheck = false;
+
+  Attribute Attr = Fn.getFnAttribute("stack-protector-buffer-size");
+  if (Attr.isStringAttribute() &&
+      Attr.getValueAsString().getAsInteger(10, SSPBufferSize))
+    return false; // Invalid integer string
+
+  if (!RequiresStackProtector())
+    return false;
+
+  // TODO(etienneb): Functions with funclets are not correctly supported now.
+  // Do nothing if this is funclet-based personality.
+  if (Fn.hasPersonalityFn()) {
+    EHPersonality Personality = classifyEHPersonality(Fn.getPersonalityFn());
+    if (isFuncletEHPersonality(Personality))
+      return false;
+  }
+
+  ++NumFunProtected;
+  return InsertStackProtectors();
+}
+
+/// \param [out] IsLarge is set to true if a protectable array is found and
+/// it is "large" ( >= ssp-buffer-size).  In the case of a structure with
+/// multiple arrays, this gets set if any of them is large.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
+                                              bool Strong,
+                                              bool InStruct) const {
+  if (!Ty)
+    return false;
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    if (!AT->getElementType()->isIntegerTy(8)) {
+      // If we're on a non-Darwin platform or we're inside of a structure, don't
+      // add stack protectors unless the array is a character array.
+      // However, in strong mode any array, regardless of type and size,
+      // triggers a protector.
+      if (!Strong && (InStruct || !Trip.isOSDarwin()))
+        return false;
+    }
+
+    // If an array has more than SSPBufferSize bytes of allocated space, then we
+    // emit stack protectors.
+    if (SSPBufferSize <= M->getDataLayout().getTypeAllocSize(AT)) {
+      IsLarge = true;
+      return true;
+    }
+
+    if (Strong)
+      // Require a protector for all arrays in strong mode
+      return true;
+  }
+
+  const StructType *ST = dyn_cast<StructType>(Ty);
+  if (!ST)
+    return false;
+
+  bool NeedsProtector = false;
+  for (StructType::element_iterator I = ST->element_begin(),
+                                    E = ST->element_end();
+       I != E; ++I)
+    if (ContainsProtectableArray(*I, IsLarge, Strong, true)) {
+      // If the element is a protectable array and is large (>= SSPBufferSize)
+      // then we are done.  If the protectable array is not large, then
+      // keep looking in case a subsequent element is a large array.
+      if (IsLarge)
+        return true;
+      NeedsProtector = true;
+    }
+
+  return NeedsProtector;
+}
+
+bool StackProtector::HasAddressTaken(const Instruction *AI) {
+  for (const User *U : AI->users()) {
+    if (const StoreInst *SI = dyn_cast<StoreInst>(U)) {
+      if (AI == SI->getValueOperand())
+        return true;
+    } else if (const PtrToIntInst *SI = dyn_cast<PtrToIntInst>(U)) {
+      if (AI == SI->getOperand(0))
+        return true;
+    } else if (isa<CallInst>(U)) {
+      return true;
+    } else if (isa<InvokeInst>(U)) {
+      return true;
+    } else if (const SelectInst *SI = dyn_cast<SelectInst>(U)) {
+      if (HasAddressTaken(SI))
+        return true;
+    } else if (const PHINode *PN = dyn_cast<PHINode>(U)) {
+      // Keep track of what PHI nodes we have already visited to ensure
+      // they are only visited once.
+      if (VisitedPHIs.insert(PN).second)
+        if (HasAddressTaken(PN))
+          return true;
+    } else if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
+      if (HasAddressTaken(GEP))
+        return true;
+    } else if (const BitCastInst *BI = dyn_cast<BitCastInst>(U)) {
+      if (HasAddressTaken(BI))
+        return true;
+    }
+  }
+  return false;
+}
+
+/// \brief Check whether or not this function needs a stack protector based
+/// upon the stack protector level.
+///
+/// We use two heuristics: a standard (ssp) and strong (sspstrong).
+/// The standard heuristic which will add a guard variable to functions that
+/// call alloca with a either a variable size or a size >= SSPBufferSize,
+/// functions with character buffers larger than SSPBufferSize, and functions
+/// with aggregates containing character buffers larger than SSPBufferSize. The
+/// strong heuristic will add a guard variables to functions that call alloca
+/// regardless of size, functions with any buffer regardless of type and size,
+/// functions with aggregates that contain any buffer regardless of type and
+/// size, and functions that contain stack-based variables that have had their
+/// address taken.
+bool StackProtector::RequiresStackProtector() {
+  bool Strong = false;
+  bool NeedsProtector = false;
+  for (const BasicBlock &BB : *F)
+    for (const Instruction &I : BB)
+      if (const CallInst *CI = dyn_cast<CallInst>(&I))
+        if (CI->getCalledFunction() ==
+            Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::stackprotector))
+          HasPrologue = true;
+
+  if (F->hasFnAttribute(Attribute::SafeStack))
+    return false;
+
+  if (F->hasFnAttribute(Attribute::StackProtectReq)) {
+    NeedsProtector = true;
+    Strong = true; // Use the same heuristic as strong to determine SSPLayout
+  } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
+    Strong = true;
+  else if (HasPrologue)
+    NeedsProtector = true;
+  else if (!F->hasFnAttribute(Attribute::StackProtect))
+    return false;
+
+  for (const BasicBlock &BB : *F) {
+    for (const Instruction &I : BB) {
+      if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+        if (AI->isArrayAllocation()) {
+          // SSP-Strong: Enable protectors for any call to alloca, regardless
+          // of size.
+          if (Strong)
+            return true;
+
+          if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
+            if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
+              // A call to alloca with size >= SSPBufferSize requires
+              // stack protectors.
+              Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              NeedsProtector = true;
+            } else if (Strong) {
+              // Require protectors for all alloca calls in strong mode.
+              Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              NeedsProtector = true;
+            }
+          } else {
+            // A call to alloca with a variable size requires protectors.
+            Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            NeedsProtector = true;
+          }
+          continue;
+        }
+
+        bool IsLarge = false;
+        if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
+          Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
+                                                   : SSPLK_SmallArray));
+          NeedsProtector = true;
+          continue;
+        }
+
+        if (Strong && HasAddressTaken(AI)) {
+          ++NumAddrTaken;
+          Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          NeedsProtector = true;
+        }
+      }
+    }
+  }
+
+  return NeedsProtector;
+}
+
+/// Create a stack guard loading and populate whether SelectionDAG SSP is
+/// supported.
+static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
+                            IRBuilder<> &B,
+                            bool *SupportsSelectionDAGSP = nullptr) {
+  if (Value *Guard = TLI->getIRStackGuard(B))
+    return B.CreateLoad(Guard, true, "StackGuard");
+
+  // Use SelectionDAG SSP handling, since there isn't an IR guard.
+  //
+  // This is more or less weird, since we optionally output whether we
+  // should perform a SelectionDAG SP here. The reason is that it's strictly
+  // defined as !TLI->getIRStackGuard(B), where getIRStackGuard is also
+  // mutating. There is no way to get this bit without mutating the IR, so
+  // getting this bit has to happen in this right time.
+  //
+  // We could have define a new function TLI::supportsSelectionDAGSP(), but that
+  // will put more burden on the backends' overriding work, especially when it
+  // actually conveys the same information getIRStackGuard() already gives.
+  if (SupportsSelectionDAGSP)
+    *SupportsSelectionDAGSP = true;
+  TLI->insertSSPDeclarations(*M);
+  return B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackguard));
+}
+
+/// Insert code into the entry block that stores the stack guard
+/// variable onto the stack:
+///
+///   entry:
+///     StackGuardSlot = alloca i8*
+///     StackGuard = <stack guard>
+///     call void @llvm.stackprotector(StackGuard, StackGuardSlot)
+///
+/// Returns true if the platform/triple supports the stackprotectorcreate pseudo
+/// node.
+static bool CreatePrologue(Function *F, Module *M, ReturnInst *RI,
+                           const TargetLoweringBase *TLI, AllocaInst *&AI) {
+  bool SupportsSelectionDAGSP = false;
+  IRBuilder<> B(&F->getEntryBlock().front());
+  PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
+  AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
+
+  Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
+  B.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackprotector),
+               {GuardSlot, AI});
+  return SupportsSelectionDAGSP;
+}
+
+/// InsertStackProtectors - Insert code into the prologue and epilogue of the
+/// function.
+///
+///  - The prologue code loads and stores the stack guard onto the stack.
+///  - The epilogue checks the value stored in the prologue against the original
+///    value. It calls __stack_chk_fail if they differ.
+bool StackProtector::InsertStackProtectors() {
+  bool SupportsSelectionDAGSP =
+      EnableSelectionDAGSP && !TM->Options.EnableFastISel;
+  AllocaInst *AI = nullptr;       // Place on stack that stores the stack guard.
+
+  for (Function::iterator I = F->begin(), E = F->end(); I != E;) {
+    BasicBlock *BB = &*I++;
+    ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator());
+    if (!RI)
+      continue;
+
+    // Generate prologue instrumentation if not already generated.
+    if (!HasPrologue) {
+      HasPrologue = true;
+      SupportsSelectionDAGSP &= CreatePrologue(F, M, RI, TLI, AI);
+    }
+
+    // SelectionDAG based code generation. Nothing else needs to be done here.
+    // The epilogue instrumentation is postponed to SelectionDAG.
+    if (SupportsSelectionDAGSP)
+      break;
+
+    // Set HasIRCheck to true, so that SelectionDAG will not generate its own
+    // version. SelectionDAG called 'shouldEmitSDCheck' to check whether
+    // instrumentation has already been generated.
+    HasIRCheck = true;
+
+    // Generate epilogue instrumentation. The epilogue intrumentation can be
+    // function-based or inlined depending on which mechanism the target is
+    // providing.
+    if (Value* GuardCheck = TLI->getSSPStackGuardCheck(*M)) {
+      // Generate the function-based epilogue instrumentation.
+      // The target provides a guard check function, generate a call to it.
+      IRBuilder<> B(RI);
+      LoadInst *Guard = B.CreateLoad(AI, true, "Guard");
+      CallInst *Call = B.CreateCall(GuardCheck, {Guard});
+      llvm::Function *Function = cast<llvm::Function>(GuardCheck);
+      Call->setAttributes(Function->getAttributes());
+      Call->setCallingConv(Function->getCallingConv());
+    } else {
+      // Generate the epilogue with inline instrumentation.
+      // If we do not support SelectionDAG based tail calls, generate IR level
+      // tail calls.
+      //
+      // For each block with a return instruction, convert this:
+      //
+      //   return:
+      //     ...
+      //     ret ...
+      //
+      // into this:
+      //
+      //   return:
+      //     ...
+      //     %1 = <stack guard>
+      //     %2 = load StackGuardSlot
+      //     %3 = cmp i1 %1, %2
+      //     br i1 %3, label %SP_return, label %CallStackCheckFailBlk
+      //
+      //   SP_return:
+      //     ret ...
+      //
+      //   CallStackCheckFailBlk:
+      //     call void @__stack_chk_fail()
+      //     unreachable
+
+      // Create the FailBB. We duplicate the BB every time since the MI tail
+      // merge pass will merge together all of the various BB into one including
+      // fail BB generated by the stack protector pseudo instruction.
+      BasicBlock *FailBB = CreateFailBB();
+
+      // Split the basic block before the return instruction.
+      BasicBlock *NewBB = BB->splitBasicBlock(RI->getIterator(), "SP_return");
+
+      // Update the dominator tree if we need to.
+      if (DT && DT->isReachableFromEntry(BB)) {
+        DT->addNewBlock(NewBB, BB);
+        DT->addNewBlock(FailBB, BB);
+      }
+
+      // Remove default branch instruction to the new BB.
+      BB->getTerminator()->eraseFromParent();
+
+      // Move the newly created basic block to the point right after the old
+      // basic block so that it's in the "fall through" position.
+      NewBB->moveAfter(BB);
+
+      // Generate the stack protector instructions in the old basic block.
+      IRBuilder<> B(BB);
+      Value *Guard = getStackGuard(TLI, M, B);
+      LoadInst *LI2 = B.CreateLoad(AI, true);
+      Value *Cmp = B.CreateICmpEQ(Guard, LI2);
+      auto SuccessProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(true);
+      auto FailureProb =
+          BranchProbabilityInfo::getBranchProbStackProtector(false);
+      MDNode *Weights = MDBuilder(F->getContext())
+                            .createBranchWeights(SuccessProb.getNumerator(),
+                                                 FailureProb.getNumerator());
+      B.CreateCondBr(Cmp, NewBB, FailBB, Weights);
+    }
+  }
+
+  // Return if we didn't modify any basic blocks. i.e., there are no return
+  // statements in the function.
+  return HasPrologue;
+}
+
+/// CreateFailBB - Create a basic block to jump to when the stack protector
+/// check fails.
+BasicBlock *StackProtector::CreateFailBB() {
+  LLVMContext &Context = F->getContext();
+  BasicBlock *FailBB = BasicBlock::Create(Context, "CallStackCheckFailBlk", F);
+  IRBuilder<> B(FailBB);
+  B.SetCurrentDebugLocation(DebugLoc::get(0, 0, F->getSubprogram()));
+  if (Trip.isOSOpenBSD()) {
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_smash_handler",
+                               Type::getVoidTy(Context),
+                               Type::getInt8PtrTy(Context), nullptr);
+
+    B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
+  } else {
+    Constant *StackChkFail =
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
+                               nullptr);
+    B.CreateCall(StackChkFail, {});
+  }
+  B.CreateUnreachable();
+  return FailBB;
+}
+
+bool StackProtector::shouldEmitSDCheck(const BasicBlock &BB) const {
+  return HasPrologue && !HasIRCheck && dyn_cast<ReturnInst>(BB.getTerminator());
+}
diff --git a/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
new file mode 100644
index 000000000000..d996714a414a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -0,0 +1,469 @@
+//===-- StackSlotColoring.cpp - Stack slot coloring pass. -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the stack slot coloring pass.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <vector>
+using namespace llvm;
+
+#define DEBUG_TYPE "stackslotcoloring"
+
+static cl::opt<bool>
+DisableSharing("no-stack-slot-sharing",
+             cl::init(false), cl::Hidden,
+             cl::desc("Suppress slot sharing during stack coloring"));
+
+static cl::opt<int> DCELimit("ssc-dce-limit", cl::init(-1), cl::Hidden);
+
+STATISTIC(NumEliminated, "Number of stack slots eliminated due to coloring");
+STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
+
+namespace {
+  class StackSlotColoring : public MachineFunctionPass {
+    LiveStacks* LS;
+    MachineFrameInfo *MFI;
+    const TargetInstrInfo  *TII;
+    const MachineBlockFrequencyInfo *MBFI;
+
+    // SSIntervals - Spill slot intervals.
+    std::vector<LiveInterval*> SSIntervals;
+
+    // SSRefs - Keep a list of MachineMemOperands for each spill slot.
+    // MachineMemOperands can be shared between instructions, so we need
+    // to be careful that renames like [FI0, FI1] -> [FI1, FI2] do not
+    // become FI0 -> FI1 -> FI2.
+    SmallVector<SmallVector<MachineMemOperand *, 8>, 16> SSRefs;
+
+    // OrigAlignments - Alignments of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigAlignments;
+
+    // OrigSizes - Sizess of stack objects before coloring.
+    SmallVector<unsigned, 16> OrigSizes;
+
+    // AllColors - If index is set, it's a spill slot, i.e. color.
+    // FIXME: This assumes PEI locate spill slot with smaller indices
+    // closest to stack pointer / frame pointer. Therefore, smaller
+    // index == better color.
+    BitVector AllColors;
+
+    // NextColor - Next "color" that's not yet used.
+    int NextColor;
+
+    // UsedColors - "Colors" that have been assigned.
+    BitVector UsedColors;
+
+    // Assignments - Color to intervals mapping.
+    SmallVector<SmallVector<LiveInterval*,4>, 16> Assignments;
+
+  public:
+    static char ID; // Pass identification
+    StackSlotColoring() :
+      MachineFunctionPass(ID), NextColor(-1) {
+        initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
+      }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG();
+      AU.addRequired<SlotIndexes>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addRequired<LiveStacks>();
+      AU.addRequired<MachineBlockFrequencyInfo>();
+      AU.addPreserved<MachineBlockFrequencyInfo>();
+      AU.addPreservedID(MachineDominatorsID);
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
+    bool runOnMachineFunction(MachineFunction &MF) override;
+
+  private:
+    void InitializeSlots();
+    void ScanForSpillSlotRefs(MachineFunction &MF);
+    bool OverlapWithAssignments(LiveInterval *li, int Color) const;
+    int ColorSlot(LiveInterval *li);
+    bool ColorSlots(MachineFunction &MF);
+    void RewriteInstruction(MachineInstr &MI, SmallVectorImpl<int> &SlotMapping,
+                            MachineFunction &MF);
+    bool RemoveDeadStores(MachineBasicBlock* MBB);
+  };
+} // end anonymous namespace
+
+char StackSlotColoring::ID = 0;
+char &llvm::StackSlotColoringID = StackSlotColoring::ID;
+
+INITIALIZE_PASS_BEGIN(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(StackSlotColoring, "stack-slot-coloring",
+                "Stack Slot Coloring", false, false)
+
+namespace {
+  // IntervalSorter - Comparison predicate that sort live intervals by
+  // their weight.
+  struct IntervalSorter {
+    bool operator()(LiveInterval* LHS, LiveInterval* RHS) const {
+      return LHS->weight > RHS->weight;
+    }
+  };
+}
+
+/// ScanForSpillSlotRefs - Scan all the machine instructions for spill slot
+/// references and update spill slot weights.
+void StackSlotColoring::ScanForSpillSlotRefs(MachineFunction &MF) {
+  SSRefs.resize(MFI->getObjectIndexEnd());
+
+  // FIXME: Need the equivalent of MachineRegisterInfo for frameindex operands.
+  for (MachineFunction::iterator MBBI = MF.begin(), E = MF.end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = &*MBBI;
+    for (MachineBasicBlock::iterator MII = MBB->begin(), EE = MBB->end();
+         MII != EE; ++MII) {
+      MachineInstr &MI = *MII;
+      for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI.getOperand(i);
+        if (!MO.isFI())
+          continue;
+        int FI = MO.getIndex();
+        if (FI < 0)
+          continue;
+        if (!LS->hasInterval(FI))
+          continue;
+        LiveInterval &li = LS->getInterval(FI);
+        if (!MI.isDebugValue())
+          li.weight += LiveIntervals::getSpillWeight(false, true, MBFI, MI);
+      }
+      for (MachineInstr::mmo_iterator MMOI = MI.memoperands_begin(),
+                                      EE = MI.memoperands_end();
+           MMOI != EE; ++MMOI) {
+        MachineMemOperand *MMO = *MMOI;
+        if (const FixedStackPseudoSourceValue *FSV =
+            dyn_cast_or_null<FixedStackPseudoSourceValue>(
+                MMO->getPseudoValue())) {
+          int FI = FSV->getFrameIndex();
+          if (FI >= 0)
+            SSRefs[FI].push_back(MMO);
+        }
+      }
+    }
+  }
+}
+
+/// InitializeSlots - Process all spill stack slot liveintervals and add them
+/// to a sorted (by weight) list.
+void StackSlotColoring::InitializeSlots() {
+  int LastFI = MFI->getObjectIndexEnd();
+  OrigAlignments.resize(LastFI);
+  OrigSizes.resize(LastFI);
+  AllColors.resize(LastFI);
+  UsedColors.resize(LastFI);
+  Assignments.resize(LastFI);
+
+  typedef std::iterator_traits<LiveStacks::iterator>::value_type Pair;
+  SmallVector<Pair *, 16> Intervals;
+  Intervals.reserve(LS->getNumIntervals());
+  for (auto &I : *LS)
+    Intervals.push_back(&I);
+  std::sort(Intervals.begin(), Intervals.end(),
+            [](Pair *LHS, Pair *RHS) { return LHS->first < RHS->first; });
+
+  // Gather all spill slots into a list.
+  DEBUG(dbgs() << "Spill slot intervals:\n");
+  for (auto *I : Intervals) {
+    LiveInterval &li = I->second;
+    DEBUG(li.dump());
+    int FI = TargetRegisterInfo::stackSlot2Index(li.reg);
+    if (MFI->isDeadObjectIndex(FI))
+      continue;
+    SSIntervals.push_back(&li);
+    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigSizes[FI]      = MFI->getObjectSize(FI);
+    AllColors.set(FI);
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Sort them by weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+  // Get first "color".
+  NextColor = AllColors.find_first();
+}
+
+/// OverlapWithAssignments - Return true if LiveInterval overlaps with any
+/// LiveIntervals that have already been assigned to the specified color.
+bool
+StackSlotColoring::OverlapWithAssignments(LiveInterval *li, int Color) const {
+  const SmallVectorImpl<LiveInterval *> &OtherLIs = Assignments[Color];
+  for (unsigned i = 0, e = OtherLIs.size(); i != e; ++i) {
+    LiveInterval *OtherLI = OtherLIs[i];
+    if (OtherLI->overlaps(*li))
+      return true;
+  }
+  return false;
+}
+
+/// ColorSlot - Assign a "color" (stack slot) to the specified stack slot.
+///
+int StackSlotColoring::ColorSlot(LiveInterval *li) {
+  int Color = -1;
+  bool Share = false;
+  if (!DisableSharing) {
+    // Check if it's possible to reuse any of the used colors.
+    Color = UsedColors.find_first();
+    while (Color != -1) {
+      if (!OverlapWithAssignments(li, Color)) {
+        Share = true;
+        ++NumEliminated;
+        break;
+      }
+      Color = UsedColors.find_next(Color);
+    }
+  }
+
+  // Assign it to the first available color (assumed to be the best) if it's
+  // not possible to share a used color with other objects.
+  if (!Share) {
+    assert(NextColor != -1 && "No more spill slots?");
+    Color = NextColor;
+    UsedColors.set(Color);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  // Record the assignment.
+  Assignments[Color].push_back(li);
+  int FI = TargetRegisterInfo::stackSlot2Index(li->reg);
+  DEBUG(dbgs() << "Assigning fi#" << FI << " to fi#" << Color << "\n");
+
+  // Change size and alignment of the allocated slot. If there are multiple
+  // objects sharing the same slot, then make sure the size and alignment
+  // are large enough for all.
+  unsigned Align = OrigAlignments[FI];
+  if (!Share || Align > MFI->getObjectAlignment(Color))
+    MFI->setObjectAlignment(Color, Align);
+  int64_t Size = OrigSizes[FI];
+  if (!Share || Size > MFI->getObjectSize(Color))
+    MFI->setObjectSize(Color, Size);
+  return Color;
+}
+
+/// Colorslots - Color all spill stack slots and rewrite all frameindex machine
+/// operands in the function.
+bool StackSlotColoring::ColorSlots(MachineFunction &MF) {
+  unsigned NumObjs = MFI->getObjectIndexEnd();
+  SmallVector<int, 16> SlotMapping(NumObjs, -1);
+  SmallVector<float, 16> SlotWeights(NumObjs, 0.0);
+  SmallVector<SmallVector<int, 4>, 16> RevMap(NumObjs);
+  BitVector UsedColors(NumObjs);
+
+  DEBUG(dbgs() << "Color spill slot intervals:\n");
+  bool Changed = false;
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    int NewSS = ColorSlot(li);
+    assert(NewSS >= 0 && "Stack coloring failed?");
+    SlotMapping[SS] = NewSS;
+    RevMap[NewSS].push_back(SS);
+    SlotWeights[NewSS] += li->weight;
+    UsedColors.set(NewSS);
+    Changed |= (SS != NewSS);
+  }
+
+  DEBUG(dbgs() << "\nSpill slots after coloring:\n");
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i) {
+    LiveInterval *li = SSIntervals[i];
+    int SS = TargetRegisterInfo::stackSlot2Index(li->reg);
+    li->weight = SlotWeights[SS];
+  }
+  // Sort them by new weight.
+  std::stable_sort(SSIntervals.begin(), SSIntervals.end(), IntervalSorter());
+
+#ifndef NDEBUG
+  for (unsigned i = 0, e = SSIntervals.size(); i != e; ++i)
+    DEBUG(SSIntervals[i]->dump());
+  DEBUG(dbgs() << '\n');
+#endif
+
+  if (!Changed)
+    return false;
+
+  // Rewrite all MachineMemOperands.
+  for (unsigned SS = 0, SE = SSRefs.size(); SS != SE; ++SS) {
+    int NewFI = SlotMapping[SS];
+    if (NewFI == -1 || (NewFI == (int)SS))
+      continue;
+
+    const PseudoSourceValue *NewSV = MF.getPSVManager().getFixedStack(NewFI);
+    SmallVectorImpl<MachineMemOperand *> &RefMMOs = SSRefs[SS];
+    for (unsigned i = 0, e = RefMMOs.size(); i != e; ++i)
+      RefMMOs[i]->setValue(NewSV);
+  }
+
+  // Rewrite all MO_FrameIndex operands.  Look for dead stores.
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB)
+      RewriteInstruction(MI, SlotMapping, MF);
+    RemoveDeadStores(&MBB);
+  }
+
+  // Delete unused stack slots.
+  while (NextColor != -1) {
+    DEBUG(dbgs() << "Removing unused stack object fi#" << NextColor << "\n");
+    MFI->RemoveStackObject(NextColor);
+    NextColor = AllColors.find_next(NextColor);
+  }
+
+  return true;
+}
+
+/// RewriteInstruction - Rewrite specified instruction by replacing references
+/// to old frame index with new one.
+void StackSlotColoring::RewriteInstruction(MachineInstr &MI,
+                                           SmallVectorImpl<int> &SlotMapping,
+                                           MachineFunction &MF) {
+  // Update the operands.
+  for (unsigned i = 0, ee = MI.getNumOperands(); i != ee; ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isFI())
+      continue;
+    int OldFI = MO.getIndex();
+    if (OldFI < 0)
+      continue;
+    int NewFI = SlotMapping[OldFI];
+    if (NewFI == -1 || NewFI == OldFI)
+      continue;
+    MO.setIndex(NewFI);
+  }
+
+  // The MachineMemOperands have already been updated.
+}
+
+
+/// RemoveDeadStores - Scan through a basic block and look for loads followed
+/// by stores.  If they're both using the same stack slot, then the store is
+/// definitely dead.  This could obviously be much more aggressive (consider
+/// pairs with instructions between them), but such extensions might have a
+/// considerable compile time impact.
+bool StackSlotColoring::RemoveDeadStores(MachineBasicBlock* MBB) {
+  // FIXME: This could be much more aggressive, but we need to investigate
+  // the compile time impact of doing so.
+  bool changed = false;
+
+  SmallVector<MachineInstr*, 4> toErase;
+
+  for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    if (DCELimit != -1 && (int)NumDead >= DCELimit)
+      break;
+
+    int FirstSS, SecondSS;
+    if (TII->isStackSlotCopy(*I, FirstSS, SecondSS) && FirstSS == SecondSS &&
+        FirstSS != -1) {
+      ++NumDead;
+      changed = true;
+      toErase.push_back(&*I);
+      continue;
+    }
+
+    MachineBasicBlock::iterator NextMI = std::next(I);
+    if (NextMI == MBB->end()) continue;
+
+    unsigned LoadReg = 0;
+    unsigned StoreReg = 0;
+    if (!(LoadReg = TII->isLoadFromStackSlot(*I, FirstSS)))
+      continue;
+    if (!(StoreReg = TII->isStoreToStackSlot(*NextMI, SecondSS)))
+      continue;
+    if (FirstSS != SecondSS || LoadReg != StoreReg || FirstSS == -1) continue;
+
+    ++NumDead;
+    changed = true;
+
+    if (NextMI->findRegisterUseOperandIdx(LoadReg, true, nullptr) != -1) {
+      ++NumDead;
+      toErase.push_back(&*I);
+    }
+
+    toErase.push_back(&*NextMI);
+    ++I;
+  }
+
+  for (SmallVectorImpl<MachineInstr *>::iterator I = toErase.begin(),
+       E = toErase.end(); I != E; ++I)
+    (*I)->eraseFromParent();
+
+  return changed;
+}
+
+
+bool StackSlotColoring::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG({
+      dbgs() << "********** Stack Slot Coloring **********\n"
+             << "********** Function: " << MF.getName() << '\n';
+    });
+
+  MFI = MF.getFrameInfo();
+  TII = MF.getSubtarget().getInstrInfo();
+  LS = &getAnalysis<LiveStacks>();
+  MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+
+  bool Changed = false;
+
+  unsigned NumSlots = LS->getNumIntervals();
+  if (NumSlots == 0)
+    // Nothing to do!
+    return false;
+
+  // If there are calls to setjmp or sigsetjmp, don't perform stack slot
+  // coloring. The stack could be modified before the longjmp is executed,
+  // resulting in the wrong value being used afterwards. (See
+  // <rdar://problem/8007500>.)
+  if (MF.exposesReturnsTwice())
+    return false;
+
+  // Gather spill slot references
+  ScanForSpillSlotRefs(MF);
+  InitializeSlots();
+  Changed = ColorSlots(MF);
+
+  NextColor = -1;
+  SSIntervals.clear();
+  for (unsigned i = 0, e = SSRefs.size(); i != e; ++i)
+    SSRefs[i].clear();
+  SSRefs.clear();
+  OrigAlignments.clear();
+  OrigSizes.clear();
+  AllColors.clear();
+  UsedColors.clear();
+  for (unsigned i = 0, e = Assignments.size(); i != e; ++i)
+    Assignments[i].clear();
+  Assignments.clear();
+
+  return Changed;
+}
diff --git a/contrib/llvm/lib/CodeGen/TailDuplication.cpp b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
new file mode 100644
index 000000000000..2b1fb127497a
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TailDuplication.cpp
@@ -0,0 +1,65 @@
+//===-- TailDuplication.cpp - Duplicate blocks into predecessors' tails ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass duplicates basic blocks ending in unconditional branches into
+// the tails of their predecessors, using the TailDuplicator utility class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailduplication"
+
+namespace {
+/// Perform tail duplication. Delegates to TailDuplicator
+class TailDuplicatePass : public MachineFunctionPass {
+  TailDuplicator Duplicator;
+
+public:
+  static char ID;
+  explicit TailDuplicatePass() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+char TailDuplicatePass::ID = 0;
+}
+
+char &llvm::TailDuplicateID = TailDuplicatePass::ID;
+
+INITIALIZE_PASS(TailDuplicatePass, "tailduplication", "Tail Duplication", false,
+                false)
+
+bool TailDuplicatePass::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(*MF.getFunction()))
+    return false;
+
+  auto MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  auto MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
+
+  Duplicator.initMF(MF, MMI, MBPI);
+
+  bool MadeChange = false;
+  while (Duplicator.tailDuplicateBlocks(MF))
+    MadeChange = true;
+
+  return MadeChange;
+}
+
+void TailDuplicatePass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
diff --git a/contrib/llvm/lib/CodeGen/TailDuplicator.cpp b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
new file mode 100644
index 000000000000..847a09349a59
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -0,0 +1,932 @@
+//===-- TailDuplicator.cpp - Duplicate blocks into predecessors' tails ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This utility class duplicates basic blocks ending in unconditional branches
+// into the tails of their predecessors.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TailDuplicator.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "tailduplication"
+
+STATISTIC(NumTails, "Number of tails duplicated");
+STATISTIC(NumTailDups, "Number of tail duplicated blocks");
+STATISTIC(NumTailDupAdded,
+          "Number of instructions added due to tail duplication");
+STATISTIC(NumTailDupRemoved,
+          "Number of instructions removed due to tail duplication");
+STATISTIC(NumDeadBlocks, "Number of dead blocks removed");
+STATISTIC(NumAddedPHIs, "Number of phis added");
+
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDuplicateSize(
+    "tail-dup-size",
+    cl::desc("Maximum instructions to consider tail duplicating"), cl::init(2),
+    cl::Hidden);
+
+static cl::opt<bool>
+    TailDupVerify("tail-dup-verify",
+                  cl::desc("Verify sanity of PHI instructions during taildup"),
+                  cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> TailDupLimit("tail-dup-limit", cl::init(~0U),
+                                      cl::Hidden);
+
+namespace llvm {
+
+void TailDuplicator::initMF(MachineFunction &MF, const MachineModuleInfo *MMIin,
+                            const MachineBranchProbabilityInfo *MBPIin) {
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MMI = MMIin;
+  MBPI = MBPIin;
+
+  assert(MBPI != nullptr && "Machine Branch Probability Info required");
+
+  PreRegAlloc = MRI->isSSA();
+}
+
+static void VerifyPHIs(MachineFunction &MF, bool CheckExtra) {
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E; ++I) {
+    MachineBasicBlock *MBB = &*I;
+    SmallSetVector<MachineBasicBlock *, 8> Preds(MBB->pred_begin(),
+                                                 MBB->pred_end());
+    MachineBasicBlock::iterator MI = MBB->begin();
+    while (MI != MBB->end()) {
+      if (!MI->isPHI())
+        break;
+      for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                            PE = Preds.end();
+           PI != PE; ++PI) {
+        MachineBasicBlock *PredBB = *PI;
+        bool Found = false;
+        for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+          MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+          if (PHIBB == PredBB) {
+            Found = true;
+            break;
+          }
+        }
+        if (!Found) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  missing input from predecessor BB#"
+                 << PredBB->getNumber() << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+
+      for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2) {
+        MachineBasicBlock *PHIBB = MI->getOperand(i + 1).getMBB();
+        if (CheckExtra && !Preds.count(PHIBB)) {
+          dbgs() << "Warning: malformed PHI in BB#" << MBB->getNumber() << ": "
+                 << *MI;
+          dbgs() << "  extra input from predecessor BB#" << PHIBB->getNumber()
+                 << '\n';
+          llvm_unreachable(nullptr);
+        }
+        if (PHIBB->getNumber() < 0) {
+          dbgs() << "Malformed PHI in BB#" << MBB->getNumber() << ": " << *MI;
+          dbgs() << "  non-existing BB#" << PHIBB->getNumber() << '\n';
+          llvm_unreachable(nullptr);
+        }
+      }
+      ++MI;
+    }
+  }
+}
+
+/// Tail duplicate the block and cleanup.
+bool TailDuplicator::tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple,
+                                            MachineBasicBlock *MBB) {
+  // Save the successors list.
+  SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
+                                               MBB->succ_end());
+
+  SmallVector<MachineBasicBlock *, 8> TDBBs;
+  SmallVector<MachineInstr *, 16> Copies;
+  if (!tailDuplicate(MF, IsSimple, MBB, TDBBs, Copies))
+    return false;
+
+  ++NumTails;
+
+  SmallVector<MachineInstr *, 8> NewPHIs;
+  MachineSSAUpdater SSAUpdate(MF, &NewPHIs);
+
+  // TailBB's immediate successors are now successors of those predecessors
+  // which duplicated TailBB. Add the predecessors as sources to the PHI
+  // instructions.
+  bool isDead = MBB->pred_empty() && !MBB->hasAddressTaken();
+  if (PreRegAlloc)
+    updateSuccessorsPHIs(MBB, isDead, TDBBs, Succs);
+
+  // If it is dead, remove it.
+  if (isDead) {
+    NumTailDupRemoved += MBB->size();
+    removeDeadBlock(MBB);
+    ++NumDeadBlocks;
+  }
+
+  // Update SSA form.
+  if (!SSAUpdateVRs.empty()) {
+    for (unsigned i = 0, e = SSAUpdateVRs.size(); i != e; ++i) {
+      unsigned VReg = SSAUpdateVRs[i];
+      SSAUpdate.Initialize(VReg);
+
+      // If the original definition is still around, add it as an available
+      // value.
+      MachineInstr *DefMI = MRI->getVRegDef(VReg);
+      MachineBasicBlock *DefBB = nullptr;
+      if (DefMI) {
+        DefBB = DefMI->getParent();
+        SSAUpdate.AddAvailableValue(DefBB, VReg);
+      }
+
+      // Add the new vregs as available values.
+      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(VReg);
+      for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+        MachineBasicBlock *SrcBB = LI->second[j].first;
+        unsigned SrcReg = LI->second[j].second;
+        SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
+      }
+
+      // Rewrite uses that are outside of the original def's block.
+      MachineRegisterInfo::use_iterator UI = MRI->use_begin(VReg);
+      while (UI != MRI->use_end()) {
+        MachineOperand &UseMO = *UI;
+        MachineInstr *UseMI = UseMO.getParent();
+        ++UI;
+        if (UseMI->isDebugValue()) {
+          // SSAUpdate can replace the use with an undef. That creates
+          // a debug instruction that is a kill.
+          // FIXME: Should it SSAUpdate job to delete debug instructions
+          // instead of replacing the use with undef?
+          UseMI->eraseFromParent();
+          continue;
+        }
+        if (UseMI->getParent() == DefBB && !UseMI->isPHI())
+          continue;
+        SSAUpdate.RewriteUse(UseMO);
+      }
+    }
+
+    SSAUpdateVRs.clear();
+    SSAUpdateVals.clear();
+  }
+
+  // Eliminate some of the copies inserted by tail duplication to maintain
+  // SSA form.
+  for (unsigned i = 0, e = Copies.size(); i != e; ++i) {
+    MachineInstr *Copy = Copies[i];
+    if (!Copy->isCopy())
+      continue;
+    unsigned Dst = Copy->getOperand(0).getReg();
+    unsigned Src = Copy->getOperand(1).getReg();
+    if (MRI->hasOneNonDBGUse(Src) &&
+        MRI->constrainRegClass(Src, MRI->getRegClass(Dst))) {
+      // Copy is the only use. Do trivial copy propagation here.
+      MRI->replaceRegWith(Dst, Src);
+      Copy->eraseFromParent();
+    }
+  }
+
+  if (NewPHIs.size())
+    NumAddedPHIs += NewPHIs.size();
+
+  return true;
+}
+
+/// Look for small blocks that are unconditionally branched to and do not fall
+/// through. Tail-duplicate their instructions into their predecessors to
+/// eliminate (dynamic) branches.
+bool TailDuplicator::tailDuplicateBlocks(MachineFunction &MF) {
+  bool MadeChange = false;
+
+  if (PreRegAlloc && TailDupVerify) {
+    DEBUG(dbgs() << "\n*** Before tail-duplicating\n");
+    VerifyPHIs(MF, true);
+  }
+
+  for (MachineFunction::iterator I = ++MF.begin(), E = MF.end(); I != E;) {
+    MachineBasicBlock *MBB = &*I++;
+
+    if (NumTails == TailDupLimit)
+      break;
+
+    bool IsSimple = isSimpleBB(MBB);
+
+    if (!shouldTailDuplicate(MF, IsSimple, *MBB))
+      continue;
+
+    MadeChange |= tailDuplicateAndUpdate(MF, IsSimple, MBB);
+  }
+
+  if (PreRegAlloc && TailDupVerify)
+    VerifyPHIs(MF, false);
+
+  return MadeChange;
+}
+
+static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+                         const MachineRegisterInfo *MRI) {
+  for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
+    if (UseMI.isDebugValue())
+      continue;
+    if (UseMI.getParent() != BB)
+      return true;
+  }
+  return false;
+}
+
+static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
+  for (unsigned i = 1, e = MI->getNumOperands(); i != e; i += 2)
+    if (MI->getOperand(i + 1).getMBB() == SrcBB)
+      return i;
+  return 0;
+}
+
+// Remember which registers are used by phis in this block. This is
+// used to determine which registers are liveout while modifying the
+// block (which is why we need to copy the information).
+static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
+                              DenseSet<unsigned> *UsedByPhi) {
+  for (const auto &MI : BB) {
+    if (!MI.isPHI())
+      break;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      unsigned SrcReg = MI.getOperand(i).getReg();
+      UsedByPhi->insert(SrcReg);
+    }
+  }
+}
+
+/// Add a definition and source virtual registers pair for SSA update.
+void TailDuplicator::addSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+                                       MachineBasicBlock *BB) {
+  DenseMap<unsigned, AvailableValsTy>::iterator LI =
+      SSAUpdateVals.find(OrigReg);
+  if (LI != SSAUpdateVals.end())
+    LI->second.push_back(std::make_pair(BB, NewReg));
+  else {
+    AvailableValsTy Vals;
+    Vals.push_back(std::make_pair(BB, NewReg));
+    SSAUpdateVals.insert(std::make_pair(OrigReg, Vals));
+    SSAUpdateVRs.push_back(OrigReg);
+  }
+}
+
+/// Process PHI node in TailBB by turning it into a copy in PredBB. Remember the
+/// source register that's contributed by PredBB and update SSA update map.
+void TailDuplicator::processPHI(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
+    SmallVectorImpl<std::pair<unsigned, RegSubRegPair>> &Copies,
+    const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
+  unsigned DefReg = MI->getOperand(0).getReg();
+  unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
+  assert(SrcOpIdx && "Unable to find matching PHI source?");
+  unsigned SrcReg = MI->getOperand(SrcOpIdx).getReg();
+  unsigned SrcSubReg = MI->getOperand(SrcOpIdx).getSubReg();
+  const TargetRegisterClass *RC = MRI->getRegClass(DefReg);
+  LocalVRMap.insert(std::make_pair(DefReg, RegSubRegPair(SrcReg, SrcSubReg)));
+
+  // Insert a copy from source to the end of the block. The def register is the
+  // available value liveout of the block.
+  unsigned NewDef = MRI->createVirtualRegister(RC);
+  Copies.push_back(std::make_pair(NewDef, RegSubRegPair(SrcReg, SrcSubReg)));
+  if (isDefLiveOut(DefReg, TailBB, MRI) || RegsUsedByPhi.count(DefReg))
+    addSSAUpdateEntry(DefReg, NewDef, PredBB);
+
+  if (!Remove)
+    return;
+
+  // Remove PredBB from the PHI node.
+  MI->RemoveOperand(SrcOpIdx + 1);
+  MI->RemoveOperand(SrcOpIdx);
+  if (MI->getNumOperands() == 1)
+    MI->eraseFromParent();
+}
+
+/// Duplicate a TailBB instruction to PredBB and update
+/// the source operands due to earlier PHI translation.
+void TailDuplicator::duplicateInstruction(
+    MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
+    MachineFunction &MF,
+    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
+    const DenseSet<unsigned> &UsedByPhi) {
+  MachineInstr *NewMI = TII->duplicate(*MI, MF);
+  if (PreRegAlloc) {
+    for (unsigned i = 0, e = NewMI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = NewMI->getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      if (MO.isDef()) {
+        const TargetRegisterClass *RC = MRI->getRegClass(Reg);
+        unsigned NewReg = MRI->createVirtualRegister(RC);
+        MO.setReg(NewReg);
+        LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+        if (isDefLiveOut(Reg, TailBB, MRI) || UsedByPhi.count(Reg))
+          addSSAUpdateEntry(Reg, NewReg, PredBB);
+      } else {
+        auto VI = LocalVRMap.find(Reg);
+        if (VI != LocalVRMap.end()) {
+          // Need to make sure that the register class of the mapped register
+          // will satisfy the constraints of the class of the register being
+          // replaced.
+          auto *OrigRC = MRI->getRegClass(Reg);
+          auto *MappedRC = MRI->getRegClass(VI->second.Reg);
+          const TargetRegisterClass *ConstrRC;
+          if (VI->second.SubReg != 0) {
+            ConstrRC = TRI->getMatchingSuperRegClass(MappedRC, OrigRC,
+                                                     VI->second.SubReg);
+            if (ConstrRC) {
+              // The actual constraining (as in "find appropriate new class")
+              // is done by getMatchingSuperRegClass, so now we only need to
+              // change the class of the mapped register.
+              MRI->setRegClass(VI->second.Reg, ConstrRC);
+            }
+          } else {
+            // For mapped registers that do not have sub-registers, simply
+            // restrict their class to match the original one.
+            ConstrRC = MRI->constrainRegClass(VI->second.Reg, OrigRC);
+          }
+
+          if (ConstrRC) {
+            // If the class constraining succeeded, we can simply replace
+            // the old register with the mapped one.
+            MO.setReg(VI->second.Reg);
+            // We have Reg -> VI.Reg:VI.SubReg, so if Reg is used with a
+            // sub-register, we need to compose the sub-register indices.
+            MO.setSubReg(TRI->composeSubRegIndices(MO.getSubReg(),
+                                                   VI->second.SubReg));
+          } else {
+            // The direct replacement is not possible, due to failing register
+            // class constraints. An explicit COPY is necessary. Create one
+            // that can be reused
+            auto *NewRC = MI->getRegClassConstraint(i, TII, TRI);
+            if (NewRC == nullptr)
+              NewRC = OrigRC;
+            unsigned NewReg = MRI->createVirtualRegister(NewRC);
+            BuildMI(*PredBB, MI, MI->getDebugLoc(),
+                    TII->get(TargetOpcode::COPY), NewReg)
+                .addReg(VI->second.Reg, 0, VI->second.SubReg);
+            LocalVRMap.erase(VI);
+            LocalVRMap.insert(std::make_pair(Reg, RegSubRegPair(NewReg, 0)));
+            MO.setReg(NewReg);
+            // The composed VI.Reg:VI.SubReg is replaced with NewReg, which
+            // is equivalent to the whole register Reg. Hence, Reg:subreg
+            // is same as NewReg:subreg, so keep the sub-register index
+            // unchanged.
+          }
+          // Clear any kill flags from this operand.  The new register could
+          // have uses after this one, so kills are not valid here.
+          MO.setIsKill(false);
+        }
+      }
+    }
+  }
+  PredBB->insert(PredBB->instr_end(), NewMI);
+}
+
+/// After FromBB is tail duplicated into its predecessor blocks, the successors
+/// have gained new predecessors. Update the PHI instructions in them
+/// accordingly.
+void TailDuplicator::updateSuccessorsPHIs(
+    MachineBasicBlock *FromBB, bool isDead,
+    SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    SmallSetVector<MachineBasicBlock *, 8> &Succs) {
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator SI = Succs.begin(),
+                                                        SE = Succs.end();
+       SI != SE; ++SI) {
+    MachineBasicBlock *SuccBB = *SI;
+    for (MachineBasicBlock::iterator II = SuccBB->begin(), EE = SuccBB->end();
+         II != EE; ++II) {
+      if (!II->isPHI())
+        break;
+      MachineInstrBuilder MIB(*FromBB->getParent(), II);
+      unsigned Idx = 0;
+      for (unsigned i = 1, e = II->getNumOperands(); i != e; i += 2) {
+        MachineOperand &MO = II->getOperand(i + 1);
+        if (MO.getMBB() == FromBB) {
+          Idx = i;
+          break;
+        }
+      }
+
+      assert(Idx != 0);
+      MachineOperand &MO0 = II->getOperand(Idx);
+      unsigned Reg = MO0.getReg();
+      if (isDead) {
+        // Folded into the previous BB.
+        // There could be duplicate phi source entries. FIXME: Should sdisel
+        // or earlier pass fixed this?
+        for (unsigned i = II->getNumOperands() - 2; i != Idx; i -= 2) {
+          MachineOperand &MO = II->getOperand(i + 1);
+          if (MO.getMBB() == FromBB) {
+            II->RemoveOperand(i + 1);
+            II->RemoveOperand(i);
+          }
+        }
+      } else
+        Idx = 0;
+
+      // If Idx is set, the operands at Idx and Idx+1 must be removed.
+      // We reuse the location to avoid expensive RemoveOperand calls.
+
+      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+          SSAUpdateVals.find(Reg);
+      if (LI != SSAUpdateVals.end()) {
+        // This register is defined in the tail block.
+        for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = LI->second[j].first;
+          // If we didn't duplicate a bb into a particular predecessor, we
+          // might still have added an entry to SSAUpdateVals to correcly
+          // recompute SSA. If that case, avoid adding a dummy extra argument
+          // this PHI.
+          if (!SrcBB->isSuccessor(SuccBB))
+            continue;
+
+          unsigned SrcReg = LI->second[j].second;
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(SrcReg);
+            II->getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(SrcReg).addMBB(SrcBB);
+          }
+        }
+      } else {
+        // Live in tail block, must also be live in predecessors.
+        for (unsigned j = 0, ee = TDBBs.size(); j != ee; ++j) {
+          MachineBasicBlock *SrcBB = TDBBs[j];
+          if (Idx != 0) {
+            II->getOperand(Idx).setReg(Reg);
+            II->getOperand(Idx + 1).setMBB(SrcBB);
+            Idx = 0;
+          } else {
+            MIB.addReg(Reg).addMBB(SrcBB);
+          }
+        }
+      }
+      if (Idx != 0) {
+        II->RemoveOperand(Idx + 1);
+        II->RemoveOperand(Idx);
+      }
+    }
+  }
+}
+
+/// Determine if it is profitable to duplicate this block.
+bool TailDuplicator::shouldTailDuplicate(const MachineFunction &MF,
+                                         bool IsSimple,
+                                         MachineBasicBlock &TailBB) {
+  // Only duplicate blocks that end with unconditional branches.
+  if (TailBB.canFallThrough())
+    return false;
+
+  // Don't try to tail-duplicate single-block loops.
+  if (TailBB.isSuccessor(&TailBB))
+    return false;
+
+  // Set the limit on the cost to duplicate. When optimizing for size,
+  // duplicate only one, because one branch instruction can be eliminated to
+  // compensate for the duplication.
+  unsigned MaxDuplicateCount;
+  if (TailDuplicateSize.getNumOccurrences() == 0 &&
+      // FIXME: Use Function::optForSize().
+      MF.getFunction()->hasFnAttribute(Attribute::OptimizeForSize))
+    MaxDuplicateCount = 1;
+  else
+    MaxDuplicateCount = TailDuplicateSize;
+
+  // If the target has hardware branch prediction that can handle indirect
+  // branches, duplicating them can often make them predictable when there
+  // are common paths through the code.  The limit needs to be high enough
+  // to allow undoing the effects of tail merging and other optimizations
+  // that rearrange the predecessors of the indirect branch.
+
+  bool HasIndirectbr = false;
+  if (!TailBB.empty())
+    HasIndirectbr = TailBB.back().isIndirectBranch();
+
+  if (HasIndirectbr && PreRegAlloc)
+    MaxDuplicateCount = 20;
+
+  // Check the instructions in the block to determine whether tail-duplication
+  // is invalid or unlikely to be profitable.
+  unsigned InstrCount = 0;
+  for (MachineInstr &MI : TailBB) {
+    // Non-duplicable things shouldn't be tail-duplicated.
+    if (MI.isNotDuplicable())
+      return false;
+
+    // Convergent instructions can be duplicated only if doing so doesn't add
+    // new control dependencies, which is what we're going to do here.
+    if (MI.isConvergent())
+      return false;
+
+    // Do not duplicate 'return' instructions if this is a pre-regalloc run.
+    // A return may expand into a lot more instructions (e.g. reload of callee
+    // saved registers) after PEI.
+    if (PreRegAlloc && MI.isReturn())
+      return false;
+
+    // Avoid duplicating calls before register allocation. Calls presents a
+    // barrier to register allocation so duplicating them may end up increasing
+    // spills.
+    if (PreRegAlloc && MI.isCall())
+      return false;
+
+    if (!MI.isPHI() && !MI.isDebugValue())
+      InstrCount += 1;
+
+    if (InstrCount > MaxDuplicateCount)
+      return false;
+  }
+
+  // Check if any of the successors of TailBB has a PHI node in which the
+  // value corresponding to TailBB uses a subregister.
+  // If a phi node uses a register paired with a subregister, the actual
+  // "value type" of the phi may differ from the type of the register without
+  // any subregisters. Due to a bug, tail duplication may add a new operand
+  // without a necessary subregister, producing an invalid code. This is
+  // demonstrated by test/CodeGen/Hexagon/tail-dup-subreg-abort.ll.
+  // Disable tail duplication for this case for now, until the problem is
+  // fixed.
+  for (auto SB : TailBB.successors()) {
+    for (auto &I : *SB) {
+      if (!I.isPHI())
+        break;
+      unsigned Idx = getPHISrcRegOpIdx(&I, &TailBB);
+      assert(Idx != 0);
+      MachineOperand &PU = I.getOperand(Idx);
+      if (PU.getSubReg() != 0)
+        return false;
+    }
+  }
+
+  if (HasIndirectbr && PreRegAlloc)
+    return true;
+
+  if (IsSimple)
+    return true;
+
+  if (!PreRegAlloc)
+    return true;
+
+  return canCompletelyDuplicateBB(TailBB);
+}
+
+/// True if this BB has only one unconditional jump.
+bool TailDuplicator::isSimpleBB(MachineBasicBlock *TailBB) {
+  if (TailBB->succ_size() != 1)
+    return false;
+  if (TailBB->pred_empty())
+    return false;
+  MachineBasicBlock::iterator I = TailBB->getFirstNonDebugInstr();
+  if (I == TailBB->end())
+    return true;
+  return I->isUnconditionalBranch();
+}
+
+static bool bothUsedInPHI(const MachineBasicBlock &A,
+                          const SmallPtrSet<MachineBasicBlock *, 8> &SuccsB) {
+  for (MachineBasicBlock *BB : A.successors())
+    if (SuccsB.count(BB) && !BB->empty() && BB->begin()->isPHI())
+      return true;
+
+  return false;
+}
+
+bool TailDuplicator::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
+  for (MachineBasicBlock *PredBB : BB.predecessors()) {
+    if (PredBB->succ_size() > 1)
+      return false;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      return false;
+
+    if (!PredCond.empty())
+      return false;
+  }
+  return true;
+}
+
+bool TailDuplicator::duplicateSimpleBB(
+    MachineBasicBlock *TailBB, SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+    const DenseSet<unsigned> &UsedByPhi,
+    SmallVectorImpl<MachineInstr *> &Copies) {
+  SmallPtrSet<MachineBasicBlock *, 8> Succs(TailBB->succ_begin(),
+                                            TailBB->succ_end());
+  SmallVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
+                                            TailBB->pred_end());
+  bool Changed = false;
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    if (PredBB->hasEHPadSuccessor())
+      continue;
+
+    if (bothUsedInPHI(*PredBB, Succs))
+      continue;
+
+    MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+
+    Changed = true;
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From simple Succ: " << *TailBB);
+
+    MachineBasicBlock *NewTarget = *TailBB->succ_begin();
+    MachineBasicBlock *NextBB = &*std::next(PredBB->getIterator());
+
+    // Make PredFBB explicit.
+    if (PredCond.empty())
+      PredFBB = PredTBB;
+
+    // Make fall through explicit.
+    if (!PredTBB)
+      PredTBB = NextBB;
+    if (!PredFBB)
+      PredFBB = NextBB;
+
+    // Redirect
+    if (PredFBB == TailBB)
+      PredFBB = NewTarget;
+    if (PredTBB == TailBB)
+      PredTBB = NewTarget;
+
+    // Make the branch unconditional if possible
+    if (PredTBB == PredFBB) {
+      PredCond.clear();
+      PredFBB = nullptr;
+    }
+
+    // Avoid adding fall through branches.
+    if (PredFBB == NextBB)
+      PredFBB = nullptr;
+    if (PredTBB == NextBB && PredFBB == nullptr)
+      PredTBB = nullptr;
+
+    TII->RemoveBranch(*PredBB);
+
+    if (!PredBB->isSuccessor(NewTarget))
+      PredBB->replaceSuccessor(TailBB, NewTarget);
+    else {
+      PredBB->removeSuccessor(TailBB, true);
+      assert(PredBB->succ_size() <= 1);
+    }
+
+    if (PredTBB)
+      TII->InsertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+
+    TDBBs.push_back(PredBB);
+  }
+  return Changed;
+}
+
+/// If it is profitable, duplicate TailBB's contents in each
+/// of its predecessors.
+bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple,
+                                   MachineBasicBlock *TailBB,
+                                   SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                                   SmallVectorImpl<MachineInstr *> &Copies) {
+  DEBUG(dbgs() << "\n*** Tail-duplicating BB#" << TailBB->getNumber() << '\n');
+
+  DenseSet<unsigned> UsedByPhi;
+  getRegsUsedByPHIs(*TailBB, &UsedByPhi);
+
+  if (IsSimple)
+    return duplicateSimpleBB(TailBB, TDBBs, UsedByPhi, Copies);
+
+  // Iterate through all the unique predecessors and tail-duplicate this
+  // block into them, if possible. Copying the list ahead of time also
+  // avoids trouble with the predecessor list reallocating.
+  bool Changed = false;
+  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
+                                               TailBB->pred_end());
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+
+    assert(TailBB != PredBB &&
+           "Single-block loop should have been rejected earlier!");
+    // EH edges are ignored by AnalyzeBranch.
+    if (PredBB->succ_size() > 1)
+      continue;
+
+    MachineBasicBlock *PredTBB, *PredFBB;
+    SmallVector<MachineOperand, 4> PredCond;
+    if (TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
+      continue;
+    if (!PredCond.empty())
+      continue;
+    // Don't duplicate into a fall-through predecessor (at least for now).
+    if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+      continue;
+
+    DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
+                 << "From Succ: " << *TailBB);
+
+    TDBBs.push_back(PredBB);
+
+    // Remove PredBB's unconditional branch.
+    TII->RemoveBranch(*PredBB);
+
+    // Clone the contents of TailBB into PredBB.
+    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    // Use instr_iterator here to properly handle bundles, e.g.
+    // ARM Thumb2 IT block.
+    MachineBasicBlock::instr_iterator I = TailBB->instr_begin();
+    while (I != TailBB->instr_end()) {
+      MachineInstr *MI = &*I;
+      ++I;
+      if (MI->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      } else {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        duplicateInstruction(MI, TailBB, PredBB, MF, LocalVRMap, UsedByPhi);
+      }
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+
+    // Simplify
+    TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
+
+    NumTailDupAdded += TailBB->size() - 1; // subtract one for removed branch
+
+    // Update the CFG.
+    PredBB->removeSuccessor(PredBB->succ_begin());
+    assert(PredBB->succ_empty() &&
+           "TailDuplicate called on block with multiple successors!");
+    for (MachineBasicBlock::succ_iterator I = TailBB->succ_begin(),
+                                          E = TailBB->succ_end();
+         I != E; ++I)
+      PredBB->addSuccessor(*I, MBPI->getEdgeProbability(TailBB, I));
+
+    Changed = true;
+    ++NumTailDups;
+  }
+
+  // If TailBB was duplicated into all its predecessors except for the prior
+  // block, which falls through unconditionally, move the contents of this
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
+  MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
+  SmallVector<MachineOperand, 4> PriorCond;
+  // This has to check PrevBB->succ_size() because EH edges are ignored by
+  // AnalyzeBranch.
+  if (PrevBB->succ_size() == 1 &&
+      !TII->analyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
+      PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
+      !TailBB->hasAddressTaken()) {
+    DEBUG(dbgs() << "\nMerging into block: " << *PrevBB
+                 << "From MBB: " << *TailBB);
+    if (PreRegAlloc) {
+      DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+      SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+      MachineBasicBlock::iterator I = TailBB->begin();
+      // Process PHI instructions first.
+      while (I != TailBB->end() && I->isPHI()) {
+        // Replace the uses of the def of the PHI with the register coming
+        // from PredBB.
+        MachineInstr *MI = &*I++;
+        processPHI(MI, TailBB, PrevBB, LocalVRMap, CopyInfos, UsedByPhi, true);
+      }
+
+      // Now copy the non-PHI instructions.
+      while (I != TailBB->end()) {
+        // Replace def of virtual registers with new registers, and update
+        // uses with PHI source register or the new registers.
+        MachineInstr *MI = &*I++;
+        assert(!MI->isBundle() && "Not expecting bundles before regalloc!");
+        duplicateInstruction(MI, TailBB, PrevBB, MF, LocalVRMap, UsedByPhi);
+        MI->eraseFromParent();
+      }
+      appendCopies(PrevBB, CopyInfos, Copies);
+    } else {
+      // No PHIs to worry about, just splice the instructions over.
+      PrevBB->splice(PrevBB->end(), TailBB, TailBB->begin(), TailBB->end());
+    }
+    PrevBB->removeSuccessor(PrevBB->succ_begin());
+    assert(PrevBB->succ_empty());
+    PrevBB->transferSuccessors(TailBB);
+    TDBBs.push_back(PrevBB);
+    Changed = true;
+  }
+
+  // If this is after register allocation, there are no phis to fix.
+  if (!PreRegAlloc)
+    return Changed;
+
+  // If we made no changes so far, we are safe.
+  if (!Changed)
+    return Changed;
+
+  // Handle the nasty case in that we duplicated a block that is part of a loop
+  // into some but not all of its predecessors. For example:
+  //    1 -> 2 <-> 3                 |
+  //          \                      |
+  //           \---> rest            |
+  // if we duplicate 2 into 1 but not into 3, we end up with
+  // 12 -> 3 <-> 2 -> rest           |
+  //   \             /               |
+  //    \----->-----/                |
+  // If there was a "var = phi(1, 3)" in 2, it has to be ultimately replaced
+  // with a phi in 3 (which now dominates 2).
+  // What we do here is introduce a copy in 3 of the register defined by the
+  // phi, just like when we are duplicating 2 into 3, but we don't copy any
+  // real instructions or remove the 3 -> 2 edge from the phi in 2.
+  for (SmallSetVector<MachineBasicBlock *, 8>::iterator PI = Preds.begin(),
+                                                        PE = Preds.end();
+       PI != PE; ++PI) {
+    MachineBasicBlock *PredBB = *PI;
+    if (std::find(TDBBs.begin(), TDBBs.end(), PredBB) != TDBBs.end())
+      continue;
+
+    // EH edges
+    if (PredBB->succ_size() != 1)
+      continue;
+
+    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    MachineBasicBlock::iterator I = TailBB->begin();
+    // Process PHI instructions first.
+    while (I != TailBB->end() && I->isPHI()) {
+      // Replace the uses of the def of the PHI with the register coming
+      // from PredBB.
+      MachineInstr *MI = &*I++;
+      processPHI(MI, TailBB, PredBB, LocalVRMap, CopyInfos, UsedByPhi, false);
+    }
+    appendCopies(PredBB, CopyInfos, Copies);
+  }
+
+  return Changed;
+}
+
+/// At the end of the block \p MBB generate COPY instructions between registers
+/// described by \p CopyInfos. Append resulting instructions to \p Copies.
+void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
+      SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
+      SmallVectorImpl<MachineInstr*> &Copies) {
+  MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
+  const MCInstrDesc &CopyD = TII->get(TargetOpcode::COPY);
+  for (auto &CI : CopyInfos) {
+    auto C = BuildMI(*MBB, Loc, DebugLoc(), CopyD, CI.first)
+                .addReg(CI.second.Reg, 0, CI.second.SubReg);
+    Copies.push_back(C);
+  }
+}
+
+/// Remove the specified dead machine basic block from the function, updating
+/// the CFG.
+void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) {
+  assert(MBB->pred_empty() && "MBB must be dead!");
+  DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
+
+  // Remove all successors.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_end() - 1);
+
+  // Remove the block.
+  MBB->eraseFromParent();
+}
+
+} // End llvm namespace
diff --git a/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
new file mode 100644
index 000000000000..cac7e63af328
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -0,0 +1,104 @@
+//===----- TargetFrameLoweringImpl.cpp - Implement target frame interface --==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implements the layout of a stack frame on the target machine.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cstdlib>
+using namespace llvm;
+
+TargetFrameLowering::~TargetFrameLowering() {
+}
+
+/// The default implementation just looks at attribute "no-frame-pointer-elim".
+bool TargetFrameLowering::noFramePointerElim(const MachineFunction &MF) const {
+  auto Attr = MF.getFunction()->getFnAttribute("no-frame-pointer-elim");
+  return Attr.getValueAsString() == "true";
+}
+
+/// Returns the displacement from the frame register to the stack
+/// frame of the specified index, along with the frame register used
+/// (in output arg FrameReg). This is the default implementation which
+/// is overridden for some targets.
+int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                             int FI, unsigned &FrameReg) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+
+  // By default, assume all frame indices are referenced via whatever
+  // getFrameRegister() says. The target can override this if it's doing
+  // something different.
+  FrameReg = RI->getFrameRegister(MF);
+
+  return MFI->getObjectOffset(FI) + MFI->getStackSize() -
+         getOffsetOfLocalArea() + MFI->getOffsetAdjustment();
+}
+
+bool TargetFrameLowering::needsFrameIndexResolution(
+    const MachineFunction &MF) const {
+  return MF.getFrameInfo()->hasStackObjects();
+}
+
+void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
+                                               BitVector &SavedRegs,
+                                               RegScavenger *RS) const {
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // Resize before the early returns. Some backends expect that
+  // SavedRegs.size() == TRI.getNumRegs() after this call even if there are no
+  // saved registers.
+  SavedRegs.resize(TRI.getNumRegs());
+
+  // When interprocedural register allocation is enabled caller saved registers
+  // are preferred over callee saved registers.
+  if (MF.getTarget().Options.EnableIPRA && isSafeForNoCSROpt(MF.getFunction()))
+    return;
+
+  // Get the callee saved register list...
+  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+
+  // Early exit if there are no callee saved registers.
+  if (!CSRegs || CSRegs[0] == 0)
+    return;
+
+  // In Naked functions we aren't going to save any registers.
+  if (MF.getFunction()->hasFnAttribute(Attribute::Naked))
+    return;
+
+  // Functions which call __builtin_unwind_init get all their registers saved.
+  bool CallsUnwindInit = MF.getMMI().callsUnwindInit();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned i = 0; CSRegs[i]; ++i) {
+    unsigned Reg = CSRegs[i];
+    if (CallsUnwindInit || MRI.isPhysRegModified(Reg))
+      SavedRegs.set(Reg);
+  }
+}
+
+unsigned TargetFrameLowering::getStackAlignmentSkew(
+    const MachineFunction &MF) const {
+  // When HHVM function is called, the stack is skewed as the return address
+  // is removed from the stack before we enter the function.
+  if (LLVM_UNLIKELY(MF.getFunction()->getCallingConv() == CallingConv::HHVM))
+    return MF.getTarget().getPointerSize();
+
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
new file mode 100644
index 000000000000..e7330c60ed23
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -0,0 +1,1195 @@
+//===-- TargetInstrInfo.cpp - Target Instruction Information --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetInstrInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cctype>
+
+using namespace llvm;
+
+static cl::opt<bool> DisableHazardRecognizer(
+  "disable-sched-hazard", cl::Hidden, cl::init(false),
+  cl::desc("Disable hazard detection during preRA scheduling"));
+
+TargetInstrInfo::~TargetInstrInfo() {
+}
+
+const TargetRegisterClass*
+TargetInstrInfo::getRegClass(const MCInstrDesc &MCID, unsigned OpNum,
+                             const TargetRegisterInfo *TRI,
+                             const MachineFunction &MF) const {
+  if (OpNum >= MCID.getNumOperands())
+    return nullptr;
+
+  short RegClass = MCID.OpInfo[OpNum].RegClass;
+  if (MCID.OpInfo[OpNum].isLookupPtrRegClass())
+    return TRI->getPointerRegClass(MF, RegClass);
+
+  // Instructions like INSERT_SUBREG do not have fixed register classes.
+  if (RegClass < 0)
+    return nullptr;
+
+  // Otherwise just look it up normally.
+  return TRI->getRegClass(RegClass);
+}
+
+/// insertNoop - Insert a noop into the instruction stream at the specified
+/// point.
+void TargetInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI) const {
+  llvm_unreachable("Target didn't implement insertNoop!");
+}
+
+/// Measure the specified inline asm to determine an approximation of its
+/// length.
+/// Comments (which run till the next SeparatorString or newline) do not
+/// count as an instruction.
+/// Any other non-whitespace text is considered an instruction, with
+/// multiple instructions separated by SeparatorString or newlines.
+/// Variable-length instructions are not handled here; this function
+/// may be overloaded in the target code to do that.
+unsigned TargetInstrInfo::getInlineAsmLength(const char *Str,
+                                             const MCAsmInfo &MAI) const {
+  // Count the number of instructions in the asm.
+  bool atInsnStart = true;
+  unsigned InstCount = 0;
+  for (; *Str; ++Str) {
+    if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
+                                strlen(MAI.getSeparatorString())) == 0) {
+      atInsnStart = true;
+    } else if (strncmp(Str, MAI.getCommentString(),
+                       strlen(MAI.getCommentString())) == 0) {
+      // Stop counting as an instruction after a comment until the next
+      // separator.
+      atInsnStart = false;
+    }
+
+    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+      ++InstCount;
+      atInsnStart = false;
+    }
+  }
+
+  return InstCount * MAI.getMaxInstLength();
+}
+
+/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
+/// after it, replacing it with an unconditional branch to NewDest.
+void
+TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
+                                         MachineBasicBlock *NewDest) const {
+  MachineBasicBlock *MBB = Tail->getParent();
+
+  // Remove all the old successors of MBB from the CFG.
+  while (!MBB->succ_empty())
+    MBB->removeSuccessor(MBB->succ_begin());
+
+  // Save off the debug loc before erasing the instruction.
+  DebugLoc DL = Tail->getDebugLoc();
+
+  // Remove all the dead instructions from the end of MBB.
+  MBB->erase(Tail, MBB->end());
+
+  // If MBB isn't immediately before MBB, insert a branch to it.
+  if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(NewDest))
+    InsertBranch(*MBB, NewDest, nullptr, SmallVector<MachineOperand, 0>(), DL);
+  MBB->addSuccessor(NewDest);
+}
+
+MachineInstr *TargetInstrInfo::commuteInstructionImpl(MachineInstr &MI,
+                                                      bool NewMI, unsigned Idx1,
+                                                      unsigned Idx2) const {
+  const MCInstrDesc &MCID = MI.getDesc();
+  bool HasDef = MCID.getNumDefs();
+  if (HasDef && !MI.getOperand(0).isReg())
+    // No idea how to commute this instruction. Target should implement its own.
+    return nullptr;
+
+  unsigned CommutableOpIdx1 = Idx1; (void)CommutableOpIdx1;
+  unsigned CommutableOpIdx2 = Idx2; (void)CommutableOpIdx2;
+  assert(findCommutedOpIndices(MI, CommutableOpIdx1, CommutableOpIdx2) &&
+         CommutableOpIdx1 == Idx1 && CommutableOpIdx2 == Idx2 &&
+         "TargetInstrInfo::CommuteInstructionImpl(): not commutable operands.");
+  assert(MI.getOperand(Idx1).isReg() && MI.getOperand(Idx2).isReg() &&
+         "This only knows how to commute register operands so far");
+
+  unsigned Reg0 = HasDef ? MI.getOperand(0).getReg() : 0;
+  unsigned Reg1 = MI.getOperand(Idx1).getReg();
+  unsigned Reg2 = MI.getOperand(Idx2).getReg();
+  unsigned SubReg0 = HasDef ? MI.getOperand(0).getSubReg() : 0;
+  unsigned SubReg1 = MI.getOperand(Idx1).getSubReg();
+  unsigned SubReg2 = MI.getOperand(Idx2).getSubReg();
+  bool Reg1IsKill = MI.getOperand(Idx1).isKill();
+  bool Reg2IsKill = MI.getOperand(Idx2).isKill();
+  bool Reg1IsUndef = MI.getOperand(Idx1).isUndef();
+  bool Reg2IsUndef = MI.getOperand(Idx2).isUndef();
+  bool Reg1IsInternal = MI.getOperand(Idx1).isInternalRead();
+  bool Reg2IsInternal = MI.getOperand(Idx2).isInternalRead();
+  // If destination is tied to either of the commuted source register, then
+  // it must be updated.
+  if (HasDef && Reg0 == Reg1 &&
+      MI.getDesc().getOperandConstraint(Idx1, MCOI::TIED_TO) == 0) {
+    Reg2IsKill = false;
+    Reg0 = Reg2;
+    SubReg0 = SubReg2;
+  } else if (HasDef && Reg0 == Reg2 &&
+             MI.getDesc().getOperandConstraint(Idx2, MCOI::TIED_TO) == 0) {
+    Reg1IsKill = false;
+    Reg0 = Reg1;
+    SubReg0 = SubReg1;
+  }
+
+  MachineInstr *CommutedMI = nullptr;
+  if (NewMI) {
+    // Create a new instruction.
+    MachineFunction &MF = *MI.getParent()->getParent();
+    CommutedMI = MF.CloneMachineInstr(&MI);
+  } else {
+    CommutedMI = &MI;
+  }
+
+  if (HasDef) {
+    CommutedMI->getOperand(0).setReg(Reg0);
+    CommutedMI->getOperand(0).setSubReg(SubReg0);
+  }
+  CommutedMI->getOperand(Idx2).setReg(Reg1);
+  CommutedMI->getOperand(Idx1).setReg(Reg2);
+  CommutedMI->getOperand(Idx2).setSubReg(SubReg1);
+  CommutedMI->getOperand(Idx1).setSubReg(SubReg2);
+  CommutedMI->getOperand(Idx2).setIsKill(Reg1IsKill);
+  CommutedMI->getOperand(Idx1).setIsKill(Reg2IsKill);
+  CommutedMI->getOperand(Idx2).setIsUndef(Reg1IsUndef);
+  CommutedMI->getOperand(Idx1).setIsUndef(Reg2IsUndef);
+  CommutedMI->getOperand(Idx2).setIsInternalRead(Reg1IsInternal);
+  CommutedMI->getOperand(Idx1).setIsInternalRead(Reg2IsInternal);
+  return CommutedMI;
+}
+
+MachineInstr *TargetInstrInfo::commuteInstruction(MachineInstr &MI, bool NewMI,
+                                                  unsigned OpIdx1,
+                                                  unsigned OpIdx2) const {
+  // If OpIdx1 or OpIdx2 is not specified, then this method is free to choose
+  // any commutable operand, which is done in findCommutedOpIndices() method
+  // called below.
+  if ((OpIdx1 == CommuteAnyOperandIndex || OpIdx2 == CommuteAnyOperandIndex) &&
+      !findCommutedOpIndices(MI, OpIdx1, OpIdx2)) {
+    assert(MI.isCommutable() &&
+           "Precondition violation: MI must be commutable.");
+    return nullptr;
+  }
+  return commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
+}
+
+bool TargetInstrInfo::fixCommutedOpIndices(unsigned &ResultIdx1,
+                                           unsigned &ResultIdx2,
+                                           unsigned CommutableOpIdx1,
+                                           unsigned CommutableOpIdx2) {
+  if (ResultIdx1 == CommuteAnyOperandIndex &&
+      ResultIdx2 == CommuteAnyOperandIndex) {
+    ResultIdx1 = CommutableOpIdx1;
+    ResultIdx2 = CommutableOpIdx2;
+  } else if (ResultIdx1 == CommuteAnyOperandIndex) {
+    if (ResultIdx2 == CommutableOpIdx1)
+      ResultIdx1 = CommutableOpIdx2;
+    else if (ResultIdx2 == CommutableOpIdx2)
+      ResultIdx1 = CommutableOpIdx1;
+    else
+      return false;
+  } else if (ResultIdx2 == CommuteAnyOperandIndex) {
+    if (ResultIdx1 == CommutableOpIdx1)
+      ResultIdx2 = CommutableOpIdx2;
+    else if (ResultIdx1 == CommutableOpIdx2)
+      ResultIdx2 = CommutableOpIdx1;
+    else
+      return false;
+  } else
+    // Check that the result operand indices match the given commutable
+    // operand indices.
+    return (ResultIdx1 == CommutableOpIdx1 && ResultIdx2 == CommutableOpIdx2) ||
+           (ResultIdx1 == CommutableOpIdx2 && ResultIdx2 == CommutableOpIdx1);
+
+  return true;
+}
+
+bool TargetInstrInfo::findCommutedOpIndices(MachineInstr &MI,
+                                            unsigned &SrcOpIdx1,
+                                            unsigned &SrcOpIdx2) const {
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  // This assumes v0 = op v1, v2 and commuting would swap v1 and v2. If this
+  // is not true, then the target must implement this.
+  unsigned CommutableOpIdx1 = MCID.getNumDefs();
+  unsigned CommutableOpIdx2 = CommutableOpIdx1 + 1;
+  if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+                            CommutableOpIdx1, CommutableOpIdx2))
+    return false;
+
+  if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
+    // No idea.
+    return false;
+  return true;
+}
+
+bool TargetInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
+  if (!MI.isTerminator()) return false;
+
+  // Conditional branch is a special case.
+  if (MI.isBranch() && !MI.isBarrier())
+    return true;
+  if (!MI.isPredicable())
+    return true;
+  return !isPredicated(MI);
+}
+
+bool TargetInstrInfo::PredicateInstruction(
+    MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
+  bool MadeChange = false;
+
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::PredicateInstruction() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MI.isPredicable())
+    return false;
+
+  for (unsigned j = 0, i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    if (MCID.OpInfo[i].isPredicate()) {
+      MachineOperand &MO = MI.getOperand(i);
+      if (MO.isReg()) {
+        MO.setReg(Pred[j].getReg());
+        MadeChange = true;
+      } else if (MO.isImm()) {
+        MO.setImm(Pred[j].getImm());
+        MadeChange = true;
+      } else if (MO.isMBB()) {
+        MO.setMBB(Pred[j].getMBB());
+        MadeChange = true;
+      }
+      ++j;
+    }
+  }
+  return MadeChange;
+}
+
+bool TargetInstrInfo::hasLoadFromStackSlot(const MachineInstr &MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
+    if ((*o)->isLoad()) {
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast_or_null<FixedStackPseudoSourceValue>(
+              (*o)->getPseudoValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool TargetInstrInfo::hasStoreToStackSlot(const MachineInstr &MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+  for (MachineInstr::mmo_iterator o = MI.memoperands_begin(),
+                                  oe = MI.memoperands_end();
+       o != oe; ++o) {
+    if ((*o)->isStore()) {
+      if (const FixedStackPseudoSourceValue *Value =
+          dyn_cast_or_null<FixedStackPseudoSourceValue>(
+              (*o)->getPseudoValue())) {
+        FrameIndex = Value->getFrameIndex();
+        MMO = *o;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
+                                        unsigned SubIdx, unsigned &Size,
+                                        unsigned &Offset,
+                                        const MachineFunction &MF) const {
+  if (!SubIdx) {
+    Size = RC->getSize();
+    Offset = 0;
+    return true;
+  }
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  unsigned BitSize = TRI->getSubRegIdxSize(SubIdx);
+  // Convert bit size to byte size to be consistent with
+  // MCRegisterClass::getSize().
+  if (BitSize % 8)
+    return false;
+
+  int BitOffset = TRI->getSubRegIdxOffset(SubIdx);
+  if (BitOffset < 0 || BitOffset % 8)
+    return false;
+
+  Size = BitSize /= 8;
+  Offset = (unsigned)BitOffset / 8;
+
+  assert(RC->getSize() >= (Offset + Size) && "bad subregister range");
+
+  if (!MF.getDataLayout().isLittleEndian()) {
+    Offset = RC->getSize() - (Offset + Size);
+  }
+  return true;
+}
+
+void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator I,
+                                    unsigned DestReg, unsigned SubIdx,
+                                    const MachineInstr &Orig,
+                                    const TargetRegisterInfo &TRI) const {
+  MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
+  MI->substituteRegister(MI->getOperand(0).getReg(), DestReg, SubIdx, TRI);
+  MBB.insert(I, MI);
+}
+
+bool TargetInstrInfo::produceSameValue(const MachineInstr &MI0,
+                                       const MachineInstr &MI1,
+                                       const MachineRegisterInfo *MRI) const {
+  return MI0.isIdenticalTo(MI1, MachineInstr::IgnoreVRegDefs);
+}
+
+MachineInstr *TargetInstrInfo::duplicate(MachineInstr &Orig,
+                                         MachineFunction &MF) const {
+  assert(!Orig.isNotDuplicable() && "Instruction cannot be duplicated");
+  return MF.CloneMachineInstr(&Orig);
+}
+
+// If the COPY instruction in MI can be folded to a stack operation, return
+// the register class to use.
+static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI,
+                                              unsigned FoldIdx) {
+  assert(MI.isCopy() && "MI must be a COPY instruction");
+  if (MI.getNumOperands() != 2)
+    return nullptr;
+  assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand");
+
+  const MachineOperand &FoldOp = MI.getOperand(FoldIdx);
+  const MachineOperand &LiveOp = MI.getOperand(1 - FoldIdx);
+
+  if (FoldOp.getSubReg() || LiveOp.getSubReg())
+    return nullptr;
+
+  unsigned FoldReg = FoldOp.getReg();
+  unsigned LiveReg = LiveOp.getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(FoldReg) &&
+         "Cannot fold physregs");
+
+  const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+  const TargetRegisterClass *RC = MRI.getRegClass(FoldReg);
+
+  if (TargetRegisterInfo::isPhysicalRegister(LiveOp.getReg()))
+    return RC->contains(LiveOp.getReg()) ? RC : nullptr;
+
+  if (RC->hasSubClassEq(MRI.getRegClass(LiveReg)))
+    return RC;
+
+  // FIXME: Allow folding when register classes are memory compatible.
+  return nullptr;
+}
+
+void TargetInstrInfo::getNoopForMachoTarget(MCInst &NopInst) const {
+  llvm_unreachable("Not a MachO target");
+}
+
+static MachineInstr *foldPatchpoint(MachineFunction &MF, MachineInstr &MI,
+                                    ArrayRef<unsigned> Ops, int FrameIndex,
+                                    const TargetInstrInfo &TII) {
+  unsigned StartIdx = 0;
+  switch (MI.getOpcode()) {
+  case TargetOpcode::STACKMAP:
+    StartIdx = 2; // Skip ID, nShadowBytes.
+    break;
+  case TargetOpcode::PATCHPOINT: {
+    // For PatchPoint, the call args are not foldable.
+    PatchPointOpers opers(&MI);
+    StartIdx = opers.getVarIdx();
+    break;
+  }
+  default:
+    llvm_unreachable("unexpected stackmap opcode");
+  }
+
+  // Return false if any operands requested for folding are not foldable (not
+  // part of the stackmap's live values).
+  for (unsigned Op : Ops) {
+    if (Op < StartIdx)
+      return nullptr;
+  }
+
+  MachineInstr *NewMI =
+      MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
+  MachineInstrBuilder MIB(MF, NewMI);
+
+  // No need to fold return, the meta data, and function arguments
+  for (unsigned i = 0; i < StartIdx; ++i)
+    MIB.addOperand(MI.getOperand(i));
+
+  for (unsigned i = StartIdx; i < MI.getNumOperands(); ++i) {
+    MachineOperand &MO = MI.getOperand(i);
+    if (std::find(Ops.begin(), Ops.end(), i) != Ops.end()) {
+      unsigned SpillSize;
+      unsigned SpillOffset;
+      // Compute the spill slot size and offset.
+      const TargetRegisterClass *RC =
+        MF.getRegInfo().getRegClass(MO.getReg());
+      bool Valid =
+          TII.getStackSlotRange(RC, MO.getSubReg(), SpillSize, SpillOffset, MF);
+      if (!Valid)
+        report_fatal_error("cannot spill patchpoint subregister operand");
+      MIB.addImm(StackMaps::IndirectMemRefOp);
+      MIB.addImm(SpillSize);
+      MIB.addFrameIndex(FrameIndex);
+      MIB.addImm(SpillOffset);
+    }
+    else
+      MIB.addOperand(MO);
+  }
+  return NewMI;
+}
+
+/// foldMemoryOperand - Attempt to fold a load or store of the specified stack
+/// slot into the specified machine instruction for the specified operand(s).
+/// If this is possible, a new instruction is returned with the specified
+/// operand folded, otherwise NULL is returned. The client is responsible for
+/// removing the old instruction and adding the new one in the instruction
+/// stream.
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
+                                                 ArrayRef<unsigned> Ops, int FI,
+                                                 LiveIntervals *LIS) const {
+  auto Flags = MachineMemOperand::MONone;
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    if (MI.getOperand(Ops[i]).isDef())
+      Flags |= MachineMemOperand::MOStore;
+    else
+      Flags |= MachineMemOperand::MOLoad;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  assert(MBB && "foldMemoryOperand needs an inserted instruction");
+  MachineFunction &MF = *MBB->getParent();
+
+  MachineInstr *NewMI = nullptr;
+
+  if (MI.getOpcode() == TargetOpcode::STACKMAP ||
+      MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+    // Fold stackmap/patchpoint.
+    NewMI = foldPatchpoint(MF, MI, Ops, FI, *this);
+    if (NewMI)
+      MBB->insert(MI, NewMI);
+  } else {
+    // Ask the target to do the actual folding.
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, FI, LIS);
+  }
+
+  if (NewMI) {
+    NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    // Add a memory operand, foldMemoryOperandImpl doesn't do that.
+    assert((!(Flags & MachineMemOperand::MOStore) ||
+            NewMI->mayStore()) &&
+           "Folded a def to a non-store!");
+    assert((!(Flags & MachineMemOperand::MOLoad) ||
+            NewMI->mayLoad()) &&
+           "Folded a use to a non-load!");
+    const MachineFrameInfo &MFI = *MF.getFrameInfo();
+    assert(MFI.getObjectOffset(FI) != -1);
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), Flags, MFI.getObjectSize(FI),
+        MFI.getObjectAlignment(FI));
+    NewMI->addMemOperand(MF, MMO);
+
+    return NewMI;
+  }
+
+  // Straight COPY may fold as load/store.
+  if (!MI.isCopy() || Ops.size() != 1)
+    return nullptr;
+
+  const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]);
+  if (!RC)
+    return nullptr;
+
+  const MachineOperand &MO = MI.getOperand(1 - Ops[0]);
+  MachineBasicBlock::iterator Pos = MI;
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  if (Flags == MachineMemOperand::MOStore)
+    storeRegToStackSlot(*MBB, Pos, MO.getReg(), MO.isKill(), FI, RC, TRI);
+  else
+    loadRegFromStackSlot(*MBB, Pos, MO.getReg(), FI, RC, TRI);
+  return &*--Pos;
+}
+
+bool TargetInstrInfo::hasReassociableOperands(
+    const MachineInstr &Inst, const MachineBasicBlock *MBB) const {
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  // We need virtual register definitions for the operands that we will
+  // reassociate.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+
+  // And they need to be in the trace (otherwise, they won't have a depth).
+  return MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB;
+}
+
+bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
+                                             bool &Commuted) const {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned AssocOpcode = Inst.getOpcode();
+
+  // If only one operand has the same opcode and it's the second source operand,
+  // the operands must be commuted.
+  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+  if (Commuted)
+    std::swap(MI1, MI2);
+
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 3. The previous instruction's result must only be used by Inst.
+  return MI1->getOpcode() == AssocOpcode &&
+         hasReassociableOperands(*MI1, MBB) &&
+         MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
+}
+
+// 1. The operation must be associative and commutative.
+// 2. The instruction must have virtual register definitions for its
+//    operands in the same basic block.
+// 3. The instruction must have a reassociable sibling.
+bool TargetInstrInfo::isReassociationCandidate(const MachineInstr &Inst,
+                                               bool &Commuted) const {
+  return isAssociativeAndCommutative(Inst) &&
+         hasReassociableOperands(Inst, Inst.getParent()) &&
+         hasReassociableSibling(Inst, Commuted);
+}
+
+// The concept of the reassociation pass is that these operations can benefit
+// from this kind of transformation:
+//
+// A = ? op ?
+// B = A op X (Prev)
+// C = B op Y (Root)
+// -->
+// A = ? op ?
+// B = X op Y
+// C = A op B
+//
+// breaking the dependency between A and B, allowing them to be executed in
+// parallel (or back-to-back in a pipeline) instead of depending on each other.
+
+// FIXME: This has the potential to be expensive (compile time) while not
+// improving the code at all. Some ways to limit the overhead:
+// 1. Track successful transforms; bail out if hit rate gets too low.
+// 2. Only enable at -O3 or some other non-default optimization level.
+// 3. Pre-screen pattern candidates here: if an operand of the previous
+//    instruction is known to not increase the critical path, then don't match
+//    that pattern.
+bool TargetInstrInfo::getMachineCombinerPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  bool Commute;
+  if (isReassociationCandidate(Root, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::REASSOC_XA_BY);
+    }
+    return true;
+  }
+
+  return false;
+}
+/// Return true when a code sequence can improve loop throughput.
+bool
+TargetInstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const {
+  return false;
+}
+/// Attempt the reassociation transformation to reduce critical path length.
+/// See the above comments before getMachineCombinerPatterns().
+void TargetInstrInfo::reassociateOps(
+    MachineInstr &Root, MachineInstr &Prev,
+    MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineFunction *MF = Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+
+  // This array encodes the operand index for each parameter because the
+  // operands may be commuted. Each row corresponds to a pattern value,
+  // and each column specifies the index of A, B, X, Y.
+  unsigned OpIdx[4][4] = {
+    { 1, 1, 2, 2 },
+    { 1, 2, 2, 1 },
+    { 2, 1, 1, 2 },
+    { 2, 2, 1, 1 }
+  };
+
+  int Row;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY: Row = 0; break;
+  case MachineCombinerPattern::REASSOC_AX_YB: Row = 1; break;
+  case MachineCombinerPattern::REASSOC_XA_BY: Row = 2; break;
+  case MachineCombinerPattern::REASSOC_XA_YB: Row = 3; break;
+  default: llvm_unreachable("unexpected MachineCombinerPattern");
+  }
+
+  MachineOperand &OpA = Prev.getOperand(OpIdx[Row][0]);
+  MachineOperand &OpB = Root.getOperand(OpIdx[Row][1]);
+  MachineOperand &OpX = Prev.getOperand(OpIdx[Row][2]);
+  MachineOperand &OpY = Root.getOperand(OpIdx[Row][3]);
+  MachineOperand &OpC = Root.getOperand(0);
+
+  unsigned RegA = OpA.getReg();
+  unsigned RegB = OpB.getReg();
+  unsigned RegX = OpX.getReg();
+  unsigned RegY = OpY.getReg();
+  unsigned RegC = OpC.getReg();
+
+  if (TargetRegisterInfo::isVirtualRegister(RegA))
+    MRI.constrainRegClass(RegA, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegB))
+    MRI.constrainRegClass(RegB, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegX))
+    MRI.constrainRegClass(RegX, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegY))
+    MRI.constrainRegClass(RegY, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegC))
+    MRI.constrainRegClass(RegC, RC);
+
+  // Create a new virtual register for the result of (X op Y) instead of
+  // recycling RegB because the MachineCombiner's computation of the critical
+  // path requires a new register definition rather than an existing one.
+  unsigned NewVR = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVR, 0));
+
+  unsigned Opcode = Root.getOpcode();
+  bool KillA = OpA.isKill();
+  bool KillX = OpX.isKill();
+  bool KillY = OpY.isKill();
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+      BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), NewVR)
+          .addReg(RegX, getKillRegState(KillX))
+          .addReg(RegY, getKillRegState(KillY));
+  MachineInstrBuilder MIB2 =
+      BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
+          .addReg(RegA, getKillRegState(KillA))
+          .addReg(NewVR, getKillRegState(true));
+
+  setSpecialOperandAttr(Root, Prev, *MIB1, *MIB2);
+
+  // Record new instructions for insertion and old instructions for deletion.
+  InsInstrs.push_back(MIB1);
+  InsInstrs.push_back(MIB2);
+  DelInstrs.push_back(&Prev);
+  DelInstrs.push_back(&Root);
+}
+
+void TargetInstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstIdxForVirtReg) const {
+  MachineRegisterInfo &MRI = Root.getParent()->getParent()->getRegInfo();
+
+  // Select the previous instruction in the sequence based on the input pattern.
+  MachineInstr *Prev = nullptr;
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_AX_BY:
+  case MachineCombinerPattern::REASSOC_XA_BY:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+    break;
+  case MachineCombinerPattern::REASSOC_AX_YB:
+  case MachineCombinerPattern::REASSOC_XA_YB:
+    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+    break;
+  default:
+    break;
+  }
+
+  assert(Prev && "Unknown pattern for machine combiner");
+
+  reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
+}
+
+/// foldMemoryOperand - Same as the previous version except it allows folding
+/// of any load and store from / to any address, not just from a specific
+/// stack slot.
+MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
+                                                 ArrayRef<unsigned> Ops,
+                                                 MachineInstr &LoadMI,
+                                                 LiveIntervals *LIS) const {
+  assert(LoadMI.canFoldAsLoad() && "LoadMI isn't foldable!");
+#ifndef NDEBUG
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    assert(MI.getOperand(Ops[i]).isUse() && "Folding load into def!");
+#endif
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+
+  // Ask the target to do the actual folding.
+  MachineInstr *NewMI = nullptr;
+  int FrameIndex = 0;
+
+  if ((MI.getOpcode() == TargetOpcode::STACKMAP ||
+       MI.getOpcode() == TargetOpcode::PATCHPOINT) &&
+      isLoadFromStackSlot(LoadMI, FrameIndex)) {
+    // Fold stackmap/patchpoint.
+    NewMI = foldPatchpoint(MF, MI, Ops, FrameIndex, *this);
+    if (NewMI)
+      NewMI = &*MBB.insert(MI, NewMI);
+  } else {
+    // Ask the target to do the actual folding.
+    NewMI = foldMemoryOperandImpl(MF, MI, Ops, MI, LoadMI, LIS);
+  }
+
+  if (!NewMI) return nullptr;
+
+  // Copy the memoperands from the load to the folded instruction.
+  if (MI.memoperands_empty()) {
+    NewMI->setMemRefs(LoadMI.memoperands_begin(), LoadMI.memoperands_end());
+  }
+  else {
+    // Handle the rare case of folding multiple loads.
+    NewMI->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+    for (MachineInstr::mmo_iterator I = LoadMI.memoperands_begin(),
+                                    E = LoadMI.memoperands_end();
+         I != E; ++I) {
+      NewMI->addMemOperand(MF, *I);
+    }
+  }
+  return NewMI;
+}
+
+bool TargetInstrInfo::isReallyTriviallyReMaterializableGeneric(
+    const MachineInstr &MI, AliasAnalysis *AA) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // Remat clients assume operand 0 is the defined register.
+  if (!MI.getNumOperands() || !MI.getOperand(0).isReg())
+    return false;
+  unsigned DefReg = MI.getOperand(0).getReg();
+
+  // A sub-register definition can only be rematerialized if the instruction
+  // doesn't read the other parts of the register.  Otherwise it is really a
+  // read-modify-write operation on the full virtual register which cannot be
+  // moved safely.
+  if (TargetRegisterInfo::isVirtualRegister(DefReg) &&
+      MI.getOperand(0).getSubReg() && MI.readsVirtualRegister(DefReg))
+    return false;
+
+  // A load from a fixed stack slot can be rematerialized. This may be
+  // redundant with subsequent checks, but it's target-independent,
+  // simple, and a common case.
+  int FrameIdx = 0;
+  if (isLoadFromStackSlot(MI, FrameIdx) &&
+      MF.getFrameInfo()->isImmutableObjectIndex(FrameIdx))
+    return true;
+
+  // Avoid instructions obviously unsafe for remat.
+  if (MI.isNotDuplicable() || MI.mayStore() || MI.hasUnmodeledSideEffects())
+    return false;
+
+  // Don't remat inline asm. We have no idea how expensive it is
+  // even if it's side effect free.
+  if (MI.isInlineAsm())
+    return false;
+
+  // Avoid instructions which load from potentially varying memory.
+  if (MI.mayLoad() && !MI.isInvariantLoad(AA))
+    return false;
+
+  // If any of the registers accessed are non-constant, conservatively assume
+  // the instruction is not rematerializable.
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (Reg == 0)
+      continue;
+
+    // Check for a well-behaved physical register.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      if (MO.isUse()) {
+        // If the physreg has no defs anywhere, it's just an ambient register
+        // and we can freely move its uses. Alternatively, if it's allocatable,
+        // it could get allocated to something with a def during allocation.
+        if (!MRI.isConstantPhysReg(Reg, MF))
+          return false;
+      } else {
+        // A physreg def. We can't remat it.
+        return false;
+      }
+      continue;
+    }
+
+    // Only allow one virtual-register def.  There may be multiple defs of the
+    // same virtual register, though.
+    if (MO.isDef() && Reg != DefReg)
+      return false;
+
+    // Don't allow any virtual-register uses. Rematting an instruction with
+    // virtual register uses would length the live ranges of the uses, which
+    // is not necessarily a good idea, certainly not "trivial".
+    if (MO.isUse())
+      return false;
+  }
+
+  // Everything checked out.
+  return true;
+}
+
+int TargetInstrInfo::getSPAdjust(const MachineInstr &MI) const {
+  const MachineFunction *MF = MI.getParent()->getParent();
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  bool StackGrowsDown =
+    TFI->getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
+
+  unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
+  unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();
+
+  if (MI.getOpcode() != FrameSetupOpcode &&
+      MI.getOpcode() != FrameDestroyOpcode)
+    return 0;
+
+  int SPAdj = MI.getOperand(0).getImm();
+  SPAdj = TFI->alignSPAdjust(SPAdj);
+
+  if ((!StackGrowsDown && MI.getOpcode() == FrameSetupOpcode) ||
+      (StackGrowsDown && MI.getOpcode() == FrameDestroyOpcode))
+    SPAdj = -SPAdj;
+
+  return SPAdj;
+}
+
+/// isSchedulingBoundary - Test if the given instruction should be
+/// considered a scheduling boundary. This primarily includes labels
+/// and terminators.
+bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
+                                           const MachineBasicBlock *MBB,
+                                           const MachineFunction &MF) const {
+  // Terminators and labels can't be scheduled around.
+  if (MI.isTerminator() || MI.isPosition())
+    return true;
+
+  // Don't attempt to schedule around any instruction that defines
+  // a stack-oriented pointer, as it's unlikely to be profitable. This
+  // saves compile time, because it doesn't require every single
+  // stack slot reference to depend on the instruction that does the
+  // modification.
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  return MI.modifiesRegister(TLI.getStackPointerRegisterToSaveRestore(), TRI);
+}
+
+// Provide a global flag for disabling the PreRA hazard recognizer that targets
+// may choose to honor.
+bool TargetInstrInfo::usePreRAHazardRecognizer() const {
+  return !DisableHazardRecognizer;
+}
+
+// Default implementation of CreateTargetRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetHazardRecognizer(const TargetSubtargetInfo *STI,
+                             const ScheduleDAG *DAG) const {
+  // Dummy hazard recognizer allows all instructions to issue.
+  return new ScheduleHazardRecognizer();
+}
+
+// Default implementation of CreateTargetMIHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetMIHazardRecognizer(const InstrItineraryData *II,
+                               const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "misched");
+}
+
+// Default implementation of CreateTargetPostRAHazardRecognizer.
+ScheduleHazardRecognizer *TargetInstrInfo::
+CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
+                                   const ScheduleDAG *DAG) const {
+  return (ScheduleHazardRecognizer *)
+    new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
+}
+
+//===----------------------------------------------------------------------===//
+//  SelectionDAG latency interface.
+//===----------------------------------------------------------------------===//
+
+int
+TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                   SDNode *DefNode, unsigned DefIdx,
+                                   SDNode *UseNode, unsigned UseIdx) const {
+  if (!ItinData || ItinData->isEmpty())
+    return -1;
+
+  if (!DefNode->isMachineOpcode())
+    return -1;
+
+  unsigned DefClass = get(DefNode->getMachineOpcode()).getSchedClass();
+  if (!UseNode->isMachineOpcode())
+    return ItinData->getOperandCycle(DefClass, DefIdx);
+  unsigned UseClass = get(UseNode->getMachineOpcode()).getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+int TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                     SDNode *N) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  if (!N->isMachineOpcode())
+    return 1;
+
+  return ItinData->getStageLatency(get(N->getMachineOpcode()).getSchedClass());
+}
+
+//===----------------------------------------------------------------------===//
+//  MachineInstr latency interface.
+//===----------------------------------------------------------------------===//
+
+unsigned TargetInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
+                                         const MachineInstr &MI) const {
+  if (!ItinData || ItinData->isEmpty())
+    return 1;
+
+  unsigned Class = MI.getDesc().getSchedClass();
+  int UOps = ItinData->Itineraries[Class].NumMicroOps;
+  if (UOps >= 0)
+    return UOps;
+
+  // The # of u-ops is dynamically determined. The specific target should
+  // override this function to return the right number.
+  return 1;
+}
+
+/// Return the default expected latency for a def based on it's opcode.
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel &SchedModel,
+                                            const MachineInstr &DefMI) const {
+  if (DefMI.isTransient())
+    return 0;
+  if (DefMI.mayLoad())
+    return SchedModel.LoadLatency;
+  if (isHighLatencyDef(DefMI.getOpcode()))
+    return SchedModel.HighLatency;
+  return 1;
+}
+
+unsigned TargetInstrInfo::getPredicationCost(const MachineInstr &) const {
+  return 0;
+}
+
+unsigned TargetInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                          const MachineInstr &MI,
+                                          unsigned *PredCost) const {
+  // Default to one cycle for no itinerary. However, an "empty" itinerary may
+  // still have a MinLatency property, which getStageLatency checks.
+  if (!ItinData)
+    return MI.mayLoad() ? 2 : 1;
+
+  return ItinData->getStageLatency(MI.getDesc().getSchedClass());
+}
+
+bool TargetInstrInfo::hasLowDefLatency(const TargetSchedModel &SchedModel,
+                                       const MachineInstr &DefMI,
+                                       unsigned DefIdx) const {
+  const InstrItineraryData *ItinData = SchedModel.getInstrItineraries();
+  if (!ItinData || ItinData->isEmpty())
+    return false;
+
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
+  int DefCycle = ItinData->getOperandCycle(DefClass, DefIdx);
+  return (DefCycle != -1 && DefCycle <= 1);
+}
+
+/// Both DefMI and UseMI must be valid.  By default, call directly to the
+/// itinerary. This may be overriden by the target.
+int TargetInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
+                                       const MachineInstr &DefMI,
+                                       unsigned DefIdx,
+                                       const MachineInstr &UseMI,
+                                       unsigned UseIdx) const {
+  unsigned DefClass = DefMI.getDesc().getSchedClass();
+  unsigned UseClass = UseMI.getDesc().getSchedClass();
+  return ItinData->getOperandLatency(DefClass, DefIdx, UseClass, UseIdx);
+}
+
+/// If we can determine the operand latency from the def only, without itinerary
+/// lookup, do so. Otherwise return -1.
+int TargetInstrInfo::computeDefOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI) const {
+
+  // Let the target hook getInstrLatency handle missing itineraries.
+  if (!ItinData)
+    return getInstrLatency(ItinData, DefMI);
+
+  if(ItinData->isEmpty())
+    return defaultDefLatency(ItinData->SchedModel, DefMI);
+
+  // ...operand lookup required
+  return -1;
+}
+
+unsigned TargetInstrInfo::computeOperandLatency(
+    const InstrItineraryData *ItinData, const MachineInstr &DefMI,
+    unsigned DefIdx, const MachineInstr *UseMI, unsigned UseIdx) const {
+
+  int DefLatency = computeDefOperandLatency(ItinData, DefMI);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  assert(ItinData && !ItinData->isEmpty() && "computeDefOperandLatency fail");
+
+  int OperLatency = 0;
+  if (UseMI)
+    OperLatency = getOperandLatency(ItinData, DefMI, DefIdx, *UseMI, UseIdx);
+  else {
+    unsigned DefClass = DefMI.getDesc().getSchedClass();
+    OperLatency = ItinData->getOperandCycle(DefClass, DefIdx);
+  }
+  if (OperLatency >= 0)
+    return OperLatency;
+
+  // No operand latency was found.
+  unsigned InstrLatency = getInstrLatency(ItinData, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  InstrLatency = std::max(InstrLatency,
+                          defaultDefLatency(ItinData->SchedModel, DefMI));
+  return InstrLatency;
+}
+
+bool TargetInstrInfo::getRegSequenceInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    SmallVectorImpl<RegSubRegPairAndIdx> &InputRegs) const {
+  assert((MI.isRegSequence() ||
+          MI.isRegSequenceLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isRegSequence())
+    return getRegSequenceLikeInputs(MI, DefIdx, InputRegs);
+
+  // We are looking at:
+  // Def = REG_SEQUENCE v0, sub0, v1, sub1, ...
+  assert(DefIdx == 0 && "REG_SEQUENCE only has one def");
+  for (unsigned OpIdx = 1, EndOpIdx = MI.getNumOperands(); OpIdx != EndOpIdx;
+       OpIdx += 2) {
+    const MachineOperand &MOReg = MI.getOperand(OpIdx);
+    const MachineOperand &MOSubIdx = MI.getOperand(OpIdx + 1);
+    assert(MOSubIdx.isImm() &&
+           "One of the subindex of the reg_sequence is not an immediate");
+    // Record Reg:SubReg, SubIdx.
+    InputRegs.push_back(RegSubRegPairAndIdx(MOReg.getReg(), MOReg.getSubReg(),
+                                            (unsigned)MOSubIdx.getImm()));
+  }
+  return true;
+}
+
+bool TargetInstrInfo::getExtractSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPairAndIdx &InputReg) const {
+  assert((MI.isExtractSubreg() ||
+      MI.isExtractSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isExtractSubreg())
+    return getExtractSubregLikeInputs(MI, DefIdx, InputReg);
+
+  // We are looking at:
+  // Def = EXTRACT_SUBREG v0.sub1, sub0.
+  assert(DefIdx == 0 && "EXTRACT_SUBREG only has one def");
+  const MachineOperand &MOReg = MI.getOperand(1);
+  const MachineOperand &MOSubIdx = MI.getOperand(2);
+  assert(MOSubIdx.isImm() &&
+         "The subindex of the extract_subreg is not an immediate");
+
+  InputReg.Reg = MOReg.getReg();
+  InputReg.SubReg = MOReg.getSubReg();
+  InputReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
+
+bool TargetInstrInfo::getInsertSubregInputs(
+    const MachineInstr &MI, unsigned DefIdx,
+    RegSubRegPair &BaseReg, RegSubRegPairAndIdx &InsertedReg) const {
+  assert((MI.isInsertSubreg() ||
+      MI.isInsertSubregLike()) && "Instruction do not have the proper type");
+
+  if (!MI.isInsertSubreg())
+    return getInsertSubregLikeInputs(MI, DefIdx, BaseReg, InsertedReg);
+
+  // We are looking at:
+  // Def = INSERT_SEQUENCE v0, v1, sub0.
+  assert(DefIdx == 0 && "INSERT_SUBREG only has one def");
+  const MachineOperand &MOBaseReg = MI.getOperand(1);
+  const MachineOperand &MOInsertedReg = MI.getOperand(2);
+  const MachineOperand &MOSubIdx = MI.getOperand(3);
+  assert(MOSubIdx.isImm() &&
+         "One of the subindex of the reg_sequence is not an immediate");
+  BaseReg.Reg = MOBaseReg.getReg();
+  BaseReg.SubReg = MOBaseReg.getSubReg();
+
+  InsertedReg.Reg = MOInsertedReg.getReg();
+  InsertedReg.SubReg = MOInsertedReg.getSubReg();
+  InsertedReg.SubIdx = (unsigned)MOSubIdx.getImm();
+  return true;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
new file mode 100644
index 000000000000..6d3fe8ca6473
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -0,0 +1,1842 @@
+//===-- TargetLoweringBase.cpp - Implement the TargetLoweringBase class ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements the TargetLoweringBase class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cctype>
+using namespace llvm;
+
+static cl::opt<bool> JumpIsExpensiveOverride(
+    "jump-is-expensive", cl::init(false),
+    cl::desc("Do not create extra branches to split comparison logic."),
+    cl::Hidden);
+
+// Although this default value is arbitrary, it is not random. It is assumed
+// that a condition that evaluates the same way by a higher percentage than this
+// is best represented as control flow. Therefore, the default value N should be
+// set such that the win from N% correct executions is greater than the loss
+// from (100 - N)% mispredicted executions for the majority of intended targets.
+static cl::opt<int> MinPercentageForPredictableBranch(
+    "min-predictable-branch", cl::init(99),
+    cl::desc("Minimum percentage (0-100) that a condition must be either true "
+             "or false to assume that the condition is predictable"),
+    cl::Hidden);
+
+/// InitLibcallNames - Set default libcall names.
+///
+static void InitLibcallNames(const char **Names, const Triple &TT) {
+  Names[RTLIB::SHL_I16] = "__ashlhi3";
+  Names[RTLIB::SHL_I32] = "__ashlsi3";
+  Names[RTLIB::SHL_I64] = "__ashldi3";
+  Names[RTLIB::SHL_I128] = "__ashlti3";
+  Names[RTLIB::SRL_I16] = "__lshrhi3";
+  Names[RTLIB::SRL_I32] = "__lshrsi3";
+  Names[RTLIB::SRL_I64] = "__lshrdi3";
+  Names[RTLIB::SRL_I128] = "__lshrti3";
+  Names[RTLIB::SRA_I16] = "__ashrhi3";
+  Names[RTLIB::SRA_I32] = "__ashrsi3";
+  Names[RTLIB::SRA_I64] = "__ashrdi3";
+  Names[RTLIB::SRA_I128] = "__ashrti3";
+  Names[RTLIB::MUL_I8] = "__mulqi3";
+  Names[RTLIB::MUL_I16] = "__mulhi3";
+  Names[RTLIB::MUL_I32] = "__mulsi3";
+  Names[RTLIB::MUL_I64] = "__muldi3";
+  Names[RTLIB::MUL_I128] = "__multi3";
+  Names[RTLIB::MULO_I32] = "__mulosi4";
+  Names[RTLIB::MULO_I64] = "__mulodi4";
+  Names[RTLIB::MULO_I128] = "__muloti4";
+  Names[RTLIB::SDIV_I8] = "__divqi3";
+  Names[RTLIB::SDIV_I16] = "__divhi3";
+  Names[RTLIB::SDIV_I32] = "__divsi3";
+  Names[RTLIB::SDIV_I64] = "__divdi3";
+  Names[RTLIB::SDIV_I128] = "__divti3";
+  Names[RTLIB::UDIV_I8] = "__udivqi3";
+  Names[RTLIB::UDIV_I16] = "__udivhi3";
+  Names[RTLIB::UDIV_I32] = "__udivsi3";
+  Names[RTLIB::UDIV_I64] = "__udivdi3";
+  Names[RTLIB::UDIV_I128] = "__udivti3";
+  Names[RTLIB::SREM_I8] = "__modqi3";
+  Names[RTLIB::SREM_I16] = "__modhi3";
+  Names[RTLIB::SREM_I32] = "__modsi3";
+  Names[RTLIB::SREM_I64] = "__moddi3";
+  Names[RTLIB::SREM_I128] = "__modti3";
+  Names[RTLIB::UREM_I8] = "__umodqi3";
+  Names[RTLIB::UREM_I16] = "__umodhi3";
+  Names[RTLIB::UREM_I32] = "__umodsi3";
+  Names[RTLIB::UREM_I64] = "__umoddi3";
+  Names[RTLIB::UREM_I128] = "__umodti3";
+
+  Names[RTLIB::NEG_I32] = "__negsi2";
+  Names[RTLIB::NEG_I64] = "__negdi2";
+  Names[RTLIB::ADD_F32] = "__addsf3";
+  Names[RTLIB::ADD_F64] = "__adddf3";
+  Names[RTLIB::ADD_F80] = "__addxf3";
+  Names[RTLIB::ADD_F128] = "__addtf3";
+  Names[RTLIB::ADD_PPCF128] = "__gcc_qadd";
+  Names[RTLIB::SUB_F32] = "__subsf3";
+  Names[RTLIB::SUB_F64] = "__subdf3";
+  Names[RTLIB::SUB_F80] = "__subxf3";
+  Names[RTLIB::SUB_F128] = "__subtf3";
+  Names[RTLIB::SUB_PPCF128] = "__gcc_qsub";
+  Names[RTLIB::MUL_F32] = "__mulsf3";
+  Names[RTLIB::MUL_F64] = "__muldf3";
+  Names[RTLIB::MUL_F80] = "__mulxf3";
+  Names[RTLIB::MUL_F128] = "__multf3";
+  Names[RTLIB::MUL_PPCF128] = "__gcc_qmul";
+  Names[RTLIB::DIV_F32] = "__divsf3";
+  Names[RTLIB::DIV_F64] = "__divdf3";
+  Names[RTLIB::DIV_F80] = "__divxf3";
+  Names[RTLIB::DIV_F128] = "__divtf3";
+  Names[RTLIB::DIV_PPCF128] = "__gcc_qdiv";
+  Names[RTLIB::REM_F32] = "fmodf";
+  Names[RTLIB::REM_F64] = "fmod";
+  Names[RTLIB::REM_F80] = "fmodl";
+  Names[RTLIB::REM_F128] = "fmodl";
+  Names[RTLIB::REM_PPCF128] = "fmodl";
+  Names[RTLIB::FMA_F32] = "fmaf";
+  Names[RTLIB::FMA_F64] = "fma";
+  Names[RTLIB::FMA_F80] = "fmal";
+  Names[RTLIB::FMA_F128] = "fmal";
+  Names[RTLIB::FMA_PPCF128] = "fmal";
+  Names[RTLIB::POWI_F32] = "__powisf2";
+  Names[RTLIB::POWI_F64] = "__powidf2";
+  Names[RTLIB::POWI_F80] = "__powixf2";
+  Names[RTLIB::POWI_F128] = "__powitf2";
+  Names[RTLIB::POWI_PPCF128] = "__powitf2";
+  Names[RTLIB::SQRT_F32] = "sqrtf";
+  Names[RTLIB::SQRT_F64] = "sqrt";
+  Names[RTLIB::SQRT_F80] = "sqrtl";
+  Names[RTLIB::SQRT_F128] = "sqrtl";
+  Names[RTLIB::SQRT_PPCF128] = "sqrtl";
+  Names[RTLIB::LOG_F32] = "logf";
+  Names[RTLIB::LOG_F64] = "log";
+  Names[RTLIB::LOG_F80] = "logl";
+  Names[RTLIB::LOG_F128] = "logl";
+  Names[RTLIB::LOG_PPCF128] = "logl";
+  Names[RTLIB::LOG2_F32] = "log2f";
+  Names[RTLIB::LOG2_F64] = "log2";
+  Names[RTLIB::LOG2_F80] = "log2l";
+  Names[RTLIB::LOG2_F128] = "log2l";
+  Names[RTLIB::LOG2_PPCF128] = "log2l";
+  Names[RTLIB::LOG10_F32] = "log10f";
+  Names[RTLIB::LOG10_F64] = "log10";
+  Names[RTLIB::LOG10_F80] = "log10l";
+  Names[RTLIB::LOG10_F128] = "log10l";
+  Names[RTLIB::LOG10_PPCF128] = "log10l";
+  Names[RTLIB::EXP_F32] = "expf";
+  Names[RTLIB::EXP_F64] = "exp";
+  Names[RTLIB::EXP_F80] = "expl";
+  Names[RTLIB::EXP_F128] = "expl";
+  Names[RTLIB::EXP_PPCF128] = "expl";
+  Names[RTLIB::EXP2_F32] = "exp2f";
+  Names[RTLIB::EXP2_F64] = "exp2";
+  Names[RTLIB::EXP2_F80] = "exp2l";
+  Names[RTLIB::EXP2_F128] = "exp2l";
+  Names[RTLIB::EXP2_PPCF128] = "exp2l";
+  Names[RTLIB::SIN_F32] = "sinf";
+  Names[RTLIB::SIN_F64] = "sin";
+  Names[RTLIB::SIN_F80] = "sinl";
+  Names[RTLIB::SIN_F128] = "sinl";
+  Names[RTLIB::SIN_PPCF128] = "sinl";
+  Names[RTLIB::COS_F32] = "cosf";
+  Names[RTLIB::COS_F64] = "cos";
+  Names[RTLIB::COS_F80] = "cosl";
+  Names[RTLIB::COS_F128] = "cosl";
+  Names[RTLIB::COS_PPCF128] = "cosl";
+  Names[RTLIB::POW_F32] = "powf";
+  Names[RTLIB::POW_F64] = "pow";
+  Names[RTLIB::POW_F80] = "powl";
+  Names[RTLIB::POW_F128] = "powl";
+  Names[RTLIB::POW_PPCF128] = "powl";
+  Names[RTLIB::CEIL_F32] = "ceilf";
+  Names[RTLIB::CEIL_F64] = "ceil";
+  Names[RTLIB::CEIL_F80] = "ceill";
+  Names[RTLIB::CEIL_F128] = "ceill";
+  Names[RTLIB::CEIL_PPCF128] = "ceill";
+  Names[RTLIB::TRUNC_F32] = "truncf";
+  Names[RTLIB::TRUNC_F64] = "trunc";
+  Names[RTLIB::TRUNC_F80] = "truncl";
+  Names[RTLIB::TRUNC_F128] = "truncl";
+  Names[RTLIB::TRUNC_PPCF128] = "truncl";
+  Names[RTLIB::RINT_F32] = "rintf";
+  Names[RTLIB::RINT_F64] = "rint";
+  Names[RTLIB::RINT_F80] = "rintl";
+  Names[RTLIB::RINT_F128] = "rintl";
+  Names[RTLIB::RINT_PPCF128] = "rintl";
+  Names[RTLIB::NEARBYINT_F32] = "nearbyintf";
+  Names[RTLIB::NEARBYINT_F64] = "nearbyint";
+  Names[RTLIB::NEARBYINT_F80] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_F128] = "nearbyintl";
+  Names[RTLIB::NEARBYINT_PPCF128] = "nearbyintl";
+  Names[RTLIB::ROUND_F32] = "roundf";
+  Names[RTLIB::ROUND_F64] = "round";
+  Names[RTLIB::ROUND_F80] = "roundl";
+  Names[RTLIB::ROUND_F128] = "roundl";
+  Names[RTLIB::ROUND_PPCF128] = "roundl";
+  Names[RTLIB::FLOOR_F32] = "floorf";
+  Names[RTLIB::FLOOR_F64] = "floor";
+  Names[RTLIB::FLOOR_F80] = "floorl";
+  Names[RTLIB::FLOOR_F128] = "floorl";
+  Names[RTLIB::FLOOR_PPCF128] = "floorl";
+  Names[RTLIB::FMIN_F32] = "fminf";
+  Names[RTLIB::FMIN_F64] = "fmin";
+  Names[RTLIB::FMIN_F80] = "fminl";
+  Names[RTLIB::FMIN_F128] = "fminl";
+  Names[RTLIB::FMIN_PPCF128] = "fminl";
+  Names[RTLIB::FMAX_F32] = "fmaxf";
+  Names[RTLIB::FMAX_F64] = "fmax";
+  Names[RTLIB::FMAX_F80] = "fmaxl";
+  Names[RTLIB::FMAX_F128] = "fmaxl";
+  Names[RTLIB::FMAX_PPCF128] = "fmaxl";
+  Names[RTLIB::ROUND_F32] = "roundf";
+  Names[RTLIB::ROUND_F64] = "round";
+  Names[RTLIB::ROUND_F80] = "roundl";
+  Names[RTLIB::ROUND_F128] = "roundl";
+  Names[RTLIB::ROUND_PPCF128] = "roundl";
+  Names[RTLIB::COPYSIGN_F32] = "copysignf";
+  Names[RTLIB::COPYSIGN_F64] = "copysign";
+  Names[RTLIB::COPYSIGN_F80] = "copysignl";
+  Names[RTLIB::COPYSIGN_F128] = "copysignl";
+  Names[RTLIB::COPYSIGN_PPCF128] = "copysignl";
+  Names[RTLIB::FPEXT_F32_PPCF128] = "__gcc_stoq";
+  Names[RTLIB::FPEXT_F64_PPCF128] = "__gcc_dtoq";
+  Names[RTLIB::FPEXT_F64_F128] = "__extenddftf2";
+  Names[RTLIB::FPEXT_F32_F128] = "__extendsftf2";
+  Names[RTLIB::FPEXT_F32_F64] = "__extendsfdf2";
+  if (TT.isOSDarwin()) {
+    // For f16/f32 conversions, Darwin uses the standard naming scheme, instead
+    // of the gnueabi-style __gnu_*_ieee.
+    // FIXME: What about other targets?
+    Names[RTLIB::FPEXT_F16_F32] = "__extendhfsf2";
+    Names[RTLIB::FPROUND_F32_F16] = "__truncsfhf2";
+  } else {
+    Names[RTLIB::FPEXT_F16_F32] = "__gnu_h2f_ieee";
+    Names[RTLIB::FPROUND_F32_F16] = "__gnu_f2h_ieee";
+  }
+  Names[RTLIB::FPROUND_F64_F16] = "__truncdfhf2";
+  Names[RTLIB::FPROUND_F80_F16] = "__truncxfhf2";
+  Names[RTLIB::FPROUND_F128_F16] = "__trunctfhf2";
+  Names[RTLIB::FPROUND_PPCF128_F16] = "__trunctfhf2";
+  Names[RTLIB::FPROUND_F64_F32] = "__truncdfsf2";
+  Names[RTLIB::FPROUND_F80_F32] = "__truncxfsf2";
+  Names[RTLIB::FPROUND_F128_F32] = "__trunctfsf2";
+  Names[RTLIB::FPROUND_PPCF128_F32] = "__gcc_qtos";
+  Names[RTLIB::FPROUND_F80_F64] = "__truncxfdf2";
+  Names[RTLIB::FPROUND_F128_F64] = "__trunctfdf2";
+  Names[RTLIB::FPROUND_PPCF128_F64] = "__gcc_qtod";
+  Names[RTLIB::FPTOSINT_F32_I32] = "__fixsfsi";
+  Names[RTLIB::FPTOSINT_F32_I64] = "__fixsfdi";
+  Names[RTLIB::FPTOSINT_F32_I128] = "__fixsfti";
+  Names[RTLIB::FPTOSINT_F64_I32] = "__fixdfsi";
+  Names[RTLIB::FPTOSINT_F64_I64] = "__fixdfdi";
+  Names[RTLIB::FPTOSINT_F64_I128] = "__fixdfti";
+  Names[RTLIB::FPTOSINT_F80_I32] = "__fixxfsi";
+  Names[RTLIB::FPTOSINT_F80_I64] = "__fixxfdi";
+  Names[RTLIB::FPTOSINT_F80_I128] = "__fixxfti";
+  Names[RTLIB::FPTOSINT_F128_I32] = "__fixtfsi";
+  Names[RTLIB::FPTOSINT_F128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_F128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOSINT_PPCF128_I32] = "__gcc_qtou";
+  Names[RTLIB::FPTOSINT_PPCF128_I64] = "__fixtfdi";
+  Names[RTLIB::FPTOSINT_PPCF128_I128] = "__fixtfti";
+  Names[RTLIB::FPTOUINT_F32_I32] = "__fixunssfsi";
+  Names[RTLIB::FPTOUINT_F32_I64] = "__fixunssfdi";
+  Names[RTLIB::FPTOUINT_F32_I128] = "__fixunssfti";
+  Names[RTLIB::FPTOUINT_F64_I32] = "__fixunsdfsi";
+  Names[RTLIB::FPTOUINT_F64_I64] = "__fixunsdfdi";
+  Names[RTLIB::FPTOUINT_F64_I128] = "__fixunsdfti";
+  Names[RTLIB::FPTOUINT_F80_I32] = "__fixunsxfsi";
+  Names[RTLIB::FPTOUINT_F80_I64] = "__fixunsxfdi";
+  Names[RTLIB::FPTOUINT_F80_I128] = "__fixunsxfti";
+  Names[RTLIB::FPTOUINT_F128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_F128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_F128_I128] = "__fixunstfti";
+  Names[RTLIB::FPTOUINT_PPCF128_I32] = "__fixunstfsi";
+  Names[RTLIB::FPTOUINT_PPCF128_I64] = "__fixunstfdi";
+  Names[RTLIB::FPTOUINT_PPCF128_I128] = "__fixunstfti";
+  Names[RTLIB::SINTTOFP_I32_F32] = "__floatsisf";
+  Names[RTLIB::SINTTOFP_I32_F64] = "__floatsidf";
+  Names[RTLIB::SINTTOFP_I32_F80] = "__floatsixf";
+  Names[RTLIB::SINTTOFP_I32_F128] = "__floatsitf";
+  Names[RTLIB::SINTTOFP_I32_PPCF128] = "__gcc_itoq";
+  Names[RTLIB::SINTTOFP_I64_F32] = "__floatdisf";
+  Names[RTLIB::SINTTOFP_I64_F64] = "__floatdidf";
+  Names[RTLIB::SINTTOFP_I64_F80] = "__floatdixf";
+  Names[RTLIB::SINTTOFP_I64_F128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I64_PPCF128] = "__floatditf";
+  Names[RTLIB::SINTTOFP_I128_F32] = "__floattisf";
+  Names[RTLIB::SINTTOFP_I128_F64] = "__floattidf";
+  Names[RTLIB::SINTTOFP_I128_F80] = "__floattixf";
+  Names[RTLIB::SINTTOFP_I128_F128] = "__floattitf";
+  Names[RTLIB::SINTTOFP_I128_PPCF128] = "__floattitf";
+  Names[RTLIB::UINTTOFP_I32_F32] = "__floatunsisf";
+  Names[RTLIB::UINTTOFP_I32_F64] = "__floatunsidf";
+  Names[RTLIB::UINTTOFP_I32_F80] = "__floatunsixf";
+  Names[RTLIB::UINTTOFP_I32_F128] = "__floatunsitf";
+  Names[RTLIB::UINTTOFP_I32_PPCF128] = "__gcc_utoq";
+  Names[RTLIB::UINTTOFP_I64_F32] = "__floatundisf";
+  Names[RTLIB::UINTTOFP_I64_F64] = "__floatundidf";
+  Names[RTLIB::UINTTOFP_I64_F80] = "__floatundixf";
+  Names[RTLIB::UINTTOFP_I64_F128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I64_PPCF128] = "__floatunditf";
+  Names[RTLIB::UINTTOFP_I128_F32] = "__floatuntisf";
+  Names[RTLIB::UINTTOFP_I128_F64] = "__floatuntidf";
+  Names[RTLIB::UINTTOFP_I128_F80] = "__floatuntixf";
+  Names[RTLIB::UINTTOFP_I128_F128] = "__floatuntitf";
+  Names[RTLIB::UINTTOFP_I128_PPCF128] = "__floatuntitf";
+  Names[RTLIB::OEQ_F32] = "__eqsf2";
+  Names[RTLIB::OEQ_F64] = "__eqdf2";
+  Names[RTLIB::OEQ_F128] = "__eqtf2";
+  Names[RTLIB::OEQ_PPCF128] = "__gcc_qeq";
+  Names[RTLIB::UNE_F32] = "__nesf2";
+  Names[RTLIB::UNE_F64] = "__nedf2";
+  Names[RTLIB::UNE_F128] = "__netf2";
+  Names[RTLIB::UNE_PPCF128] = "__gcc_qne";
+  Names[RTLIB::OGE_F32] = "__gesf2";
+  Names[RTLIB::OGE_F64] = "__gedf2";
+  Names[RTLIB::OGE_F128] = "__getf2";
+  Names[RTLIB::OGE_PPCF128] = "__gcc_qge";
+  Names[RTLIB::OLT_F32] = "__ltsf2";
+  Names[RTLIB::OLT_F64] = "__ltdf2";
+  Names[RTLIB::OLT_F128] = "__lttf2";
+  Names[RTLIB::OLT_PPCF128] = "__gcc_qlt";
+  Names[RTLIB::OLE_F32] = "__lesf2";
+  Names[RTLIB::OLE_F64] = "__ledf2";
+  Names[RTLIB::OLE_F128] = "__letf2";
+  Names[RTLIB::OLE_PPCF128] = "__gcc_qle";
+  Names[RTLIB::OGT_F32] = "__gtsf2";
+  Names[RTLIB::OGT_F64] = "__gtdf2";
+  Names[RTLIB::OGT_F128] = "__gttf2";
+  Names[RTLIB::OGT_PPCF128] = "__gcc_qgt";
+  Names[RTLIB::UO_F32] = "__unordsf2";
+  Names[RTLIB::UO_F64] = "__unorddf2";
+  Names[RTLIB::UO_F128] = "__unordtf2";
+  Names[RTLIB::UO_PPCF128] = "__gcc_qunord";
+  Names[RTLIB::O_F32] = "__unordsf2";
+  Names[RTLIB::O_F64] = "__unorddf2";
+  Names[RTLIB::O_F128] = "__unordtf2";
+  Names[RTLIB::O_PPCF128] = "__gcc_qunord";
+  Names[RTLIB::MEMCPY] = "memcpy";
+  Names[RTLIB::MEMMOVE] = "memmove";
+  Names[RTLIB::MEMSET] = "memset";
+  Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_4] = "__sync_val_compare_and_swap_4";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_8] = "__sync_val_compare_and_swap_8";
+  Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_16] = "__sync_val_compare_and_swap_16";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_1] = "__sync_lock_test_and_set_1";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_2] = "__sync_lock_test_and_set_2";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_4] = "__sync_lock_test_and_set_4";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_8] = "__sync_lock_test_and_set_8";
+  Names[RTLIB::SYNC_LOCK_TEST_AND_SET_16] = "__sync_lock_test_and_set_16";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_1] = "__sync_fetch_and_add_1";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_2] = "__sync_fetch_and_add_2";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_4] = "__sync_fetch_and_add_4";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_8] = "__sync_fetch_and_add_8";
+  Names[RTLIB::SYNC_FETCH_AND_ADD_16] = "__sync_fetch_and_add_16";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_1] = "__sync_fetch_and_sub_1";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_2] = "__sync_fetch_and_sub_2";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_4] = "__sync_fetch_and_sub_4";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_8] = "__sync_fetch_and_sub_8";
+  Names[RTLIB::SYNC_FETCH_AND_SUB_16] = "__sync_fetch_and_sub_16";
+  Names[RTLIB::SYNC_FETCH_AND_AND_1] = "__sync_fetch_and_and_1";
+  Names[RTLIB::SYNC_FETCH_AND_AND_2] = "__sync_fetch_and_and_2";
+  Names[RTLIB::SYNC_FETCH_AND_AND_4] = "__sync_fetch_and_and_4";
+  Names[RTLIB::SYNC_FETCH_AND_AND_8] = "__sync_fetch_and_and_8";
+  Names[RTLIB::SYNC_FETCH_AND_AND_16] = "__sync_fetch_and_and_16";
+  Names[RTLIB::SYNC_FETCH_AND_OR_1] = "__sync_fetch_and_or_1";
+  Names[RTLIB::SYNC_FETCH_AND_OR_2] = "__sync_fetch_and_or_2";
+  Names[RTLIB::SYNC_FETCH_AND_OR_4] = "__sync_fetch_and_or_4";
+  Names[RTLIB::SYNC_FETCH_AND_OR_8] = "__sync_fetch_and_or_8";
+  Names[RTLIB::SYNC_FETCH_AND_OR_16] = "__sync_fetch_and_or_16";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_1] = "__sync_fetch_and_xor_1";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_2] = "__sync_fetch_and_xor_2";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_4] = "__sync_fetch_and_xor_4";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_8] = "__sync_fetch_and_xor_8";
+  Names[RTLIB::SYNC_FETCH_AND_XOR_16] = "__sync_fetch_and_xor_16";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_1] = "__sync_fetch_and_nand_1";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_2] = "__sync_fetch_and_nand_2";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_4] = "__sync_fetch_and_nand_4";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_8] = "__sync_fetch_and_nand_8";
+  Names[RTLIB::SYNC_FETCH_AND_NAND_16] = "__sync_fetch_and_nand_16";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_1] = "__sync_fetch_and_max_1";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_2] = "__sync_fetch_and_max_2";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_4] = "__sync_fetch_and_max_4";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_8] = "__sync_fetch_and_max_8";
+  Names[RTLIB::SYNC_FETCH_AND_MAX_16] = "__sync_fetch_and_max_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_1] = "__sync_fetch_and_umax_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_2] = "__sync_fetch_and_umax_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_4] = "__sync_fetch_and_umax_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_8] = "__sync_fetch_and_umax_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMAX_16] = "__sync_fetch_and_umax_16";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_1] = "__sync_fetch_and_min_1";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_2] = "__sync_fetch_and_min_2";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_4] = "__sync_fetch_and_min_4";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_8] = "__sync_fetch_and_min_8";
+  Names[RTLIB::SYNC_FETCH_AND_MIN_16] = "__sync_fetch_and_min_16";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_1] = "__sync_fetch_and_umin_1";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_2] = "__sync_fetch_and_umin_2";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_4] = "__sync_fetch_and_umin_4";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_8] = "__sync_fetch_and_umin_8";
+  Names[RTLIB::SYNC_FETCH_AND_UMIN_16] = "__sync_fetch_and_umin_16";
+
+  Names[RTLIB::ATOMIC_LOAD] = "__atomic_load";
+  Names[RTLIB::ATOMIC_LOAD_1] = "__atomic_load_1";
+  Names[RTLIB::ATOMIC_LOAD_2] = "__atomic_load_2";
+  Names[RTLIB::ATOMIC_LOAD_4] = "__atomic_load_4";
+  Names[RTLIB::ATOMIC_LOAD_8] = "__atomic_load_8";
+  Names[RTLIB::ATOMIC_LOAD_16] = "__atomic_load_16";
+
+  Names[RTLIB::ATOMIC_STORE] = "__atomic_store";
+  Names[RTLIB::ATOMIC_STORE_1] = "__atomic_store_1";
+  Names[RTLIB::ATOMIC_STORE_2] = "__atomic_store_2";
+  Names[RTLIB::ATOMIC_STORE_4] = "__atomic_store_4";
+  Names[RTLIB::ATOMIC_STORE_8] = "__atomic_store_8";
+  Names[RTLIB::ATOMIC_STORE_16] = "__atomic_store_16";
+
+  Names[RTLIB::ATOMIC_EXCHANGE] = "__atomic_exchange";
+  Names[RTLIB::ATOMIC_EXCHANGE_1] = "__atomic_exchange_1";
+  Names[RTLIB::ATOMIC_EXCHANGE_2] = "__atomic_exchange_2";
+  Names[RTLIB::ATOMIC_EXCHANGE_4] = "__atomic_exchange_4";
+  Names[RTLIB::ATOMIC_EXCHANGE_8] = "__atomic_exchange_8";
+  Names[RTLIB::ATOMIC_EXCHANGE_16] = "__atomic_exchange_16";
+
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE] = "__atomic_compare_exchange";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_1] = "__atomic_compare_exchange_1";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_2] = "__atomic_compare_exchange_2";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_4] = "__atomic_compare_exchange_4";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_8] = "__atomic_compare_exchange_8";
+  Names[RTLIB::ATOMIC_COMPARE_EXCHANGE_16] = "__atomic_compare_exchange_16";
+
+  Names[RTLIB::ATOMIC_FETCH_ADD_1] = "__atomic_fetch_add_1";
+  Names[RTLIB::ATOMIC_FETCH_ADD_2] = "__atomic_fetch_add_2";
+  Names[RTLIB::ATOMIC_FETCH_ADD_4] = "__atomic_fetch_add_4";
+  Names[RTLIB::ATOMIC_FETCH_ADD_8] = "__atomic_fetch_add_8";
+  Names[RTLIB::ATOMIC_FETCH_ADD_16] = "__atomic_fetch_add_16";
+  Names[RTLIB::ATOMIC_FETCH_SUB_1] = "__atomic_fetch_sub_1";
+  Names[RTLIB::ATOMIC_FETCH_SUB_2] = "__atomic_fetch_sub_2";
+  Names[RTLIB::ATOMIC_FETCH_SUB_4] = "__atomic_fetch_sub_4";
+  Names[RTLIB::ATOMIC_FETCH_SUB_8] = "__atomic_fetch_sub_8";
+  Names[RTLIB::ATOMIC_FETCH_SUB_16] = "__atomic_fetch_sub_16";
+  Names[RTLIB::ATOMIC_FETCH_AND_1] = "__atomic_fetch_and_1";
+  Names[RTLIB::ATOMIC_FETCH_AND_2] = "__atomic_fetch_and_2";
+  Names[RTLIB::ATOMIC_FETCH_AND_4] = "__atomic_fetch_and_4";
+  Names[RTLIB::ATOMIC_FETCH_AND_8] = "__atomic_fetch_and_8";
+  Names[RTLIB::ATOMIC_FETCH_AND_16] = "__atomic_fetch_and_16";
+  Names[RTLIB::ATOMIC_FETCH_OR_1] = "__atomic_fetch_or_1";
+  Names[RTLIB::ATOMIC_FETCH_OR_2] = "__atomic_fetch_or_2";
+  Names[RTLIB::ATOMIC_FETCH_OR_4] = "__atomic_fetch_or_4";
+  Names[RTLIB::ATOMIC_FETCH_OR_8] = "__atomic_fetch_or_8";
+  Names[RTLIB::ATOMIC_FETCH_OR_16] = "__atomic_fetch_or_16";
+  Names[RTLIB::ATOMIC_FETCH_XOR_1] = "__atomic_fetch_xor_1";
+  Names[RTLIB::ATOMIC_FETCH_XOR_2] = "__atomic_fetch_xor_2";
+  Names[RTLIB::ATOMIC_FETCH_XOR_4] = "__atomic_fetch_xor_4";
+  Names[RTLIB::ATOMIC_FETCH_XOR_8] = "__atomic_fetch_xor_8";
+  Names[RTLIB::ATOMIC_FETCH_XOR_16] = "__atomic_fetch_xor_16";
+  Names[RTLIB::ATOMIC_FETCH_NAND_1] = "__atomic_fetch_nand_1";
+  Names[RTLIB::ATOMIC_FETCH_NAND_2] = "__atomic_fetch_nand_2";
+  Names[RTLIB::ATOMIC_FETCH_NAND_4] = "__atomic_fetch_nand_4";
+  Names[RTLIB::ATOMIC_FETCH_NAND_8] = "__atomic_fetch_nand_8";
+  Names[RTLIB::ATOMIC_FETCH_NAND_16] = "__atomic_fetch_nand_16";
+
+  if (TT.isGNUEnvironment()) {
+    Names[RTLIB::SINCOS_F32] = "sincosf";
+    Names[RTLIB::SINCOS_F64] = "sincos";
+    Names[RTLIB::SINCOS_F80] = "sincosl";
+    Names[RTLIB::SINCOS_F128] = "sincosl";
+    Names[RTLIB::SINCOS_PPCF128] = "sincosl";
+  }
+
+  if (!TT.isOSOpenBSD()) {
+    Names[RTLIB::STACKPROTECTOR_CHECK_FAIL] = "__stack_chk_fail";
+  }
+
+  Names[RTLIB::DEOPTIMIZE] = "__llvm_deoptimize";
+}
+
+/// InitLibcallCallingConvs - Set default libcall CallingConvs.
+///
+static void InitLibcallCallingConvs(CallingConv::ID *CCs) {
+  for (int i = 0; i < RTLIB::UNKNOWN_LIBCALL; ++i) {
+    CCs[i] = CallingConv::C;
+  }
+}
+
+/// getFPEXT - Return the FPEXT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPEXT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f16) {
+    if (RetVT == MVT::f32)
+      return FPEXT_F16_F32;
+  } else if (OpVT == MVT::f32) {
+    if (RetVT == MVT::f64)
+      return FPEXT_F32_F64;
+    if (RetVT == MVT::f128)
+      return FPEXT_F32_F128;
+    if (RetVT == MVT::ppcf128)
+      return FPEXT_F32_PPCF128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::f128)
+      return FPEXT_F64_F128;
+    else if (RetVT == MVT::ppcf128)
+      return FPEXT_F64_PPCF128;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPROUND - Return the FPROUND_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPROUND(EVT OpVT, EVT RetVT) {
+  if (RetVT == MVT::f16) {
+    if (OpVT == MVT::f32)
+      return FPROUND_F32_F16;
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F16;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F16;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F16;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F16;
+  } else if (RetVT == MVT::f32) {
+    if (OpVT == MVT::f64)
+      return FPROUND_F64_F32;
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F32;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F32;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F32;
+  } else if (RetVT == MVT::f64) {
+    if (OpVT == MVT::f80)
+      return FPROUND_F80_F64;
+    if (OpVT == MVT::f128)
+      return FPROUND_F128_F64;
+    if (OpVT == MVT::ppcf128)
+      return FPROUND_PPCF128_F64;
+  }
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOSINT - Return the FPTOSINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOSINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOSINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOSINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOSINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getFPTOUINT - Return the FPTOUINT_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getFPTOUINT(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::f32) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F32_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F32_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F32_I128;
+  } else if (OpVT == MVT::f64) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F64_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F64_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F64_I128;
+  } else if (OpVT == MVT::f80) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F80_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F80_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F80_I128;
+  } else if (OpVT == MVT::f128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_F128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_F128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_F128_I128;
+  } else if (OpVT == MVT::ppcf128) {
+    if (RetVT == MVT::i32)
+      return FPTOUINT_PPCF128_I32;
+    if (RetVT == MVT::i64)
+      return FPTOUINT_PPCF128_I64;
+    if (RetVT == MVT::i128)
+      return FPTOUINT_PPCF128_I128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getSINTTOFP - Return the SINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getSINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return SINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return SINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return SINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return SINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return SINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+/// getUINTTOFP - Return the UINTTOFP_*_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+RTLIB::Libcall RTLIB::getUINTTOFP(EVT OpVT, EVT RetVT) {
+  if (OpVT == MVT::i32) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I32_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I32_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I32_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I32_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I32_PPCF128;
+  } else if (OpVT == MVT::i64) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I64_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I64_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I64_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I64_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I64_PPCF128;
+  } else if (OpVT == MVT::i128) {
+    if (RetVT == MVT::f32)
+      return UINTTOFP_I128_F32;
+    if (RetVT == MVT::f64)
+      return UINTTOFP_I128_F64;
+    if (RetVT == MVT::f80)
+      return UINTTOFP_I128_F80;
+    if (RetVT == MVT::f128)
+      return UINTTOFP_I128_F128;
+    if (RetVT == MVT::ppcf128)
+      return UINTTOFP_I128_PPCF128;
+  }
+  return UNKNOWN_LIBCALL;
+}
+
+RTLIB::Libcall RTLIB::getSYNC(unsigned Opc, MVT VT) {
+#define OP_TO_LIBCALL(Name, Enum)                                              \
+  case Name:                                                                   \
+    switch (VT.SimpleTy) {                                                     \
+    default:                                                                   \
+      return UNKNOWN_LIBCALL;                                                  \
+    case MVT::i8:                                                              \
+      return Enum##_1;                                                         \
+    case MVT::i16:                                                             \
+      return Enum##_2;                                                         \
+    case MVT::i32:                                                             \
+      return Enum##_4;                                                         \
+    case MVT::i64:                                                             \
+      return Enum##_8;                                                         \
+    case MVT::i128:                                                            \
+      return Enum##_16;                                                        \
+    }
+
+  switch (Opc) {
+    OP_TO_LIBCALL(ISD::ATOMIC_SWAP, SYNC_LOCK_TEST_AND_SET)
+    OP_TO_LIBCALL(ISD::ATOMIC_CMP_SWAP, SYNC_VAL_COMPARE_AND_SWAP)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_ADD, SYNC_FETCH_AND_ADD)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_SUB, SYNC_FETCH_AND_SUB)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_AND, SYNC_FETCH_AND_AND)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_OR, SYNC_FETCH_AND_OR)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_XOR, SYNC_FETCH_AND_XOR)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_NAND, SYNC_FETCH_AND_NAND)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MAX, SYNC_FETCH_AND_MAX)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMAX, SYNC_FETCH_AND_UMAX)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_MIN, SYNC_FETCH_AND_MIN)
+    OP_TO_LIBCALL(ISD::ATOMIC_LOAD_UMIN, SYNC_FETCH_AND_UMIN)
+  }
+
+#undef OP_TO_LIBCALL
+
+  return UNKNOWN_LIBCALL;
+}
+
+/// InitCmpLibcallCCs - Set default comparison libcall CC.
+///
+static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
+  memset(CCs, ISD::SETCC_INVALID, sizeof(ISD::CondCode)*RTLIB::UNKNOWN_LIBCALL);
+  CCs[RTLIB::OEQ_F32] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F64] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_F128] = ISD::SETEQ;
+  CCs[RTLIB::OEQ_PPCF128] = ISD::SETEQ;
+  CCs[RTLIB::UNE_F32] = ISD::SETNE;
+  CCs[RTLIB::UNE_F64] = ISD::SETNE;
+  CCs[RTLIB::UNE_F128] = ISD::SETNE;
+  CCs[RTLIB::UNE_PPCF128] = ISD::SETNE;
+  CCs[RTLIB::OGE_F32] = ISD::SETGE;
+  CCs[RTLIB::OGE_F64] = ISD::SETGE;
+  CCs[RTLIB::OGE_F128] = ISD::SETGE;
+  CCs[RTLIB::OGE_PPCF128] = ISD::SETGE;
+  CCs[RTLIB::OLT_F32] = ISD::SETLT;
+  CCs[RTLIB::OLT_F64] = ISD::SETLT;
+  CCs[RTLIB::OLT_F128] = ISD::SETLT;
+  CCs[RTLIB::OLT_PPCF128] = ISD::SETLT;
+  CCs[RTLIB::OLE_F32] = ISD::SETLE;
+  CCs[RTLIB::OLE_F64] = ISD::SETLE;
+  CCs[RTLIB::OLE_F128] = ISD::SETLE;
+  CCs[RTLIB::OLE_PPCF128] = ISD::SETLE;
+  CCs[RTLIB::OGT_F32] = ISD::SETGT;
+  CCs[RTLIB::OGT_F64] = ISD::SETGT;
+  CCs[RTLIB::OGT_F128] = ISD::SETGT;
+  CCs[RTLIB::OGT_PPCF128] = ISD::SETGT;
+  CCs[RTLIB::UO_F32] = ISD::SETNE;
+  CCs[RTLIB::UO_F64] = ISD::SETNE;
+  CCs[RTLIB::UO_F128] = ISD::SETNE;
+  CCs[RTLIB::UO_PPCF128] = ISD::SETNE;
+  CCs[RTLIB::O_F32] = ISD::SETEQ;
+  CCs[RTLIB::O_F64] = ISD::SETEQ;
+  CCs[RTLIB::O_F128] = ISD::SETEQ;
+  CCs[RTLIB::O_PPCF128] = ISD::SETEQ;
+}
+
+/// NOTE: The TargetMachine owns TLOF.
+TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
+  initActions();
+
+  // Perform these initializations only once.
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
+    = MaxStoresPerMemmoveOptSize = 4;
+  UseUnderscoreSetJmp = false;
+  UseUnderscoreLongJmp = false;
+  SelectIsExpensive = false;
+  HasMultipleConditionRegisters = false;
+  HasExtractBitsInsn = false;
+  FsqrtIsCheap = false;
+  JumpIsExpensive = JumpIsExpensiveOverride;
+  PredictableSelectIsExpensive = false;
+  MaskAndBranchFoldingIsLegal = false;
+  EnableExtLdPromotion = false;
+  HasFloatingPointExceptions = true;
+  StackPointerRegisterToSaveRestore = 0;
+  BooleanContents = UndefinedBooleanContent;
+  BooleanFloatContents = UndefinedBooleanContent;
+  BooleanVectorContents = UndefinedBooleanContent;
+  SchedPreferenceInfo = Sched::ILP;
+  JumpBufSize = 0;
+  JumpBufAlignment = 0;
+  MinFunctionAlignment = 0;
+  PrefFunctionAlignment = 0;
+  PrefLoopAlignment = 0;
+  GatherAllAliasesMaxDepth = 6;
+  MinStackArgumentAlignment = 1;
+  MinimumJumpTableEntries = 4;
+  // TODO: the default will be switched to 0 in the next commit, along
+  // with the Target-specific changes necessary.
+  MaxAtomicSizeInBitsSupported = 1024;
+
+  MinCmpXchgSizeInBits = 0;
+
+  std::fill(std::begin(LibcallRoutineNames), std::end(LibcallRoutineNames), nullptr);
+
+  InitLibcallNames(LibcallRoutineNames, TM.getTargetTriple());
+  InitCmpLibcallCCs(CmpLibcallCCs);
+  InitLibcallCallingConvs(LibcallCallingConvs);
+}
+
+void TargetLoweringBase::initActions() {
+  // All operations default to being supported.
+  memset(OpActions, 0, sizeof(OpActions));
+  memset(LoadExtActions, 0, sizeof(LoadExtActions));
+  memset(TruncStoreActions, 0, sizeof(TruncStoreActions));
+  memset(IndexedModeActions, 0, sizeof(IndexedModeActions));
+  memset(CondCodeActions, 0, sizeof(CondCodeActions));
+  std::fill(std::begin(RegClassForVT), std::end(RegClassForVT), nullptr);
+  std::fill(std::begin(TargetDAGCombineArray),
+            std::end(TargetDAGCombineArray), 0);
+
+  // Set default actions for various operations.
+  for (MVT VT : MVT::all_valuetypes()) {
+    // Default all indexed load / store to expand.
+    for (unsigned IM = (unsigned)ISD::PRE_INC;
+         IM != (unsigned)ISD::LAST_INDEXED_MODE; ++IM) {
+      setIndexedLoadAction(IM, VT, Expand);
+      setIndexedStoreAction(IM, VT, Expand);
+    }
+
+    // Most backends expect to see the node which just returns the value loaded.
+    setOperationAction(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS, VT, Expand);
+
+    // These operations default to expand.
+    setOperationAction(ISD::FGETSIGN, VT, Expand);
+    setOperationAction(ISD::CONCAT_VECTORS, VT, Expand);
+    setOperationAction(ISD::FMINNUM, VT, Expand);
+    setOperationAction(ISD::FMAXNUM, VT, Expand);
+    setOperationAction(ISD::FMINNAN, VT, Expand);
+    setOperationAction(ISD::FMAXNAN, VT, Expand);
+    setOperationAction(ISD::FMAD, VT, Expand);
+    setOperationAction(ISD::SMIN, VT, Expand);
+    setOperationAction(ISD::SMAX, VT, Expand);
+    setOperationAction(ISD::UMIN, VT, Expand);
+    setOperationAction(ISD::UMAX, VT, Expand);
+
+    // Overflow operations default to expand
+    setOperationAction(ISD::SADDO, VT, Expand);
+    setOperationAction(ISD::SSUBO, VT, Expand);
+    setOperationAction(ISD::UADDO, VT, Expand);
+    setOperationAction(ISD::USUBO, VT, Expand);
+    setOperationAction(ISD::SMULO, VT, Expand);
+    setOperationAction(ISD::UMULO, VT, Expand);
+
+    // These default to Expand so they will be expanded to CTLZ/CTTZ by default.
+    setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Expand);
+    setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Expand);
+
+    setOperationAction(ISD::BITREVERSE, VT, Expand);
+    
+    // These library functions default to expand.
+    setOperationAction(ISD::FROUND, VT, Expand);
+
+    // These operations default to expand for vector types.
+    if (VT.isVector()) {
+      setOperationAction(ISD::FCOPYSIGN, VT, Expand);
+      setOperationAction(ISD::ANY_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Expand);
+      setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Expand);
+    }
+
+    // For most targets @llvm.get.dynamic.area.offset just returns 0.
+    setOperationAction(ISD::GET_DYNAMIC_AREA_OFFSET, VT, Expand);
+  }
+
+  // Most targets ignore the @llvm.prefetch intrinsic.
+  setOperationAction(ISD::PREFETCH, MVT::Other, Expand);
+
+  // Most targets also ignore the @llvm.readcyclecounter intrinsic.
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Expand);
+
+  // ConstantFP nodes default to expand.  Targets can either change this to
+  // Legal, in which case all fp constants are legal, or use isFPImmLegal()
+  // to optimize expansions for certain constants.
+  setOperationAction(ISD::ConstantFP, MVT::f16, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f32, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f64, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f80, Expand);
+  setOperationAction(ISD::ConstantFP, MVT::f128, Expand);
+
+  // These library functions default to expand.
+  for (MVT VT : {MVT::f32, MVT::f64, MVT::f128}) {
+    setOperationAction(ISD::FLOG ,      VT, Expand);
+    setOperationAction(ISD::FLOG2,      VT, Expand);
+    setOperationAction(ISD::FLOG10,     VT, Expand);
+    setOperationAction(ISD::FEXP ,      VT, Expand);
+    setOperationAction(ISD::FEXP2,      VT, Expand);
+    setOperationAction(ISD::FFLOOR,     VT, Expand);
+    setOperationAction(ISD::FNEARBYINT, VT, Expand);
+    setOperationAction(ISD::FCEIL,      VT, Expand);
+    setOperationAction(ISD::FRINT,      VT, Expand);
+    setOperationAction(ISD::FTRUNC,     VT, Expand);
+    setOperationAction(ISD::FROUND,     VT, Expand);
+  }
+
+  // Default ISD::TRAP to expand (which turns it into abort).
+  setOperationAction(ISD::TRAP, MVT::Other, Expand);
+
+  // On most systems, DEBUGTRAP and TRAP have no difference. The "Expand"
+  // here is to inform DAG Legalizer to replace DEBUGTRAP with TRAP.
+  //
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Expand);
+}
+
+MVT TargetLoweringBase::getScalarShiftAmountTy(const DataLayout &DL,
+                                               EVT) const {
+  return MVT::getIntegerVT(8 * DL.getPointerSize(0));
+}
+
+EVT TargetLoweringBase::getShiftAmountTy(EVT LHSTy,
+                                         const DataLayout &DL) const {
+  assert(LHSTy.isInteger() && "Shift amount is not an integer type!");
+  if (LHSTy.isVector())
+    return LHSTy;
+  return getScalarShiftAmountTy(DL, LHSTy);
+}
+
+/// canOpTrap - Returns true if the operation can trap for the value type.
+/// VT must be a legal type.
+bool TargetLoweringBase::canOpTrap(unsigned Op, EVT VT) const {
+  assert(isTypeLegal(VT));
+  switch (Op) {
+  default:
+    return false;
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM:
+    return true;
+  }
+}
+
+void TargetLoweringBase::setJumpIsExpensive(bool isExpensive) {
+  // If the command-line option was specified, ignore this request.
+  if (!JumpIsExpensiveOverride.getNumOccurrences())
+    JumpIsExpensive = isExpensive;
+}
+
+TargetLoweringBase::LegalizeKind
+TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
+  // If this is a simple type, use the ComputeRegisterProp mechanism.
+  if (VT.isSimple()) {
+    MVT SVT = VT.getSimpleVT();
+    assert((unsigned)SVT.SimpleTy < array_lengthof(TransformToType));
+    MVT NVT = TransformToType[SVT.SimpleTy];
+    LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
+
+    assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+            ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger) &&
+           "Promote may not follow Expand or Promote");
+
+    if (LA == TypeSplitVector)
+      return LegalizeKind(LA,
+                          EVT::getVectorVT(Context, SVT.getVectorElementType(),
+                                           SVT.getVectorNumElements() / 2));
+    if (LA == TypeScalarizeVector)
+      return LegalizeKind(LA, SVT.getVectorElementType());
+    return LegalizeKind(LA, NVT);
+  }
+
+  // Handle Extended Scalar Types.
+  if (!VT.isVector()) {
+    assert(VT.isInteger() && "Float types must be simple");
+    unsigned BitSize = VT.getSizeInBits();
+    // First promote to a power-of-two size, then expand if necessary.
+    if (BitSize < 8 || !isPowerOf2_32(BitSize)) {
+      EVT NVT = VT.getRoundIntegerType(Context);
+      assert(NVT != VT && "Unable to round integer VT");
+      LegalizeKind NextStep = getTypeConversion(Context, NVT);
+      // Avoid multi-step promotion.
+      if (NextStep.first == TypePromoteInteger)
+        return NextStep;
+      // Return rounded integer type.
+      return LegalizeKind(TypePromoteInteger, NVT);
+    }
+
+    return LegalizeKind(TypeExpandInteger,
+                        EVT::getIntegerVT(Context, VT.getSizeInBits() / 2));
+  }
+
+  // Handle vector types.
+  unsigned NumElts = VT.getVectorNumElements();
+  EVT EltVT = VT.getVectorElementType();
+
+  // Vectors with only one element are always scalarized.
+  if (NumElts == 1)
+    return LegalizeKind(TypeScalarizeVector, EltVT);
+
+  // Try to widen vector elements until the element type is a power of two and
+  // promote it to a legal type later on, for example:
+  // <3 x i8> -> <4 x i8> -> <4 x i32>
+  if (EltVT.isInteger()) {
+    // Vectors with a number of elements that is not a power of two are always
+    // widened, for example <3 x i8> -> <4 x i8>.
+    if (!VT.isPow2VectorType()) {
+      NumElts = (unsigned)NextPowerOf2(NumElts);
+      EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
+      return LegalizeKind(TypeWidenVector, NVT);
+    }
+
+    // Examine the element type.
+    LegalizeKind LK = getTypeConversion(Context, EltVT);
+
+    // If type is to be expanded, split the vector.
+    //  <4 x i140> -> <2 x i140>
+    if (LK.first == TypeExpandInteger)
+      return LegalizeKind(TypeSplitVector,
+                          EVT::getVectorVT(Context, EltVT, NumElts / 2));
+
+    // Promote the integer element types until a legal vector type is found
+    // or until the element integer type is too big. If a legal type was not
+    // found, fallback to the usual mechanism of widening/splitting the
+    // vector.
+    EVT OldEltVT = EltVT;
+    while (1) {
+      // Increase the bitwidth of the element to the next pow-of-two
+      // (which is greater than 8 bits).
+      EltVT = EVT::getIntegerVT(Context, 1 + EltVT.getSizeInBits())
+                  .getRoundIntegerType(Context);
+
+      // Stop trying when getting a non-simple element type.
+      // Note that vector elements may be greater than legal vector element
+      // types. Example: X86 XMM registers hold 64bit element on 32bit
+      // systems.
+      if (!EltVT.isSimple())
+        break;
+
+      // Build a new vector type and check if it is legal.
+      MVT NVT = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+      // Found a legal promoted vector type.
+      if (NVT != MVT() && ValueTypeActions.getTypeAction(NVT) == TypeLegal)
+        return LegalizeKind(TypePromoteInteger,
+                            EVT::getVectorVT(Context, EltVT, NumElts));
+    }
+
+    // Reset the type to the unexpanded type if we did not find a legal vector
+    // type with a promoted vector element type.
+    EltVT = OldEltVT;
+  }
+
+  // Try to widen the vector until a legal type is found.
+  // If there is no wider legal type, split the vector.
+  while (1) {
+    // Round up to the next power of 2.
+    NumElts = (unsigned)NextPowerOf2(NumElts);
+
+    // If there is no simple vector type with this many elements then there
+    // cannot be a larger legal vector type.  Note that this assumes that
+    // there are no skipped intermediate vector types in the simple types.
+    if (!EltVT.isSimple())
+      break;
+    MVT LargerVector = MVT::getVectorVT(EltVT.getSimpleVT(), NumElts);
+    if (LargerVector == MVT())
+      break;
+
+    // If this type is legal then widen the vector.
+    if (ValueTypeActions.getTypeAction(LargerVector) == TypeLegal)
+      return LegalizeKind(TypeWidenVector, LargerVector);
+  }
+
+  // Widen odd vectors to next power of two.
+  if (!VT.isPow2VectorType()) {
+    EVT NVT = VT.getPow2VectorType(Context);
+    return LegalizeKind(TypeWidenVector, NVT);
+  }
+
+  // Vectors with illegal element types are expanded.
+  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+  return LegalizeKind(TypeSplitVector, NVT);
+}
+
+static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
+                                          unsigned &NumIntermediates,
+                                          MVT &RegisterVT,
+                                          TargetLoweringBase *TLI) {
+  // Figure out the right, legal destination reg to copy into.
+  unsigned NumElts = VT.getVectorNumElements();
+  MVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  if (!TLI->isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  MVT DestVT = TLI->getRegisterType(NewVT);
+  RegisterVT = DestVT;
+  if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// isLegalRC - Return true if the value types that can be represented by the
+/// specified register class are all legal.
+bool TargetLoweringBase::isLegalRC(const TargetRegisterClass *RC) const {
+  for (TargetRegisterClass::vt_iterator I = RC->vt_begin(), E = RC->vt_end();
+       I != E; ++I) {
+    if (isTypeLegal(*I))
+      return true;
+  }
+  return false;
+}
+
+/// Replace/modify any TargetFrameIndex operands with a targte-dependent
+/// sequence of memory operands that is recognized by PrologEpilogInserter.
+MachineBasicBlock *
+TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
+                                   MachineBasicBlock *MBB) const {
+  MachineInstr *MI = &InitialMI;
+  MachineFunction &MF = *MI->getParent()->getParent();
+  MachineFrameInfo &MFI = *MF.getFrameInfo();
+
+  // We're handling multiple types of operands here:
+  // PATCHPOINT MetaArgs - live-in, read only, direct
+  // STATEPOINT Deopt Spill - live-through, read only, indirect
+  // STATEPOINT Deopt Alloca - live-through, read only, direct
+  // (We're currently conservative and mark the deopt slots read/write in
+  // practice.) 
+  // STATEPOINT GC Spill - live-through, read/write, indirect
+  // STATEPOINT GC Alloca - live-through, read/write, direct
+  // The live-in vs live-through is handled already (the live through ones are
+  // all stack slots), but we need to handle the different type of stackmap
+  // operands and memory effects here.
+
+  // MI changes inside this loop as we grow operands.
+  for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) {
+    MachineOperand &MO = MI->getOperand(OperIdx);
+    if (!MO.isFI())
+      continue;
+
+    // foldMemoryOperand builds a new MI after replacing a single FI operand
+    // with the canonical set of five x86 addressing-mode operands.
+    int FI = MO.getIndex();
+    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());
+
+    // Copy operands before the frame-index.
+    for (unsigned i = 0; i < OperIdx; ++i)
+      MIB.addOperand(MI->getOperand(i));
+    // Add frame index operands recognized by stackmaps.cpp
+    if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
+      // indirect-mem-ref tag, size, #FI, offset.
+      // Used for spills inserted by StatepointLowering.  This codepath is not
+      // used for patchpoints/stackmaps at all, for these spilling is done via
+      // foldMemoryOperand callback only.
+      assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
+      MIB.addImm(StackMaps::IndirectMemRefOp);
+      MIB.addImm(MFI.getObjectSize(FI));
+      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.addImm(0);
+    } else {
+      // direct-mem-ref tag, #FI, offset.
+      // Used by patchpoint, and direct alloca arguments to statepoints
+      MIB.addImm(StackMaps::DirectMemRefOp);
+      MIB.addOperand(MI->getOperand(OperIdx));
+      MIB.addImm(0);
+    }
+    // Copy the operands after the frame index.
+    for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
+      MIB.addOperand(MI->getOperand(i));
+
+    // Inherit previous memory operands.
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+    assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!");
+
+    // Add a new memory operand for this FI.
+    assert(MFI.getObjectOffset(FI) != -1);
+
+    auto Flags = MachineMemOperand::MOLoad;
+    if (MI->getOpcode() == TargetOpcode::STATEPOINT) {
+      Flags |= MachineMemOperand::MOStore;
+      Flags |= MachineMemOperand::MOVolatile;
+    }
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo::getFixedStack(MF, FI), Flags,
+        MF.getDataLayout().getPointerSize(), MFI.getObjectAlignment(FI));
+    MIB->addMemOperand(MF, MMO);
+
+    // Replace the instruction and update the operand index.
+    MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+    OperIdx += (MIB->getNumOperands() - MI->getNumOperands()) - 1;
+    MI->eraseFromParent();
+    MI = MIB;
+  }
+  return MBB;
+}
+
+/// findRepresentativeClass - Return the largest legal super-reg register class
+/// of the register class for the specified type and its associated "cost".
+// This function is in TargetLowering because it uses RegClassForVT which would
+// need to be moved to TargetRegisterInfo and would necessitate moving
+// isTypeLegal over as well - a massive change that would just require
+// TargetLowering having a TargetRegisterInfo class member that it would use.
+std::pair<const TargetRegisterClass *, uint8_t>
+TargetLoweringBase::findRepresentativeClass(const TargetRegisterInfo *TRI,
+                                            MVT VT) const {
+  const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
+  if (!RC)
+    return std::make_pair(RC, 0);
+
+  // Compute the set of all super-register classes.
+  BitVector SuperRegRC(TRI->getNumRegClasses());
+  for (SuperRegClassIterator RCI(RC, TRI); RCI.isValid(); ++RCI)
+    SuperRegRC.setBitsInMask(RCI.getMask());
+
+  // Find the first legal register class with the largest spill size.
+  const TargetRegisterClass *BestRC = RC;
+  for (int i = SuperRegRC.find_first(); i >= 0; i = SuperRegRC.find_next(i)) {
+    const TargetRegisterClass *SuperRC = TRI->getRegClass(i);
+    // We want the largest possible spill size.
+    if (SuperRC->getSize() <= BestRC->getSize())
+      continue;
+    if (!isLegalRC(SuperRC))
+      continue;
+    BestRC = SuperRC;
+  }
+  return std::make_pair(BestRC, 1);
+}
+
+/// computeRegisterProperties - Once all of the register classes are added,
+/// this allows us to compute derived properties we expose.
+void TargetLoweringBase::computeRegisterProperties(
+    const TargetRegisterInfo *TRI) {
+  static_assert(MVT::LAST_VALUETYPE <= MVT::MAX_ALLOWED_VALUETYPE,
+                "Too many value types for ValueTypeActions to hold!");
+
+  // Everything defaults to needing one register.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    NumRegistersForVT[i] = 1;
+    RegisterTypeForVT[i] = TransformToType[i] = (MVT::SimpleValueType)i;
+  }
+  // ...except isVoid, which doesn't need any registers.
+  NumRegistersForVT[MVT::isVoid] = 0;
+
+  // Find the largest integer register class.
+  unsigned LargestIntReg = MVT::LAST_INTEGER_VALUETYPE;
+  for (; RegClassForVT[LargestIntReg] == nullptr; --LargestIntReg)
+    assert(LargestIntReg != MVT::i1 && "No integer registers defined!");
+
+  // Every integer value type larger than this largest register takes twice as
+  // many registers to represent as the previous ValueType.
+  for (unsigned ExpandedReg = LargestIntReg + 1;
+       ExpandedReg <= MVT::LAST_INTEGER_VALUETYPE; ++ExpandedReg) {
+    NumRegistersForVT[ExpandedReg] = 2*NumRegistersForVT[ExpandedReg-1];
+    RegisterTypeForVT[ExpandedReg] = (MVT::SimpleValueType)LargestIntReg;
+    TransformToType[ExpandedReg] = (MVT::SimpleValueType)(ExpandedReg - 1);
+    ValueTypeActions.setTypeAction((MVT::SimpleValueType)ExpandedReg,
+                                   TypeExpandInteger);
+  }
+
+  // Inspect all of the ValueType's smaller than the largest integer
+  // register to see which ones need promotion.
+  unsigned LegalIntReg = LargestIntReg;
+  for (unsigned IntReg = LargestIntReg - 1;
+       IntReg >= (unsigned)MVT::i1; --IntReg) {
+    MVT IVT = (MVT::SimpleValueType)IntReg;
+    if (isTypeLegal(IVT)) {
+      LegalIntReg = IntReg;
+    } else {
+      RegisterTypeForVT[IntReg] = TransformToType[IntReg] =
+        (const MVT::SimpleValueType)LegalIntReg;
+      ValueTypeActions.setTypeAction(IVT, TypePromoteInteger);
+    }
+  }
+
+  // ppcf128 type is really two f64's.
+  if (!isTypeLegal(MVT::ppcf128)) {
+    if (isTypeLegal(MVT::f64)) {
+      NumRegistersForVT[MVT::ppcf128] = 2*NumRegistersForVT[MVT::f64];
+      RegisterTypeForVT[MVT::ppcf128] = MVT::f64;
+      TransformToType[MVT::ppcf128] = MVT::f64;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeExpandFloat);
+    } else {
+      NumRegistersForVT[MVT::ppcf128] = NumRegistersForVT[MVT::i128];
+      RegisterTypeForVT[MVT::ppcf128] = RegisterTypeForVT[MVT::i128];
+      TransformToType[MVT::ppcf128] = MVT::i128;
+      ValueTypeActions.setTypeAction(MVT::ppcf128, TypeSoftenFloat);
+    }
+  }
+
+  // Decide how to handle f128. If the target does not have native f128 support,
+  // expand it to i128 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f128)) {
+    NumRegistersForVT[MVT::f128] = NumRegistersForVT[MVT::i128];
+    RegisterTypeForVT[MVT::f128] = RegisterTypeForVT[MVT::i128];
+    TransformToType[MVT::f128] = MVT::i128;
+    ValueTypeActions.setTypeAction(MVT::f128, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f64. If the target does not have native f64 support,
+  // expand it to i64 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f64)) {
+    NumRegistersForVT[MVT::f64] = NumRegistersForVT[MVT::i64];
+    RegisterTypeForVT[MVT::f64] = RegisterTypeForVT[MVT::i64];
+    TransformToType[MVT::f64] = MVT::i64;
+    ValueTypeActions.setTypeAction(MVT::f64, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f32. If the target does not have native f32 support,
+  // expand it to i32 and we will be generating soft float library calls.
+  if (!isTypeLegal(MVT::f32)) {
+    NumRegistersForVT[MVT::f32] = NumRegistersForVT[MVT::i32];
+    RegisterTypeForVT[MVT::f32] = RegisterTypeForVT[MVT::i32];
+    TransformToType[MVT::f32] = MVT::i32;
+    ValueTypeActions.setTypeAction(MVT::f32, TypeSoftenFloat);
+  }
+
+  // Decide how to handle f16. If the target does not have native f16 support,
+  // promote it to f32, because there are no f16 library calls (except for
+  // conversions).
+  if (!isTypeLegal(MVT::f16)) {
+    NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
+    RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
+    TransformToType[MVT::f16] = MVT::f32;
+    ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+  }
+
+  // Loop over all of the vector value types to see which need transformations.
+  for (unsigned i = MVT::FIRST_VECTOR_VALUETYPE;
+       i <= (unsigned)MVT::LAST_VECTOR_VALUETYPE; ++i) {
+    MVT VT = (MVT::SimpleValueType) i;
+    if (isTypeLegal(VT))
+      continue;
+
+    MVT EltVT = VT.getVectorElementType();
+    unsigned NElts = VT.getVectorNumElements();
+    bool IsLegalWiderType = false;
+    LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
+    switch (PreferredAction) {
+    case TypePromoteInteger: {
+      // Try to promote the elements of integer vectors. If no legal
+      // promotion was found, fall through to the widen-vector method.
+      for (unsigned nVT = i + 1; nVT <= MVT::LAST_INTEGER_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType) nVT;
+        // Promote vectors of integers to vectors with the same number
+        // of elements, with a wider element type.
+        if (SVT.getVectorElementType().getSizeInBits() > EltVT.getSizeInBits() &&
+            SVT.getVectorNumElements() == NElts && isTypeLegal(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypePromoteInteger);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType)
+        break;
+    }
+    case TypeWidenVector: {
+      // Try to widen the vector.
+      for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
+        MVT SVT = (MVT::SimpleValueType) nVT;
+        if (SVT.getVectorElementType() == EltVT
+            && SVT.getVectorNumElements() > NElts && isTypeLegal(SVT)) {
+          TransformToType[i] = SVT;
+          RegisterTypeForVT[i] = SVT;
+          NumRegistersForVT[i] = 1;
+          ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+          IsLegalWiderType = true;
+          break;
+        }
+      }
+      if (IsLegalWiderType)
+        break;
+    }
+    case TypeSplitVector:
+    case TypeScalarizeVector: {
+      MVT IntermediateVT;
+      MVT RegisterVT;
+      unsigned NumIntermediates;
+      NumRegistersForVT[i] = getVectorTypeBreakdownMVT(VT, IntermediateVT,
+          NumIntermediates, RegisterVT, this);
+      RegisterTypeForVT[i] = RegisterVT;
+
+      MVT NVT = VT.getPow2VectorType();
+      if (NVT == VT) {
+        // Type is already a power of 2.  The default action is to split.
+        TransformToType[i] = MVT::Other;
+        if (PreferredAction == TypeScalarizeVector)
+          ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
+        else if (PreferredAction == TypeSplitVector)
+          ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else
+          // Set type action according to the number of elements.
+          ValueTypeActions.setTypeAction(VT, NElts == 1 ? TypeScalarizeVector
+                                                        : TypeSplitVector);
+      } else {
+        TransformToType[i] = NVT;
+        ValueTypeActions.setTypeAction(VT, TypeWidenVector);
+      }
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown vector legalization action!");
+    }
+  }
+
+  // Determine the 'representative' register class for each value type.
+  // An representative register class is the largest (meaning one which is
+  // not a sub-register class / subreg register class) legal register class for
+  // a group of value types. For example, on i386, i8, i16, and i32
+  // representative would be GR32; while on x86_64 it's GR64.
+  for (unsigned i = 0; i != MVT::LAST_VALUETYPE; ++i) {
+    const TargetRegisterClass* RRC;
+    uint8_t Cost;
+    std::tie(RRC, Cost) = findRepresentativeClass(TRI, (MVT::SimpleValueType)i);
+    RepRegClassForVT[i] = RRC;
+    RepRegClassCostForVT[i] = Cost;
+  }
+}
+
+EVT TargetLoweringBase::getSetCCResultType(const DataLayout &DL, LLVMContext &,
+                                           EVT VT) const {
+  assert(!VT.isVector() && "No default SetCC type for vectors!");
+  return getPointerTy(DL).SimpleTy;
+}
+
+MVT::SimpleValueType TargetLoweringBase::getCmpLibcallReturnType() const {
+  return MVT::i32; // return the default value
+}
+
+/// getVectorTypeBreakdown - Vector types are broken down into some number of
+/// legal first class types.  For example, MVT::v8f32 maps to 2 MVT::v4f32
+/// with Altivec or SSE1, or 8 promoted MVT::f64 values with the X86 FP stack.
+/// Similarly, MVT::v2i64 turns into 4 MVT::i32 values with both PPC and X86.
+///
+/// This method returns the number of registers needed, and the VT for each
+/// register.  It also returns the VT and quantity of the intermediate values
+/// before they are promoted/expanded.
+///
+unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT,
+                                                EVT &IntermediateVT,
+                                                unsigned &NumIntermediates,
+                                                MVT &RegisterVT) const {
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If there is a wider vector type with the same element type as this one,
+  // or a promoted vector type that has the same number of elements which
+  // are wider, then we should convert to that legal vector type.
+  // This handles things like <2 x float> -> <4 x float> and
+  // <4 x i1> -> <4 x i32>.
+  LegalizeTypeAction TA = getTypeAction(Context, VT);
+  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+    EVT RegisterEVT = getTypeToTransformTo(Context, VT);
+    if (isTypeLegal(RegisterEVT)) {
+      IntermediateVT = RegisterEVT;
+      RegisterVT = RegisterEVT.getSimpleVT();
+      NumIntermediates = 1;
+      return 1;
+    }
+  }
+
+  // Figure out the right, legal destination reg to copy into.
+  EVT EltTy = VT.getVectorElementType();
+
+  unsigned NumVectorRegs = 1;
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
+  // could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(NumElts)) {
+    NumVectorRegs = NumElts;
+    NumElts = 1;
+  }
+
+  // Divide the input until we get to a supported size.  This will always
+  // end with a scalar if the target doesn't support vectors.
+  while (NumElts > 1 && !isTypeLegal(
+                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
+    NumElts >>= 1;
+    NumVectorRegs <<= 1;
+  }
+
+  NumIntermediates = NumVectorRegs;
+
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
+  if (!isTypeLegal(NewVT))
+    NewVT = EltTy;
+  IntermediateVT = NewVT;
+
+  MVT DestVT = getRegisterType(Context, NewVT);
+  RegisterVT = DestVT;
+  unsigned NewVTSize = NewVT.getSizeInBits();
+
+  // Convert sizes such as i33 to i64.
+  if (!isPowerOf2_32(NewVTSize))
+    NewVTSize = NextPowerOf2(NewVTSize);
+
+  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
+    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+
+  // Otherwise, promotion or legal types use the same number of registers as
+  // the vector decimated to the appropriate level.
+  return NumVectorRegs;
+}
+
+/// Get the EVTs and ArgFlags collections that represent the legalized return
+/// type of the given function.  This does not require a DAG or a return value,
+/// and is suitable for use before any DAGs for the function are constructed.
+/// TODO: Move this out of TargetLowering.cpp.
+void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
+                         SmallVectorImpl<ISD::OutputArg> &Outs,
+                         const TargetLowering &TLI, const DataLayout &DL) {
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(TLI, DL, ReturnType, ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  for (unsigned j = 0, f = NumValues; j != f; ++j) {
+    EVT VT = ValueVTs[j];
+    ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
+
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      ExtendKind = ISD::SIGN_EXTEND;
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      ExtendKind = ISD::ZERO_EXTEND;
+
+    // FIXME: C calling convention requires the return type to be promoted to
+    // at least 32-bit. But this is not necessary for non-C calling
+    // conventions. The frontend should mark functions whose return values
+    // require promoting with signext or zeroext attributes.
+    if (ExtendKind != ISD::ANY_EXTEND && VT.isInteger()) {
+      MVT MinVT = TLI.getRegisterType(ReturnType->getContext(), MVT::i32);
+      if (VT.bitsLT(MinVT))
+        VT = MinVT;
+    }
+
+    unsigned NumParts = TLI.getNumRegisters(ReturnType->getContext(), VT);
+    MVT PartVT = TLI.getRegisterType(ReturnType->getContext(), VT);
+
+    // 'inreg' on function refers to return value
+    ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+      Flags.setInReg();
+
+    // Propagate extension type if any
+    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+      Flags.setSExt();
+    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+      Flags.setZExt();
+
+    for (unsigned i = 0; i < NumParts; ++i)
+      Outs.push_back(ISD::OutputArg(Flags, PartVT, VT, /*isFixed=*/true, 0, 0));
+  }
+}
+
+/// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
+/// function arguments in the caller parameter area.  This is the actual
+/// alignment, not its logarithm.
+unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
+                                                   const DataLayout &DL) const {
+  return DL.getABITypeAlignment(Ty);
+}
+
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, EVT VT,
+                                            unsigned AddrSpace,
+                                            unsigned Alignment,
+                                            bool *Fast) const {
+  // Check if the specified alignment is sufficient based on the data layout.
+  // TODO: While using the data layout works in practice, a better solution
+  // would be to implement this check directly (make this a virtual function).
+  // For example, the ABI alignment may change based on software platform while
+  // this function should only be affected by hardware implementation.
+  Type *Ty = VT.getTypeForEVT(Context);
+  if (Alignment >= DL.getABITypeAlignment(Ty)) {
+    // Assume that an access that meets the ABI-specified alignment is fast.
+    if (Fast != nullptr)
+      *Fast = true;
+    return true;
+  }
+  
+  // This is a misaligned access.
+  return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Fast);
+}
+
+BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
+  return BranchProbability(MinPercentageForPredictableBranch, 100);
+}
+
+//===----------------------------------------------------------------------===//
+//  TargetTransformInfo Helpers
+//===----------------------------------------------------------------------===//
+
+int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
+  enum InstructionOpcodes {
+#define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
+#define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
+#include "llvm/IR/Instruction.def"
+  };
+  switch (static_cast<InstructionOpcodes>(Opcode)) {
+  case Ret:            return 0;
+  case Br:             return 0;
+  case Switch:         return 0;
+  case IndirectBr:     return 0;
+  case Invoke:         return 0;
+  case Resume:         return 0;
+  case Unreachable:    return 0;
+  case CleanupRet:     return 0;
+  case CatchRet:       return 0;
+  case CatchPad:       return 0;
+  case CatchSwitch:    return 0;
+  case CleanupPad:     return 0;
+  case Add:            return ISD::ADD;
+  case FAdd:           return ISD::FADD;
+  case Sub:            return ISD::SUB;
+  case FSub:           return ISD::FSUB;
+  case Mul:            return ISD::MUL;
+  case FMul:           return ISD::FMUL;
+  case UDiv:           return ISD::UDIV;
+  case SDiv:           return ISD::SDIV;
+  case FDiv:           return ISD::FDIV;
+  case URem:           return ISD::UREM;
+  case SRem:           return ISD::SREM;
+  case FRem:           return ISD::FREM;
+  case Shl:            return ISD::SHL;
+  case LShr:           return ISD::SRL;
+  case AShr:           return ISD::SRA;
+  case And:            return ISD::AND;
+  case Or:             return ISD::OR;
+  case Xor:            return ISD::XOR;
+  case Alloca:         return 0;
+  case Load:           return ISD::LOAD;
+  case Store:          return ISD::STORE;
+  case GetElementPtr:  return 0;
+  case Fence:          return 0;
+  case AtomicCmpXchg:  return 0;
+  case AtomicRMW:      return 0;
+  case Trunc:          return ISD::TRUNCATE;
+  case ZExt:           return ISD::ZERO_EXTEND;
+  case SExt:           return ISD::SIGN_EXTEND;
+  case FPToUI:         return ISD::FP_TO_UINT;
+  case FPToSI:         return ISD::FP_TO_SINT;
+  case UIToFP:         return ISD::UINT_TO_FP;
+  case SIToFP:         return ISD::SINT_TO_FP;
+  case FPTrunc:        return ISD::FP_ROUND;
+  case FPExt:          return ISD::FP_EXTEND;
+  case PtrToInt:       return ISD::BITCAST;
+  case IntToPtr:       return ISD::BITCAST;
+  case BitCast:        return ISD::BITCAST;
+  case AddrSpaceCast:  return ISD::ADDRSPACECAST;
+  case ICmp:           return ISD::SETCC;
+  case FCmp:           return ISD::SETCC;
+  case PHI:            return 0;
+  case Call:           return 0;
+  case Select:         return ISD::SELECT;
+  case UserOp1:        return 0;
+  case UserOp2:        return 0;
+  case VAArg:          return 0;
+  case ExtractElement: return ISD::EXTRACT_VECTOR_ELT;
+  case InsertElement:  return ISD::INSERT_VECTOR_ELT;
+  case ShuffleVector:  return ISD::VECTOR_SHUFFLE;
+  case ExtractValue:   return ISD::MERGE_VALUES;
+  case InsertValue:    return ISD::MERGE_VALUES;
+  case LandingPad:     return 0;
+  }
+
+  llvm_unreachable("Unknown instruction type encountered!");
+}
+
+std::pair<int, MVT>
+TargetLoweringBase::getTypeLegalizationCost(const DataLayout &DL,
+                                            Type *Ty) const {
+  LLVMContext &C = Ty->getContext();
+  EVT MTy = getValueType(DL, Ty);
+
+  int Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    LegalizeKind LK = getTypeConversion(C, MTy);
+
+    if (LK.first == TypeLegal)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
+    if (LK.first == TypeSplitVector || LK.first == TypeExpandInteger)
+      Cost *= 2;
+
+    // Do not loop with f128 type.
+    if (MTy == LK.second)
+      return std::make_pair(Cost, MTy.getSimpleVT());
+
+    // Keep legalizing the type.
+    MTy = LK.second;
+  }
+}
+
+Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
+  if (!TM.getTargetTriple().isAndroid())
+    return nullptr;
+
+  // Android provides a libc function to retrieve the address of the current
+  // thread's unsafe stack pointer.
+  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
+  Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
+  Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
+                                     StackPtrTy->getPointerTo(0), nullptr);
+  return IRB.CreateCall(Fn);
+}
+
+//===----------------------------------------------------------------------===//
+//  Loop Strength Reduction hooks
+//===----------------------------------------------------------------------===//
+
+/// isLegalAddressingMode - Return true if the addressing mode represented
+/// by AM is legal for this target, for a load/store of the specified type.
+bool TargetLoweringBase::isLegalAddressingMode(const DataLayout &DL,
+                                               const AddrMode &AM, Type *Ty,
+                                               unsigned AS) const {
+  // The default implementation of this implements a conservative RISCy, r+r and
+  // r+i addr mode.
+
+  // Allows a sign-extended 16-bit immediate field.
+  if (AM.BaseOffs <= -(1LL << 16) || AM.BaseOffs >= (1LL << 16)-1)
+    return false;
+
+  // No global is ever allowed as a base.
+  if (AM.BaseGV)
+    return false;
+
+  // Only support r+r,
+  switch (AM.Scale) {
+  case 0:  // "r+i" or just "i", depending on HasBaseReg.
+    break;
+  case 1:
+    if (AM.HasBaseReg && AM.BaseOffs)  // "r+r+i" is not allowed.
+      return false;
+    // Otherwise we have r+r or r+i.
+    break;
+  case 2:
+    if (AM.HasBaseReg || AM.BaseOffs)  // 2*r+r  or  2*r+i is not allowed.
+      return false;
+    // Allow 2*r as r+r.
+    break;
+  default: // Don't allow n * r
+    return false;
+  }
+
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//  Stack Protector
+//===----------------------------------------------------------------------===//
+
+// For OpenBSD return its special guard variable. Otherwise return nullptr,
+// so that SelectionDAG handle SSP.
+Value *TargetLoweringBase::getIRStackGuard(IRBuilder<> &IRB) const {
+  if (getTargetMachine().getTargetTriple().isOSOpenBSD()) {
+    Module &M = *IRB.GetInsertBlock()->getParent()->getParent();
+    PointerType *PtrTy = Type::getInt8PtrTy(M.getContext());
+    auto Guard = cast<GlobalValue>(M.getOrInsertGlobal("__guard_local", PtrTy));
+    Guard->setVisibility(GlobalValue::HiddenVisibility);
+    return Guard;
+  }
+  return nullptr;
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+void TargetLoweringBase::insertSSPDeclarations(Module &M) const {
+  M.getOrInsertGlobal("__stack_chk_guard", Type::getInt8PtrTy(M.getContext()));
+}
+
+// Currently only support "standard" __stack_chk_guard.
+// TODO: add LOAD_STACK_GUARD support.
+Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
+  return M.getGlobalVariable("__stack_chk_guard", true);
+}
+
+Value *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
+  return nullptr;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
new file mode 100644
index 000000000000..5f814c957e92
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -0,0 +1,1098 @@
+//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements classes used to handle lowerings specific to common
+// object file formats.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+using namespace dwarf;
+
+//===----------------------------------------------------------------------===//
+//                                  ELF
+//===----------------------------------------------------------------------===//
+
+MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  unsigned Encoding = getPersonalityEncoding();
+  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
+    return getContext().getOrCreateSymbol(StringRef("DW.ref.") +
+                                          TM.getSymbol(GV, Mang)->getName());
+  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+    return TM.getSymbol(GV, Mang);
+  report_fatal_error("We do not support this DWARF encoding yet!");
+}
+
+void TargetLoweringObjectFileELF::emitPersonalityValue(
+    MCStreamer &Streamer, const DataLayout &DL, const MCSymbol *Sym) const {
+  SmallString<64> NameData("DW.ref.");
+  NameData += Sym->getName();
+  MCSymbolELF *Label =
+      cast<MCSymbolELF>(getContext().getOrCreateSymbol(NameData));
+  Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
+  Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
+  MCSection *Sec = getContext().getELFNamedSection(".data", Label->getName(),
+                                                   ELF::SHT_PROGBITS, Flags, 0);
+  unsigned Size = DL.getPointerSize();
+  Streamer.SwitchSection(Sec);
+  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment());
+  Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
+  const MCExpr *E = MCConstantExpr::create(Size, getContext());
+  Streamer.emitELFSize(Label, E);
+  Streamer.EmitLabel(Label);
+
+  Streamer.EmitSymbolValue(Sym, Size);
+}
+
+const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+
+  if (Encoding & dwarf::DW_EH_PE_indirect) {
+    MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
+
+    MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", Mang, TM);
+
+    // Add information about the stub reference to ELFMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = ELFMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV, Mang);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
+                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::
+    getTTypeGlobalReference(GV, Encoding, Mang, TM, MMI, Streamer);
+}
+
+static SectionKind
+getELFKindForNamedSection(StringRef Name, SectionKind K) {
+  // N.B.: The defaults used in here are no the same ones used in MC.
+  // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
+  // both gas and MC will produce a section with no flags. Given
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
+  
+  if (Name == getInstrProfCoverageSectionName(false))
+    return SectionKind::getMetadata();
+
+  if (Name.empty() || Name[0] != '.') return K;
+
+  // Some lame default implementation based on some magic section names.
+  if (Name == ".bss" ||
+      Name.startswith(".bss.") ||
+      Name.startswith(".gnu.linkonce.b.") ||
+      Name.startswith(".llvm.linkonce.b.") ||
+      Name == ".sbss" ||
+      Name.startswith(".sbss.") ||
+      Name.startswith(".gnu.linkonce.sb.") ||
+      Name.startswith(".llvm.linkonce.sb."))
+    return SectionKind::getBSS();
+
+  if (Name == ".tdata" ||
+      Name.startswith(".tdata.") ||
+      Name.startswith(".gnu.linkonce.td.") ||
+      Name.startswith(".llvm.linkonce.td."))
+    return SectionKind::getThreadData();
+
+  if (Name == ".tbss" ||
+      Name.startswith(".tbss.") ||
+      Name.startswith(".gnu.linkonce.tb.") ||
+      Name.startswith(".llvm.linkonce.tb."))
+    return SectionKind::getThreadBSS();
+
+  return K;
+}
+
+
+static unsigned getELFSectionType(StringRef Name, SectionKind K) {
+
+  if (Name == ".init_array")
+    return ELF::SHT_INIT_ARRAY;
+
+  if (Name == ".fini_array")
+    return ELF::SHT_FINI_ARRAY;
+
+  if (Name == ".preinit_array")
+    return ELF::SHT_PREINIT_ARRAY;
+
+  if (K.isBSS() || K.isThreadBSS())
+    return ELF::SHT_NOBITS;
+
+  return ELF::SHT_PROGBITS;
+}
+
+static unsigned getELFSectionFlags(SectionKind K) {
+  unsigned Flags = 0;
+
+  if (!K.isMetadata())
+    Flags |= ELF::SHF_ALLOC;
+
+  if (K.isText())
+    Flags |= ELF::SHF_EXECINSTR;
+
+  if (K.isWriteable())
+    Flags |= ELF::SHF_WRITE;
+
+  if (K.isThreadLocal())
+    Flags |= ELF::SHF_TLS;
+
+  if (K.isMergeableCString() || K.isMergeableConst())
+    Flags |= ELF::SHF_MERGE;
+
+  if (K.isMergeableCString())
+    Flags |= ELF::SHF_STRINGS;
+
+  return Flags;
+}
+
+static const Comdat *getELFComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("ELF COMDATs only support SelectionKind::Any, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
+MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  StringRef SectionName = GV->getSection();
+
+  // Infer section flags from the section name if we can.
+  Kind = getELFKindForNamedSection(SectionName, Kind);
+
+  StringRef Group = "";
+  unsigned Flags = getELFSectionFlags(Kind);
+  if (const Comdat *C = getELFComdat(GV)) {
+    Group = C->getName();
+    Flags |= ELF::SHF_GROUP;
+  }
+  return getContext().getELFSection(SectionName,
+                                    getELFSectionType(SectionName, Kind), Flags,
+                                    /*EntrySize=*/0, Group);
+}
+
+/// Return the section prefix name used by options FunctionsSections and
+/// DataSections.
+static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isReadOnly())
+    return ".rodata";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadData())
+    return ".tdata";
+  if (Kind.isThreadBSS())
+    return ".tbss";
+  if (Kind.isData())
+    return ".data";
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return ".data.rel.ro";
+}
+
+static MCSectionELF *
+selectELFSectionForGlobal(MCContext &Ctx, const GlobalValue *GV,
+                          SectionKind Kind, Mangler &Mang,
+                          const TargetMachine &TM, bool EmitUniqueSection,
+                          unsigned Flags, unsigned *NextUniqueID) {
+  unsigned EntrySize = 0;
+  if (Kind.isMergeableCString()) {
+    if (Kind.isMergeable2ByteCString()) {
+      EntrySize = 2;
+    } else if (Kind.isMergeable4ByteCString()) {
+      EntrySize = 4;
+    } else {
+      EntrySize = 1;
+      assert(Kind.isMergeable1ByteCString() && "unknown string width");
+    }
+  } else if (Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4()) {
+      EntrySize = 4;
+    } else if (Kind.isMergeableConst8()) {
+      EntrySize = 8;
+    } else if (Kind.isMergeableConst16()) {
+      EntrySize = 16;
+    } else {
+      assert(Kind.isMergeableConst32() && "unknown data width");
+      EntrySize = 32;
+    }
+  }
+
+  StringRef Group = "";
+  if (const Comdat *C = getELFComdat(GV)) {
+    Flags |= ELF::SHF_GROUP;
+    Group = C->getName();
+  }
+
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name;
+  if (Kind.isMergeableCString()) {
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    unsigned Align = GV->getParent()->getDataLayout().getPreferredAlignment(
+        cast<GlobalVariable>(GV));
+
+    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
+    Name = SizeSpec + utostr(Align);
+  } else if (Kind.isMergeableConst()) {
+    Name = ".rodata.cst";
+    Name += utostr(EntrySize);
+  } else {
+    Name = getSectionPrefixForGlobal(Kind);
+  }
+  // FIXME: Extend the section prefix to include hotness catagories such as .hot
+  //  or .unlikely for functions.
+
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GV, Mang, true);
+  }
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection && !UniqueSectionNames) {
+    UniqueID = *NextUniqueID;
+    (*NextUniqueID)++;
+  }
+  return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags,
+                           EntrySize, Group, UniqueID);
+}
+
+MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  unsigned Flags = getELFSectionFlags(Kind);
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (!(Flags & ELF::SHF_MERGE) && !Kind.isCommon()) {
+    if (Kind.isText())
+      EmitUniqueSection = TM.getFunctionSections();
+    else
+      EmitUniqueSection = TM.getDataSections();
+  }
+  EmitUniqueSection |= GV->hasComdat();
+
+  return selectELFSectionForGlobal(getContext(), GV, Kind, Mang, TM,
+                                   EmitUniqueSection, Flags, &NextUniqueID);
+}
+
+MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
+    const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
+
+  return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(),
+                                   Mang, TM, EmitUniqueSection, ELF::SHF_ALLOC,
+                                   &NextUniqueID);
+}
+
+bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+/// Given a mergeable constant with the specified size and relocation
+/// information, return a section that it should be placed in.
+MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
+  if (Kind.isMergeableConst4() && MergeableConst4Section)
+    return MergeableConst4Section;
+  if (Kind.isMergeableConst8() && MergeableConst8Section)
+    return MergeableConst8Section;
+  if (Kind.isMergeableConst16() && MergeableConst16Section)
+    return MergeableConst16Section;
+  if (Kind.isMergeableConst32() && MergeableConst32Section)
+    return MergeableConst32Section;
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
+  return DataRelROSection;
+}
+
+static MCSectionELF *getStaticStructorSection(MCContext &Ctx, bool UseInitArray,
+                                              bool IsCtor, unsigned Priority,
+                                              const MCSymbol *KeySym) {
+  std::string Name;
+  unsigned Type;
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  StringRef COMDAT = KeySym ? KeySym->getName() : "";
+
+  if (KeySym)
+    Flags |= ELF::SHF_GROUP;
+
+  if (UseInitArray) {
+    if (IsCtor) {
+      Type = ELF::SHT_INIT_ARRAY;
+      Name = ".init_array";
+    } else {
+      Type = ELF::SHT_FINI_ARRAY;
+      Name = ".fini_array";
+    }
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(Priority);
+    }
+  } else {
+    // The default scheme is .ctor / .dtor, so we have to invert the priority
+    // numbering.
+    if (IsCtor)
+      Name = ".ctors";
+    else
+      Name = ".dtors";
+    if (Priority != 65535) {
+      Name += '.';
+      Name += utostr(65535 - Priority);
+    }
+    Type = ELF::SHT_PROGBITS;
+  }
+
+  return Ctx.getELFSection(Name, Type, Flags, 0, COMDAT);
+}
+
+MCSection *TargetLoweringObjectFileELF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, true, Priority,
+                                  KeySym);
+}
+
+MCSection *TargetLoweringObjectFileELF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getStaticStructorSection(getContext(), UseInitArray, false, Priority,
+                                  KeySym);
+}
+
+const MCExpr *TargetLoweringObjectFileELF::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS, Mangler &Mang,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic sanity checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS, Mang), PLTRelativeVariantKind,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS, Mang), getContext()),
+      getContext());
+}
+
+void
+TargetLoweringObjectFileELF::InitializeELF(bool UseInitArray_) {
+  UseInitArray = UseInitArray_;
+  if (!UseInitArray)
+    return;
+
+  StaticCtorSection = getContext().getELFSection(
+      ".init_array", ELF::SHT_INIT_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+  StaticDtorSection = getContext().getELFSection(
+      ".fini_array", ELF::SHT_FINI_ARRAY, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+}
+
+//===----------------------------------------------------------------------===//
+//                                 MachO
+//===----------------------------------------------------------------------===//
+
+TargetLoweringObjectFileMachO::TargetLoweringObjectFileMachO()
+  : TargetLoweringObjectFile() {
+  SupportIndirectSymViaGOTPCRel = true;
+}
+
+/// emitModuleFlags - Perform code emission for module flags.
+void TargetLoweringObjectFileMachO::
+emitModuleFlags(MCStreamer &Streamer,
+                ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
+                Mangler &Mang, const TargetMachine &TM) const {
+  unsigned VersionVal = 0;
+  unsigned ImageInfoFlags = 0;
+  MDNode *LinkerOptions = nullptr;
+  StringRef SectionVal;
+
+  for (const auto &MFE : ModuleFlags) {
+    // Ignore flags with 'Require' behavior.
+    if (MFE.Behavior == Module::Require)
+      continue;
+
+    StringRef Key = MFE.Key->getString();
+    Metadata *Val = MFE.Val;
+
+    if (Key == "Objective-C Image Info Version") {
+      VersionVal = mdconst::extract<ConstantInt>(Val)->getZExtValue();
+    } else if (Key == "Objective-C Garbage Collection" ||
+               Key == "Objective-C GC Only" ||
+               Key == "Objective-C Is Simulated" ||
+               Key == "Objective-C Class Properties" ||
+               Key == "Objective-C Image Swift Version") {
+      ImageInfoFlags |= mdconst::extract<ConstantInt>(Val)->getZExtValue();
+    } else if (Key == "Objective-C Image Info Section") {
+      SectionVal = cast<MDString>(Val)->getString();
+    } else if (Key == "Linker Options") {
+      LinkerOptions = cast<MDNode>(Val);
+    }
+  }
+
+  // Emit the linker options if present.
+  if (LinkerOptions) {
+    for (const auto &Option : LinkerOptions->operands()) {
+      SmallVector<std::string, 4> StrOptions;
+      for (const auto &Piece : cast<MDNode>(Option)->operands())
+        StrOptions.push_back(cast<MDString>(Piece)->getString());
+      Streamer.EmitLinkerOptions(StrOptions);
+    }
+  }
+
+  // The section is mandatory. If we don't have it, then we don't have GC info.
+  if (SectionVal.empty()) return;
+
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(SectionVal, Segment, Section,
+                                          TAA, TAAParsed, StubSize);
+  if (!ErrorCode.empty())
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Invalid section specifier '" + Section + "': " +
+                       ErrorCode + ".");
+
+  // Get the section.
+  MCSectionMachO *S = getContext().getMachOSection(
+      Segment, Section, TAA, StubSize, SectionKind::getData());
+  Streamer.SwitchSection(S);
+  Streamer.EmitLabel(getContext().
+                     getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
+  Streamer.EmitIntValue(VersionVal, 4);
+  Streamer.EmitIntValue(ImageInfoFlags, 4);
+  Streamer.AddBlankLine();
+}
+
+static void checkMachOComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return;
+
+  report_fatal_error("MachO doesn't support COMDATs, '" + C->getName() +
+                     "' cannot be lowered.");
+}
+
+MCSection *TargetLoweringObjectFileMachO::getExplicitSectionGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  // Parse the section specifier and create it if valid.
+  StringRef Segment, Section;
+  unsigned TAA = 0, StubSize = 0;
+  bool TAAParsed;
+
+  checkMachOComdat(GV);
+
+  std::string ErrorCode =
+    MCSectionMachO::ParseSectionSpecifier(GV->getSection(), Segment, Section,
+                                          TAA, TAAParsed, StubSize);
+  if (!ErrorCode.empty()) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' has an invalid section specifier '" +
+                       GV->getSection() + "': " + ErrorCode + ".");
+  }
+
+  // Get the section.
+  MCSectionMachO *S =
+      getContext().getMachOSection(Segment, Section, TAA, StubSize, Kind);
+
+  // If TAA wasn't set by ParseSectionSpecifier() above,
+  // use the value returned by getMachOSection() as a default.
+  if (!TAAParsed)
+    TAA = S->getTypeAndAttributes();
+
+  // Okay, now that we got the section, verify that the TAA & StubSize agree.
+  // If the user declared multiple globals with different section flags, we need
+  // to reject it here.
+  if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
+    // If invalid, report the error with report_fatal_error.
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' section type or attributes does not match previous"
+                       " section specifier");
+  }
+
+  return S;
+}
+
+MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  checkMachOComdat(GV);
+
+  // Handle thread local data.
+  if (Kind.isThreadBSS()) return TLSBSSSection;
+  if (Kind.isThreadData()) return TLSDataSection;
+
+  if (Kind.isText())
+    return GV->isWeakForLinker() ? TextCoalSection : TextSection;
+
+  // If this is weak/linkonce, put this in a coalescable section, either in text
+  // or data depending on if it is writable.
+  if (GV->isWeakForLinker()) {
+    if (Kind.isReadOnly())
+      return ConstTextCoalSection;
+    return DataCoalSection;
+  }
+
+  // FIXME: Alignment check should be handled by section classifier.
+  if (Kind.isMergeable1ByteCString() &&
+      GV->getParent()->getDataLayout().getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
+    return CStringSection;
+
+  // Do not put 16-bit arrays in the UString section if they have an
+  // externally visible label, this runs into issues with certain linker
+  // versions.
+  if (Kind.isMergeable2ByteCString() && !GV->hasExternalLinkage() &&
+      GV->getParent()->getDataLayout().getPreferredAlignment(
+          cast<GlobalVariable>(GV)) < 32)
+    return UStringSection;
+
+  // With MachO only variables whose corresponding symbol starts with 'l' or
+  // 'L' can be merged, so we only try merging GVs with private linkage.
+  if (GV->hasPrivateLinkage() && Kind.isMergeableConst()) {
+    if (Kind.isMergeableConst4())
+      return FourByteConstantSection;
+    if (Kind.isMergeableConst8())
+      return EightByteConstantSection;
+    if (Kind.isMergeableConst16())
+      return SixteenByteConstantSection;
+  }
+
+  // Otherwise, if it is readonly, but not something we can specially optimize,
+  // just drop it in .const.
+  if (Kind.isReadOnly())
+    return ReadOnlySection;
+
+  // If this is marked const, put it into a const section.  But if the dynamic
+  // linker needs to write to it, put it in the data segment.
+  if (Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  // Put zero initialized globals with strong external linkage in the
+  // DATA, __common section with the .zerofill directive.
+  if (Kind.isBSSExtern())
+    return DataCommonSection;
+
+  // Put zero initialized globals with local linkage in __DATA,__bss directive
+  // with the .zerofill directive (aka .lcomm).
+  if (Kind.isBSSLocal())
+    return DataBSSSection;
+
+  // Otherwise, just drop the variable in the normal data section.
+  return DataSection;
+}
+
+MCSection *TargetLoweringObjectFileMachO::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    unsigned &Align) const {
+  // If this constant requires a relocation, we have to put it in the data
+  // segment, not in the text segment.
+  if (Kind.isData() || Kind.isReadOnlyWithRel())
+    return ConstDataSection;
+
+  if (Kind.isMergeableConst4())
+    return FourByteConstantSection;
+  if (Kind.isMergeableConst8())
+    return EightByteConstantSection;
+  if (Kind.isMergeableConst16())
+    return SixteenByteConstantSection;
+  return ReadOnlySection;  // .const
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
+    const GlobalValue *GV, unsigned Encoding, Mangler &Mang,
+    const TargetMachine &TM, MachineModuleInfo *MMI,
+    MCStreamer &Streamer) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+
+  if (Encoding & DW_EH_PE_indirect) {
+    MachineModuleInfoMachO &MachOMMI =
+      MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+    MCSymbol *SSym =
+        getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr", Mang, TM);
+
+    // Add information about the stub reference to MachOMMI so that the stub
+    // gets emitted by the asmprinter.
+    MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+    if (!StubSym.getPointer()) {
+      MCSymbol *Sym = TM.getSymbol(GV, Mang);
+      StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+    }
+
+    return TargetLoweringObjectFile::
+      getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
+                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+  }
+
+  return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, Mang,
+                                                           TM, MMI, Streamer);
+}
+
+MCSymbol *TargetLoweringObjectFileMachO::getCFIPersonalitySymbol(
+    const GlobalValue *GV, Mangler &Mang, const TargetMachine &TM,
+    MachineModuleInfo *MMI) const {
+  // The mach-o version of this method defaults to returning a stub reference.
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+
+  MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr", Mang, TM);
+
+  // Add information about the stub reference to MachOMMI so that the stub
+  // gets emitted by the asmprinter.
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(SSym);
+  if (!StubSym.getPointer()) {
+    MCSymbol *Sym = TM.getSymbol(GV, Mang);
+    StubSym = MachineModuleInfoImpl::StubValueTy(Sym, !GV->hasLocalLinkage());
+  }
+
+  return SSym;
+}
+
+const MCExpr *TargetLoweringObjectFileMachO::getIndirectSymViaGOTPCRel(
+    const MCSymbol *Sym, const MCValue &MV, int64_t Offset,
+    MachineModuleInfo *MMI, MCStreamer &Streamer) const {
+  // Although MachO 32-bit targets do not explicitly have a GOTPCREL relocation
+  // as 64-bit do, we replace the GOT equivalent by accessing the final symbol
+  // through a non_lazy_ptr stub instead. One advantage is that it allows the
+  // computation of deltas to final external symbols. Example:
+  //
+  //    _extgotequiv:
+  //       .long   _extfoo
+  //
+  //    _delta:
+  //       .long   _extgotequiv-_delta
+  //
+  // is transformed to:
+  //
+  //    _delta:
+  //       .long   L_extfoo$non_lazy_ptr-(_delta+0)
+  //
+  //       .section        __IMPORT,__pointers,non_lazy_symbol_pointers
+  //    L_extfoo$non_lazy_ptr:
+  //       .indirect_symbol        _extfoo
+  //       .long   0
+  //
+  MachineModuleInfoMachO &MachOMMI =
+    MMI->getObjFileInfo<MachineModuleInfoMachO>();
+  MCContext &Ctx = getContext();
+
+  // The offset must consider the original displacement from the base symbol
+  // since 32-bit targets don't have a GOTPCREL to fold the PC displacement.
+  Offset = -MV.getConstant();
+  const MCSymbol *BaseSym = &MV.getSymB()->getSymbol();
+
+  // Access the final symbol via sym$non_lazy_ptr and generate the appropriated
+  // non_lazy_ptr stubs.
+  SmallString<128> Name;
+  StringRef Suffix = "$non_lazy_ptr";
+  Name += MMI->getModule()->getDataLayout().getPrivateGlobalPrefix();
+  Name += Sym->getName();
+  Name += Suffix;
+  MCSymbol *Stub = Ctx.getOrCreateSymbol(Name);
+
+  MachineModuleInfoImpl::StubValueTy &StubSym = MachOMMI.getGVStubEntry(Stub);
+  if (!StubSym.getPointer())
+    StubSym = MachineModuleInfoImpl::
+      StubValueTy(const_cast<MCSymbol *>(Sym), true /* access indirectly */);
+
+  const MCExpr *BSymExpr =
+    MCSymbolRefExpr::create(BaseSym, MCSymbolRefExpr::VK_None, Ctx);
+  const MCExpr *LHS =
+    MCSymbolRefExpr::create(Stub, MCSymbolRefExpr::VK_None, Ctx);
+
+  if (!Offset)
+    return MCBinaryExpr::createSub(LHS, BSymExpr, Ctx);
+
+  const MCExpr *RHS =
+    MCBinaryExpr::createAdd(BSymExpr, MCConstantExpr::create(Offset, Ctx), Ctx);
+  return MCBinaryExpr::createSub(LHS, RHS, Ctx);
+}
+
+static bool canUsePrivateLabel(const MCAsmInfo &AsmInfo,
+                               const MCSection &Section) {
+  if (!AsmInfo.isSectionAtomizableBySymbols(Section))
+    return true;
+
+  // If it is not dead stripped, it is safe to use private labels.
+  const MCSectionMachO &SMO = cast<MCSectionMachO>(Section);
+  if (SMO.hasAttribute(MachO::S_ATTR_NO_DEAD_STRIP))
+    return true;
+
+  return false;
+}
+
+void TargetLoweringObjectFileMachO::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+    const TargetMachine &TM) const {
+  SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
+  const MCSection *TheSection = SectionForGlobal(GV, GVKind, Mang, TM);
+  bool CannotUsePrivateLabel =
+      !canUsePrivateLabel(*TM.getMCAsmInfo(), *TheSection);
+  Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+}
+
+//===----------------------------------------------------------------------===//
+//                                  COFF
+//===----------------------------------------------------------------------===//
+
+static unsigned
+getCOFFSectionFlags(SectionKind K, const TargetMachine &TM) {
+  unsigned Flags = 0;
+  bool isThumb = TM.getTargetTriple().getArch() == Triple::thumb;
+
+  if (K.isMetadata())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_DISCARDABLE;
+  else if (K.isText())
+    Flags |=
+      COFF::IMAGE_SCN_MEM_EXECUTE |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_CNT_CODE |
+      (isThumb ? COFF::IMAGE_SCN_MEM_16BIT : (COFF::SectionCharacteristics)0);
+  else if (K.isBSS())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isThreadLocal())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+  else if (K.isReadOnly() || K.isReadOnlyWithRel())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ;
+  else if (K.isWriteable())
+    Flags |=
+      COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+      COFF::IMAGE_SCN_MEM_READ |
+      COFF::IMAGE_SCN_MEM_WRITE;
+
+  return Flags;
+}
+
+static const GlobalValue *getComdatGVForCOFF(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  assert(C && "expected GV to have a Comdat!");
+
+  StringRef ComdatGVName = C->getName();
+  const GlobalValue *ComdatGV = GV->getParent()->getNamedValue(ComdatGVName);
+  if (!ComdatGV)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' does not exist.");
+
+  if (ComdatGV->getComdat() != C)
+    report_fatal_error("Associative COMDAT symbol '" + ComdatGVName +
+                       "' is not a key for its COMDAT.");
+
+  return ComdatGV;
+}
+
+static int getSelectionForCOFF(const GlobalValue *GV) {
+  if (const Comdat *C = GV->getComdat()) {
+    const GlobalValue *ComdatKey = getComdatGVForCOFF(GV);
+    if (const auto *GA = dyn_cast<GlobalAlias>(ComdatKey))
+      ComdatKey = GA->getBaseObject();
+    if (ComdatKey == GV) {
+      switch (C->getSelectionKind()) {
+      case Comdat::Any:
+        return COFF::IMAGE_COMDAT_SELECT_ANY;
+      case Comdat::ExactMatch:
+        return COFF::IMAGE_COMDAT_SELECT_EXACT_MATCH;
+      case Comdat::Largest:
+        return COFF::IMAGE_COMDAT_SELECT_LARGEST;
+      case Comdat::NoDuplicates:
+        return COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+      case Comdat::SameSize:
+        return COFF::IMAGE_COMDAT_SELECT_SAME_SIZE;
+      }
+    } else {
+      return COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+    }
+  }
+  return 0;
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getExplicitSectionGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  int Selection = 0;
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+  StringRef Name = GV->getSection();
+  StringRef COMDATSymName = "";
+  if (GV->hasComdat()) {
+    Selection = getSelectionForCOFF(GV);
+    const GlobalValue *ComdatGV;
+    if (Selection == COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE)
+      ComdatGV = getComdatGVForCOFF(GV);
+    else
+      ComdatGV = GV;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang);
+      COMDATSymName = Sym->getName();
+      Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    } else {
+      Selection = 0;
+    }
+  }
+
+  return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
+                                     Selection);
+}
+
+static const char *getCOFFSectionNameForUniqueGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadLocal())
+    return ".tls$";
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ".rdata";
+  return ".data";
+}
+
+MCSection *TargetLoweringObjectFileCOFF::SelectSectionForGlobal(
+    const GlobalValue *GV, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM) const {
+  // If we have -ffunction-sections then we should emit the global value to a
+  // uniqued section specifically for it.
+  bool EmitUniquedSection;
+  if (Kind.isText())
+    EmitUniquedSection = TM.getFunctionSections();
+  else
+    EmitUniquedSection = TM.getDataSections();
+
+  if ((EmitUniquedSection && !Kind.isCommon()) || GV->hasComdat()) {
+    const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
+    unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+
+    Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+    int Selection = getSelectionForCOFF(GV);
+    if (!Selection)
+      Selection = COFF::IMAGE_COMDAT_SELECT_NODUPLICATES;
+    const GlobalValue *ComdatGV;
+    if (GV->hasComdat())
+      ComdatGV = getComdatGVForCOFF(GV);
+    else
+      ComdatGV = GV;
+
+    unsigned UniqueID = MCContext::GenericSectionID;
+    if (EmitUniquedSection)
+      UniqueID = NextUniqueID++;
+
+    if (!ComdatGV->hasPrivateLinkage()) {
+      MCSymbol *Sym = TM.getSymbol(ComdatGV, Mang);
+      StringRef COMDATSymName = Sym->getName();
+      return getContext().getCOFFSection(Name, Characteristics, Kind,
+                                         COMDATSymName, Selection, UniqueID);
+    } else {
+      SmallString<256> TmpData;
+      Mang.getNameWithPrefix(TmpData, GV, /*CannotUsePrivateLabel=*/true);
+      return getContext().getCOFFSection(Name, Characteristics, Kind, TmpData,
+                                         Selection, UniqueID);
+    }
+  }
+
+  if (Kind.isText())
+    return TextSection;
+
+  if (Kind.isThreadLocal())
+    return TLSDataSection;
+
+  if (Kind.isReadOnly() || Kind.isReadOnlyWithRel())
+    return ReadOnlySection;
+
+  // Note: we claim that common symbols are put in BSSSection, but they are
+  // really emitted with the magic .comm directive, which creates a symbol table
+  // entry but not a section.
+  if (Kind.isBSS() || Kind.isCommon())
+    return BSSSection;
+
+  return DataSection;
+}
+
+void TargetLoweringObjectFileCOFF::getNameWithPrefix(
+    SmallVectorImpl<char> &OutName, const GlobalValue *GV, Mangler &Mang,
+    const TargetMachine &TM) const {
+  bool CannotUsePrivateLabel = false;
+  if (GV->hasPrivateLinkage() &&
+      ((isa<Function>(GV) && TM.getFunctionSections()) ||
+       (isa<GlobalVariable>(GV) && TM.getDataSections())))
+    CannotUsePrivateLabel = true;
+
+  Mang.getNameWithPrefix(OutName, GV, CannotUsePrivateLabel);
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getSectionForJumpTable(
+    const Function &F, Mangler &Mang, const TargetMachine &TM) const {
+  // If the function can be removed, produce a unique section so that
+  // the table doesn't prevent the removal.
+  const Comdat *C = F.getComdat();
+  bool EmitUniqueSection = TM.getFunctionSections() || C;
+  if (!EmitUniqueSection)
+    return ReadOnlySection;
+
+  // FIXME: we should produce a symbol for F instead.
+  if (F.hasPrivateLinkage())
+    return ReadOnlySection;
+
+  MCSymbol *Sym = TM.getSymbol(&F, Mang);
+  StringRef COMDATSymName = Sym->getName();
+
+  SectionKind Kind = SectionKind::getReadOnly();
+  const char *Name = getCOFFSectionNameForUniqueGlobal(Kind);
+  unsigned Characteristics = getCOFFSectionFlags(Kind, TM);
+  Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
+  unsigned UniqueID = NextUniqueID++;
+
+  return getContext().getCOFFSection(Name, Characteristics, Kind, COMDATSymName,
+                                     COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
+}
+
+void TargetLoweringObjectFileCOFF::
+emitModuleFlags(MCStreamer &Streamer,
+                ArrayRef<Module::ModuleFlagEntry> ModuleFlags,
+                Mangler &Mang, const TargetMachine &TM) const {
+  MDNode *LinkerOptions = nullptr;
+
+  for (const auto &MFE : ModuleFlags) {
+    StringRef Key = MFE.Key->getString();
+    if (Key == "Linker Options")
+      LinkerOptions = cast<MDNode>(MFE.Val);
+  }
+
+  if (LinkerOptions) {
+    // Emit the linker options to the linker .drectve section.  According to the
+    // spec, this section is a space-separated string containing flags for
+    // linker.
+    MCSection *Sec = getDrectveSection();
+    Streamer.SwitchSection(Sec);
+    for (const auto &Option : LinkerOptions->operands()) {
+      for (const auto &Piece : cast<MDNode>(Option)->operands()) {
+        // Lead with a space for consistency with our dllexport implementation.
+        std::string Directive(" ");
+        Directive.append(cast<MDString>(Piece)->getString());
+        Streamer.EmitBytes(Directive);
+      }
+    }
+  }
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getStaticCtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticCtorSection), KeySym, 0);
+}
+
+MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
+    unsigned Priority, const MCSymbol *KeySym) const {
+  return getContext().getAssociativeCOFFSection(
+      cast<MCSectionCOFF>(StaticDtorSection), KeySym, 0);
+}
+
+void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
+    raw_ostream &OS, const GlobalValue *GV, const Mangler &Mang) const {
+  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
+    return;
+
+  const Triple &TT = getTargetTriple();
+
+  if (TT.isKnownWindowsMSVCEnvironment())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
+    std::string Flag;
+    raw_string_ostream FlagOS(Flag);
+    Mang.getNameWithPrefix(FlagOS, GV, false);
+    FlagOS.flush();
+    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+      OS << Flag.substr(1);
+    else
+      OS << Flag;
+  } else {
+    Mang.getNameWithPrefix(OS, GV, false);
+  }
+
+  if (!GV->getValueType()->isFunctionTy()) {
+    if (TT.isKnownWindowsMSVCEnvironment())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
new file mode 100644
index 000000000000..8d2048fa047f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -0,0 +1,49 @@
+//===-- TargetOptionsImpl.cpp - Options that apply to all targets ----------==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the methods in the TargetOptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+using namespace llvm;
+
+/// DisableFramePointerElim - This returns true if frame pointer elimination
+/// optimization should be disabled for the given machine function.
+bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
+  // Check to see if we should eliminate all frame pointers.
+  if (MF.getSubtarget().getFrameLowering()->noFramePointerElim(MF))
+    return true;
+
+  // Check to see if we should eliminate non-leaf frame pointers.
+  if (MF.getFunction()->hasFnAttribute("no-frame-pointer-elim-non-leaf"))
+    return MF.getFrameInfo()->hasCalls();
+
+  return false;
+}
+
+/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
+/// is specified on the command line.  When this flag is off(default), the
+/// code generator is not allowed to generate mad (multiply add) if the
+/// result is "less precise" than doing those operations individually.
+bool TargetOptions::LessPreciseFPMAD() const {
+  return UnsafeFPMath || LessPreciseFPMADOption;
+}
+
+/// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
+/// that the rounding mode of the FPU can change from its default.
+bool TargetOptions::HonorSignDependentRoundingFPMath() const {
+  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
new file mode 100644
index 000000000000..b8c820942cb5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -0,0 +1,888 @@
+//===-- TargetPassConfig.cpp - Target independent code generation passes --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces to access the target independent code
+// generation passes provided by the LLVM backend.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetPassConfig.h"
+
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/CFLAndersAliasAnalysis.h"
+#include "llvm/Analysis/CFLSteensAliasAnalysis.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/TypeBasedAliasAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/RegisterUsageInfo.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/SymbolRewriter.h"
+
+using namespace llvm;
+
+static cl::opt<bool> DisablePostRA("disable-post-ra", cl::Hidden,
+    cl::desc("Disable Post Regalloc"));
+static cl::opt<bool> DisableBranchFold("disable-branch-fold", cl::Hidden,
+    cl::desc("Disable branch folding"));
+static cl::opt<bool> DisableTailDuplicate("disable-tail-duplicate", cl::Hidden,
+    cl::desc("Disable tail duplication"));
+static cl::opt<bool> DisableEarlyTailDup("disable-early-taildup", cl::Hidden,
+    cl::desc("Disable pre-register allocation tail duplication"));
+static cl::opt<bool> DisableBlockPlacement("disable-block-placement",
+    cl::Hidden, cl::desc("Disable probability-driven block placement"));
+static cl::opt<bool> EnableBlockPlacementStats("enable-block-placement-stats",
+    cl::Hidden, cl::desc("Collect probability-driven block placement stats"));
+static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
+    cl::desc("Disable Stack Slot Coloring"));
+static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
+    cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> DisableEarlyIfConversion("disable-early-ifcvt", cl::Hidden,
+    cl::desc("Disable Early If-conversion"));
+static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
+    cl::desc("Disable Machine Common Subexpression Elimination"));
+static cl::opt<cl::boolOrDefault> OptimizeRegAlloc(
+    "optimize-regalloc", cl::Hidden,
+    cl::desc("Enable optimized register allocation compilation path."));
+static cl::opt<bool> DisablePostRAMachineLICM("disable-postra-machine-licm",
+    cl::Hidden,
+    cl::desc("Disable Machine LICM"));
+static cl::opt<bool> DisableMachineSink("disable-machine-sink", cl::Hidden,
+    cl::desc("Disable Machine Sinking"));
+static cl::opt<bool> DisableLSR("disable-lsr", cl::Hidden,
+    cl::desc("Disable Loop Strength Reduction Pass"));
+static cl::opt<bool> DisableConstantHoisting("disable-constant-hoisting",
+    cl::Hidden, cl::desc("Disable ConstantHoisting"));
+static cl::opt<bool> DisableCGP("disable-cgp", cl::Hidden,
+    cl::desc("Disable Codegen Prepare"));
+static cl::opt<bool> DisableCopyProp("disable-copyprop", cl::Hidden,
+    cl::desc("Disable Copy Propagation pass"));
+static cl::opt<bool> DisablePartialLibcallInlining("disable-partial-libcall-inlining",
+    cl::Hidden, cl::desc("Disable Partial Libcall Inlining"));
+static cl::opt<bool> EnableImplicitNullChecks(
+    "enable-implicit-null-checks",
+    cl::desc("Fold null checks into faulting memory operations"),
+    cl::init(false));
+static cl::opt<bool> PrintLSR("print-lsr-output", cl::Hidden,
+    cl::desc("Print LLVM IR produced by the loop-reduce pass"));
+static cl::opt<bool> PrintISelInput("print-isel-input", cl::Hidden,
+    cl::desc("Print LLVM IR input to isel pass"));
+static cl::opt<bool> PrintGCInfo("print-gc", cl::Hidden,
+    cl::desc("Dump garbage collector data"));
+static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
+    cl::desc("Verify generated machine code"),
+    cl::init(false),
+    cl::ZeroOrMore);
+
+static cl::opt<std::string>
+PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
+                   cl::desc("Print machine instrs"),
+                   cl::value_desc("pass-name"), cl::init("option-unspecified"));
+
+// Temporary option to allow experimenting with MachineScheduler as a post-RA
+// scheduler. Targets can "properly" enable this with
+// substitutePass(&PostRASchedulerID, &PostMachineSchedulerID).
+// Targets can return true in targetSchedulesPostRAScheduling() and
+// insert a PostRA scheduling pass wherever it wants.
+cl::opt<bool> MISchedPostRA("misched-postra", cl::Hidden,
+  cl::desc("Run MachineScheduler post regalloc (independent of preRA sched)"));
+
+// Experimental option to run live interval analysis early.
+static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
+    cl::desc("Run live interval analysis earlier in the pipeline"));
+
+// Experimental option to use CFL-AA in codegen
+enum class CFLAAType { None, Steensgaard, Andersen, Both };
+static cl::opt<CFLAAType> UseCFLAA(
+    "use-cfl-aa-in-codegen", cl::init(CFLAAType::None), cl::Hidden,
+    cl::desc("Enable the new, experimental CFL alias analysis in CodeGen"),
+    cl::values(clEnumValN(CFLAAType::None, "none", "Disable CFL-AA"),
+               clEnumValN(CFLAAType::Steensgaard, "steens",
+                          "Enable unification-based CFL-AA"),
+               clEnumValN(CFLAAType::Andersen, "anders",
+                          "Enable inclusion-based CFL-AA"),
+               clEnumValN(CFLAAType::Both, "both", 
+                          "Enable both variants of CFL-AA"),
+               clEnumValEnd));
+
+/// Allow standard passes to be disabled by command line options. This supports
+/// simple binary flags that either suppress the pass or do nothing.
+/// i.e. -disable-mypass=false has no effect.
+/// These should be converted to boolOrDefault in order to use applyOverride.
+static IdentifyingPassPtr applyDisable(IdentifyingPassPtr PassID,
+                                       bool Override) {
+  if (Override)
+    return IdentifyingPassPtr();
+  return PassID;
+}
+
+/// Allow standard passes to be disabled by the command line, regardless of who
+/// is adding the pass.
+///
+/// StandardID is the pass identified in the standard pass pipeline and provided
+/// to addPass(). It may be a target-specific ID in the case that the target
+/// directly adds its own pass, but in that case we harmlessly fall through.
+///
+/// TargetID is the pass that the target has configured to override StandardID.
+///
+/// StandardID may be a pseudo ID. In that case TargetID is the name of the real
+/// pass to run. This allows multiple options to control a single pass depending
+/// on where in the pipeline that pass is added.
+static IdentifyingPassPtr overridePass(AnalysisID StandardID,
+                                       IdentifyingPassPtr TargetID) {
+  if (StandardID == &PostRASchedulerID)
+    return applyDisable(TargetID, DisablePostRA);
+
+  if (StandardID == &BranchFolderPassID)
+    return applyDisable(TargetID, DisableBranchFold);
+
+  if (StandardID == &TailDuplicateID)
+    return applyDisable(TargetID, DisableTailDuplicate);
+
+  if (StandardID == &TargetPassConfig::EarlyTailDuplicateID)
+    return applyDisable(TargetID, DisableEarlyTailDup);
+
+  if (StandardID == &MachineBlockPlacementID)
+    return applyDisable(TargetID, DisableBlockPlacement);
+
+  if (StandardID == &StackSlotColoringID)
+    return applyDisable(TargetID, DisableSSC);
+
+  if (StandardID == &DeadMachineInstructionElimID)
+    return applyDisable(TargetID, DisableMachineDCE);
+
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, DisableEarlyIfConversion);
+
+  if (StandardID == &MachineLICMID)
+    return applyDisable(TargetID, DisableMachineLICM);
+
+  if (StandardID == &MachineCSEID)
+    return applyDisable(TargetID, DisableMachineCSE);
+
+  if (StandardID == &TargetPassConfig::PostRAMachineLICMID)
+    return applyDisable(TargetID, DisablePostRAMachineLICM);
+
+  if (StandardID == &MachineSinkingID)
+    return applyDisable(TargetID, DisableMachineSink);
+
+  if (StandardID == &MachineCopyPropagationID)
+    return applyDisable(TargetID, DisableCopyProp);
+
+  return TargetID;
+}
+
+//===---------------------------------------------------------------------===//
+/// TargetPassConfig
+//===---------------------------------------------------------------------===//
+
+INITIALIZE_PASS(TargetPassConfig, "targetpassconfig",
+                "Target Pass Configuration", false, false)
+char TargetPassConfig::ID = 0;
+
+// Pseudo Pass IDs.
+char TargetPassConfig::EarlyTailDuplicateID = 0;
+char TargetPassConfig::PostRAMachineLICMID = 0;
+
+namespace {
+struct InsertedPass {
+  AnalysisID TargetPassID;
+  IdentifyingPassPtr InsertedPassID;
+  bool VerifyAfter;
+  bool PrintAfter;
+
+  InsertedPass(AnalysisID TargetPassID, IdentifyingPassPtr InsertedPassID,
+               bool VerifyAfter, bool PrintAfter)
+      : TargetPassID(TargetPassID), InsertedPassID(InsertedPassID),
+        VerifyAfter(VerifyAfter), PrintAfter(PrintAfter) {}
+
+  Pass *getInsertedPass() const {
+    assert(InsertedPassID.isValid() && "Illegal Pass ID!");
+    if (InsertedPassID.isInstance())
+      return InsertedPassID.getInstance();
+    Pass *NP = Pass::createPass(InsertedPassID.getID());
+    assert(NP && "Pass ID not registered");
+    return NP;
+  }
+};
+}
+
+namespace llvm {
+class PassConfigImpl {
+public:
+  // List of passes explicitly substituted by this target. Normally this is
+  // empty, but it is a convenient way to suppress or replace specific passes
+  // that are part of a standard pass pipeline without overridding the entire
+  // pipeline. This mechanism allows target options to inherit a standard pass's
+  // user interface. For example, a target may disable a standard pass by
+  // default by substituting a pass ID of zero, and the user may still enable
+  // that standard pass with an explicit command line option.
+  DenseMap<AnalysisID,IdentifyingPassPtr> TargetPasses;
+
+  /// Store the pairs of <AnalysisID, AnalysisID> of which the second pass
+  /// is inserted after each instance of the first one.
+  SmallVector<InsertedPass, 4> InsertedPasses;
+};
+} // namespace llvm
+
+// Out of line virtual method.
+TargetPassConfig::~TargetPassConfig() {
+  delete Impl;
+}
+
+// Out of line constructor provides default values for pass options and
+// registers all common codegen passes.
+TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
+    : ImmutablePass(ID), PM(&pm), StartBefore(nullptr), StartAfter(nullptr),
+      StopAfter(nullptr), Started(true), Stopped(false),
+      AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
+      DisableVerify(false), EnableTailMerge(true) {
+
+  Impl = new PassConfigImpl();
+
+  // Register all target independent codegen passes to activate their PassIDs,
+  // including this pass itself.
+  initializeCodeGen(*PassRegistry::getPassRegistry());
+
+  // Also register alias analysis passes required by codegen passes.
+  initializeBasicAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  initializeAAResultsWrapperPassPass(*PassRegistry::getPassRegistry());
+
+  // Substitute Pseudo Pass IDs for real ones.
+  substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
+  substitutePass(&PostRAMachineLICMID, &MachineLICMID);
+
+  if (StringRef(PrintMachineInstrs.getValue()).equals(""))
+    TM->Options.PrintMachineCode = true;
+}
+
+CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
+  return TM->getOptLevel();
+}
+
+/// Insert InsertedPassID pass after TargetPassID.
+void TargetPassConfig::insertPass(AnalysisID TargetPassID,
+                                  IdentifyingPassPtr InsertedPassID,
+                                  bool VerifyAfter, bool PrintAfter) {
+  assert(((!InsertedPassID.isInstance() &&
+           TargetPassID != InsertedPassID.getID()) ||
+          (InsertedPassID.isInstance() &&
+           TargetPassID != InsertedPassID.getInstance()->getPassID())) &&
+         "Insert a pass after itself!");
+  Impl->InsertedPasses.emplace_back(TargetPassID, InsertedPassID, VerifyAfter,
+                                    PrintAfter);
+}
+
+/// createPassConfig - Create a pass configuration object to be used by
+/// addPassToEmitX methods for generating a pipeline of CodeGen passes.
+///
+/// Targets may override this to extend TargetPassConfig.
+TargetPassConfig *LLVMTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new TargetPassConfig(this, PM);
+}
+
+TargetPassConfig::TargetPassConfig()
+  : ImmutablePass(ID), PM(nullptr) {
+  llvm_unreachable("TargetPassConfig should not be constructed on-the-fly");
+}
+
+// Helper to verify the analysis is really immutable.
+void TargetPassConfig::setOpt(bool &Opt, bool Val) {
+  assert(!Initialized && "PassConfig is immutable");
+  Opt = Val;
+}
+
+void TargetPassConfig::substitutePass(AnalysisID StandardID,
+                                      IdentifyingPassPtr TargetID) {
+  Impl->TargetPasses[StandardID] = TargetID;
+}
+
+IdentifyingPassPtr TargetPassConfig::getPassSubstitution(AnalysisID ID) const {
+  DenseMap<AnalysisID, IdentifyingPassPtr>::const_iterator
+    I = Impl->TargetPasses.find(ID);
+  if (I == Impl->TargetPasses.end())
+    return ID;
+  return I->second;
+}
+
+bool TargetPassConfig::isPassSubstitutedOrOverridden(AnalysisID ID) const {
+  IdentifyingPassPtr TargetID = getPassSubstitution(ID);
+  IdentifyingPassPtr FinalPtr = overridePass(ID, TargetID);
+  return !FinalPtr.isValid() || FinalPtr.isInstance() ||
+      FinalPtr.getID() != ID;
+}
+
+/// Add a pass to the PassManager if that pass is supposed to be run.  If the
+/// Started/Stopped flags indicate either that the compilation should start at
+/// a later pass or that it should stop after an earlier pass, then do not add
+/// the pass.  Finally, compare the current pass against the StartAfter
+/// and StopAfter options and change the Started/Stopped flags accordingly.
+void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
+  assert(!Initialized && "PassConfig is immutable");
+
+  // Cache the Pass ID here in case the pass manager finds this pass is
+  // redundant with ones already scheduled / available, and deletes it.
+  // Fundamentally, once we add the pass to the manager, we no longer own it
+  // and shouldn't reference it.
+  AnalysisID PassID = P->getPassID();
+
+  if (StartBefore == PassID)
+    Started = true;
+  if (Started && !Stopped) {
+    std::string Banner;
+    // Construct banner message before PM->add() as that may delete the pass.
+    if (AddingMachinePasses && (printAfter || verifyAfter))
+      Banner = std::string("After ") + std::string(P->getPassName());
+    PM->add(P);
+    if (AddingMachinePasses) {
+      if (printAfter)
+        addPrintPass(Banner);
+      if (verifyAfter)
+        addVerifyPass(Banner);
+    }
+
+    // Add the passes after the pass P if there is any.
+    for (auto IP : Impl->InsertedPasses) {
+      if (IP.TargetPassID == PassID)
+        addPass(IP.getInsertedPass(), IP.VerifyAfter, IP.PrintAfter);
+    }
+  } else {
+    delete P;
+  }
+  if (StopAfter == PassID)
+    Stopped = true;
+  if (StartAfter == PassID)
+    Started = true;
+  if (Stopped && !Started)
+    report_fatal_error("Cannot stop compilation after pass that is not run");
+}
+
+/// Add a CodeGen pass at this point in the pipeline after checking for target
+/// and command line overrides.
+///
+/// addPass cannot return a pointer to the pass instance because is internal the
+/// PassManager and the instance we create here may already be freed.
+AnalysisID TargetPassConfig::addPass(AnalysisID PassID, bool verifyAfter,
+                                     bool printAfter) {
+  IdentifyingPassPtr TargetID = getPassSubstitution(PassID);
+  IdentifyingPassPtr FinalPtr = overridePass(PassID, TargetID);
+  if (!FinalPtr.isValid())
+    return nullptr;
+
+  Pass *P;
+  if (FinalPtr.isInstance())
+    P = FinalPtr.getInstance();
+  else {
+    P = Pass::createPass(FinalPtr.getID());
+    if (!P)
+      llvm_unreachable("Pass ID not registered");
+  }
+  AnalysisID FinalID = P->getPassID();
+  addPass(P, verifyAfter, printAfter); // Ends the lifetime of P.
+
+  return FinalID;
+}
+
+void TargetPassConfig::printAndVerify(const std::string &Banner) {
+  addPrintPass(Banner);
+  addVerifyPass(Banner);
+}
+
+void TargetPassConfig::addPrintPass(const std::string &Banner) {
+  if (TM->shouldPrintMachineCode())
+    PM->add(createMachineFunctionPrinterPass(dbgs(), Banner));
+}
+
+void TargetPassConfig::addVerifyPass(const std::string &Banner) {
+  if (VerifyMachineCode)
+    PM->add(createMachineVerifierPass(Banner));
+}
+
+/// Add common target configurable passes that perform LLVM IR to IR transforms
+/// following machine independent optimization.
+void TargetPassConfig::addIRPasses() {
+  switch (UseCFLAA) {
+  case CFLAAType::Steensgaard:
+    addPass(createCFLSteensAAWrapperPass());
+    break;
+  case CFLAAType::Andersen:
+    addPass(createCFLAndersAAWrapperPass());
+    break;
+  case CFLAAType::Both:
+    addPass(createCFLAndersAAWrapperPass());
+    addPass(createCFLSteensAAWrapperPass());
+    break;
+  default:
+    break;
+  }
+
+  // Basic AliasAnalysis support.
+  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+  // BasicAliasAnalysis wins if they disagree. This is intended to help
+  // support "obvious" type-punning idioms.
+  addPass(createTypeBasedAAWrapperPass());
+  addPass(createScopedNoAliasAAWrapperPass());
+  addPass(createBasicAAWrapperPass());
+
+  // Before running any passes, run the verifier to determine if the input
+  // coming from the front-end and/or optimizer is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+
+  // Run loop strength reduction before anything else.
+  if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
+    addPass(createLoopStrengthReducePass());
+    if (PrintLSR)
+      addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
+  }
+
+  // Run GC lowering passes for builtin collectors
+  // TODO: add a pass insertion point here
+  addPass(createGCLoweringPass());
+  addPass(createShadowStackGCLoweringPass());
+
+  // Make sure that no unreachable blocks are instruction selected.
+  addPass(createUnreachableBlockEliminationPass());
+
+  // Prepare expensive constants for SelectionDAG.
+  if (getOptLevel() != CodeGenOpt::None && !DisableConstantHoisting)
+    addPass(createConstantHoistingPass());
+
+  if (getOptLevel() != CodeGenOpt::None && !DisablePartialLibcallInlining)
+    addPass(createPartiallyInlineLibCallsPass());
+}
+
+/// Turn exception handling constructs into something the code generators can
+/// handle.
+void TargetPassConfig::addPassesToHandleExceptions() {
+  switch (TM->getMCAsmInfo()->getExceptionHandlingType()) {
+  case ExceptionHandling::SjLj:
+    // SjLj piggy-backs on dwarf for this bit. The cleanups done apply to both
+    // Dwarf EH prepare needs to be run after SjLj prepare. Otherwise,
+    // catch info can get misplaced when a selector ends up more than one block
+    // removed from the parent invoke(s). This could happen when a landing
+    // pad is shared by multiple invokes and is also a target of a normal
+    // edge from elsewhere.
+    addPass(createSjLjEHPreparePass());
+    // FALLTHROUGH
+  case ExceptionHandling::DwarfCFI:
+  case ExceptionHandling::ARM:
+    addPass(createDwarfEHPass(TM));
+    break;
+  case ExceptionHandling::WinEH:
+    // We support using both GCC-style and MSVC-style exceptions on Windows, so
+    // add both preparation passes. Each pass will only actually run if it
+    // recognizes the personality function.
+    addPass(createWinEHPass(TM));
+    addPass(createDwarfEHPass(TM));
+    break;
+  case ExceptionHandling::None:
+    addPass(createLowerInvokePass());
+
+    // The lower invoke pass may create unreachable code. Remove it.
+    addPass(createUnreachableBlockEliminationPass());
+    break;
+  }
+}
+
+/// Add pass to prepare the LLVM IR for code generation. This should be done
+/// before exception handling preparation passes.
+void TargetPassConfig::addCodeGenPrepare() {
+  if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
+    addPass(createCodeGenPreparePass(TM));
+  addPass(createRewriteSymbolsPass());
+}
+
+/// Add common passes that perform LLVM IR to IR transforms in preparation for
+/// instruction selection.
+void TargetPassConfig::addISelPrepare() {
+  addPreISel();
+
+  // Force codegen to run according to the callgraph.
+  if (TM->Options.EnableIPRA)
+    addPass(new DummyCGSCCPass);
+
+  // Add both the safe stack and the stack protection passes: each of them will
+  // only protect functions that have corresponding attributes.
+  addPass(createSafeStackPass(TM));
+  addPass(createStackProtectorPass(TM));
+
+  if (PrintISelInput)
+    addPass(createPrintFunctionPass(
+        dbgs(), "\n\n*** Final LLVM Code input to ISel ***\n"));
+
+  // All passes which modify the LLVM IR are now complete; run the verifier
+  // to ensure that the IR is valid.
+  if (!DisableVerify)
+    addPass(createVerifierPass());
+}
+
+/// Add the complete set of target-independent postISel code generator passes.
+///
+/// This can be read as the standard order of major LLVM CodeGen stages. Stages
+/// with nontrivial configuration or multiple passes are broken out below in
+/// add%Stage routines.
+///
+/// Any TargetPassConfig::addXX routine may be overriden by the Target. The
+/// addPre/Post methods with empty header implementations allow injecting
+/// target-specific fixups just before or after major stages. Additionally,
+/// targets have the flexibility to change pass order within a stage by
+/// overriding default implementation of add%Stage routines below. Each
+/// technique has maintainability tradeoffs because alternate pass orders are
+/// not well supported. addPre/Post works better if the target pass is easily
+/// tied to a common pass. But if it has subtle dependencies on multiple passes,
+/// the target should override the stage instead.
+///
+/// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
+/// before/after any target-independent pass. But it's currently overkill.
+void TargetPassConfig::addMachinePasses() {
+  AddingMachinePasses = true;
+
+  if (TM->Options.EnableIPRA)
+    addPass(createRegUsageInfoPropPass());
+
+  // Insert a machine instr printer pass after the specified pass.
+  if (!StringRef(PrintMachineInstrs.getValue()).equals("") &&
+      !StringRef(PrintMachineInstrs.getValue()).equals("option-unspecified")) {
+    const PassRegistry *PR = PassRegistry::getPassRegistry();
+    const PassInfo *TPI = PR->getPassInfo(PrintMachineInstrs.getValue());
+    const PassInfo *IPI = PR->getPassInfo(StringRef("machineinstr-printer"));
+    assert (TPI && IPI && "Pass ID not registered!");
+    const char *TID = (const char *)(TPI->getTypeInfo());
+    const char *IID = (const char *)(IPI->getTypeInfo());
+    insertPass(TID, IID);
+  }
+
+  // Print the instruction selected machine code...
+  printAndVerify("After Instruction Selection");
+
+  // Expand pseudo-instructions emitted by ISel.
+  addPass(&ExpandISelPseudosID);
+
+  // Add passes that optimize machine instructions in SSA form.
+  if (getOptLevel() != CodeGenOpt::None) {
+    addMachineSSAOptimization();
+  } else {
+    // If the target requests it, assign local variables to stack slots relative
+    // to one another and simplify frame index references where possible.
+    addPass(&LocalStackSlotAllocationID, false);
+  }
+
+  // Run pre-ra passes.
+  addPreRegAlloc();
+
+  // Run register allocation and passes that are tightly coupled with it,
+  // including phi elimination and scheduling.
+  if (getOptimizeRegAlloc())
+    addOptimizedRegAlloc(createRegAllocPass(true));
+  else
+    addFastRegAlloc(createRegAllocPass(false));
+
+  // Run post-ra passes.
+  addPostRegAlloc();
+
+  // Insert prolog/epilog code.  Eliminate abstract frame index references...
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(&ShrinkWrapID);
+
+  // Prolog/Epilog inserter needs a TargetMachine to instantiate. But only
+  // do so if it hasn't been disabled, substituted, or overridden.
+  if (!isPassSubstitutedOrOverridden(&PrologEpilogCodeInserterID))
+      addPass(createPrologEpilogInserterPass(TM));
+
+  /// Add passes that optimize machine instructions after register allocation.
+  if (getOptLevel() != CodeGenOpt::None)
+    addMachineLateOptimization();
+
+  // Expand pseudo instructions before second scheduling pass.
+  addPass(&ExpandPostRAPseudosID);
+
+  // Run pre-sched2 passes.
+  addPreSched2();
+
+  if (EnableImplicitNullChecks)
+    addPass(&ImplicitNullChecksID);
+
+  // Second pass scheduler.
+  // Let Target optionally insert this pass by itself at some other
+  // point.
+  if (getOptLevel() != CodeGenOpt::None &&
+      !TM->targetSchedulesPostRAScheduling()) {
+    if (MISchedPostRA)
+      addPass(&PostMachineSchedulerID);
+    else
+      addPass(&PostRASchedulerID);
+  }
+
+  // GC
+  if (addGCPasses()) {
+    if (PrintGCInfo)
+      addPass(createGCInfoPrinter(dbgs()), false, false);
+  }
+
+  // Basic block placement.
+  if (getOptLevel() != CodeGenOpt::None)
+    addBlockPlacement();
+
+  addPreEmitPass();
+
+  if (TM->Options.EnableIPRA)
+    // Collect register usage information and produce a register mask of
+    // clobbered registers, to be used to optimize call sites.
+    addPass(createRegUsageInfoCollector());
+
+  addPass(&FuncletLayoutID, false);
+
+  addPass(&StackMapLivenessID, false);
+  addPass(&LiveDebugValuesID, false);
+
+  addPass(&XRayInstrumentationID, false);
+  addPass(&PatchableFunctionID, false);
+
+  AddingMachinePasses = false;
+}
+
+/// Add passes that optimize machine instructions in SSA form.
+void TargetPassConfig::addMachineSSAOptimization() {
+  // Pre-ra tail duplication.
+  addPass(&EarlyTailDuplicateID);
+
+  // Optimize PHIs before DCE: removing dead PHI cycles may make more
+  // instructions dead.
+  addPass(&OptimizePHIsID, false);
+
+  // This pass merges large allocas. StackSlotColoring is a different pass
+  // which merges spill slots.
+  addPass(&StackColoringID, false);
+
+  // If the target requests it, assign local variables to stack slots relative
+  // to one another and simplify frame index references where possible.
+  addPass(&LocalStackSlotAllocationID, false);
+
+  // With optimization, dead code should already be eliminated. However
+  // there is one known exception: lowered code for arguments that are only
+  // used by tail calls, where the tail calls reuse the incoming stack
+  // arguments directly (see t11 in test/CodeGen/X86/sibcall.ll).
+  addPass(&DeadMachineInstructionElimID);
+
+  // Allow targets to insert passes that improve instruction level parallelism,
+  // like if-conversion. Such passes will typically need dominator trees and
+  // loop info, just like LICM and CSE below.
+  addILPOpts();
+
+  addPass(&MachineLICMID, false);
+  addPass(&MachineCSEID, false);
+  addPass(&MachineSinkingID);
+
+  addPass(&PeepholeOptimizerID);
+  // Clean-up the dead code that may have been generated by peephole
+  // rewriting.
+  addPass(&DeadMachineInstructionElimID);
+}
+
+//===---------------------------------------------------------------------===//
+/// Register Allocation Pass Configuration
+//===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::getOptimizeRegAlloc() const {
+  switch (OptimizeRegAlloc) {
+  case cl::BOU_UNSET: return getOptLevel() != CodeGenOpt::None;
+  case cl::BOU_TRUE:  return true;
+  case cl::BOU_FALSE: return false;
+  }
+  llvm_unreachable("Invalid optimize-regalloc state");
+}
+
+/// RegisterRegAlloc's global Registry tracks allocator registration.
+MachinePassRegistry RegisterRegAlloc::Registry;
+
+/// A dummy default pass factory indicates whether the register allocator is
+/// overridden on the command line.
+LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag);
+static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
+static RegisterRegAlloc
+defaultRegAlloc("default",
+                "pick register allocator based on -O option",
+                useDefaultRegisterAllocator);
+
+/// -regalloc=... command line option.
+static cl::opt<RegisterRegAlloc::FunctionPassCtor, false,
+               RegisterPassParser<RegisterRegAlloc> >
+RegAlloc("regalloc",
+         cl::init(&useDefaultRegisterAllocator),
+         cl::desc("Register allocator to use"));
+
+static void initializeDefaultRegisterAllocatorOnce() {
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+
+  if (!Ctor) {
+    Ctor = RegAlloc;
+    RegisterRegAlloc::setDefault(RegAlloc);
+  }
+}
+
+
+/// Instantiate the default register allocator pass for this target for either
+/// the optimized or unoptimized allocation path. This will be added to the pass
+/// manager by addFastRegAlloc in the unoptimized case or addOptimizedRegAlloc
+/// in the optimized case.
+///
+/// A target that uses the standard regalloc pass order for fast or optimized
+/// allocation may still override this for per-target regalloc
+/// selection. But -regalloc=... always takes precedence.
+FunctionPass *TargetPassConfig::createTargetRegisterAllocator(bool Optimized) {
+  if (Optimized)
+    return createGreedyRegisterAllocator();
+  else
+    return createFastRegisterAllocator();
+}
+
+/// Find and instantiate the register allocation pass requested by this target
+/// at the current optimization level.  Different register allocators are
+/// defined as separate passes because they may require different analysis.
+///
+/// This helper ensures that the regalloc= option is always available,
+/// even for targets that override the default allocator.
+///
+/// FIXME: When MachinePassRegistry register pass IDs instead of function ptrs,
+/// this can be folded into addPass.
+FunctionPass *TargetPassConfig::createRegAllocPass(bool Optimized) {
+  // Initialize the global default.
+  llvm::call_once(InitializeDefaultRegisterAllocatorFlag,
+                  initializeDefaultRegisterAllocatorOnce);
+
+  RegisterRegAlloc::FunctionPassCtor Ctor = RegisterRegAlloc::getDefault();
+  if (Ctor != useDefaultRegisterAllocator)
+    return Ctor();
+
+  // With no -regalloc= override, ask the target for a regalloc pass.
+  return createTargetRegisterAllocator(Optimized);
+}
+
+/// Return true if the default global register allocator is in use and
+/// has not be overriden on the command line with '-regalloc=...'
+bool TargetPassConfig::usingDefaultRegAlloc() const {
+  return RegAlloc.getNumOccurrences() == 0;
+}
+
+/// Add the minimum set of target-independent passes that are required for
+/// register allocation. No coalescing or scheduling.
+void TargetPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&PHIEliminationID, false);
+  addPass(&TwoAddressInstructionPassID, false);
+
+  if (RegAllocPass)
+    addPass(RegAllocPass);
+}
+
+/// Add standard target-independent passes that are tightly coupled with
+/// optimized register allocation, including coalescing, machine instruction
+/// scheduling, and register allocation itself.
+void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
+  addPass(&DetectDeadLanesID, false);
+
+  addPass(&ProcessImplicitDefsID, false);
+
+  // LiveVariables currently requires pure SSA form.
+  //
+  // FIXME: Once TwoAddressInstruction pass no longer uses kill flags,
+  // LiveVariables can be removed completely, and LiveIntervals can be directly
+  // computed. (We still either need to regenerate kill flags after regalloc, or
+  // preferably fix the scavenger to not depend on them).
+  addPass(&LiveVariablesID, false);
+
+  // Edge splitting is smarter with machine loop info.
+  addPass(&MachineLoopInfoID, false);
+  addPass(&PHIEliminationID, false);
+
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (EarlyLiveIntervals)
+    addPass(&LiveIntervalsID, false);
+
+  addPass(&TwoAddressInstructionPassID, false);
+  addPass(&RegisterCoalescerID);
+
+  // The machine scheduler may accidentally create disconnected components
+  // when moving subregister definitions around, avoid this by splitting them to
+  // separate vregs before. Splitting can also improve reg. allocation quality.
+  addPass(&RenameIndependentSubregsID);
+
+  // PreRA instruction scheduling.
+  addPass(&MachineSchedulerID);
+
+  if (RegAllocPass) {
+    // Add the selected register allocation pass.
+    addPass(RegAllocPass);
+
+    // Allow targets to change the register assignments before rewriting.
+    addPreRewrite();
+
+    // Finally rewrite virtual registers.
+    addPass(&VirtRegRewriterID);
+
+    // Perform stack slot coloring and post-ra machine LICM.
+    //
+    // FIXME: Re-enable coloring with register when it's capable of adding
+    // kill markers.
+    addPass(&StackSlotColoringID);
+
+    // Run post-ra machine LICM to hoist reloads / remats.
+    //
+    // FIXME: can this move into MachineLateOptimization?
+    addPass(&PostRAMachineLICMID);
+  }
+}
+
+//===---------------------------------------------------------------------===//
+/// Post RegAlloc Pass Configuration
+//===---------------------------------------------------------------------===//
+
+/// Add passes that optimize machine instructions after register allocation.
+void TargetPassConfig::addMachineLateOptimization() {
+  // Branch folding must be run after regalloc and prolog/epilog insertion.
+  addPass(&BranchFolderPassID);
+
+  // Tail duplication.
+  // Note that duplicating tail just increases code size and degrades
+  // performance for targets that require Structured Control Flow.
+  // In addition it can also make CFG irreducible. Thus we disable it.
+  if (!TM->requiresStructuredCFG())
+    addPass(&TailDuplicateID);
+
+  // Copy propagation.
+  addPass(&MachineCopyPropagationID);
+}
+
+/// Add standard GC passes.
+bool TargetPassConfig::addGCPasses() {
+  addPass(&GCMachineCodeAnalysisID, false);
+  return true;
+}
+
+/// Add standard basic block placement passes.
+void TargetPassConfig::addBlockPlacement() {
+  if (addPass(&MachineBlockPlacementID)) {
+    // Run a separate pass to collect block placement statistics.
+    if (EnableBlockPlacementStats)
+      addPass(&MachineBlockPlacementStatsID);
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
new file mode 100644
index 000000000000..e1d90cb913e5
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -0,0 +1,399 @@
+//===- TargetRegisterInfo.cpp - Target Register Information Implementation ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TargetRegisterInfo interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define DEBUG_TYPE "target-reg-info"
+
+using namespace llvm;
+
+TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
+                             regclass_iterator RCB, regclass_iterator RCE,
+                             const char *const *SRINames,
+                             const unsigned *SRILaneMasks,
+                             unsigned SRICoveringLanes)
+  : InfoDesc(ID), SubRegIndexNames(SRINames),
+    SubRegIndexLaneMasks(SRILaneMasks),
+    RegClassBegin(RCB), RegClassEnd(RCE),
+    CoveringLanes(SRICoveringLanes) {
+}
+
+TargetRegisterInfo::~TargetRegisterInfo() {}
+
+namespace llvm {
+
+Printable PrintReg(unsigned Reg, const TargetRegisterInfo *TRI,
+                   unsigned SubIdx) {
+  return Printable([Reg, TRI, SubIdx](raw_ostream &OS) {
+    if (!Reg)
+      OS << "%noreg";
+    else if (TargetRegisterInfo::isStackSlot(Reg))
+      OS << "SS#" << TargetRegisterInfo::stackSlot2Index(Reg);
+    else if (TargetRegisterInfo::isVirtualRegister(Reg))
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Reg);
+    else if (TRI && Reg < TRI->getNumRegs())
+      OS << '%' << TRI->getName(Reg);
+    else
+      OS << "%physreg" << Reg;
+    if (SubIdx) {
+      if (TRI)
+        OS << ':' << TRI->getSubRegIndexName(SubIdx);
+      else
+        OS << ":sub(" << SubIdx << ')';
+    }
+  });
+}
+
+Printable PrintRegUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    // Generic printout when TRI is missing.
+    if (!TRI) {
+      OS << "Unit~" << Unit;
+      return;
+    }
+
+    // Check for invalid register units.
+    if (Unit >= TRI->getNumRegUnits()) {
+      OS << "BadUnit~" << Unit;
+      return;
+    }
+
+    // Normal units have at least one root.
+    MCRegUnitRootIterator Roots(Unit, TRI);
+    assert(Roots.isValid() && "Unit has no roots.");
+    OS << TRI->getName(*Roots);
+    for (++Roots; Roots.isValid(); ++Roots)
+      OS << '~' << TRI->getName(*Roots);
+  });
+}
+
+Printable PrintVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
+  return Printable([Unit, TRI](raw_ostream &OS) {
+    if (TRI && TRI->isVirtualRegister(Unit)) {
+      OS << "%vreg" << TargetRegisterInfo::virtReg2Index(Unit);
+    } else {
+      OS << PrintRegUnit(Unit, TRI);
+    }
+  });
+}
+
+Printable PrintLaneMask(LaneBitmask LaneMask) {
+  return Printable([LaneMask](raw_ostream &OS) {
+    OS << format("%08X", LaneMask);
+  });
+}
+
+} // End of llvm namespace
+
+/// getAllocatableClass - Return the maximal subclass of the given register
+/// class that is alloctable, or NULL.
+const TargetRegisterClass *
+TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
+  if (!RC || RC->isAllocatable())
+    return RC;
+
+  for (BitMaskClassIterator It(RC->getSubClassMask(), *this); It.isValid();
+       ++It) {
+    const TargetRegisterClass *SubRC = getRegClass(It.getID());
+    if (SubRC->isAllocatable())
+      return SubRC;
+  }
+  return nullptr;
+}
+
+/// getMinimalPhysRegClass - Returns the Register Class of a physical
+/// register of the given type, picking the most sub register class of
+/// the right type that contains this physreg.
+const TargetRegisterClass *
+TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
+  assert(isPhysicalRegister(reg) && "reg must be a physical register");
+
+  // Pick the most sub register class of the right type that contains
+  // this physreg.
+  const TargetRegisterClass* BestRC = nullptr;
+  for (regclass_iterator I = regclass_begin(), E = regclass_end(); I != E; ++I){
+    const TargetRegisterClass* RC = *I;
+    if ((VT == MVT::Other || RC->hasType(VT)) && RC->contains(reg) &&
+        (!BestRC || BestRC->hasSubClass(RC)))
+      BestRC = RC;
+  }
+
+  assert(BestRC && "Couldn't find the register class");
+  return BestRC;
+}
+
+/// getAllocatableSetForRC - Toggle the bits that represent allocatable
+/// registers for the specific register class.
+static void getAllocatableSetForRC(const MachineFunction &MF,
+                                   const TargetRegisterClass *RC, BitVector &R){
+  assert(RC->isAllocatable() && "invalid for nonallocatable sets");
+  ArrayRef<MCPhysReg> Order = RC->getRawAllocationOrder(MF);
+  for (unsigned i = 0; i != Order.size(); ++i)
+    R.set(Order[i]);
+}
+
+BitVector TargetRegisterInfo::getAllocatableSet(const MachineFunction &MF,
+                                          const TargetRegisterClass *RC) const {
+  BitVector Allocatable(getNumRegs());
+  if (RC) {
+    // A register class with no allocatable subclass returns an empty set.
+    const TargetRegisterClass *SubClass = getAllocatableClass(RC);
+    if (SubClass)
+      getAllocatableSetForRC(MF, SubClass, Allocatable);
+  } else {
+    for (TargetRegisterInfo::regclass_iterator I = regclass_begin(),
+         E = regclass_end(); I != E; ++I)
+      if ((*I)->isAllocatable())
+        getAllocatableSetForRC(MF, *I, Allocatable);
+  }
+
+  // Mask out the reserved registers
+  BitVector Reserved = getReservedRegs(MF);
+  Allocatable &= Reserved.flip();
+
+  return Allocatable;
+}
+
+static inline
+const TargetRegisterClass *firstCommonClass(const uint32_t *A,
+                                            const uint32_t *B,
+                                            const TargetRegisterInfo *TRI,
+                                            const MVT::SimpleValueType SVT =
+                                            MVT::SimpleValueType::Any) {
+  const MVT VT(SVT);
+  for (unsigned I = 0, E = TRI->getNumRegClasses(); I < E; I += 32)
+    if (unsigned Common = *A++ & *B++) {
+      const TargetRegisterClass *RC =
+          TRI->getRegClass(I + countTrailingZeros(Common));
+      if (SVT == MVT::SimpleValueType::Any || RC->hasType(VT))
+        return RC;
+    }
+  return nullptr;
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getCommonSubClass(const TargetRegisterClass *A,
+                                      const TargetRegisterClass *B,
+                                      const MVT::SimpleValueType SVT) const {
+  // First take care of the trivial cases.
+  if (A == B)
+    return A;
+  if (!A || !B)
+    return nullptr;
+
+  // Register classes are ordered topologically, so the largest common
+  // sub-class it the common sub-class with the smallest ID.
+  return firstCommonClass(A->getSubClassMask(), B->getSubClassMask(), this, SVT);
+}
+
+const TargetRegisterClass *
+TargetRegisterInfo::getMatchingSuperRegClass(const TargetRegisterClass *A,
+                                             const TargetRegisterClass *B,
+                                             unsigned Idx) const {
+  assert(A && B && "Missing register class");
+  assert(Idx && "Bad sub-register index");
+
+  // Find Idx in the list of super-register indices.
+  for (SuperRegClassIterator RCI(B, this); RCI.isValid(); ++RCI)
+    if (RCI.getSubReg() == Idx)
+      // The bit mask contains all register classes that are projected into B
+      // by Idx. Find a class that is also a sub-class of A.
+      return firstCommonClass(RCI.getMask(), A->getSubClassMask(), this);
+  return nullptr;
+}
+
+const TargetRegisterClass *TargetRegisterInfo::
+getCommonSuperRegClass(const TargetRegisterClass *RCA, unsigned SubA,
+                       const TargetRegisterClass *RCB, unsigned SubB,
+                       unsigned &PreA, unsigned &PreB) const {
+  assert(RCA && SubA && RCB && SubB && "Invalid arguments");
+
+  // Search all pairs of sub-register indices that project into RCA and RCB
+  // respectively. This is quadratic, but usually the sets are very small. On
+  // most targets like X86, there will only be a single sub-register index
+  // (e.g., sub_16bit projecting into GR16).
+  //
+  // The worst case is a register class like DPR on ARM.
+  // We have indices dsub_0..dsub_7 projecting into that class.
+  //
+  // It is very common that one register class is a sub-register of the other.
+  // Arrange for RCA to be the larger register so the answer will be found in
+  // the first iteration. This makes the search linear for the most common
+  // case.
+  const TargetRegisterClass *BestRC = nullptr;
+  unsigned *BestPreA = &PreA;
+  unsigned *BestPreB = &PreB;
+  if (RCA->getSize() < RCB->getSize()) {
+    std::swap(RCA, RCB);
+    std::swap(SubA, SubB);
+    std::swap(BestPreA, BestPreB);
+  }
+
+  // Also terminate the search one we have found a register class as small as
+  // RCA.
+  unsigned MinSize = RCA->getSize();
+
+  for (SuperRegClassIterator IA(RCA, this, true); IA.isValid(); ++IA) {
+    unsigned FinalA = composeSubRegIndices(IA.getSubReg(), SubA);
+    for (SuperRegClassIterator IB(RCB, this, true); IB.isValid(); ++IB) {
+      // Check if a common super-register class exists for this index pair.
+      const TargetRegisterClass *RC =
+        firstCommonClass(IA.getMask(), IB.getMask(), this);
+      if (!RC || RC->getSize() < MinSize)
+        continue;
+
+      // The indexes must compose identically: PreA+SubA == PreB+SubB.
+      unsigned FinalB = composeSubRegIndices(IB.getSubReg(), SubB);
+      if (FinalA != FinalB)
+        continue;
+
+      // Is RC a better candidate than BestRC?
+      if (BestRC && RC->getSize() >= BestRC->getSize())
+        continue;
+
+      // Yes, RC is the smallest super-register seen so far.
+      BestRC = RC;
+      *BestPreA = IA.getSubReg();
+      *BestPreB = IB.getSubReg();
+
+      // Bail early if we reached MinSize. We won't find a better candidate.
+      if (BestRC->getSize() == MinSize)
+        return BestRC;
+    }
+  }
+  return BestRC;
+}
+
+/// \brief Check if the registers defined by the pair (RegisterClass, SubReg)
+/// share the same register file.
+static bool shareSameRegisterFile(const TargetRegisterInfo &TRI,
+                                  const TargetRegisterClass *DefRC,
+                                  unsigned DefSubReg,
+                                  const TargetRegisterClass *SrcRC,
+                                  unsigned SrcSubReg) {
+  // Same register class.
+  if (DefRC == SrcRC)
+    return true;
+
+  // Both operands are sub registers. Check if they share a register class.
+  unsigned SrcIdx, DefIdx;
+  if (SrcSubReg && DefSubReg) {
+    return TRI.getCommonSuperRegClass(SrcRC, SrcSubReg, DefRC, DefSubReg,
+                                      SrcIdx, DefIdx) != nullptr;
+  }
+
+  // At most one of the register is a sub register, make it Src to avoid
+  // duplicating the test.
+  if (!SrcSubReg) {
+    std::swap(DefSubReg, SrcSubReg);
+    std::swap(DefRC, SrcRC);
+  }
+
+  // One of the register is a sub register, check if we can get a superclass.
+  if (SrcSubReg)
+    return TRI.getMatchingSuperRegClass(SrcRC, DefRC, SrcSubReg) != nullptr;
+
+  // Plain copy.
+  return TRI.getCommonSubClass(DefRC, SrcRC) != nullptr;
+}
+
+bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
+                                              unsigned DefSubReg,
+                                              const TargetRegisterClass *SrcRC,
+                                              unsigned SrcSubReg) const {
+  // If this source does not incur a cross register bank copy, use it.
+  return shareSameRegisterFile(*this, DefRC, DefSubReg, SrcRC, SrcSubReg);
+}
+
+// Compute target-independent register allocator hints to help eliminate copies.
+void
+TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
+                                          ArrayRef<MCPhysReg> Order,
+                                          SmallVectorImpl<MCPhysReg> &Hints,
+                                          const MachineFunction &MF,
+                                          const VirtRegMap *VRM,
+                                          const LiveRegMatrix *Matrix) const {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+
+  // Hints with HintType != 0 were set by target-dependent code.
+  // Such targets must provide their own implementation of
+  // TRI::getRegAllocationHints to interpret those hint types.
+  assert(Hint.first == 0 && "Target must implement TRI::getRegAllocationHints");
+
+  // Target-independent hints are either a physical or a virtual register.
+  unsigned Phys = Hint.second;
+  if (VRM && isVirtualRegister(Phys))
+    Phys = VRM->getPhys(Phys);
+
+  // Check that Phys is a valid hint in VirtReg's register class.
+  if (!isPhysicalRegister(Phys))
+    return;
+  if (MRI.isReserved(Phys))
+    return;
+  // Check that Phys is in the allocation order. We shouldn't heed hints
+  // from VirtReg's register class if they aren't in the allocation order. The
+  // target probably has a reason for removing the register.
+  if (std::find(Order.begin(), Order.end(), Phys) == Order.end())
+    return;
+
+  // All clear, tell the register allocator to prefer this register.
+  Hints.push_back(Phys);
+}
+
+bool TargetRegisterInfo::canRealignStack(const MachineFunction &MF) const {
+  return !MF.getFunction()->hasFnAttribute("no-realign-stack");
+}
+
+bool TargetRegisterInfo::needsStackRealignment(
+    const MachineFunction &MF) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
+  const Function *F = MF.getFunction();
+  unsigned StackAlign = TFI->getStackAlignment();
+  bool requiresRealignment = ((MFI->getMaxAlignment() > StackAlign) ||
+                              F->hasFnAttribute(Attribute::StackAlignment));
+  if (MF.getFunction()->hasFnAttribute("stackrealign") || requiresRealignment) {
+    if (canRealignStack(MF))
+      return true;
+    DEBUG(dbgs() << "Can't realign function's stack: " << F->getName() << "\n");
+  }
+  return false;
+}
+
+bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
+                                            const uint32_t *mask1) const {
+  unsigned N = (getNumRegs()+31) / 32;
+  for (unsigned I = 0; I < N; ++I)
+    if ((mask0[I] & mask1[I]) != mask0[I])
+      return false;
+  return true;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void
+TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+                            const TargetRegisterInfo *TRI) {
+  dbgs() << PrintReg(Reg, TRI, SubRegIndex) << "\n";
+}
+#endif
diff --git a/contrib/llvm/lib/CodeGen/TargetSchedule.cpp b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
new file mode 100644
index 000000000000..022e912aa84f
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TargetSchedule.cpp
@@ -0,0 +1,300 @@
+//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper around MCSchedModel that allows the interface
+// to benefit from information currently only available in TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSchedModel("schedmodel", cl::Hidden, cl::init(true),
+  cl::desc("Use TargetSchedModel for latency lookup"));
+
+static cl::opt<bool> EnableSchedItins("scheditins", cl::Hidden, cl::init(true),
+  cl::desc("Use InstrItineraryData for latency lookup"));
+
+bool TargetSchedModel::hasInstrSchedModel() const {
+  return EnableSchedModel && SchedModel.hasInstrSchedModel();
+}
+
+bool TargetSchedModel::hasInstrItineraries() const {
+  return EnableSchedItins && !InstrItins.isEmpty();
+}
+
+static unsigned gcd(unsigned Dividend, unsigned Divisor) {
+  // Dividend and Divisor will be naturally swapped as needed.
+  while(Divisor) {
+    unsigned Rem = Dividend % Divisor;
+    Dividend = Divisor;
+    Divisor = Rem;
+  };
+  return Dividend;
+}
+static unsigned lcm(unsigned A, unsigned B) {
+  unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
+  assert((LCM >= A && LCM >= B) && "LCM overflow");
+  return LCM;
+}
+
+void TargetSchedModel::init(const MCSchedModel &sm,
+                            const TargetSubtargetInfo *sti,
+                            const TargetInstrInfo *tii) {
+  SchedModel = sm;
+  STI = sti;
+  TII = tii;
+  STI->initInstrItins(InstrItins);
+
+  unsigned NumRes = SchedModel.getNumProcResourceKinds();
+  ResourceFactors.resize(NumRes);
+  ResourceLCM = SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    if (NumUnits > 0)
+      ResourceLCM = lcm(ResourceLCM, NumUnits);
+  }
+  MicroOpFactor = ResourceLCM / SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    ResourceFactors[Idx] = NumUnits ? (ResourceLCM / NumUnits) : 0;
+  }
+}
+
+unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
+                                          const MCSchedClassDesc *SC) const {
+  if (hasInstrItineraries()) {
+    int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
+    return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, *MI);
+  }
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->NumMicroOps;
+  }
+  return MI->isTransient() ? 0 : 1;
+}
+
+// The machine model may explicitly specify an invalid latency, which
+// effectively means infinite latency. Since users of the TargetSchedule API
+// don't know how to handle this, we convert it to a very large latency that is
+// easy to distinguish when debugging the DAG but won't induce overflow.
+static unsigned capLatency(int Cycles) {
+  return Cycles >= 0 ? Cycles : 1000;
+}
+
+/// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
+/// evaluation of predicates that depend on instruction operands or flags.
+const MCSchedClassDesc *TargetSchedModel::
+resolveSchedClass(const MachineInstr *MI) const {
+
+  // Get the definition's scheduling class descriptor from this machine model.
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  if (!SCDesc->isValid())
+    return SCDesc;
+
+#ifndef NDEBUG
+  unsigned NIter = 0;
+#endif
+  while (SCDesc->isVariant()) {
+    assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+
+    SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
+    SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  }
+  return SCDesc;
+}
+
+/// Find the def index of this operand. This index maps to the machine model and
+/// is independent of use operands. Def operands may be reordered with uses or
+/// merged with uses without affecting the def index (e.g. before/after
+/// regalloc). However, an instruction's def operands must never be reordered
+/// with respect to each other.
+static unsigned findDefIdx(const MachineInstr *MI, unsigned DefOperIdx) {
+  unsigned DefIdx = 0;
+  for (unsigned i = 0; i != DefOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef())
+      ++DefIdx;
+  }
+  return DefIdx;
+}
+
+/// Find the use index of this operand. This is independent of the instruction's
+/// def operands.
+///
+/// Note that uses are not determined by the operand's isUse property, which
+/// is simply the inverse of isDef. Here we consider any readsReg operand to be
+/// a "use". The machine model allows an operand to be both a Def and Use.
+static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
+  unsigned UseIdx = 0;
+  for (unsigned i = 0; i != UseOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.readsReg())
+      ++UseIdx;
+  }
+  return UseIdx;
+}
+
+// Top-level API for clients that know the operand indices.
+unsigned TargetSchedModel::computeOperandLatency(
+  const MachineInstr *DefMI, unsigned DefOperIdx,
+  const MachineInstr *UseMI, unsigned UseOperIdx) const {
+
+  if (!hasInstrSchedModel() && !hasInstrItineraries())
+    return TII->defaultDefLatency(SchedModel, *DefMI);
+
+  if (hasInstrItineraries()) {
+    int OperLatency = 0;
+    if (UseMI) {
+      OperLatency = TII->getOperandLatency(&InstrItins, *DefMI, DefOperIdx,
+                                           *UseMI, UseOperIdx);
+    }
+    else {
+      unsigned DefClass = DefMI->getDesc().getSchedClass();
+      OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
+    }
+    if (OperLatency >= 0)
+      return OperLatency;
+
+    // No operand latency was found.
+    unsigned InstrLatency = TII->getInstrLatency(&InstrItins, *DefMI);
+
+    // Expected latency is the max of the stage latency and itinerary props.
+    // Rather than directly querying InstrItins stage latency, we call a TII
+    // hook to allow subtargets to specialize latency. This hook is only
+    // applicable to the InstrItins model. InstrSchedModel should model all
+    // special cases without TII hooks.
+    InstrLatency =
+        std::max(InstrLatency, TII->defaultDefLatency(SchedModel, *DefMI));
+    return InstrLatency;
+  }
+  // hasInstrSchedModel()
+  const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+  unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
+  if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+      STI->getWriteLatencyEntry(SCDesc, DefIdx);
+    unsigned WriteID = WLEntry->WriteResourceID;
+    unsigned Latency = capLatency(WLEntry->Cycles);
+    if (!UseMI)
+      return Latency;
+
+    // Lookup the use's latency adjustment in SubtargetInfo.
+    const MCSchedClassDesc *UseDesc = resolveSchedClass(UseMI);
+    if (UseDesc->NumReadAdvanceEntries == 0)
+      return Latency;
+    unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
+    int Advance = STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+    if (Advance > 0 && (unsigned)Advance > Latency) // unsigned wrap
+      return 0;
+    return Latency - Advance;
+  }
+  // If DefIdx does not exist in the model (e.g. implicit defs), then return
+  // unit latency (defaultDefLatency may be too conservative).
+#ifndef NDEBUG
+  if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
+      && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()
+      && SchedModel.isComplete()) {
+    errs() << "DefIdx " << DefIdx << " exceeds machine model writes for "
+           << *DefMI << " (Try with MCSchedModel.CompleteModel set to false)";
+    llvm_unreachable("incomplete machine model");
+  }
+#endif
+  // FIXME: Automatically giving all implicit defs defaultDefLatency is
+  // undesirable. We should only do it for defs that are known to the MC
+  // desc like flags. Truly implicit defs should get 1 cycle latency.
+  return DefMI->isTransient() ? 0 : TII->defaultDefLatency(SchedModel, *DefMI);
+}
+
+unsigned
+TargetSchedModel::computeInstrLatency(const MCSchedClassDesc &SCDesc) const {
+  unsigned Latency = 0;
+  for (unsigned DefIdx = 0, DefEnd = SCDesc.NumWriteLatencyEntries;
+       DefIdx != DefEnd; ++DefIdx) {
+    // Lookup the definition's write latency in SubtargetInfo.
+    const MCWriteLatencyEntry *WLEntry =
+      STI->getWriteLatencyEntry(&SCDesc, DefIdx);
+    Latency = std::max(Latency, capLatency(WLEntry->Cycles));
+  }
+  return Latency;
+}
+
+unsigned TargetSchedModel::computeInstrLatency(unsigned Opcode) const {
+  assert(hasInstrSchedModel() && "Only call this function with a SchedModel");
+
+  unsigned SCIdx = TII->get(Opcode).getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SCIdx);
+
+  if (SCDesc->isValid() && !SCDesc->isVariant())
+    return computeInstrLatency(*SCDesc);
+
+  llvm_unreachable("No MI sched latency");
+}
+
+unsigned
+TargetSchedModel::computeInstrLatency(const MachineInstr *MI,
+                                      bool UseDefaultDefLatency) const {
+  // For the itinerary model, fall back to the old subtarget hook.
+  // Allow subtargets to compute Bundle latencies outside the machine model.
+  if (hasInstrItineraries() || MI->isBundle() ||
+      (!hasInstrSchedModel() && !UseDefaultDefLatency))
+    return TII->getInstrLatency(&InstrItins, *MI);
+
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
+    if (SCDesc->isValid())
+      return computeInstrLatency(*SCDesc);
+  }
+  return TII->defaultDefLatency(SchedModel, *MI);
+}
+
+unsigned TargetSchedModel::
+computeOutputLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                     const MachineInstr *DepMI) const {
+  if (!SchedModel.isOutOfOrder())
+    return 1;
+
+  // Out-of-order processor can dispatch WAW dependencies in the same cycle.
+
+  // Treat predication as a data dependency for out-of-order cpus. In-order
+  // cpus do not need to treat predicated writes specially.
+  //
+  // TODO: The following hack exists because predication passes do not
+  // correctly append imp-use operands, and readsReg() strangely returns false
+  // for predicated defs.
+  unsigned Reg = DefMI->getOperand(DefOperIdx).getReg();
+  const MachineFunction &MF = *DefMI->getParent()->getParent();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  if (!DepMI->readsRegister(Reg, TRI) && TII->isPredicated(*DepMI))
+    return computeInstrLatency(DefMI);
+
+  // If we have a per operand scheduling model, check if this def is writing
+  // an unbuffered resource. If so, it treated like an in-order cpu.
+  if (hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+    if (SCDesc->isValid()) {
+      for (const MCWriteProcResEntry *PRI = STI->getWriteProcResBegin(SCDesc),
+             *PRE = STI->getWriteProcResEnd(SCDesc); PRI != PRE; ++PRI) {
+        if (!SchedModel.getProcResource(PRI->ProcResourceIdx)->BufferSize)
+          return 1;
+      }
+    }
+  }
+  return 0;
+}
diff --git a/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
new file mode 100644
index 000000000000..3d9a51864b6c
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -0,0 +1,1806 @@
+//===-- TwoAddressInstructionPass.cpp - Two-Address instruction pass ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the TwoAddress instruction pass which is used
+// by most register allocators. Two-Address instructions are rewritten
+// from:
+//
+//     A = B op C
+//
+// to:
+//
+//     A = B
+//     A op= C
+//
+// Note that if a register allocator chooses to use this pass, that it
+// has to be capable of handling the non-SSA nature of these rewritten
+// virtual registers.
+//
+// It is also worth noting that the duplicate operand of the two
+// address instruction is removed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "twoaddrinstr"
+
+STATISTIC(NumTwoAddressInstrs, "Number of two-address instructions");
+STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
+STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
+STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
+STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
+STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
+STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
+
+// Temporary flag to disable rescheduling.
+static cl::opt<bool>
+EnableRescheduling("twoaddr-reschedule",
+                   cl::desc("Coalesce copies by rescheduling (default=true)"),
+                   cl::init(true), cl::Hidden);
+
+namespace {
+class TwoAddressInstructionPass : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const InstrItineraryData *InstrItins;
+  MachineRegisterInfo *MRI;
+  LiveVariables *LV;
+  LiveIntervals *LIS;
+  AliasAnalysis *AA;
+  CodeGenOpt::Level OptLevel;
+
+  // The current basic block being processed.
+  MachineBasicBlock *MBB;
+
+  // Keep track the distance of a MI from the start of the current basic block.
+  DenseMap<MachineInstr*, unsigned> DistanceMap;
+
+  // Set of already processed instructions in the current block.
+  SmallPtrSet<MachineInstr*, 8> Processed;
+
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies from physical registers to virtual
+  // registers. e.g. v1024 = move r0.
+  DenseMap<unsigned, unsigned> SrcRegMap;
+
+  // A map from virtual registers to physical registers which are likely targets
+  // to be coalesced to due to copies to physical registers from virtual
+  // registers. e.g. r1 = move v1024.
+  DenseMap<unsigned, unsigned> DstRegMap;
+
+  bool sink3AddrInstruction(MachineInstr *MI, unsigned Reg,
+                            MachineBasicBlock::iterator OldPos);
+
+  bool isRevCopyChain(unsigned FromReg, unsigned ToReg, int Maxlen);
+
+  bool noUseAfterLastDef(unsigned Reg, unsigned Dist, unsigned &LastDef);
+
+  bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                             MachineInstr *MI, unsigned Dist);
+
+  bool commuteInstruction(MachineInstr *MI,
+                          unsigned RegBIdx, unsigned RegCIdx, unsigned Dist);
+
+  bool isProfitableToConv3Addr(unsigned RegA, unsigned RegB);
+
+  bool convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                          MachineBasicBlock::iterator &nmi,
+                          unsigned RegA, unsigned RegB, unsigned Dist);
+
+  bool isDefTooClose(unsigned Reg, unsigned Dist, MachineInstr *MI);
+
+  bool rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
+  bool rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                             MachineBasicBlock::iterator &nmi,
+                             unsigned Reg);
+
+  bool tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                               MachineBasicBlock::iterator &nmi,
+                               unsigned SrcIdx, unsigned DstIdx,
+                               unsigned Dist, bool shouldOnlyCommute);
+
+  bool tryInstructionCommute(MachineInstr *MI,
+                             unsigned DstOpIdx,
+                             unsigned BaseOpIdx,
+                             bool BaseOpKilled,
+                             unsigned Dist);
+  void scanUses(unsigned DstReg);
+
+  void processCopy(MachineInstr *MI);
+
+  typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
+  typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
+  bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+  void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+  void eliminateRegSequence(MachineBasicBlock::iterator&);
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  TwoAddressInstructionPass() : MachineFunctionPass(ID) {
+    initializeTwoAddressInstructionPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addUsedIfAvailable<LiveVariables>();
+    AU.addPreserved<LiveVariables>();
+    AU.addPreserved<SlotIndexes>();
+    AU.addPreserved<LiveIntervals>();
+    AU.addPreservedID(MachineLoopInfoID);
+    AU.addPreservedID(MachineDominatorsID);
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  /// Pass entry point.
+  bool runOnMachineFunction(MachineFunction&) override;
+};
+} // end anonymous namespace
+
+char TwoAddressInstructionPass::ID = 0;
+INITIALIZE_PASS_BEGIN(TwoAddressInstructionPass, "twoaddressinstruction",
+                "Two-Address instruction pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_END(TwoAddressInstructionPass, "twoaddressinstruction",
+                "Two-Address instruction pass", false, false)
+
+char &llvm::TwoAddressInstructionPassID = TwoAddressInstructionPass::ID;
+
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg, LiveIntervals *LIS);
+
+/// A two-address instruction has been converted to a three-address instruction
+/// to avoid clobbering a register. Try to sink it past the instruction that
+/// would kill the above mentioned register to reduce register pressure.
+bool TwoAddressInstructionPass::
+sink3AddrInstruction(MachineInstr *MI, unsigned SavedReg,
+                     MachineBasicBlock::iterator OldPos) {
+  // FIXME: Shouldn't we be trying to do this before we three-addressify the
+  // instruction?  After this transformation is done, we no longer need
+  // the instruction to be in three-address form.
+
+  // Check if it's safe to move this instruction.
+  bool SeenStore = true; // Be conservative.
+  if (!MI->isSafeToMove(AA, SeenStore))
+    return false;
+
+  unsigned DefReg = 0;
+  SmallSet<unsigned, 4> UseRegs;
+
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isUse() && MOReg != SavedReg)
+      UseRegs.insert(MO.getReg());
+    if (!MO.isDef())
+      continue;
+    if (MO.isImplicit())
+      // Don't try to move it if it implicitly defines a register.
+      return false;
+    if (DefReg)
+      // For now, don't move any instructions that define multiple registers.
+      return false;
+    DefReg = MO.getReg();
+  }
+
+  // Find the instruction that kills SavedReg.
+  MachineInstr *KillMI = nullptr;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(SavedReg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  }
+  if (!KillMI) {
+    for (MachineOperand &UseMO : MRI->use_nodbg_operands(SavedReg)) {
+      if (!UseMO.isKill())
+        continue;
+      KillMI = UseMO.getParent();
+      break;
+    }
+  }
+
+  // If we find the instruction that kills SavedReg, and it is in an
+  // appropriate location, we can try to sink the current instruction
+  // past it.
+  if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
+      MachineBasicBlock::iterator(KillMI) == OldPos || KillMI->isTerminator())
+    return false;
+
+  // If any of the definitions are used by another instruction between the
+  // position and the kill use, then it's not safe to sink it.
+  //
+  // FIXME: This can be sped up if there is an easy way to query whether an
+  // instruction is before or after another instruction. Then we can use
+  // MachineRegisterInfo def / use instead.
+  MachineOperand *KillMO = nullptr;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+
+  unsigned NumVisited = 0;
+  for (MachineInstr &OtherMI : llvm::make_range(std::next(OldPos), KillPos)) {
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI.isDebugValue())
+      continue;
+    if (NumVisited > 30)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    for (unsigned i = 0, e = OtherMI.getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = OtherMI.getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (DefReg == MOReg)
+        return false;
+
+      if (MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS))) {
+        if (&OtherMI == KillMI && MOReg == SavedReg)
+          // Save the operand that kills the register. We want to unset the kill
+          // marker if we can sink MI past it.
+          KillMO = &MO;
+        else if (UseRegs.count(MOReg))
+          // One of the uses is killed before the destination.
+          return false;
+      }
+    }
+  }
+  assert(KillMO && "Didn't find kill");
+
+  if (!LIS) {
+    // Update kill and LV information.
+    KillMO->setIsKill(false);
+    KillMO = MI->findRegisterUseOperand(SavedReg, false, TRI);
+    KillMO->setIsKill(true);
+
+    if (LV)
+      LV->replaceKillInstruction(SavedReg, *KillMI, *MI);
+  }
+
+  // Move instruction to its destination.
+  MBB->remove(MI);
+  MBB->insert(KillPos, MI);
+
+  if (LIS)
+    LIS->handleMove(*MI);
+
+  ++Num3AddrSunk;
+  return true;
+}
+
+/// Return the MachineInstr* if it is the single def of the Reg in current BB.
+static MachineInstr *getSingleDef(unsigned Reg, MachineBasicBlock *BB,
+                                  const MachineRegisterInfo *MRI) {
+  MachineInstr *Ret = nullptr;
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != BB || DefMI.isDebugValue())
+      continue;
+    if (!Ret)
+      Ret = &DefMI;
+    else if (Ret != &DefMI)
+      return nullptr;
+  }
+  return Ret;
+}
+
+/// Check if there is a reversed copy chain from FromReg to ToReg:
+/// %Tmp1 = copy %Tmp2;
+/// %FromReg = copy %Tmp1;
+/// %ToReg = add %FromReg ...
+/// %Tmp2 = copy %ToReg;
+/// MaxLen specifies the maximum length of the copy chain the func
+/// can walk through.
+bool TwoAddressInstructionPass::isRevCopyChain(unsigned FromReg, unsigned ToReg,
+                                               int Maxlen) {
+  unsigned TmpReg = FromReg;
+  for (int i = 0; i < Maxlen; i++) {
+    MachineInstr *Def = getSingleDef(TmpReg, MBB, MRI);
+    if (!Def || !Def->isCopy())
+      return false;
+
+    TmpReg = Def->getOperand(1).getReg();
+
+    if (TmpReg == ToReg)
+      return true;
+  }
+  return false;
+}
+
+/// Return true if there are no intervening uses between the last instruction
+/// in the MBB that defines the specified register and the two-address
+/// instruction which is being processed. It also returns the last def location
+/// by reference.
+bool TwoAddressInstructionPass::noUseAfterLastDef(unsigned Reg, unsigned Dist,
+                                                  unsigned &LastDef) {
+  LastDef = 0;
+  unsigned LastUse = Dist;
+  for (MachineOperand &MO : MRI->reg_operands(Reg)) {
+    MachineInstr *MI = MO.getParent();
+    if (MI->getParent() != MBB || MI->isDebugValue())
+      continue;
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+    if (DI == DistanceMap.end())
+      continue;
+    if (MO.isUse() && DI->second < LastUse)
+      LastUse = DI->second;
+    if (MO.isDef() && DI->second > LastDef)
+      LastDef = DI->second;
+  }
+
+  return !(LastUse > LastDef && LastUse < Dist);
+}
+
+/// Return true if the specified MI is a copy instruction or an extract_subreg
+/// instruction. It also returns the source and destination registers and
+/// whether they are physical registers by reference.
+static bool isCopyToReg(MachineInstr &MI, const TargetInstrInfo *TII,
+                        unsigned &SrcReg, unsigned &DstReg,
+                        bool &IsSrcPhys, bool &IsDstPhys) {
+  SrcReg = 0;
+  DstReg = 0;
+  if (MI.isCopy()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(1).getReg();
+  } else if (MI.isInsertSubreg() || MI.isSubregToReg()) {
+    DstReg = MI.getOperand(0).getReg();
+    SrcReg = MI.getOperand(2).getReg();
+  } else
+    return false;
+
+  IsSrcPhys = TargetRegisterInfo::isPhysicalRegister(SrcReg);
+  IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+  return true;
+}
+
+/// Test if the given register value, which is used by the
+/// given instruction, is killed by the given instruction.
+static bool isPlainlyKilled(MachineInstr *MI, unsigned Reg,
+                            LiveIntervals *LIS) {
+  if (LIS && TargetRegisterInfo::isVirtualRegister(Reg) &&
+      !LIS->isNotInMIMap(*MI)) {
+    // FIXME: Sometimes tryInstructionTransform() will add instructions and
+    // test whether they can be folded before keeping them. In this case it
+    // sets a kill before recursively calling tryInstructionTransform() again.
+    // If there is no interval available, we assume that this instruction is
+    // one of those. A kill flag is manually inserted on the operand so the
+    // check below will handle it.
+    LiveInterval &LI = LIS->getInterval(Reg);
+    // This is to match the kill flag version where undefs don't have kill
+    // flags.
+    if (!LI.hasAtLeastOneValue())
+      return false;
+
+    SlotIndex useIdx = LIS->getInstructionIndex(*MI);
+    LiveInterval::const_iterator I = LI.find(useIdx);
+    assert(I != LI.end() && "Reg must be live-in to use.");
+    return !I->end.isBlock() && SlotIndex::isSameInstr(I->end, useIdx);
+  }
+
+  return MI->killsRegister(Reg);
+}
+
+/// Test if the given register value, which is used by the given
+/// instruction, is killed by the given instruction. This looks through
+/// coalescable copies to see if the original value is potentially not killed.
+///
+/// For example, in this code:
+///
+///   %reg1034 = copy %reg1024
+///   %reg1035 = copy %reg1025<kill>
+///   %reg1036 = add %reg1034<kill>, %reg1035<kill>
+///
+/// %reg1034 is not considered to be killed, since it is copied from a
+/// register which is not killed. Treating it as not killed lets the
+/// normal heuristics commute the (two-address) add, which lets
+/// coalescing eliminate the extra copy.
+///
+/// If allowFalsePositives is true then likely kills are treated as kills even
+/// if it can't be proven that they are kills.
+static bool isKilled(MachineInstr &MI, unsigned Reg,
+                     const MachineRegisterInfo *MRI,
+                     const TargetInstrInfo *TII,
+                     LiveIntervals *LIS,
+                     bool allowFalsePositives) {
+  MachineInstr *DefMI = &MI;
+  for (;;) {
+    // All uses of physical registers are likely to be kills.
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) &&
+        (allowFalsePositives || MRI->hasOneUse(Reg)))
+      return true;
+    if (!isPlainlyKilled(DefMI, Reg, LIS))
+      return false;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg))
+      return true;
+    MachineRegisterInfo::def_iterator Begin = MRI->def_begin(Reg);
+    // If there are multiple defs, we can't do a simple analysis, so just
+    // go with what the kill flag says.
+    if (std::next(Begin) != MRI->def_end())
+      return true;
+    DefMI = Begin->getParent();
+    bool IsSrcPhys, IsDstPhys;
+    unsigned SrcReg,  DstReg;
+    // If the def is something other than a copy, then it isn't going to
+    // be coalesced, so follow the kill flag.
+    if (!isCopyToReg(*DefMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+      return true;
+    Reg = SrcReg;
+  }
+}
+
+/// Return true if the specified MI uses the specified register as a two-address
+/// use. If so, return the destination register by reference.
+static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
+  for (unsigned i = 0, NumOps = MI.getNumOperands(); i != NumOps; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
+      continue;
+    unsigned ti;
+    if (MI.isRegTiedToDefOperand(i, &ti)) {
+      DstReg = MI.getOperand(ti).getReg();
+      return true;
+    }
+  }
+  return false;
+}
+
+/// Given a register, if has a single in-basic block use, return the use
+/// instruction if it's a copy or a two-address use.
+static
+MachineInstr *findOnlyInterestingUse(unsigned Reg, MachineBasicBlock *MBB,
+                                     MachineRegisterInfo *MRI,
+                                     const TargetInstrInfo *TII,
+                                     bool &IsCopy,
+                                     unsigned &DstReg, bool &IsDstPhys) {
+  if (!MRI->hasOneNonDBGUse(Reg))
+    // None or more than one use.
+    return nullptr;
+  MachineInstr &UseMI = *MRI->use_instr_nodbg_begin(Reg);
+  if (UseMI.getParent() != MBB)
+    return nullptr;
+  unsigned SrcReg;
+  bool IsSrcPhys;
+  if (isCopyToReg(UseMI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys)) {
+    IsCopy = true;
+    return &UseMI;
+  }
+  IsDstPhys = false;
+  if (isTwoAddrUse(UseMI, Reg, DstReg)) {
+    IsDstPhys = TargetRegisterInfo::isPhysicalRegister(DstReg);
+    return &UseMI;
+  }
+  return nullptr;
+}
+
+/// Return the physical register the specified virtual register might be mapped
+/// to.
+static unsigned
+getMappedReg(unsigned Reg, DenseMap<unsigned, unsigned> &RegMap) {
+  while (TargetRegisterInfo::isVirtualRegister(Reg))  {
+    DenseMap<unsigned, unsigned>::iterator SI = RegMap.find(Reg);
+    if (SI == RegMap.end())
+      return 0;
+    Reg = SI->second;
+  }
+  if (TargetRegisterInfo::isPhysicalRegister(Reg))
+    return Reg;
+  return 0;
+}
+
+/// Return true if the two registers are equal or aliased.
+static bool
+regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
+  if (RegA == RegB)
+    return true;
+  if (!RegA || !RegB)
+    return false;
+  return TRI->regsOverlap(RegA, RegB);
+}
+
+/// Return true if it's potentially profitable to commute the two-address
+/// instruction that's being processed.
+bool
+TwoAddressInstructionPass::
+isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
+                      MachineInstr *MI, unsigned Dist) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  // Determine if it's profitable to commute this two address instruction. In
+  // general, we want no uses between this instruction and the definition of
+  // the two-address register.
+  // e.g.
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1028
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // In this case, it might not be possible to coalesce the second MOV8rr
+  // instruction if the first one is coalesced. So it would be profitable to
+  // commute it:
+  // %reg1028<def> = EXTRACT_SUBREG %reg1027<kill>, 1
+  // %reg1029<def> = MOV8rr %reg1028
+  // %reg1029<def> = SHR8ri %reg1029, 7, %EFLAGS<imp-def,dead>
+  // insert => %reg1030<def> = MOV8rr %reg1029
+  // %reg1030<def> = ADD8rr %reg1029<kill>, %reg1028<kill>, %EFLAGS<imp-def,dead>
+
+  if (!isPlainlyKilled(MI, regC, LIS))
+    return false;
+
+  // Ok, we have something like:
+  // %reg1030<def> = ADD8rr %reg1028<kill>, %reg1029<kill>, %EFLAGS<imp-def,dead>
+  // let's see if it's worth commuting it.
+
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r0            = MOV %reg1026
+  // Commute the ADD to hopefully eliminate an otherwise unavoidable copy.
+  unsigned ToRegA = getMappedReg(regA, DstRegMap);
+  if (ToRegA) {
+    unsigned FromRegB = getMappedReg(regB, SrcRegMap);
+    unsigned FromRegC = getMappedReg(regC, SrcRegMap);
+    bool CompB = FromRegB && regsAreCompatible(FromRegB, ToRegA, TRI);
+    bool CompC = FromRegC && regsAreCompatible(FromRegC, ToRegA, TRI);
+
+    // Compute if any of the following are true:
+    // -RegB is not tied to a register and RegC is compatible with RegA.
+    // -RegB is tied to the wrong physical register, but RegC is.
+    // -RegB is tied to the wrong physical register, and RegC isn't tied.
+    if ((!FromRegB && CompC) || (FromRegB && !CompB && (!FromRegC || CompC)))
+      return true;
+    // Don't compute if any of the following are true:
+    // -RegC is not tied to a register and RegB is compatible with RegA.
+    // -RegC is tied to the wrong physical register, but RegB is.
+    // -RegC is tied to the wrong physical register, and RegB isn't tied.
+    if ((!FromRegC && CompB) || (FromRegC && !CompC && (!FromRegB || CompB)))
+      return false;
+  }
+
+  // If there is a use of regC between its last def (could be livein) and this
+  // instruction, then bail.
+  unsigned LastDefC = 0;
+  if (!noUseAfterLastDef(regC, Dist, LastDefC))
+    return false;
+
+  // If there is a use of regB between its last def (could be livein) and this
+  // instruction, then go ahead and make this transformation.
+  unsigned LastDefB = 0;
+  if (!noUseAfterLastDef(regB, Dist, LastDefB))
+    return true;
+
+  // Look for situation like this:
+  // %reg101 = MOV %reg100
+  // %reg102 = ...
+  // %reg103 = ADD %reg102, %reg101
+  // ... = %reg103 ...
+  // %reg100 = MOV %reg103
+  // If there is a reversed copy chain from reg101 to reg103, commute the ADD
+  // to eliminate an otherwise unavoidable copy.
+  // FIXME:
+  // We can extend the logic further: If an pair of operands in an insn has
+  // been merged, the insn could be regarded as a virtual copy, and the virtual
+  // copy could also be used to construct a copy chain.
+  // To more generally minimize register copies, ideally the logic of two addr
+  // instruction pass should be integrated with register allocation pass where
+  // interference graph is available.
+  if (isRevCopyChain(regC, regA, 3))
+    return true;
+
+  if (isRevCopyChain(regB, regA, 3))
+    return false;
+
+  // Since there are no intervening uses for both registers, then commute
+  // if the def of regC is closer. Its live interval is shorter.
+  return LastDefB && LastDefC && LastDefC > LastDefB;
+}
+
+/// Commute a two-address instruction and update the basic block, distance map,
+/// and live variables if needed. Return true if it is successful.
+bool TwoAddressInstructionPass::commuteInstruction(MachineInstr *MI,
+                                                   unsigned RegBIdx,
+                                                   unsigned RegCIdx,
+                                                   unsigned Dist) {
+  unsigned RegC = MI->getOperand(RegCIdx).getReg();
+  DEBUG(dbgs() << "2addr: COMMUTING  : " << *MI);
+  MachineInstr *NewMI = TII->commuteInstruction(*MI, false, RegBIdx, RegCIdx);
+
+  if (NewMI == nullptr) {
+    DEBUG(dbgs() << "2addr: COMMUTING FAILED!\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "2addr: COMMUTED TO: " << *NewMI);
+  assert(NewMI == MI &&
+         "TargetInstrInfo::commuteInstruction() should not return a new "
+         "instruction unless it was requested.");
+
+  // Update source register map.
+  unsigned FromRegC = getMappedReg(RegC, SrcRegMap);
+  if (FromRegC) {
+    unsigned RegA = MI->getOperand(0).getReg();
+    SrcRegMap[RegA] = FromRegC;
+  }
+
+  return true;
+}
+
+/// Return true if it is profitable to convert the given 2-address instruction
+/// to a 3-address one.
+bool
+TwoAddressInstructionPass::isProfitableToConv3Addr(unsigned RegA,unsigned RegB){
+  // Look for situations like this:
+  // %reg1024<def> = MOV r1
+  // %reg1025<def> = MOV r0
+  // %reg1026<def> = ADD %reg1024, %reg1025
+  // r2            = MOV %reg1026
+  // Turn ADD into a 3-address instruction to avoid a copy.
+  unsigned FromRegB = getMappedReg(RegB, SrcRegMap);
+  if (!FromRegB)
+    return false;
+  unsigned ToRegA = getMappedReg(RegA, DstRegMap);
+  return (ToRegA && !regsAreCompatible(FromRegB, ToRegA, TRI));
+}
+
+/// Convert the specified two-address instruction into a three address one.
+/// Return true if this transformation was successful.
+bool
+TwoAddressInstructionPass::convertInstTo3Addr(MachineBasicBlock::iterator &mi,
+                                              MachineBasicBlock::iterator &nmi,
+                                              unsigned RegA, unsigned RegB,
+                                              unsigned Dist) {
+  // FIXME: Why does convertToThreeAddress() need an iterator reference?
+  MachineFunction::iterator MFI = MBB->getIterator();
+  MachineInstr *NewMI = TII->convertToThreeAddress(MFI, *mi, LV);
+  assert(MBB->getIterator() == MFI &&
+         "convertToThreeAddress changed iterator reference");
+  if (!NewMI)
+    return false;
+
+  DEBUG(dbgs() << "2addr: CONVERTING 2-ADDR: " << *mi);
+  DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
+  bool Sunk = false;
+
+  if (LIS)
+    LIS->ReplaceMachineInstrInMaps(*mi, *NewMI);
+
+  if (NewMI->findRegisterUseOperand(RegB, false, TRI))
+    // FIXME: Temporary workaround. If the new instruction doesn't
+    // uses RegB, convertToThreeAddress must have created more
+    // then one instruction.
+    Sunk = sink3AddrInstruction(NewMI, RegB, mi);
+
+  MBB->erase(mi); // Nuke the old inst.
+
+  if (!Sunk) {
+    DistanceMap.insert(std::make_pair(NewMI, Dist));
+    mi = NewMI;
+    nmi = std::next(mi);
+  }
+
+  // Update source and destination register maps.
+  SrcRegMap.erase(RegA);
+  DstRegMap.erase(RegB);
+  return true;
+}
+
+/// Scan forward recursively for only uses, update maps if the use is a copy or
+/// a two-address instruction.
+void
+TwoAddressInstructionPass::scanUses(unsigned DstReg) {
+  SmallVector<unsigned, 4> VirtRegPairs;
+  bool IsDstPhys;
+  bool IsCopy = false;
+  unsigned NewReg = 0;
+  unsigned Reg = DstReg;
+  while (MachineInstr *UseMI = findOnlyInterestingUse(Reg, MBB, MRI, TII,IsCopy,
+                                                      NewReg, IsDstPhys)) {
+    if (IsCopy && !Processed.insert(UseMI).second)
+      break;
+
+    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
+    if (DI != DistanceMap.end())
+      // Earlier in the same MBB.Reached via a back edge.
+      break;
+
+    if (IsDstPhys) {
+      VirtRegPairs.push_back(NewReg);
+      break;
+    }
+    bool isNew = SrcRegMap.insert(std::make_pair(NewReg, Reg)).second;
+    if (!isNew)
+      assert(SrcRegMap[NewReg] == Reg && "Can't map to two src registers!");
+    VirtRegPairs.push_back(NewReg);
+    Reg = NewReg;
+  }
+
+  if (!VirtRegPairs.empty()) {
+    unsigned ToReg = VirtRegPairs.back();
+    VirtRegPairs.pop_back();
+    while (!VirtRegPairs.empty()) {
+      unsigned FromReg = VirtRegPairs.back();
+      VirtRegPairs.pop_back();
+      bool isNew = DstRegMap.insert(std::make_pair(FromReg, ToReg)).second;
+      if (!isNew)
+        assert(DstRegMap[FromReg] == ToReg &&"Can't map to two dst registers!");
+      ToReg = FromReg;
+    }
+    bool isNew = DstRegMap.insert(std::make_pair(DstReg, ToReg)).second;
+    if (!isNew)
+      assert(DstRegMap[DstReg] == ToReg && "Can't map to two dst registers!");
+  }
+}
+
+/// If the specified instruction is not yet processed, process it if it's a
+/// copy. For a copy instruction, we find the physical registers the
+/// source and destination registers might be mapped to. These are kept in
+/// point-to maps used to determine future optimizations. e.g.
+/// v1024 = mov r0
+/// v1025 = mov r1
+/// v1026 = add v1024, v1025
+/// r1    = mov r1026
+/// If 'add' is a two-address instruction, v1024, v1026 are both potentially
+/// coalesced to r0 (from the input side). v1025 is mapped to r1. v1026 is
+/// potentially joined with r1 on the output side. It's worthwhile to commute
+/// 'add' to eliminate a copy.
+void TwoAddressInstructionPass::processCopy(MachineInstr *MI) {
+  if (Processed.count(MI))
+    return;
+
+  bool IsSrcPhys, IsDstPhys;
+  unsigned SrcReg, DstReg;
+  if (!isCopyToReg(*MI, TII, SrcReg, DstReg, IsSrcPhys, IsDstPhys))
+    return;
+
+  if (IsDstPhys && !IsSrcPhys)
+    DstRegMap.insert(std::make_pair(SrcReg, DstReg));
+  else if (!IsDstPhys && IsSrcPhys) {
+    bool isNew = SrcRegMap.insert(std::make_pair(DstReg, SrcReg)).second;
+    if (!isNew)
+      assert(SrcRegMap[DstReg] == SrcReg &&
+             "Can't map to two src physical registers!");
+
+    scanUses(DstReg);
+  }
+
+  Processed.insert(MI);
+}
+
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the instruction below the kill instruction in order to
+/// eliminate the need for the copy.
+bool TwoAddressInstructionPass::
+rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = nullptr;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  if (KillMI->hasUnmodeledSideEffects() || KillMI->isCall() ||
+      KillMI->isBranch() || KillMI->isTerminator())
+    // Don't move pass calls, etc.
+    return false;
+
+  unsigned DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!MI->isSafeToMove(AA, SeenStore))
+    return false;
+
+  if (TII->getInstrLatency(InstrItins, *MI) > 1)
+    // FIXME: Needs more sophisticated heuristics.
+    return false;
+
+  SmallSet<unsigned, 2> Uses;
+  SmallSet<unsigned, 2> Kills;
+  SmallSet<unsigned, 2> Defs;
+  for (const MachineOperand &MO : MI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (!MOReg)
+      continue;
+    if (MO.isDef())
+      Defs.insert(MOReg);
+    else {
+      Uses.insert(MOReg);
+      if (MOReg != Reg && (MO.isKill() ||
+                           (LIS && isPlainlyKilled(MI, MOReg, LIS))))
+        Kills.insert(MOReg);
+    }
+  }
+
+  // Move the copies connected to MI down as well.
+  MachineBasicBlock::iterator Begin = MI;
+  MachineBasicBlock::iterator AfterMI = std::next(Begin);
+
+  MachineBasicBlock::iterator End = AfterMI;
+  while (End->isCopy() && Defs.count(End->getOperand(1).getReg())) {
+    Defs.insert(End->getOperand(0).getReg());
+    ++End;
+  }
+
+  // Check if the reschedule will not break depedencies.
+  unsigned NumVisited = 0;
+  MachineBasicBlock::iterator KillPos = KillMI;
+  ++KillPos;
+  for (MachineInstr &OtherMI : llvm::make_range(End, KillPos)) {
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI.isDebugValue())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    for (const MachineOperand &MO : OtherMI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isDef()) {
+        if (Uses.count(MOReg))
+          // Physical register use would be clobbered.
+          return false;
+        if (!MO.isDead() && Defs.count(MOReg))
+          // May clobber a physical register def.
+          // FIXME: This may be too conservative. It's ok if the instruction
+          // is sunken completely below the use.
+          return false;
+      } else {
+        if (Defs.count(MOReg))
+          return false;
+        bool isKill =
+            MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS));
+        if (MOReg != Reg &&
+            ((isKill && Uses.count(MOReg)) || Kills.count(MOReg)))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (MOReg == Reg && !isKill)
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || &OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
+      }
+    }
+  }
+
+  // Move debug info as well.
+  while (Begin != MBB->begin() && std::prev(Begin)->isDebugValue())
+    --Begin;
+
+  nmi = End;
+  MachineBasicBlock::iterator InsertPos = KillPos;
+  if (LIS) {
+    // We have to move the copies first so that the MBB is still well-formed
+    // when calling handleMove().
+    for (MachineBasicBlock::iterator MBBI = AfterMI; MBBI != End;) {
+      auto CopyMI = MBBI++;
+      MBB->splice(InsertPos, MBB, CopyMI);
+      LIS->handleMove(*CopyMI);
+      InsertPos = CopyMI;
+    }
+    End = std::next(MachineBasicBlock::iterator(MI));
+  }
+
+  // Copies following MI may have been moved as well.
+  MBB->splice(InsertPos, MBB, Begin, End);
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(*MI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
+  }
+
+  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
+  return true;
+}
+
+/// Return true if the re-scheduling will put the given instruction too close
+/// to the defs of its register dependencies.
+bool TwoAddressInstructionPass::isDefTooClose(unsigned Reg, unsigned Dist,
+                                              MachineInstr *MI) {
+  for (MachineInstr &DefMI : MRI->def_instructions(Reg)) {
+    if (DefMI.getParent() != MBB || DefMI.isCopy() || DefMI.isCopyLike())
+      continue;
+    if (&DefMI == MI)
+      return true; // MI is defining something KillMI uses
+    DenseMap<MachineInstr*, unsigned>::iterator DDI = DistanceMap.find(&DefMI);
+    if (DDI == DistanceMap.end())
+      return true;  // Below MI
+    unsigned DefDist = DDI->second;
+    assert(Dist > DefDist && "Visited def already?");
+    if (TII->getInstrLatency(InstrItins, DefMI) > (Dist - DefDist))
+      return true;
+  }
+  return false;
+}
+
+/// If there is one more local instruction that reads 'Reg' and it kills 'Reg,
+/// consider moving the kill instruction above the current two-address
+/// instruction in order to eliminate the need for the copy.
+bool TwoAddressInstructionPass::
+rescheduleKillAboveMI(MachineBasicBlock::iterator &mi,
+                      MachineBasicBlock::iterator &nmi,
+                      unsigned Reg) {
+  // Bail immediately if we don't have LV or LIS available. We use them to find
+  // kills efficiently.
+  if (!LV && !LIS)
+    return false;
+
+  MachineInstr *MI = &*mi;
+  DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
+  if (DI == DistanceMap.end())
+    // Must be created from unfolded load. Don't waste time trying this.
+    return false;
+
+  MachineInstr *KillMI = nullptr;
+  if (LIS) {
+    LiveInterval &LI = LIS->getInterval(Reg);
+    assert(LI.end() != LI.begin() &&
+           "Reg should not have empty live interval.");
+
+    SlotIndex MBBEndIdx = LIS->getMBBEndIdx(MBB).getPrevSlot();
+    LiveInterval::const_iterator I = LI.find(MBBEndIdx);
+    if (I != LI.end() && I->start < MBBEndIdx)
+      return false;
+
+    --I;
+    KillMI = LIS->getInstructionFromIndex(I->end);
+  } else {
+    KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  }
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+    // Don't mess with copies, they may be coalesced later.
+    return false;
+
+  unsigned DstReg;
+  if (isTwoAddrUse(*KillMI, Reg, DstReg))
+    return false;
+
+  bool SeenStore = true;
+  if (!KillMI->isSafeToMove(AA, SeenStore))
+    return false;
+
+  SmallSet<unsigned, 2> Uses;
+  SmallSet<unsigned, 2> Kills;
+  SmallSet<unsigned, 2> Defs;
+  SmallSet<unsigned, 2> LiveDefs;
+  for (const MachineOperand &MO : KillMI->operands()) {
+    if (!MO.isReg())
+      continue;
+    unsigned MOReg = MO.getReg();
+    if (MO.isUse()) {
+      if (!MOReg)
+        continue;
+      if (isDefTooClose(MOReg, DI->second, MI))
+        return false;
+      bool isKill = MO.isKill() || (LIS && isPlainlyKilled(KillMI, MOReg, LIS));
+      if (MOReg == Reg && !isKill)
+        return false;
+      Uses.insert(MOReg);
+      if (isKill && MOReg != Reg)
+        Kills.insert(MOReg);
+    } else if (TargetRegisterInfo::isPhysicalRegister(MOReg)) {
+      Defs.insert(MOReg);
+      if (!MO.isDead())
+        LiveDefs.insert(MOReg);
+    }
+  }
+
+  // Check if the reschedule will not break depedencies.
+  unsigned NumVisited = 0;
+  for (MachineInstr &OtherMI :
+       llvm::make_range(mi, MachineBasicBlock::iterator(KillMI))) {
+    // DBG_VALUE cannot be counted against the limit.
+    if (OtherMI.isDebugValue())
+      continue;
+    if (NumVisited > 10)  // FIXME: Arbitrary limit to reduce compile time cost.
+      return false;
+    ++NumVisited;
+    if (OtherMI.hasUnmodeledSideEffects() || OtherMI.isCall() ||
+        OtherMI.isBranch() || OtherMI.isTerminator())
+      // Don't move pass calls, etc.
+      return false;
+    SmallVector<unsigned, 2> OtherDefs;
+    for (const MachineOperand &MO : OtherMI.operands()) {
+      if (!MO.isReg())
+        continue;
+      unsigned MOReg = MO.getReg();
+      if (!MOReg)
+        continue;
+      if (MO.isUse()) {
+        if (Defs.count(MOReg))
+          // Moving KillMI can clobber the physical register if the def has
+          // not been seen.
+          return false;
+        if (Kills.count(MOReg))
+          // Don't want to extend other live ranges and update kills.
+          return false;
+        if (&OtherMI != MI && MOReg == Reg &&
+            !(MO.isKill() || (LIS && isPlainlyKilled(&OtherMI, MOReg, LIS))))
+          // We can't schedule across a use of the register in question.
+          return false;
+      } else {
+        OtherDefs.push_back(MOReg);
+      }
+    }
+
+    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
+      unsigned MOReg = OtherDefs[i];
+      if (Uses.count(MOReg))
+        return false;
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+          LiveDefs.count(MOReg))
+        return false;
+      // Physical register def is seen.
+      Defs.erase(MOReg);
+    }
+  }
+
+  // Move the old kill above MI, don't forget to move debug info as well.
+  MachineBasicBlock::iterator InsertPos = mi;
+  while (InsertPos != MBB->begin() && std::prev(InsertPos)->isDebugValue())
+    --InsertPos;
+  MachineBasicBlock::iterator From = KillMI;
+  MachineBasicBlock::iterator To = std::next(From);
+  while (std::prev(From)->isDebugValue())
+    --From;
+  MBB->splice(InsertPos, MBB, From, To);
+
+  nmi = std::prev(InsertPos); // Backtrack so we process the moved instr.
+  DistanceMap.erase(DI);
+
+  // Update live variables
+  if (LIS) {
+    LIS->handleMove(*KillMI);
+  } else {
+    LV->removeVirtualRegisterKilled(Reg, *KillMI);
+    LV->addVirtualRegisterKilled(Reg, *MI);
+  }
+
+  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
+  return true;
+}
+
+/// Tries to commute the operand 'BaseOpIdx' and some other operand in the
+/// given machine instruction to improve opportunities for coalescing and
+/// elimination of a register to register copy.
+///
+/// 'DstOpIdx' specifies the index of MI def operand.
+/// 'BaseOpKilled' specifies if the register associated with 'BaseOpIdx'
+/// operand is killed by the given instruction.
+/// The 'Dist' arguments provides the distance of MI from the start of the
+/// current basic block and it is used to determine if it is profitable
+/// to commute operands in the instruction.
+///
+/// Returns true if the transformation happened. Otherwise, returns false.
+bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
+                                                      unsigned DstOpIdx,
+                                                      unsigned BaseOpIdx,
+                                                      bool BaseOpKilled,
+                                                      unsigned Dist) {
+  unsigned DstOpReg = MI->getOperand(DstOpIdx).getReg();
+  unsigned BaseOpReg = MI->getOperand(BaseOpIdx).getReg();
+  unsigned OpsNum = MI->getDesc().getNumOperands();
+  unsigned OtherOpIdx = MI->getDesc().getNumDefs();
+  for (; OtherOpIdx < OpsNum; OtherOpIdx++) {
+    // The call of findCommutedOpIndices below only checks if BaseOpIdx
+    // and OtherOpIdx are commutable, it does not really search for
+    // other commutable operands and does not change the values of passed
+    // variables.
+    if (OtherOpIdx == BaseOpIdx ||
+        !TII->findCommutedOpIndices(*MI, BaseOpIdx, OtherOpIdx))
+      continue;
+
+    unsigned OtherOpReg = MI->getOperand(OtherOpIdx).getReg();
+    bool AggressiveCommute = false;
+
+    // If OtherOp dies but BaseOp does not, swap the OtherOp and BaseOp
+    // operands. This makes the live ranges of DstOp and OtherOp joinable.
+    bool DoCommute =
+        !BaseOpKilled && isKilled(*MI, OtherOpReg, MRI, TII, LIS, false);
+
+    if (!DoCommute &&
+        isProfitableToCommute(DstOpReg, BaseOpReg, OtherOpReg, MI, Dist)) {
+      DoCommute = true;
+      AggressiveCommute = true;
+    }
+
+    // If it's profitable to commute, try to do so.
+    if (DoCommute && commuteInstruction(MI, BaseOpIdx, OtherOpIdx, Dist)) {
+      ++NumCommuted;
+      if (AggressiveCommute)
+        ++NumAggrCommuted;
+      return true;
+    }
+  }
+  return false;
+}
+
+/// For the case where an instruction has a single pair of tied register
+/// operands, attempt some transformations that may either eliminate the tied
+/// operands or improve the opportunities for coalescing away the register copy.
+/// Returns true if no copy needs to be inserted to untie mi's operands
+/// (either because they were untied, or because mi was rescheduled, and will
+/// be visited again later). If the shouldOnlyCommute flag is true, only
+/// instruction commutation is attempted.
+bool TwoAddressInstructionPass::
+tryInstructionTransform(MachineBasicBlock::iterator &mi,
+                        MachineBasicBlock::iterator &nmi,
+                        unsigned SrcIdx, unsigned DstIdx,
+                        unsigned Dist, bool shouldOnlyCommute) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
+  MachineInstr &MI = *mi;
+  unsigned regA = MI.getOperand(DstIdx).getReg();
+  unsigned regB = MI.getOperand(SrcIdx).getReg();
+
+  assert(TargetRegisterInfo::isVirtualRegister(regB) &&
+         "cannot make instruction into two-address form");
+  bool regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+
+  if (TargetRegisterInfo::isVirtualRegister(regA))
+    scanUses(regA);
+
+  bool Commuted = tryInstructionCommute(&MI, DstIdx, SrcIdx, regBKilled, Dist);
+
+  // If the instruction is convertible to 3 Addr, instead
+  // of returning try 3 Addr transformation aggresively and
+  // use this variable to check later. Because it might be better.
+  // For example, we can just use `leal (%rsi,%rdi), %eax` and `ret`
+  // instead of the following code.
+  //   addl     %esi, %edi
+  //   movl     %edi, %eax
+  //   ret
+  if (Commuted && !MI.isConvertibleTo3Addr())
+    return false;
+
+  if (shouldOnlyCommute)
+    return false;
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule this MI below it.
+  if (!Commuted && EnableRescheduling && rescheduleMIBelowKill(mi, nmi, regB)) {
+    ++NumReSchedDowns;
+    return true;
+  }
+
+  // If we commuted, regB may have changed so we should re-sample it to avoid
+  // confusing the three address conversion below.
+  if (Commuted) {
+    regB = MI.getOperand(SrcIdx).getReg();
+    regBKilled = isKilled(MI, regB, MRI, TII, LIS, true);
+  }
+
+  if (MI.isConvertibleTo3Addr()) {
+    // This instruction is potentially convertible to a true
+    // three-address instruction.  Check if it is profitable.
+    if (!regBKilled || isProfitableToConv3Addr(regA, regB)) {
+      // Try to convert it.
+      if (convertInstTo3Addr(mi, nmi, regA, regB, Dist)) {
+        ++NumConvertedTo3Addr;
+        return true; // Done with this instruction.
+      }
+    }
+  }
+
+  // Return if it is commuted but 3 addr conversion is failed.
+  if (Commuted)
+    return false;
+
+  // If there is one more use of regB later in the same MBB, consider
+  // re-schedule it before this MI if it's legal.
+  if (EnableRescheduling && rescheduleKillAboveMI(mi, nmi, regB)) {
+    ++NumReSchedUps;
+    return true;
+  }
+
+  // If this is an instruction with a load folded into it, try unfolding
+  // the load, e.g. avoid this:
+  //   movq %rdx, %rcx
+  //   addq (%rax), %rcx
+  // in favor of this:
+  //   movq (%rax), %rcx
+  //   addq %rdx, %rcx
+  // because it's preferable to schedule a load than a register copy.
+  if (MI.mayLoad() && !regBKilled) {
+    // Determine if a load can be unfolded.
+    unsigned LoadRegIndex;
+    unsigned NewOpc =
+      TII->getOpcodeAfterMemoryUnfold(MI.getOpcode(),
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false,
+                                      &LoadRegIndex);
+    if (NewOpc != 0) {
+      const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
+      if (UnfoldMCID.getNumDefs() == 1) {
+        // Unfold the load.
+        DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
+        const TargetRegisterClass *RC =
+          TRI->getAllocatableClass(
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
+        unsigned Reg = MRI->createVirtualRegister(RC);
+        SmallVector<MachineInstr *, 2> NewMIs;
+        if (!TII->unfoldMemoryOperand(*MF, MI, Reg,
+                                      /*UnfoldLoad=*/true,
+                                      /*UnfoldStore=*/false, NewMIs)) {
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          return false;
+        }
+        assert(NewMIs.size() == 2 &&
+               "Unfolded a load into multiple instructions!");
+        // The load was previously folded, so this is the only use.
+        NewMIs[1]->addRegisterKilled(Reg, TRI);
+
+        // Tentatively insert the instructions into the block so that they
+        // look "normal" to the transformation logic.
+        MBB->insert(mi, NewMIs[0]);
+        MBB->insert(mi, NewMIs[1]);
+
+        DEBUG(dbgs() << "2addr:    NEW LOAD: " << *NewMIs[0]
+                     << "2addr:    NEW INST: " << *NewMIs[1]);
+
+        // Transform the instruction, now that it no longer has a load.
+        unsigned NewDstIdx = NewMIs[1]->findRegisterDefOperandIdx(regA);
+        unsigned NewSrcIdx = NewMIs[1]->findRegisterUseOperandIdx(regB);
+        MachineBasicBlock::iterator NewMI = NewMIs[1];
+        bool TransformResult =
+          tryInstructionTransform(NewMI, mi, NewSrcIdx, NewDstIdx, Dist, true);
+        (void)TransformResult;
+        assert(!TransformResult &&
+               "tryInstructionTransform() should return false.");
+        if (NewMIs[1]->getOperand(NewSrcIdx).isKill()) {
+          // Success, or at least we made an improvement. Keep the unfolded
+          // instructions and discard the original.
+          if (LV) {
+            for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+              MachineOperand &MO = MI.getOperand(i);
+              if (MO.isReg() &&
+                  TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+                if (MO.isUse()) {
+                  if (MO.isKill()) {
+                    if (NewMIs[0]->killsRegister(MO.getReg()))
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[0]);
+                    else {
+                      assert(NewMIs[1]->killsRegister(MO.getReg()) &&
+                             "Kill missing after load unfold!");
+                      LV->replaceKillInstruction(MO.getReg(), MI, *NewMIs[1]);
+                    }
+                  }
+                } else if (LV->removeVirtualRegisterDead(MO.getReg(), MI)) {
+                  if (NewMIs[1]->registerDefIsDead(MO.getReg()))
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[1]);
+                  else {
+                    assert(NewMIs[0]->registerDefIsDead(MO.getReg()) &&
+                           "Dead flag missing after load unfold!");
+                    LV->addVirtualRegisterDead(MO.getReg(), *NewMIs[0]);
+                  }
+                }
+              }
+            }
+            LV->addVirtualRegisterKilled(Reg, *NewMIs[1]);
+          }
+
+          SmallVector<unsigned, 4> OrigRegs;
+          if (LIS) {
+            for (const MachineOperand &MO : MI.operands()) {
+              if (MO.isReg())
+                OrigRegs.push_back(MO.getReg());
+            }
+          }
+
+          MI.eraseFromParent();
+
+          // Update LiveIntervals.
+          if (LIS) {
+            MachineBasicBlock::iterator Begin(NewMIs[0]);
+            MachineBasicBlock::iterator End(NewMIs[1]);
+            LIS->repairIntervalsInRange(MBB, Begin, End, OrigRegs);
+          }
+
+          mi = NewMIs[1];
+        } else {
+          // Transforming didn't eliminate the tie and didn't lead to an
+          // improvement. Clean up the unfolded instructions and keep the
+          // original.
+          DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
+          NewMIs[0]->eraseFromParent();
+          NewMIs[1]->eraseFromParent();
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+// Collect tied operands of MI that need to be handled.
+// Rewrite trivial cases immediately.
+// Return true if any tied operands where found, including the trivial ones.
+bool TwoAddressInstructionPass::
+collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  bool AnyOps = false;
+  unsigned NumOps = MI->getNumOperands();
+
+  for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+    unsigned DstIdx = 0;
+    if (!MI->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+      continue;
+    AnyOps = true;
+    MachineOperand &SrcMO = MI->getOperand(SrcIdx);
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned SrcReg = SrcMO.getReg();
+    unsigned DstReg = DstMO.getReg();
+    // Tied constraint already satisfied?
+    if (SrcReg == DstReg)
+      continue;
+
+    assert(SrcReg && SrcMO.isUse() && "two address instruction invalid");
+
+    // Deal with <undef> uses immediately - simply rewrite the src operand.
+    if (SrcMO.isUndef() && !DstMO.getSubReg()) {
+      // Constrain the DstReg register class if required.
+      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+        if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
+                                                             TRI, *MF))
+          MRI->constrainRegClass(DstReg, RC);
+      SrcMO.setReg(DstReg);
+      SrcMO.setSubReg(0);
+      DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      continue;
+    }
+    TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
+  }
+  return AnyOps;
+}
+
+// Process a list of tied MI operands that all use the same source register.
+// The tied pairs are of the form (SrcIdx, DstIdx).
+void
+TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
+                                            TiedPairList &TiedPairs,
+                                            unsigned &Dist) {
+  bool IsEarlyClobber = false;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    const MachineOperand &DstMO = MI->getOperand(TiedPairs[tpi].second);
+    IsEarlyClobber |= DstMO.isEarlyClobber();
+  }
+
+  bool RemovedKillFlag = false;
+  bool AllUsesCopied = true;
+  unsigned LastCopiedReg = 0;
+  SlotIndex LastCopyIdx;
+  unsigned RegB = 0;
+  unsigned SubRegB = 0;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    unsigned SrcIdx = TiedPairs[tpi].first;
+    unsigned DstIdx = TiedPairs[tpi].second;
+
+    const MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned RegA = DstMO.getReg();
+
+    // Grab RegB from the instruction because it may have changed if the
+    // instruction was commuted.
+    RegB = MI->getOperand(SrcIdx).getReg();
+    SubRegB = MI->getOperand(SrcIdx).getSubReg();
+
+    if (RegA == RegB) {
+      // The register is tied to multiple destinations (or else we would
+      // not have continued this far), but this use of the register
+      // already matches the tied destination.  Leave it.
+      AllUsesCopied = false;
+      continue;
+    }
+    LastCopiedReg = RegA;
+
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+    // First, verify that we don't have a use of "a" in the instruction
+    // (a = b + a for example) because our transformation will not
+    // work. This should never occur because we are in SSA form.
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i)
+      assert(i == DstIdx ||
+             !MI->getOperand(i).isReg() ||
+             MI->getOperand(i).getReg() != RegA);
+#endif
+
+    // Emit a copy.
+    MachineInstrBuilder MIB = BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+                                      TII->get(TargetOpcode::COPY), RegA);
+    // If this operand is folding a truncation, the truncation now moves to the
+    // copy so that the register classes remain valid for the operands.
+    MIB.addReg(RegB, 0, SubRegB);
+    const TargetRegisterClass *RC = MRI->getRegClass(RegB);
+    if (SubRegB) {
+      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+        assert(TRI->getMatchingSuperRegClass(RC, MRI->getRegClass(RegA),
+                                             SubRegB) &&
+               "tied subregister must be a truncation");
+        // The superreg class will not be used to constrain the subreg class.
+        RC = nullptr;
+      }
+      else {
+        assert(TRI->getMatchingSuperReg(RegA, SubRegB, MRI->getRegClass(RegB))
+               && "tied subregister must be a truncation");
+      }
+    }
+
+    // Update DistanceMap.
+    MachineBasicBlock::iterator PrevMI = MI;
+    --PrevMI;
+    DistanceMap.insert(std::make_pair(&*PrevMI, Dist));
+    DistanceMap[MI] = ++Dist;
+
+    if (LIS) {
+      LastCopyIdx = LIS->InsertMachineInstrInMaps(*PrevMI).getRegSlot();
+
+      if (TargetRegisterInfo::isVirtualRegister(RegA)) {
+        LiveInterval &LI = LIS->getInterval(RegA);
+        VNInfo *VNI = LI.getNextValue(LastCopyIdx, LIS->getVNInfoAllocator());
+        SlotIndex endIdx =
+            LIS->getInstructionIndex(*MI).getRegSlot(IsEarlyClobber);
+        LI.addSegment(LiveInterval::Segment(LastCopyIdx, endIdx, VNI));
+      }
+    }
+
+    DEBUG(dbgs() << "\t\tprepend:\t" << *MIB);
+
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
+           "inconsistent operand info for 2-reg pass");
+    if (MO.isKill()) {
+      MO.setIsKill(false);
+      RemovedKillFlag = true;
+    }
+
+    // Make sure regA is a legal regclass for the SrcIdx operand.
+    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
+        TargetRegisterInfo::isVirtualRegister(RegB))
+      MRI->constrainRegClass(RegA, RC);
+    MO.setReg(RegA);
+    // The getMatchingSuper asserts guarantee that the register class projected
+    // by SubRegB is compatible with RegA with no subregister. So regardless of
+    // whether the dest oper writes a subreg, the source oper should not.
+    MO.setSubReg(0);
+
+    // Propagate SrcRegMap.
+    SrcRegMap[RegA] = RegB;
+  }
+
+  if (AllUsesCopied) {
+    if (!IsEarlyClobber) {
+      // Replace other (un-tied) uses of regB with LastCopiedReg.
+      for (MachineOperand &MO : MI->operands()) {
+        if (MO.isReg() && MO.getReg() == RegB && MO.getSubReg() == SubRegB &&
+            MO.isUse()) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(LastCopiedReg);
+          MO.setSubReg(0);
+        }
+      }
+    }
+
+    // Update live variables for regB.
+    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(*MI)) {
+      MachineBasicBlock::iterator PrevMI = MI;
+      --PrevMI;
+      LV->addVirtualRegisterKilled(RegB, *PrevMI);
+    }
+
+    // Update LiveIntervals.
+    if (LIS) {
+      LiveInterval &LI = LIS->getInterval(RegB);
+      SlotIndex MIIdx = LIS->getInstructionIndex(*MI);
+      LiveInterval::const_iterator I = LI.find(MIIdx);
+      assert(I != LI.end() && "RegB must be live-in to use.");
+
+      SlotIndex UseIdx = MIIdx.getRegSlot(IsEarlyClobber);
+      if (I->end == UseIdx)
+        LI.removeSegment(LastCopyIdx, UseIdx);
+    }
+
+  } else if (RemovedKillFlag) {
+    // Some tied uses of regB matched their destination registers, so
+    // regB is still used in this instruction, but a kill flag was
+    // removed from a different tied use of regB, so now we need to add
+    // a kill flag to one of the remaining uses of regB.
+    for (MachineOperand &MO : MI->operands()) {
+      if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+        MO.setIsKill(true);
+        break;
+      }
+    }
+  }
+}
+
+/// Reduce two-address instructions to two operands.
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetMachine &TM = MF->getTarget();
+  MRI = &MF->getRegInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  InstrItins = MF->getSubtarget().getInstrItineraryData();
+  LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  OptLevel = TM.getOptLevel();
+
+  bool MadeChange = false;
+
+  DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
+  DEBUG(dbgs() << "********** Function: "
+        << MF->getName() << '\n');
+
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
+  TiedOperandMap TiedOperands;
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    MBB = &*MBBI;
+    unsigned Dist = 0;
+    DistanceMap.clear();
+    SrcRegMap.clear();
+    DstRegMap.clear();
+    Processed.clear();
+    for (MachineBasicBlock::iterator mi = MBB->begin(), me = MBB->end();
+         mi != me; ) {
+      MachineBasicBlock::iterator nmi = std::next(mi);
+      if (mi->isDebugValue()) {
+        mi = nmi;
+        continue;
+      }
+
+      // Expand REG_SEQUENCE instructions. This will position mi at the first
+      // expanded instruction.
+      if (mi->isRegSequence())
+        eliminateRegSequence(mi);
+
+      DistanceMap.insert(std::make_pair(&*mi, ++Dist));
+
+      processCopy(&*mi);
+
+      // First scan through all the tied register uses in this instruction
+      // and record a list of pairs of tied operands for each register.
+      if (!collectTiedOperands(&*mi, TiedOperands)) {
+        mi = nmi;
+        continue;
+      }
+
+      ++NumTwoAddressInstrs;
+      MadeChange = true;
+      DEBUG(dbgs() << '\t' << *mi);
+
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVectorImpl<std::pair<unsigned, unsigned> > &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
+          unsigned SrcIdx = TiedPairs[0].first;
+          unsigned DstIdx = TiedPairs[0].second;
+          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              tryInstructionTransform(mi, nmi, SrcIdx, DstIdx, Dist, false)) {
+            // The tied operands have been eliminated or shifted further down
+            // the block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            mi = nmi;
+            continue;
+          }
+        }
+      }
+
+      // Now iterate over the information collected above.
+      for (auto &TO : TiedOperands) {
+        processTiedPairs(&*mi, TO.second, Dist);
+        DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
+      }
+
+      // Rewrite INSERT_SUBREG as COPY now that we no longer need SSA form.
+      if (mi->isInsertSubreg()) {
+        // From %reg = INSERT_SUBREG %reg, %subreg, subidx
+        // To   %reg:subidx = COPY %subreg
+        unsigned SubIdx = mi->getOperand(3).getImm();
+        mi->RemoveOperand(3);
+        assert(mi->getOperand(0).getSubReg() == 0 && "Unexpected subreg idx");
+        mi->getOperand(0).setSubReg(SubIdx);
+        mi->getOperand(0).setIsUndef(mi->getOperand(1).isUndef());
+        mi->RemoveOperand(1);
+        mi->setDesc(TII->get(TargetOpcode::COPY));
+        DEBUG(dbgs() << "\t\tconvert to:\t" << *mi);
+      }
+
+      // Clear TiedOperands here instead of at the top of the loop
+      // since most instructions do not have tied operands.
+      TiedOperands.clear();
+      mi = nmi;
+    }
+  }
+
+  if (LIS)
+    MF->verify(this, "After two-address instruction pass");
+
+  return MadeChange;
+}
+
+/// Eliminate a REG_SEQUENCE instruction as part of the de-ssa process.
+///
+/// The instruction is turned into a sequence of sub-register copies:
+///
+///   %dst = REG_SEQUENCE %v1, ssub0, %v2, ssub1
+///
+/// Becomes:
+///
+///   %dst:ssub0<def,undef> = COPY %v1
+///   %dst:ssub1<def> = COPY %v2
+///
+void TwoAddressInstructionPass::
+eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  if (MI.getOperand(0).getSubReg() ||
+      TargetRegisterInfo::isPhysicalRegister(DstReg) ||
+      !(MI.getNumOperands() & 1)) {
+    DEBUG(dbgs() << "Illegal REG_SEQUENCE instruction:" << MI);
+    llvm_unreachable(nullptr);
+  }
+
+  SmallVector<unsigned, 4> OrigRegs;
+  if (LIS) {
+    OrigRegs.push_back(MI.getOperand(0).getReg());
+    for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
+      OrigRegs.push_back(MI.getOperand(i).getReg());
+  }
+
+  bool DefEmitted = false;
+  for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2) {
+    MachineOperand &UseMO = MI.getOperand(i);
+    unsigned SrcReg = UseMO.getReg();
+    unsigned SubIdx = MI.getOperand(i+1).getImm();
+    // Nothing needs to be inserted for <undef> operands.
+    if (UseMO.isUndef())
+      continue;
+
+    // Defer any kill flag to the last operand using SrcReg. Otherwise, we
+    // might insert a COPY that uses SrcReg after is was killed.
+    bool isKill = UseMO.isKill();
+    if (isKill)
+      for (unsigned j = i + 2; j < e; j += 2)
+        if (MI.getOperand(j).getReg() == SrcReg) {
+          MI.getOperand(j).setIsKill();
+          UseMO.setIsKill(false);
+          isKill = false;
+          break;
+        }
+
+    // Insert the sub-register copy.
+    MachineInstr *CopyMI = BuildMI(*MI.getParent(), MI, MI.getDebugLoc(),
+                                   TII->get(TargetOpcode::COPY))
+                               .addReg(DstReg, RegState::Define, SubIdx)
+                               .addOperand(UseMO);
+
+    // The first def needs an <undef> flag because there is no live register
+    // before it.
+    if (!DefEmitted) {
+      CopyMI->getOperand(0).setIsUndef(true);
+      // Return an iterator pointing to the first inserted instr.
+      MBBI = CopyMI;
+    }
+    DefEmitted = true;
+
+    // Update LiveVariables' kill info.
+    if (LV && isKill && !TargetRegisterInfo::isPhysicalRegister(SrcReg))
+      LV->replaceKillInstruction(SrcReg, MI, *CopyMI);
+
+    DEBUG(dbgs() << "Inserted: " << *CopyMI);
+  }
+
+  MachineBasicBlock::iterator EndMBBI =
+      std::next(MachineBasicBlock::iterator(MI));
+
+  if (!DefEmitted) {
+    DEBUG(dbgs() << "Turned: " << MI << " into an IMPLICIT_DEF");
+    MI.setDesc(TII->get(TargetOpcode::IMPLICIT_DEF));
+    for (int j = MI.getNumOperands() - 1, ee = 0; j > ee; --j)
+      MI.RemoveOperand(j);
+  } else {
+    DEBUG(dbgs() << "Eliminated: " << MI);
+    MI.eraseFromParent();
+  }
+
+  // Udpate LiveIntervals.
+  if (LIS)
+    LIS->repairIntervalsInRange(MBB, MBBI, EndMBBI, OrigRegs);
+}
diff --git a/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
new file mode 100644
index 000000000000..501e01c45a8b
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -0,0 +1,220 @@
+//===-- UnreachableBlockElim.cpp - Remove unreachable blocks for codegen --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is an extremely simple version of the SimplifyCFG pass.  Its sole
+// job is to delete LLVM basic blocks that are not reachable from the entry
+// node.  To do this, it performs a simple depth first traversal of the CFG,
+// then deletes any unvisited nodes.
+//
+// Note that this pass is really a hack.  In particular, the instruction
+// selectors for various targets should just not generate code for unreachable
+// blocks.  Until LLVM has a more systematic way of defining instruction
+// selectors, however, we cannot really expect them to handle additional
+// complexity.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/UnreachableBlockElim.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetInstrInfo.h"
+using namespace llvm;
+
+static bool eliminateUnreachableBlock(Function &F) {
+  SmallPtrSet<BasicBlock*, 8> Reachable;
+
+  // Mark all reachable blocks.
+  for (BasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+    if (!Reachable.count(&*I)) {
+      BasicBlock *BB = &*I;
+      DeadBlocks.push_back(BB);
+      while (PHINode *PN = dyn_cast<PHINode>(BB->begin())) {
+        PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
+        BB->getInstList().pop_front();
+      }
+      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+        (*SI)->removePredecessor(BB);
+      BB->dropAllReferences();
+    }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
+    DeadBlocks[i]->eraseFromParent();
+  }
+
+  return !DeadBlocks.empty();
+}
+
+namespace {
+class UnreachableBlockElimLegacyPass : public FunctionPass {
+  bool runOnFunction(Function &F) override {
+    return eliminateUnreachableBlock(F);
+  }
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  UnreachableBlockElimLegacyPass() : FunctionPass(ID) {
+    initializeUnreachableBlockElimLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+};
+}
+char UnreachableBlockElimLegacyPass::ID = 0;
+INITIALIZE_PASS(UnreachableBlockElimLegacyPass, "unreachableblockelim",
+                "Remove unreachable blocks from the CFG", false, false)
+
+FunctionPass *llvm::createUnreachableBlockEliminationPass() {
+  return new UnreachableBlockElimLegacyPass();
+}
+
+PreservedAnalyses UnreachableBlockElimPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  bool Changed = eliminateUnreachableBlock(F);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  return PA;
+}
+
+namespace {
+  class UnreachableMachineBlockElim : public MachineFunctionPass {
+    bool runOnMachineFunction(MachineFunction &F) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override;
+    MachineModuleInfo *MMI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
+  };
+}
+char UnreachableMachineBlockElim::ID = 0;
+
+INITIALIZE_PASS(UnreachableMachineBlockElim, "unreachable-mbb-elimination",
+  "Remove unreachable machine basic blocks", false, false)
+
+char &llvm::UnreachableMachineBlockElimID = UnreachableMachineBlockElim::ID;
+
+void UnreachableMachineBlockElim::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addPreserved<MachineDominatorTree>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
+  SmallPtrSet<MachineBasicBlock*, 8> Reachable;
+  bool ModifiedPHI = false;
+
+  MMI = getAnalysisIfAvailable<MachineModuleInfo>();
+  MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+
+  // Mark all reachable blocks.
+  for (MachineBasicBlock *BB : depth_first_ext(&F, Reachable))
+    (void)BB/* Mark all reachable blocks */;
+
+  // Loop over all dead blocks, remembering them and deleting all instructions
+  // in them.
+  std::vector<MachineBasicBlock*> DeadBlocks;
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = &*I;
+
+    // Test for deadness.
+    if (!Reachable.count(BB)) {
+      DeadBlocks.push_back(BB);
+
+      // Update dominator and loop info.
+      if (MLI) MLI->removeBlock(BB);
+      if (MDT && MDT->getNode(BB)) MDT->eraseNode(BB);
+
+      while (BB->succ_begin() != BB->succ_end()) {
+        MachineBasicBlock* succ = *BB->succ_begin();
+
+        MachineBasicBlock::iterator start = succ->begin();
+        while (start != succ->end() && start->isPHI()) {
+          for (unsigned i = start->getNumOperands() - 1; i >= 2; i-=2)
+            if (start->getOperand(i).isMBB() &&
+                start->getOperand(i).getMBB() == BB) {
+              start->RemoveOperand(i);
+              start->RemoveOperand(i-1);
+            }
+
+          start++;
+        }
+
+        BB->removeSuccessor(BB->succ_begin());
+      }
+    }
+  }
+
+  // Actually remove the blocks now.
+  for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i)
+    DeadBlocks[i]->eraseFromParent();
+
+  // Cleanup PHI nodes.
+  for (MachineFunction::iterator I = F.begin(), E = F.end(); I != E; ++I) {
+    MachineBasicBlock *BB = &*I;
+    // Prune unneeded PHI entries.
+    SmallPtrSet<MachineBasicBlock*, 8> preds(BB->pred_begin(),
+                                             BB->pred_end());
+    MachineBasicBlock::iterator phi = BB->begin();
+    while (phi != BB->end() && phi->isPHI()) {
+      for (unsigned i = phi->getNumOperands() - 1; i >= 2; i-=2)
+        if (!preds.count(phi->getOperand(i).getMBB())) {
+          phi->RemoveOperand(i);
+          phi->RemoveOperand(i-1);
+          ModifiedPHI = true;
+        }
+
+      if (phi->getNumOperands() == 3) {
+        unsigned Input = phi->getOperand(1).getReg();
+        unsigned Output = phi->getOperand(0).getReg();
+
+        phi++->eraseFromParent();
+        ModifiedPHI = true;
+
+        if (Input != Output) {
+          MachineRegisterInfo &MRI = F.getRegInfo();
+          MRI.constrainRegClass(Input, MRI.getRegClass(Output));
+          MRI.replaceRegWith(Output, Input);
+        }
+
+        continue;
+      }
+
+      ++phi;
+    }
+  }
+
+  F.RenumberBlocks();
+
+  return (!DeadBlocks.empty() || ModifiedPHI);
+}
diff --git a/contrib/llvm/lib/CodeGen/VirtRegMap.cpp b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
new file mode 100644
index 000000000000..8a3a0328870d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -0,0 +1,467 @@
+//===-- llvm/CodeGen/VirtRegMap.cpp - Virtual Register Map ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VirtRegMap class.
+//
+// It also contains implementations of the Spiller interface, which, given a
+// virtual register map and a machine function, eliminates all virtual
+// references by replacing them with physical register references - adding spill
+// code as necessary.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "LiveDebugVariables.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+STATISTIC(NumSpillSlots, "Number of spill slots allocated");
+STATISTIC(NumIdCopies,   "Number of identity moves eliminated after rewriting");
+
+//===----------------------------------------------------------------------===//
+//  VirtRegMap implementation
+//===----------------------------------------------------------------------===//
+
+char VirtRegMap::ID = 0;
+
+INITIALIZE_PASS(VirtRegMap, "virtregmap", "Virtual Register Map", false, false)
+
+bool VirtRegMap::runOnMachineFunction(MachineFunction &mf) {
+  MRI = &mf.getRegInfo();
+  TII = mf.getSubtarget().getInstrInfo();
+  TRI = mf.getSubtarget().getRegisterInfo();
+  MF = &mf;
+
+  Virt2PhysMap.clear();
+  Virt2StackSlotMap.clear();
+  Virt2SplitMap.clear();
+
+  grow();
+  return false;
+}
+
+void VirtRegMap::grow() {
+  unsigned NumRegs = MF->getRegInfo().getNumVirtRegs();
+  Virt2PhysMap.resize(NumRegs);
+  Virt2StackSlotMap.resize(NumRegs);
+  Virt2SplitMap.resize(NumRegs);
+}
+
+unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
+  int SS = MF->getFrameInfo()->CreateSpillStackObject(RC->getSize(),
+                                                      RC->getAlignment());
+  ++NumSpillSlots;
+  return SS;
+}
+
+bool VirtRegMap::hasPreferredPhys(unsigned VirtReg) {
+  unsigned Hint = MRI->getSimpleHint(VirtReg);
+  if (!Hint)
+    return false;
+  if (TargetRegisterInfo::isVirtualRegister(Hint))
+    Hint = getPhys(Hint);
+  return getPhys(VirtReg) == Hint;
+}
+
+bool VirtRegMap::hasKnownPreference(unsigned VirtReg) {
+  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(VirtReg);
+  if (TargetRegisterInfo::isPhysicalRegister(Hint.second))
+    return true;
+  if (TargetRegisterInfo::isVirtualRegister(Hint.second))
+    return hasPhys(Hint.second);
+  return false;
+}
+
+int VirtRegMap::assignVirt2StackSlot(unsigned virtReg) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  const TargetRegisterClass* RC = MF->getRegInfo().getRegClass(virtReg);
+  return Virt2StackSlotMap[virtReg] = createSpillSlot(RC);
+}
+
+void VirtRegMap::assignVirt2StackSlot(unsigned virtReg, int SS) {
+  assert(TargetRegisterInfo::isVirtualRegister(virtReg));
+  assert(Virt2StackSlotMap[virtReg] == NO_STACK_SLOT &&
+         "attempt to assign stack slot to already spilled register");
+  assert((SS >= 0 ||
+          (SS >= MF->getFrameInfo()->getObjectIndexBegin())) &&
+         "illegal fixed frame index");
+  Virt2StackSlotMap[virtReg] = SS;
+}
+
+void VirtRegMap::print(raw_ostream &OS, const Module*) const {
+  OS << "********** REGISTER MAP **********\n";
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2PhysMap[Reg] != (unsigned)VirtRegMap::NO_PHYS_REG) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> "
+         << PrintReg(Virt2PhysMap[Reg], TRI) << "] "
+         << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
+    }
+  }
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (Virt2StackSlotMap[Reg] != VirtRegMap::NO_STACK_SLOT) {
+      OS << '[' << PrintReg(Reg, TRI) << " -> fi#" << Virt2StackSlotMap[Reg]
+         << "] " << TRI->getRegClassName(MRI->getRegClass(Reg)) << "\n";
+    }
+  }
+  OS << '\n';
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void VirtRegMap::dump() const {
+  print(dbgs());
+}
+#endif
+
+//===----------------------------------------------------------------------===//
+//                              VirtRegRewriter
+//===----------------------------------------------------------------------===//
+//
+// The VirtRegRewriter is the last of the register allocator passes.
+// It rewrites virtual registers to physical registers as specified in the
+// VirtRegMap analysis. It also updates live-in information on basic blocks
+// according to LiveIntervals.
+//
+namespace {
+class VirtRegRewriter : public MachineFunctionPass {
+  MachineFunction *MF;
+  const TargetMachine *TM;
+  const TargetRegisterInfo *TRI;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  SlotIndexes *Indexes;
+  LiveIntervals *LIS;
+  VirtRegMap *VRM;
+
+  void rewrite();
+  void addMBBLiveIns();
+  bool readsUndefSubreg(const MachineOperand &MO) const;
+  void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
+  void handleIdentityCopy(MachineInstr &MI) const;
+
+public:
+  static char ID;
+  VirtRegRewriter() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction&) override;
+  MachineFunctionProperties getSetProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::AllVRegsAllocated);
+  }
+};
+} // end anonymous namespace
+
+char &llvm::VirtRegRewriterID = VirtRegRewriter::ID;
+
+INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
+                      "Virtual Register Rewriter", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
+                    "Virtual Register Rewriter", false, false)
+
+char VirtRegRewriter::ID = 0;
+
+void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  AU.addRequired<LiveIntervals>();
+  AU.addRequired<SlotIndexes>();
+  AU.addPreserved<SlotIndexes>();
+  AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
+  AU.addRequired<VirtRegMap>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool VirtRegRewriter::runOnMachineFunction(MachineFunction &fn) {
+  MF = &fn;
+  TM = &MF->getTarget();
+  TRI = MF->getSubtarget().getRegisterInfo();
+  TII = MF->getSubtarget().getInstrInfo();
+  MRI = &MF->getRegInfo();
+  Indexes = &getAnalysis<SlotIndexes>();
+  LIS = &getAnalysis<LiveIntervals>();
+  VRM = &getAnalysis<VirtRegMap>();
+  DEBUG(dbgs() << "********** REWRITE VIRTUAL REGISTERS **********\n"
+               << "********** Function: "
+               << MF->getName() << '\n');
+  DEBUG(VRM->dump());
+
+  // Add kill flags while we still have virtual registers.
+  LIS->addKillFlags(VRM);
+
+  // Live-in lists on basic blocks are required for physregs.
+  addMBBLiveIns();
+
+  // Rewrite virtual registers.
+  rewrite();
+
+  // Write out new DBG_VALUE instructions.
+  getAnalysis<LiveDebugVariables>().emitDebugValues(VRM);
+
+  // All machine operands and other references to virtual registers have been
+  // replaced. Remove the virtual registers and release all the transient data.
+  VRM->clearAllVirt();
+  MRI->clearVirtRegs();
+  return true;
+}
+
+void VirtRegRewriter::addLiveInsForSubRanges(const LiveInterval &LI,
+                                             unsigned PhysReg) const {
+  assert(!LI.empty());
+  assert(LI.hasSubRanges());
+
+  typedef std::pair<const LiveInterval::SubRange *,
+                    LiveInterval::const_iterator> SubRangeIteratorPair;
+  SmallVector<SubRangeIteratorPair, 4> SubRanges;
+  SlotIndex First;
+  SlotIndex Last;
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    SubRanges.push_back(std::make_pair(&SR, SR.begin()));
+    if (!First.isValid() || SR.segments.front().start < First)
+      First = SR.segments.front().start;
+    if (!Last.isValid() || SR.segments.back().end > Last)
+      Last = SR.segments.back().end;
+  }
+
+  // Check all mbb start positions between First and Last while
+  // simulatenously advancing an iterator for each subrange.
+  for (SlotIndexes::MBBIndexIterator MBBI = Indexes->findMBBIndex(First);
+       MBBI != Indexes->MBBIndexEnd() && MBBI->first <= Last; ++MBBI) {
+    SlotIndex MBBBegin = MBBI->first;
+    // Advance all subrange iterators so that their end position is just
+    // behind MBBBegin (or the iterator is at the end).
+    LaneBitmask LaneMask = 0;
+    for (auto &RangeIterPair : SubRanges) {
+      const LiveInterval::SubRange *SR = RangeIterPair.first;
+      LiveInterval::const_iterator &SRI = RangeIterPair.second;
+      while (SRI != SR->end() && SRI->end <= MBBBegin)
+        ++SRI;
+      if (SRI == SR->end())
+        continue;
+      if (SRI->start <= MBBBegin)
+        LaneMask |= SR->LaneMask;
+    }
+    if (LaneMask == 0)
+      continue;
+    MachineBasicBlock *MBB = MBBI->second;
+    MBB->addLiveIn(PhysReg, LaneMask);
+  }
+}
+
+// Compute MBB live-in lists from virtual register live ranges and their
+// assignments.
+void VirtRegRewriter::addMBBLiveIns() {
+  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(Idx);
+    if (MRI->reg_nodbg_empty(VirtReg))
+      continue;
+    LiveInterval &LI = LIS->getInterval(VirtReg);
+    if (LI.empty() || LIS->intervalIsInOneMBB(LI))
+      continue;
+    // This is a virtual register that is live across basic blocks. Its
+    // assigned PhysReg must be marked as live-in to those blocks.
+    unsigned PhysReg = VRM->getPhys(VirtReg);
+    assert(PhysReg != VirtRegMap::NO_PHYS_REG && "Unmapped virtual register.");
+
+    if (LI.hasSubRanges()) {
+      addLiveInsForSubRanges(LI, PhysReg);
+    } else {
+      // Go over MBB begin positions and see if we have segments covering them.
+      // The following works because segments and the MBBIndex list are both
+      // sorted by slot indexes.
+      SlotIndexes::MBBIndexIterator I = Indexes->MBBIndexBegin();
+      for (const auto &Seg : LI) {
+        I = Indexes->advanceMBBIndex(I, Seg.start);
+        for (; I != Indexes->MBBIndexEnd() && I->first < Seg.end; ++I) {
+          MachineBasicBlock *MBB = I->second;
+          MBB->addLiveIn(PhysReg);
+        }
+      }
+    }
+  }
+
+  // Sort and unique MBB LiveIns as we've not checked if SubReg/PhysReg were in
+  // each MBB's LiveIns set before calling addLiveIn on them.
+  for (MachineBasicBlock &MBB : *MF)
+    MBB.sortUniqueLiveIns();
+}
+
+/// Returns true if the given machine operand \p MO only reads undefined lanes.
+/// The function only works for use operands with a subregister set.
+bool VirtRegRewriter::readsUndefSubreg(const MachineOperand &MO) const {
+  // Shortcut if the operand is already marked undef.
+  if (MO.isUndef())
+    return true;
+
+  unsigned Reg = MO.getReg();
+  const LiveInterval &LI = LIS->getInterval(Reg);
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex BaseIndex = LIS->getInstructionIndex(MI);
+  // This code is only meant to handle reading undefined subregisters which
+  // we couldn't properly detect before.
+  assert(LI.liveAt(BaseIndex) &&
+         "Reads of completely dead register should be marked undef already");
+  unsigned SubRegIdx = MO.getSubReg();
+  LaneBitmask UseMask = TRI->getSubRegIndexLaneMask(SubRegIdx);
+  // See if any of the relevant subregister liveranges is defined at this point.
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    if ((SR.LaneMask & UseMask) != 0 && SR.liveAt(BaseIndex))
+      return false;
+  }
+  return true;
+}
+
+void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
+  if (!MI.isIdentityCopy())
+    return;
+  DEBUG(dbgs() << "Identity copy: " << MI);
+  ++NumIdCopies;
+
+  // Copies like:
+  //    %R0 = COPY %R0<undef>
+  //    %AL = COPY %AL, %EAX<imp-def>
+  // give us additional liveness information: The target (super-)register
+  // must not be valid before this point. Replace the COPY with a KILL
+  // instruction to maintain this information.
+  if (MI.getOperand(0).isUndef() || MI.getNumOperands() > 2) {
+    MI.setDesc(TII->get(TargetOpcode::KILL));
+    DEBUG(dbgs() << "  replace by: " << MI);
+    return;
+  }
+
+  if (Indexes)
+    Indexes->removeMachineInstrFromMaps(MI);
+  MI.eraseFromParent();
+  DEBUG(dbgs() << "  deleted.\n");
+}
+
+void VirtRegRewriter::rewrite() {
+  bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
+  SmallVector<unsigned, 8> SuperDeads;
+  SmallVector<unsigned, 8> SuperDefs;
+  SmallVector<unsigned, 8> SuperKills;
+
+  for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
+       MBBI != MBBE; ++MBBI) {
+    DEBUG(MBBI->print(dbgs(), Indexes));
+    for (MachineBasicBlock::instr_iterator
+           MII = MBBI->instr_begin(), MIE = MBBI->instr_end(); MII != MIE;) {
+      MachineInstr *MI = &*MII;
+      ++MII;
+
+      for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+           MOE = MI->operands_end(); MOI != MOE; ++MOI) {
+        MachineOperand &MO = *MOI;
+
+        // Make sure MRI knows about registers clobbered by regmasks.
+        if (MO.isRegMask())
+          MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+
+        if (!MO.isReg() || !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+          continue;
+        unsigned VirtReg = MO.getReg();
+        unsigned PhysReg = VRM->getPhys(VirtReg);
+        assert(PhysReg != VirtRegMap::NO_PHYS_REG &&
+               "Instruction uses unmapped VirtReg");
+        assert(!MRI->isReserved(PhysReg) && "Reserved register assignment");
+
+        // Preserve semantics of sub-register operands.
+        unsigned SubReg = MO.getSubReg();
+        if (SubReg != 0) {
+          if (NoSubRegLiveness) {
+            // A virtual register kill refers to the whole register, so we may
+            // have to add <imp-use,kill> operands for the super-register.  A
+            // partial redef always kills and redefines the super-register.
+            if (MO.readsReg() && (MO.isDef() || MO.isKill()))
+              SuperKills.push_back(PhysReg);
+
+            if (MO.isDef()) {
+              // Also add implicit defs for the super-register.
+              if (MO.isDead())
+                SuperDeads.push_back(PhysReg);
+              else
+                SuperDefs.push_back(PhysReg);
+            }
+          } else {
+            if (MO.isUse()) {
+              if (readsUndefSubreg(MO))
+                // We need to add an <undef> flag if the subregister is
+                // completely undefined (and we are not adding super-register
+                // defs).
+                MO.setIsUndef(true);
+            } else if (!MO.isDead()) {
+              assert(MO.isDef());
+            }
+          }
+
+          // The <def,undef> flag only makes sense for sub-register defs, and
+          // we are substituting a full physreg.  An <imp-use,kill> operand
+          // from the SuperKills list will represent the partial read of the
+          // super-register.
+          if (MO.isDef())
+            MO.setIsUndef(false);
+
+          // PhysReg operands cannot have subregister indexes.
+          PhysReg = TRI->getSubReg(PhysReg, SubReg);
+          assert(PhysReg && "Invalid SubReg for physical register");
+          MO.setSubReg(0);
+        }
+        // Rewrite. Note we could have used MachineOperand::substPhysReg(), but
+        // we need the inlining here.
+        MO.setReg(PhysReg);
+      }
+
+      // Add any missing super-register kills after rewriting the whole
+      // instruction.
+      while (!SuperKills.empty())
+        MI->addRegisterKilled(SuperKills.pop_back_val(), TRI, true);
+
+      while (!SuperDeads.empty())
+        MI->addRegisterDead(SuperDeads.pop_back_val(), TRI, true);
+
+      while (!SuperDefs.empty())
+        MI->addRegisterDefined(SuperDefs.pop_back_val(), TRI);
+
+      DEBUG(dbgs() << "> " << *MI);
+
+      // We can remove identity copies right now.
+      handleIdentityCopy(*MI);
+    }
+  }
+}
diff --git a/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
new file mode 100644
index 000000000000..041fb7b912bf
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -0,0 +1,1235 @@
+//===-- WinEHPrepare - Prepare exception handling for code generation ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass lowers LLVM IR exception handling into something closer to what the
+// backend wants for functions using a personality function from a runtime
+// provided by MSVC. Functions with other personality functions are left alone
+// and may be prepared by other passes. In particular, all supported MSVC
+// personality functions require cleanup code to be outlined, and the C++
+// personality requires catch handler code to be outlined.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "winehprepare"
+
+static cl::opt<bool> DisableDemotion(
+    "disable-demotion", cl::Hidden,
+    cl::desc(
+        "Clone multicolor basic blocks but do not demote cross funclet values"),
+    cl::init(false));
+
+static cl::opt<bool> DisableCleanups(
+    "disable-cleanups", cl::Hidden,
+    cl::desc("Do not remove implausible terminators or other similar cleanups"),
+    cl::init(false));
+
+namespace {
+  
+class WinEHPrepare : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  WinEHPrepare(const TargetMachine *TM = nullptr) : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &Fn) override;
+
+  bool doFinalization(Module &M) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  const char *getPassName() const override {
+    return "Windows exception handling preparation";
+  }
+
+private:
+  void insertPHIStores(PHINode *OriginalPHI, AllocaInst *SpillSlot);
+  void
+  insertPHIStore(BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+                 SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist);
+  AllocaInst *insertPHILoads(PHINode *PN, Function &F);
+  void replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                          DenseMap<BasicBlock *, Value *> &Loads, Function &F);
+  bool prepareExplicitEH(Function &F);
+  void colorFunclets(Function &F);
+
+  void demotePHIsOnFunclets(Function &F);
+  void cloneCommonBlocks(Function &F);
+  void removeImplausibleInstructions(Function &F);
+  void cleanupPreparedFunclets(Function &F);
+  void verifyPreparedFunclets(Function &F);
+
+  // All fields are reset by runOnFunction.
+  EHPersonality Personality = EHPersonality::Unknown;
+
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
+};
+
+} // end anonymous namespace
+
+char WinEHPrepare::ID = 0;
+INITIALIZE_TM_PASS(WinEHPrepare, "winehprepare", "Prepare Windows exceptions",
+                   false, false)
+
+FunctionPass *llvm::createWinEHPass(const TargetMachine *TM) {
+  return new WinEHPrepare(TM);
+}
+
+bool WinEHPrepare::runOnFunction(Function &Fn) {
+  if (!Fn.hasPersonalityFn())
+    return false;
+
+  // Classify the personality to see what kind of preparation we need.
+  Personality = classifyEHPersonality(Fn.getPersonalityFn());
+
+  // Do nothing if this is not a funclet-based personality.
+  if (!isFuncletEHPersonality(Personality))
+    return false;
+
+  return prepareExplicitEH(Fn);
+}
+
+bool WinEHPrepare::doFinalization(Module &M) { return false; }
+
+void WinEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {}
+
+static int addUnwindMapEntry(WinEHFuncInfo &FuncInfo, int ToState,
+                             const BasicBlock *BB) {
+  CxxUnwindMapEntry UME;
+  UME.ToState = ToState;
+  UME.Cleanup = BB;
+  FuncInfo.CxxUnwindMap.push_back(UME);
+  return FuncInfo.getLastStateNumber();
+}
+
+static void addTryBlockMapEntry(WinEHFuncInfo &FuncInfo, int TryLow,
+                                int TryHigh, int CatchHigh,
+                                ArrayRef<const CatchPadInst *> Handlers) {
+  WinEHTryBlockMapEntry TBME;
+  TBME.TryLow = TryLow;
+  TBME.TryHigh = TryHigh;
+  TBME.CatchHigh = CatchHigh;
+  assert(TBME.TryLow <= TBME.TryHigh);
+  for (const CatchPadInst *CPI : Handlers) {
+    WinEHHandlerType HT;
+    Constant *TypeInfo = cast<Constant>(CPI->getArgOperand(0));
+    if (TypeInfo->isNullValue())
+      HT.TypeDescriptor = nullptr;
+    else
+      HT.TypeDescriptor = cast<GlobalVariable>(TypeInfo->stripPointerCasts());
+    HT.Adjectives = cast<ConstantInt>(CPI->getArgOperand(1))->getZExtValue();
+    HT.Handler = CPI->getParent();
+    if (auto *AI =
+            dyn_cast<AllocaInst>(CPI->getArgOperand(2)->stripPointerCasts()))
+      HT.CatchObj.Alloca = AI;
+    else
+      HT.CatchObj.Alloca = nullptr;
+    TBME.HandlerArray.push_back(HT);
+  }
+  FuncInfo.TryBlockMap.push_back(TBME);
+}
+
+static BasicBlock *getCleanupRetUnwindDest(const CleanupPadInst *CleanupPad) {
+  for (const User *U : CleanupPad->users())
+    if (const auto *CRI = dyn_cast<CleanupReturnInst>(U))
+      return CRI->getUnwindDest();
+  return nullptr;
+}
+
+static void calculateStateNumbersForInvokes(const Function *Fn,
+                                            WinEHFuncInfo &FuncInfo) {
+  auto *F = const_cast<Function *>(Fn);
+  DenseMap<BasicBlock *, ColorVector> BlockColors = colorEHFunclets(*F);
+  for (BasicBlock &BB : *F) {
+    auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
+    if (!II)
+      continue;
+
+    auto &BBColors = BlockColors[&BB];
+    assert(BBColors.size() == 1 && "multi-color BB not removed by preparation");
+    BasicBlock *FuncletEntryBB = BBColors.front();
+
+    BasicBlock *FuncletUnwindDest;
+    auto *FuncletPad =
+        dyn_cast<FuncletPadInst>(FuncletEntryBB->getFirstNonPHI());
+    assert(FuncletPad || FuncletEntryBB == &Fn->getEntryBlock());
+    if (!FuncletPad)
+      FuncletUnwindDest = nullptr;
+    else if (auto *CatchPad = dyn_cast<CatchPadInst>(FuncletPad))
+      FuncletUnwindDest = CatchPad->getCatchSwitch()->getUnwindDest();
+    else if (auto *CleanupPad = dyn_cast<CleanupPadInst>(FuncletPad))
+      FuncletUnwindDest = getCleanupRetUnwindDest(CleanupPad);
+    else
+      llvm_unreachable("unexpected funclet pad!");
+
+    BasicBlock *InvokeUnwindDest = II->getUnwindDest();
+    int BaseState = -1;
+    if (FuncletUnwindDest == InvokeUnwindDest) {
+      auto BaseStateI = FuncInfo.FuncletBaseStateMap.find(FuncletPad);
+      if (BaseStateI != FuncInfo.FuncletBaseStateMap.end())
+        BaseState = BaseStateI->second;
+    }
+
+    if (BaseState != -1) {
+      FuncInfo.InvokeStateMap[II] = BaseState;
+    } else {
+      Instruction *PadInst = InvokeUnwindDest->getFirstNonPHI();
+      assert(FuncInfo.EHPadStateMap.count(PadInst) && "EH Pad has no state!");
+      FuncInfo.InvokeStateMap[II] = FuncInfo.EHPadStateMap[PadInst];
+    }
+  }
+}
+
+// Given BB which ends in an unwind edge, return the EHPad that this BB belongs
+// to. If the unwind edge came from an invoke, return null.
+static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
+                                                 Value *ParentPad) {
+  const TerminatorInst *TI = BB->getTerminator();
+  if (isa<InvokeInst>(TI))
+    return nullptr;
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(TI)) {
+    if (CatchSwitch->getParentPad() != ParentPad)
+      return nullptr;
+    return BB;
+  }
+  assert(!TI->isEHPad() && "unexpected EHPad!");
+  auto *CleanupPad = cast<CleanupReturnInst>(TI)->getCleanupPad();
+  if (CleanupPad->getParentPad() != ParentPad)
+    return nullptr;
+  return CleanupPad->getParent();
+}
+
+static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "not a funclet!");
+
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
+
+    SmallVector<const CatchPadInst *, 2> Handlers;
+    for (const BasicBlock *CatchPadBB : CatchSwitch->handlers()) {
+      auto *CatchPad = cast<CatchPadInst>(CatchPadBB->getFirstNonPHI());
+      Handlers.push_back(CatchPad);
+    }
+    int TryLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryLow;
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryLow);
+    int CatchLow = addUnwindMapEntry(FuncInfo, ParentState, nullptr);
+
+    // catchpads are separate funclets in C++ EH due to the way rethrow works.
+    int TryHigh = CatchLow - 1;
+    for (const auto *CatchPad : Handlers) {
+      FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow;
+      for (const User *U : CatchPad->users()) {
+        const auto *UserI = cast<Instruction>(U);
+        if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+          BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+          if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        }
+        if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
+          BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
+          // If a nested cleanup pad reports a null unwind destination and the
+          // enclosing catch pad doesn't it must be post-dominated by an
+          // unreachable instruction.
+          if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+            calculateCXXStateNumbers(FuncInfo, UserI, CatchLow);
+        }
+      }
+    }
+    int CatchHigh = FuncInfo.getLastStateNumber();
+    addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
+    DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
+    DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh << '\n');
+    DEBUG(dbgs() << "CatchHigh[" << BB->getName() << "]: " << CatchHigh
+                 << '\n');
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
+
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
+
+    int CleanupState = addUnwindMapEntry(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                 << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB)) {
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CleanupPad->getParentPad()))) {
+        calculateCXXStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
+      }
+    }
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the MSVC++ personality cannot "
+                           "contain exceptional actions");
+    }
+  }
+}
+
+static int addSEHExcept(WinEHFuncInfo &FuncInfo, int ParentState,
+                        const Function *Filter, const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = false;
+  Entry.Filter = Filter;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
+}
+
+static int addSEHFinally(WinEHFuncInfo &FuncInfo, int ParentState,
+                         const BasicBlock *Handler) {
+  SEHUnwindMapEntry Entry;
+  Entry.ToState = ParentState;
+  Entry.IsFinally = true;
+  Entry.Filter = nullptr;
+  Entry.Handler = Handler;
+  FuncInfo.SEHUnwindMap.push_back(Entry);
+  return FuncInfo.SEHUnwindMap.size() - 1;
+}
+
+static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
+                                     const Instruction *FirstNonPHI,
+                                     int ParentState) {
+  const BasicBlock *BB = FirstNonPHI->getParent();
+  assert(BB->isEHPad() && "no a funclet!");
+
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(FirstNonPHI)) {
+    assert(FuncInfo.EHPadStateMap.count(CatchSwitch) == 0 &&
+           "shouldn't revist catch funclets!");
+
+    // Extract the filter function and the __except basic block and create a
+    // state for them.
+    assert(CatchSwitch->getNumHandlers() == 1 &&
+           "SEH doesn't have multiple handlers per __try");
+    const auto *CatchPad =
+        cast<CatchPadInst>((*CatchSwitch->handler_begin())->getFirstNonPHI());
+    const BasicBlock *CatchPadBB = CatchPad->getParent();
+    const Constant *FilterOrNull =
+        cast<Constant>(CatchPad->getArgOperand(0)->stripPointerCasts());
+    const Function *Filter = dyn_cast<Function>(FilterOrNull);
+    assert((Filter || FilterOrNull->isNullValue()) &&
+           "unexpected filter value");
+    int TryState = addSEHExcept(FuncInfo, ParentState, Filter, CatchPadBB);
+
+    // Everything in the __try block uses TryState as its parent state.
+    FuncInfo.EHPadStateMap[CatchSwitch] = TryState;
+    DEBUG(dbgs() << "Assigning state #" << TryState << " to BB "
+                 << CatchPadBB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock = getEHPadFromPredecessor(PredBlock,
+                                               CatchSwitch->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 TryState);
+
+    // Everything in the __except block unwinds to ParentState, just like code
+    // outside the __try.
+    for (const User *U : CatchPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (auto *InnerCatchSwitch = dyn_cast<CatchSwitchInst>(UserI)) {
+        BasicBlock *UnwindDest = InnerCatchSwitch->getUnwindDest();
+        if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      }
+      if (auto *InnerCleanupPad = dyn_cast<CleanupPadInst>(UserI)) {
+        BasicBlock *UnwindDest = getCleanupRetUnwindDest(InnerCleanupPad);
+        // If a nested cleanup pad reports a null unwind destination and the
+        // enclosing catch pad doesn't it must be post-dominated by an
+        // unreachable instruction.
+        if (!UnwindDest || UnwindDest == CatchSwitch->getUnwindDest())
+          calculateSEHStateNumbers(FuncInfo, UserI, ParentState);
+      }
+    }
+  } else {
+    auto *CleanupPad = cast<CleanupPadInst>(FirstNonPHI);
+
+    // It's possible for a cleanup to be visited twice: it might have multiple
+    // cleanupret instructions.
+    if (FuncInfo.EHPadStateMap.count(CleanupPad))
+      return;
+
+    int CleanupState = addSEHFinally(FuncInfo, ParentState, BB);
+    FuncInfo.EHPadStateMap[CleanupPad] = CleanupState;
+    DEBUG(dbgs() << "Assigning state #" << CleanupState << " to BB "
+                 << BB->getName() << '\n');
+    for (const BasicBlock *PredBlock : predecessors(BB))
+      if ((PredBlock =
+               getEHPadFromPredecessor(PredBlock, CleanupPad->getParentPad())))
+        calculateSEHStateNumbers(FuncInfo, PredBlock->getFirstNonPHI(),
+                                 CleanupState);
+    for (const User *U : CleanupPad->users()) {
+      const auto *UserI = cast<Instruction>(U);
+      if (UserI->isEHPad())
+        report_fatal_error("Cleanup funclets for the SEH personality cannot "
+                           "contain exceptional actions");
+    }
+  }
+}
+
+static bool isTopLevelPadForMSVC(const Instruction *EHPad) {
+  if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(EHPad))
+    return isa<ConstantTokenNone>(CatchSwitch->getParentPad()) &&
+           CatchSwitch->unwindsToCaller();
+  if (auto *CleanupPad = dyn_cast<CleanupPadInst>(EHPad))
+    return isa<ConstantTokenNone>(CleanupPad->getParentPad()) &&
+           getCleanupRetUnwindDest(CleanupPad) == nullptr;
+  if (isa<CatchPadInst>(EHPad))
+    return false;
+  llvm_unreachable("unexpected EHPad!");
+}
+
+void llvm::calculateSEHStateNumbers(const Function *Fn,
+                                    WinEHFuncInfo &FuncInfo) {
+  // Don't compute state numbers twice.
+  if (!FuncInfo.SEHUnwindMap.empty())
+    return;
+
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
+      continue;
+    ::calculateSEHStateNumbers(FuncInfo, FirstNonPHI, -1);
+  }
+
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+}
+
+void llvm::calculateWinCXXEHStateNumbers(const Function *Fn,
+                                         WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
+
+  for (const BasicBlock &BB : *Fn) {
+    if (!BB.isEHPad())
+      continue;
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    if (!isTopLevelPadForMSVC(FirstNonPHI))
+      continue;
+    calculateCXXStateNumbers(FuncInfo, FirstNonPHI, -1);
+  }
+
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+}
+
+static int addClrEHHandler(WinEHFuncInfo &FuncInfo, int HandlerParentState,
+                           int TryParentState, ClrHandlerType HandlerType,
+                           uint32_t TypeToken, const BasicBlock *Handler) {
+  ClrEHUnwindMapEntry Entry;
+  Entry.HandlerParentState = HandlerParentState;
+  Entry.TryParentState = TryParentState;
+  Entry.Handler = Handler;
+  Entry.HandlerType = HandlerType;
+  Entry.TypeToken = TypeToken;
+  FuncInfo.ClrEHUnwindMap.push_back(Entry);
+  return FuncInfo.ClrEHUnwindMap.size() - 1;
+}
+
+void llvm::calculateClrEHStateNumbers(const Function *Fn,
+                                      WinEHFuncInfo &FuncInfo) {
+  // Return if it's already been done.
+  if (!FuncInfo.EHPadStateMap.empty())
+    return;
+
+  // This numbering assigns one state number to each catchpad and cleanuppad.
+  // It also computes two tree-like relations over states:
+  // 1) Each state has a "HandlerParentState", which is the state of the next
+  //    outer handler enclosing this state's handler (same as nearest ancestor
+  //    per the ParentPad linkage on EH pads, but skipping over catchswitches).
+  // 2) Each state has a "TryParentState", which:
+  //    a) for a catchpad that's not the last handler on its catchswitch, is
+  //       the state of the next catchpad on that catchswitch
+  //    b) for all other pads, is the state of the pad whose try region is the
+  //       next outer try region enclosing this state's try region.  The "try
+  //       regions are not present as such in the IR, but will be inferred
+  //       based on the placement of invokes and pads which reach each other
+  //       by exceptional exits
+  // Catchswitches do not get their own states, but each gets mapped to the
+  // state of its first catchpad.
+
+  // Step one: walk down from outermost to innermost funclets, assigning each
+  // catchpad and cleanuppad a state number.  Add an entry to the
+  // ClrEHUnwindMap for each state, recording its HandlerParentState and
+  // handler attributes.  Record the TryParentState as well for each catchpad
+  // that's not the last on its catchswitch, but initialize all other entries'
+  // TryParentStates to a sentinel -1 value that the next pass will update.
+
+  // Seed a worklist with pads that have no parent.
+  SmallVector<std::pair<const Instruction *, int>, 8> Worklist;
+  for (const BasicBlock &BB : *Fn) {
+    const Instruction *FirstNonPHI = BB.getFirstNonPHI();
+    const Value *ParentPad;
+    if (const auto *CPI = dyn_cast<CleanupPadInst>(FirstNonPHI))
+      ParentPad = CPI->getParentPad();
+    else if (const auto *CSI = dyn_cast<CatchSwitchInst>(FirstNonPHI))
+      ParentPad = CSI->getParentPad();
+    else
+      continue;
+    if (isa<ConstantTokenNone>(ParentPad))
+      Worklist.emplace_back(FirstNonPHI, -1);
+  }
+
+  // Use the worklist to visit all pads, from outer to inner.  Record
+  // HandlerParentState for all pads.  Record TryParentState only for catchpads
+  // that aren't the last on their catchswitch (setting all other entries'
+  // TryParentStates to an initial value of -1).  This loop is also responsible
+  // for setting the EHPadStateMap entry for all catchpads, cleanuppads, and
+  // catchswitches.
+  while (!Worklist.empty()) {
+    const Instruction *Pad;
+    int HandlerParentState;
+    std::tie(Pad, HandlerParentState) = Worklist.pop_back_val();
+
+    if (const auto *Cleanup = dyn_cast<CleanupPadInst>(Pad)) {
+      // Create the entry for this cleanup with the appropriate handler
+      // properties.  Finaly and fault handlers are distinguished by arity.
+      ClrHandlerType HandlerType =
+          (Cleanup->getNumArgOperands() ? ClrHandlerType::Fault
+                                        : ClrHandlerType::Finally);
+      int CleanupState = addClrEHHandler(FuncInfo, HandlerParentState, -1,
+                                         HandlerType, 0, Pad->getParent());
+      // Queue any child EH pads on the worklist.
+      for (const User *U : Cleanup->users())
+        if (const auto *I = dyn_cast<Instruction>(U))
+          if (I->isEHPad())
+            Worklist.emplace_back(I, CleanupState);
+      // Remember this pad's state.
+      FuncInfo.EHPadStateMap[Cleanup] = CleanupState;
+    } else {
+      // Walk the handlers of this catchswitch in reverse order since all but
+      // the last need to set the following one as its TryParentState.
+      const auto *CatchSwitch = cast<CatchSwitchInst>(Pad);
+      int CatchState = -1, FollowerState = -1;
+      SmallVector<const BasicBlock *, 4> CatchBlocks(CatchSwitch->handlers());
+      for (auto CBI = CatchBlocks.rbegin(), CBE = CatchBlocks.rend();
+           CBI != CBE; ++CBI, FollowerState = CatchState) {
+        const BasicBlock *CatchBlock = *CBI;
+        // Create the entry for this catch with the appropriate handler
+        // properties.
+        const auto *Catch = cast<CatchPadInst>(CatchBlock->getFirstNonPHI());
+        uint32_t TypeToken = static_cast<uint32_t>(
+            cast<ConstantInt>(Catch->getArgOperand(0))->getZExtValue());
+        CatchState =
+            addClrEHHandler(FuncInfo, HandlerParentState, FollowerState,
+                            ClrHandlerType::Catch, TypeToken, CatchBlock);
+        // Queue any child EH pads on the worklist.
+        for (const User *U : Catch->users())
+          if (const auto *I = dyn_cast<Instruction>(U))
+            if (I->isEHPad())
+              Worklist.emplace_back(I, CatchState);
+        // Remember this catch's state.
+        FuncInfo.EHPadStateMap[Catch] = CatchState;
+      }
+      // Associate the catchswitch with the state of its first catch.
+      assert(CatchSwitch->getNumHandlers());
+      FuncInfo.EHPadStateMap[CatchSwitch] = CatchState;
+    }
+  }
+
+  // Step two: record the TryParentState of each state.  For cleanuppads that
+  // don't have cleanuprets, we may need to infer this from their child pads,
+  // so visit pads in descendant-most to ancestor-most order.
+  for (auto Entry = FuncInfo.ClrEHUnwindMap.rbegin(),
+            End = FuncInfo.ClrEHUnwindMap.rend();
+       Entry != End; ++Entry) {
+    const Instruction *Pad =
+        Entry->Handler.get<const BasicBlock *>()->getFirstNonPHI();
+    // For most pads, the TryParentState is the state associated with the
+    // unwind dest of exceptional exits from it.
+    const BasicBlock *UnwindDest;
+    if (const auto *Catch = dyn_cast<CatchPadInst>(Pad)) {
+      // If a catch is not the last in its catchswitch, its TryParentState is
+      // the state associated with the next catch in the switch, even though
+      // that's not the unwind dest of exceptions escaping the catch.  Those
+      // cases were already assigned a TryParentState in the first pass, so
+      // skip them.
+      if (Entry->TryParentState != -1)
+        continue;
+      // Otherwise, get the unwind dest from the catchswitch.
+      UnwindDest = Catch->getCatchSwitch()->getUnwindDest();
+    } else {
+      const auto *Cleanup = cast<CleanupPadInst>(Pad);
+      UnwindDest = nullptr;
+      for (const User *U : Cleanup->users()) {
+        if (auto *CleanupRet = dyn_cast<CleanupReturnInst>(U)) {
+          // Common and unambiguous case -- cleanupret indicates cleanup's
+          // unwind dest.
+          UnwindDest = CleanupRet->getUnwindDest();
+          break;
+        }
+
+        // Get an unwind dest for the user
+        const BasicBlock *UserUnwindDest = nullptr;
+        if (auto *Invoke = dyn_cast<InvokeInst>(U)) {
+          UserUnwindDest = Invoke->getUnwindDest();
+        } else if (auto *CatchSwitch = dyn_cast<CatchSwitchInst>(U)) {
+          UserUnwindDest = CatchSwitch->getUnwindDest();
+        } else if (auto *ChildCleanup = dyn_cast<CleanupPadInst>(U)) {
+          int UserState = FuncInfo.EHPadStateMap[ChildCleanup];
+          int UserUnwindState =
+              FuncInfo.ClrEHUnwindMap[UserState].TryParentState;
+          if (UserUnwindState != -1)
+            UserUnwindDest = FuncInfo.ClrEHUnwindMap[UserUnwindState]
+                                 .Handler.get<const BasicBlock *>();
+        }
+
+        // Not having an unwind dest for this user might indicate that it
+        // doesn't unwind, so can't be taken as proof that the cleanup itself
+        // may unwind to caller (see e.g. SimplifyUnreachable and
+        // RemoveUnwindEdge).
+        if (!UserUnwindDest)
+          continue;
+
+        // Now we have an unwind dest for the user, but we need to see if it
+        // unwinds all the way out of the cleanup or if it stays within it.
+        const Instruction *UserUnwindPad = UserUnwindDest->getFirstNonPHI();
+        const Value *UserUnwindParent;
+        if (auto *CSI = dyn_cast<CatchSwitchInst>(UserUnwindPad))
+          UserUnwindParent = CSI->getParentPad();
+        else
+          UserUnwindParent =
+              cast<CleanupPadInst>(UserUnwindPad)->getParentPad();
+
+        // The unwind stays within the cleanup iff it targets a child of the
+        // cleanup.
+        if (UserUnwindParent == Cleanup)
+          continue;
+
+        // This unwind exits the cleanup, so its dest is the cleanup's dest.
+        UnwindDest = UserUnwindDest;
+        break;
+      }
+    }
+
+    // Record the state of the unwind dest as the TryParentState.
+    int UnwindDestState;
+
+    // If UnwindDest is null at this point, either the pad in question can
+    // be exited by unwind to caller, or it cannot be exited by unwind.  In
+    // either case, reporting such cases as unwinding to caller is correct.
+    // This can lead to EH tables that "look strange" -- if this pad's is in
+    // a parent funclet which has other children that do unwind to an enclosing
+    // pad, the try region for this pad will be missing the "duplicate" EH
+    // clause entries that you'd expect to see covering the whole parent.  That
+    // should be benign, since the unwind never actually happens.  If it were
+    // an issue, we could add a subsequent pass that pushes unwind dests down
+    // from parents that have them to children that appear to unwind to caller.
+    if (!UnwindDest) {
+      UnwindDestState = -1;
+    } else {
+      UnwindDestState = FuncInfo.EHPadStateMap[UnwindDest->getFirstNonPHI()];
+    }
+
+    Entry->TryParentState = UnwindDestState;
+  }
+
+  // Step three: transfer information from pads to invokes.
+  calculateStateNumbersForInvokes(Fn, FuncInfo);
+}
+
+void WinEHPrepare::colorFunclets(Function &F) {
+  BlockColors = colorEHFunclets(F);
+
+  // Invert the map from BB to colors to color to BBs.
+  for (BasicBlock &BB : F) {
+    ColorVector &Colors = BlockColors[&BB];
+    for (BasicBlock *Color : Colors)
+      FuncletBlocks[Color].push_back(&BB);
+  }
+}
+
+void WinEHPrepare::demotePHIsOnFunclets(Function &F) {
+  // Strip PHI nodes off of EH pads.
+  SmallVector<PHINode *, 16> PHINodes;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+    if (!BB->isEHPad())
+      continue;
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end(); BI != BE;) {
+      Instruction *I = &*BI++;
+      auto *PN = dyn_cast<PHINode>(I);
+      // Stop at the first non-PHI.
+      if (!PN)
+        break;
+
+      AllocaInst *SpillSlot = insertPHILoads(PN, F);
+      if (SpillSlot)
+        insertPHIStores(PN, SpillSlot);
+
+      PHINodes.push_back(PN);
+    }
+  }
+
+  for (auto *PN : PHINodes) {
+    // There may be lingering uses on other EH PHIs being removed
+    PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+    PN->eraseFromParent();
+  }
+}
+
+void WinEHPrepare::cloneCommonBlocks(Function &F) {
+  // We need to clone all blocks which belong to multiple funclets.  Values are
+  // remapped throughout the funclet to propogate both the new instructions
+  // *and* the new basic blocks themselves.
+  for (auto &Funclets : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclets.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclets.second;
+    Value *FuncletToken;
+    if (FuncletPadBB == &F.getEntryBlock())
+      FuncletToken = ConstantTokenNone::get(F.getContext());
+    else
+      FuncletToken = FuncletPadBB->getFirstNonPHI();
+
+    std::vector<std::pair<BasicBlock *, BasicBlock *>> Orig2Clone;
+    ValueToValueMapTy VMap;
+    for (BasicBlock *BB : BlocksInFunclet) {
+      ColorVector &ColorsForBB = BlockColors[BB];
+      // We don't need to do anything if the block is monochromatic.
+      size_t NumColorsForBB = ColorsForBB.size();
+      if (NumColorsForBB == 1)
+        continue;
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Cloning block \'" << BB->getName()
+                              << "\' for funclet \'" << FuncletPadBB->getName()
+                              << "\'.\n");
+
+      // Create a new basic block and copy instructions into it!
+      BasicBlock *CBB =
+          CloneBasicBlock(BB, VMap, Twine(".for.", FuncletPadBB->getName()));
+      // Insert the clone immediately after the original to ensure determinism
+      // and to keep the same relative ordering of any funclet's blocks.
+      CBB->insertInto(&F, BB->getNextNode());
+
+      // Add basic block mapping.
+      VMap[BB] = CBB;
+
+      // Record delta operations that we need to perform to our color mappings.
+      Orig2Clone.emplace_back(BB, CBB);
+    }
+
+    // If nothing was cloned, we're done cloning in this funclet.
+    if (Orig2Clone.empty())
+      continue;
+
+    // Update our color mappings to reflect that one block has lost a color and
+    // another has gained a color.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      BlocksInFunclet.push_back(NewBlock);
+      ColorVector &NewColors = BlockColors[NewBlock];
+      assert(NewColors.empty() && "A new block should only have one color!");
+      NewColors.push_back(FuncletPadBB);
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Assigned color \'" << FuncletPadBB->getName()
+                              << "\' to block \'" << NewBlock->getName()
+                              << "\'.\n");
+
+      BlocksInFunclet.erase(
+          std::remove(BlocksInFunclet.begin(), BlocksInFunclet.end(), OldBlock),
+          BlocksInFunclet.end());
+      ColorVector &OldColors = BlockColors[OldBlock];
+      OldColors.erase(
+          std::remove(OldColors.begin(), OldColors.end(), FuncletPadBB),
+          OldColors.end());
+
+      DEBUG_WITH_TYPE("winehprepare-coloring",
+                      dbgs() << "  Removed color \'" << FuncletPadBB->getName()
+                              << "\' from block \'" << OldBlock->getName()
+                              << "\'.\n");
+    }
+
+    // Loop over all of the instructions in this funclet, fixing up operand
+    // references as we go.  This uses VMap to do all the hard work.
+    for (BasicBlock *BB : BlocksInFunclet)
+      // Loop over all instructions, fixing each one as we find it...
+      for (Instruction &I : *BB)
+        RemapInstruction(&I, VMap,
+                         RF_IgnoreMissingLocals | RF_NoModuleLevelChanges);
+
+    // Catchrets targeting cloned blocks need to be updated separately from
+    // the loop above because they are not in the current funclet.
+    SmallVector<CatchReturnInst *, 2> FixupCatchrets;
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+
+      FixupCatchrets.clear();
+      for (BasicBlock *Pred : predecessors(OldBlock))
+        if (auto *CatchRet = dyn_cast<CatchReturnInst>(Pred->getTerminator()))
+          if (CatchRet->getCatchSwitchParentPad() == FuncletToken)
+            FixupCatchrets.push_back(CatchRet);
+
+      for (CatchReturnInst *CatchRet : FixupCatchrets)
+        CatchRet->setSuccessor(NewBlock);
+    }
+
+    auto UpdatePHIOnClonedBlock = [&](PHINode *PN, bool IsForOldBlock) {
+      unsigned NumPreds = PN->getNumIncomingValues();
+      for (unsigned PredIdx = 0, PredEnd = NumPreds; PredIdx != PredEnd;
+           ++PredIdx) {
+        BasicBlock *IncomingBlock = PN->getIncomingBlock(PredIdx);
+        bool EdgeTargetsFunclet;
+        if (auto *CRI =
+                dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+          EdgeTargetsFunclet = (CRI->getCatchSwitchParentPad() == FuncletToken);
+        } else {
+          ColorVector &IncomingColors = BlockColors[IncomingBlock];
+          assert(!IncomingColors.empty() && "Block not colored!");
+          assert((IncomingColors.size() == 1 ||
+                  llvm::all_of(IncomingColors,
+                               [&](BasicBlock *Color) {
+                                 return Color != FuncletPadBB;
+                               })) &&
+                 "Cloning should leave this funclet's blocks monochromatic");
+          EdgeTargetsFunclet = (IncomingColors.front() == FuncletPadBB);
+        }
+        if (IsForOldBlock != EdgeTargetsFunclet)
+          continue;
+        PN->removeIncomingValue(IncomingBlock, /*DeletePHIIfEmpty=*/false);
+        // Revisit the next entry.
+        --PredIdx;
+        --PredEnd;
+      }
+    };
+
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (Instruction &OldI : *OldBlock) {
+        auto *OldPN = dyn_cast<PHINode>(&OldI);
+        if (!OldPN)
+          break;
+        UpdatePHIOnClonedBlock(OldPN, /*IsForOldBlock=*/true);
+      }
+      for (Instruction &NewI : *NewBlock) {
+        auto *NewPN = dyn_cast<PHINode>(&NewI);
+        if (!NewPN)
+          break;
+        UpdatePHIOnClonedBlock(NewPN, /*IsForOldBlock=*/false);
+      }
+    }
+
+    // Check to see if SuccBB has PHI nodes. If so, we need to add entries to
+    // the PHI nodes for NewBB now.
+    for (auto &BBMapping : Orig2Clone) {
+      BasicBlock *OldBlock = BBMapping.first;
+      BasicBlock *NewBlock = BBMapping.second;
+      for (BasicBlock *SuccBB : successors(NewBlock)) {
+        for (Instruction &SuccI : *SuccBB) {
+          auto *SuccPN = dyn_cast<PHINode>(&SuccI);
+          if (!SuccPN)
+            break;
+
+          // Ok, we have a PHI node.  Figure out what the incoming value was for
+          // the OldBlock.
+          int OldBlockIdx = SuccPN->getBasicBlockIndex(OldBlock);
+          if (OldBlockIdx == -1)
+            break;
+          Value *IV = SuccPN->getIncomingValue(OldBlockIdx);
+
+          // Remap the value if necessary.
+          if (auto *Inst = dyn_cast<Instruction>(IV)) {
+            ValueToValueMapTy::iterator I = VMap.find(Inst);
+            if (I != VMap.end())
+              IV = I->second;
+          }
+
+          SuccPN->addIncoming(IV, NewBlock);
+        }
+      }
+    }
+
+    for (ValueToValueMapTy::value_type VT : VMap) {
+      // If there were values defined in BB that are used outside the funclet,
+      // then we now have to update all uses of the value to use either the
+      // original value, the cloned value, or some PHI derived value.  This can
+      // require arbitrary PHI insertion, of which we are prepared to do, clean
+      // these up now.
+      SmallVector<Use *, 16> UsesToRename;
+
+      auto *OldI = dyn_cast<Instruction>(const_cast<Value *>(VT.first));
+      if (!OldI)
+        continue;
+      auto *NewI = cast<Instruction>(VT.second);
+      // Scan all uses of this instruction to see if it is used outside of its
+      // funclet, and if so, record them in UsesToRename.
+      for (Use &U : OldI->uses()) {
+        Instruction *UserI = cast<Instruction>(U.getUser());
+        BasicBlock *UserBB = UserI->getParent();
+        ColorVector &ColorsForUserBB = BlockColors[UserBB];
+        assert(!ColorsForUserBB.empty());
+        if (ColorsForUserBB.size() > 1 ||
+            *ColorsForUserBB.begin() != FuncletPadBB)
+          UsesToRename.push_back(&U);
+      }
+
+      // If there are no uses outside the block, we're done with this
+      // instruction.
+      if (UsesToRename.empty())
+        continue;
+
+      // We found a use of OldI outside of the funclet.  Rename all uses of OldI
+      // that are outside its funclet to be uses of the appropriate PHI node
+      // etc.
+      SSAUpdater SSAUpdate;
+      SSAUpdate.Initialize(OldI->getType(), OldI->getName());
+      SSAUpdate.AddAvailableValue(OldI->getParent(), OldI);
+      SSAUpdate.AddAvailableValue(NewI->getParent(), NewI);
+
+      while (!UsesToRename.empty())
+        SSAUpdate.RewriteUseAfterInsertions(*UsesToRename.pop_back_val());
+    }
+  }
+}
+
+void WinEHPrepare::removeImplausibleInstructions(Function &F) {
+  // Remove implausible terminators and replace them with UnreachableInst.
+  for (auto &Funclet : FuncletBlocks) {
+    BasicBlock *FuncletPadBB = Funclet.first;
+    std::vector<BasicBlock *> &BlocksInFunclet = Funclet.second;
+    Instruction *FirstNonPHI = FuncletPadBB->getFirstNonPHI();
+    auto *FuncletPad = dyn_cast<FuncletPadInst>(FirstNonPHI);
+    auto *CatchPad = dyn_cast_or_null<CatchPadInst>(FuncletPad);
+    auto *CleanupPad = dyn_cast_or_null<CleanupPadInst>(FuncletPad);
+
+    for (BasicBlock *BB : BlocksInFunclet) {
+      for (Instruction &I : *BB) {
+        CallSite CS(&I);
+        if (!CS)
+          continue;
+
+        Value *FuncletBundleOperand = nullptr;
+        if (auto BU = CS.getOperandBundle(LLVMContext::OB_funclet))
+          FuncletBundleOperand = BU->Inputs.front();
+
+        if (FuncletBundleOperand == FuncletPad)
+          continue;
+
+        // Skip call sites which are nounwind intrinsics or inline asm.
+        auto *CalledFn =
+            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
+        if (CalledFn && ((CalledFn->isIntrinsic() && CS.doesNotThrow()) ||
+                         CS.isInlineAsm()))
+          continue;
+
+        // This call site was not part of this funclet, remove it.
+        if (CS.isInvoke()) {
+          // Remove the unwind edge if it was an invoke.
+          removeUnwindEdge(BB);
+          // Get a pointer to the new call.
+          BasicBlock::iterator CallI =
+              std::prev(BB->getTerminator()->getIterator());
+          auto *CI = cast<CallInst>(&*CallI);
+          changeToUnreachable(CI, /*UseLLVMTrap=*/false);
+        } else {
+          changeToUnreachable(&I, /*UseLLVMTrap=*/false);
+        }
+
+        // There are no more instructions in the block (except for unreachable),
+        // we are done.
+        break;
+      }
+
+      TerminatorInst *TI = BB->getTerminator();
+      // CatchPadInst and CleanupPadInst can't transfer control to a ReturnInst.
+      bool IsUnreachableRet = isa<ReturnInst>(TI) && FuncletPad;
+      // The token consumed by a CatchReturnInst must match the funclet token.
+      bool IsUnreachableCatchret = false;
+      if (auto *CRI = dyn_cast<CatchReturnInst>(TI))
+        IsUnreachableCatchret = CRI->getCatchPad() != CatchPad;
+      // The token consumed by a CleanupReturnInst must match the funclet token.
+      bool IsUnreachableCleanupret = false;
+      if (auto *CRI = dyn_cast<CleanupReturnInst>(TI))
+        IsUnreachableCleanupret = CRI->getCleanupPad() != CleanupPad;
+      if (IsUnreachableRet || IsUnreachableCatchret ||
+          IsUnreachableCleanupret) {
+        changeToUnreachable(TI, /*UseLLVMTrap=*/false);
+      } else if (isa<InvokeInst>(TI)) {
+        if (Personality == EHPersonality::MSVC_CXX && CleanupPad) {
+          // Invokes within a cleanuppad for the MSVC++ personality never
+          // transfer control to their unwind edge: the personality will
+          // terminate the program.
+          removeUnwindEdge(BB);
+        }
+      }
+    }
+  }
+}
+
+void WinEHPrepare::cleanupPreparedFunclets(Function &F) {
+  // Clean-up some of the mess we made by removing useles PHI nodes, trivial
+  // branches, etc.
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
+    BasicBlock *BB = &*FI++;
+    SimplifyInstructionsInBlock(BB);
+    ConstantFoldTerminator(BB, /*DeleteDeadConditions=*/true);
+    MergeBlockIntoPredecessor(BB);
+  }
+
+  // We might have some unreachable blocks after cleaning up some impossible
+  // control flow.
+  removeUnreachableBlocks(F);
+}
+
+void WinEHPrepare::verifyPreparedFunclets(Function &F) {
+  for (BasicBlock &BB : F) {
+    size_t NumColors = BlockColors[&BB].size();
+    assert(NumColors == 1 && "Expected monochromatic BB!");
+    if (NumColors == 0)
+      report_fatal_error("Uncolored BB!");
+    if (NumColors > 1)
+      report_fatal_error("Multicolor BB!");
+    assert((DisableDemotion || !(BB.isEHPad() && isa<PHINode>(BB.begin()))) &&
+           "EH Pad still has a PHI!");
+  }
+}
+
+bool WinEHPrepare::prepareExplicitEH(Function &F) {
+  // Remove unreachable blocks.  It is not valuable to assign them a color and
+  // their existence can trick us into thinking values are alive when they are
+  // not.
+  removeUnreachableBlocks(F);
+
+  // Determine which blocks are reachable from which funclet entries.
+  colorFunclets(F);
+
+  cloneCommonBlocks(F);
+
+  if (!DisableDemotion)
+    demotePHIsOnFunclets(F);
+
+  if (!DisableCleanups) {
+    DEBUG(verifyFunction(F));
+    removeImplausibleInstructions(F);
+
+    DEBUG(verifyFunction(F));
+    cleanupPreparedFunclets(F);
+  }
+
+  DEBUG(verifyPreparedFunclets(F));
+  // Recolor the CFG to verify that all is well.
+  DEBUG(colorFunclets(F));
+  DEBUG(verifyPreparedFunclets(F));
+
+  BlockColors.clear();
+  FuncletBlocks.clear();
+
+  return true;
+}
+
+// TODO: Share loads when one use dominates another, or when a catchpad exit
+// dominates uses (needs dominators).
+AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
+  BasicBlock *PHIBlock = PN->getParent();
+  AllocaInst *SpillSlot = nullptr;
+  Instruction *EHPad = PHIBlock->getFirstNonPHI();
+
+  if (!isa<TerminatorInst>(EHPad)) {
+    // If the EHPad isn't a terminator, then we can insert a load in this block
+    // that will dominate all uses.
+    SpillSlot = new AllocaInst(PN->getType(), nullptr,
+                               Twine(PN->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+    Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"),
+                            &*PHIBlock->getFirstInsertionPt());
+    PN->replaceAllUsesWith(V);
+    return SpillSlot;
+  }
+
+  // Otherwise, we have a PHI on a terminator EHPad, and we give up and insert
+  // loads of the slot before every use.
+  DenseMap<BasicBlock *, Value *> Loads;
+  for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
+       UI != UE;) {
+    Use &U = *UI++;
+    auto *UsingInst = cast<Instruction>(U.getUser());
+    if (isa<PHINode>(UsingInst) && UsingInst->getParent()->isEHPad()) {
+      // Use is on an EH pad phi.  Leave it alone; we'll insert loads and
+      // stores for it separately.
+      continue;
+    }
+    replaceUseWithLoad(PN, U, SpillSlot, Loads, F);
+  }
+  return SpillSlot;
+}
+
+// TODO: improve store placement.  Inserting at def is probably good, but need
+// to be careful not to introduce interfering stores (needs liveness analysis).
+// TODO: identify related phi nodes that can share spill slots, and share them
+// (also needs liveness).
+void WinEHPrepare::insertPHIStores(PHINode *OriginalPHI,
+                                   AllocaInst *SpillSlot) {
+  // Use a worklist of (Block, Value) pairs -- the given Value needs to be
+  // stored to the spill slot by the end of the given Block.
+  SmallVector<std::pair<BasicBlock *, Value *>, 4> Worklist;
+
+  Worklist.push_back({OriginalPHI->getParent(), OriginalPHI});
+
+  while (!Worklist.empty()) {
+    BasicBlock *EHBlock;
+    Value *InVal;
+    std::tie(EHBlock, InVal) = Worklist.pop_back_val();
+
+    PHINode *PN = dyn_cast<PHINode>(InVal);
+    if (PN && PN->getParent() == EHBlock) {
+      // The value is defined by another PHI we need to remove, with no room to
+      // insert a store after the PHI, so each predecessor needs to store its
+      // incoming value.
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i < e; ++i) {
+        Value *PredVal = PN->getIncomingValue(i);
+
+        // Undef can safely be skipped.
+        if (isa<UndefValue>(PredVal))
+          continue;
+
+        insertPHIStore(PN->getIncomingBlock(i), PredVal, SpillSlot, Worklist);
+      }
+    } else {
+      // We need to store InVal, which dominates EHBlock, but can't put a store
+      // in EHBlock, so need to put stores in each predecessor.
+      for (BasicBlock *PredBlock : predecessors(EHBlock)) {
+        insertPHIStore(PredBlock, InVal, SpillSlot, Worklist);
+      }
+    }
+  }
+}
+
+void WinEHPrepare::insertPHIStore(
+    BasicBlock *PredBlock, Value *PredVal, AllocaInst *SpillSlot,
+    SmallVectorImpl<std::pair<BasicBlock *, Value *>> &Worklist) {
+
+  if (PredBlock->isEHPad() &&
+      isa<TerminatorInst>(PredBlock->getFirstNonPHI())) {
+    // Pred is unsplittable, so we need to queue it on the worklist.
+    Worklist.push_back({PredBlock, PredVal});
+    return;
+  }
+
+  // Otherwise, insert the store at the end of the basic block.
+  new StoreInst(PredVal, SpillSlot, PredBlock->getTerminator());
+}
+
+void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
+                                      DenseMap<BasicBlock *, Value *> &Loads,
+                                      Function &F) {
+  // Lazilly create the spill slot.
+  if (!SpillSlot)
+    SpillSlot = new AllocaInst(V->getType(), nullptr,
+                               Twine(V->getName(), ".wineh.spillslot"),
+                               &F.getEntryBlock().front());
+
+  auto *UsingInst = cast<Instruction>(U.getUser());
+  if (auto *UsingPHI = dyn_cast<PHINode>(UsingInst)) {
+    // If this is a PHI node, we can't insert a load of the value before
+    // the use.  Instead insert the load in the predecessor block
+    // corresponding to the incoming value.
+    //
+    // Note that if there are multiple edges from a basic block to this
+    // PHI node that we cannot have multiple loads.  The problem is that
+    // the resulting PHI node will have multiple values (from each load)
+    // coming in from the same block, which is illegal SSA form.
+    // For this reason, we keep track of and reuse loads we insert.
+    BasicBlock *IncomingBlock = UsingPHI->getIncomingBlock(U);
+    if (auto *CatchRet =
+            dyn_cast<CatchReturnInst>(IncomingBlock->getTerminator())) {
+      // Putting a load above a catchret and use on the phi would still leave
+      // a cross-funclet def/use.  We need to split the edge, change the
+      // catchret to target the new block, and put the load there.
+      BasicBlock *PHIBlock = UsingInst->getParent();
+      BasicBlock *NewBlock = SplitEdge(IncomingBlock, PHIBlock);
+      // SplitEdge gives us:
+      //   IncomingBlock:
+      //     ...
+      //     br label %NewBlock
+      //   NewBlock:
+      //     catchret label %PHIBlock
+      // But we need:
+      //   IncomingBlock:
+      //     ...
+      //     catchret label %NewBlock
+      //   NewBlock:
+      //     br label %PHIBlock
+      // So move the terminators to each others' blocks and swap their
+      // successors.
+      BranchInst *Goto = cast<BranchInst>(IncomingBlock->getTerminator());
+      Goto->removeFromParent();
+      CatchRet->removeFromParent();
+      IncomingBlock->getInstList().push_back(CatchRet);
+      NewBlock->getInstList().push_back(Goto);
+      Goto->setSuccessor(0, PHIBlock);
+      CatchRet->setSuccessor(NewBlock);
+      // Update the color mapping for the newly split edge.
+      ColorVector &ColorsForPHIBlock = BlockColors[PHIBlock];
+      BlockColors[NewBlock] = ColorsForPHIBlock;
+      for (BasicBlock *FuncletPad : ColorsForPHIBlock)
+        FuncletBlocks[FuncletPad].push_back(NewBlock);
+      // Treat the new block as incoming for load insertion.
+      IncomingBlock = NewBlock;
+    }
+    Value *&Load = Loads[IncomingBlock];
+    // Insert the load into the predecessor block
+    if (!Load)
+      Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"),
+                          /*Volatile=*/false, IncomingBlock->getTerminator());
+
+    U.set(Load);
+  } else {
+    // Reload right before the old use.
+    auto *Load = new LoadInst(SpillSlot, Twine(V->getName(), ".wineh.reload"),
+                              /*Volatile=*/false, UsingInst);
+    U.set(Load);
+  }
+}
+
+void WinEHFuncInfo::addIPToStateRange(const InvokeInst *II,
+                                      MCSymbol *InvokeBegin,
+                                      MCSymbol *InvokeEnd) {
+  assert(InvokeStateMap.count(II) &&
+         "should get invoke with precomputed state");
+  LabelToStateMap[InvokeBegin] = std::make_pair(InvokeStateMap[II], InvokeEnd);
+}
+
+WinEHFuncInfo::WinEHFuncInfo() {}
diff --git a/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
new file mode 100644
index 000000000000..1f9570895f9d
--- /dev/null
+++ b/contrib/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -0,0 +1,96 @@
+//===-- XRayInstrumentation.cpp - Adds XRay instrumentation to functions. -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a MachineFunctionPass that inserts the appropriate
+// XRay instrumentation instructions. We look for XRay-specific attributes
+// on the function to determine whether we should insert the replacement
+// operations.
+//
+//===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct XRayInstrumentation : public MachineFunctionPass {
+  static char ID;
+
+  XRayInstrumentation() : MachineFunctionPass(ID) {
+    initializeXRayInstrumentationPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+}
+
+bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
+  auto &F = *MF.getFunction();
+  auto InstrAttr = F.getFnAttribute("function-instrument");
+  bool AlwaysInstrument = !InstrAttr.hasAttribute(Attribute::None) &&
+                          InstrAttr.isStringAttribute() &&
+                          InstrAttr.getValueAsString() == "xray-always";
+  Attribute Attr = F.getFnAttribute("xray-instruction-threshold");
+  unsigned XRayThreshold = 0;
+  if (!AlwaysInstrument) {
+    if (Attr.hasAttribute(Attribute::None) || !Attr.isStringAttribute())
+      return false; // XRay threshold attribute not found.
+    if (Attr.getValueAsString().getAsInteger(10, XRayThreshold))
+      return false; // Invalid value for threshold.
+    if (F.size() < XRayThreshold)
+      return false; // Function is too small.
+  }
+
+  // FIXME: Do the loop triviality analysis here or in an earlier pass.
+
+  // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
+  // MachineFunction.
+  auto &FirstMBB = *MF.begin();
+  auto &FirstMI = *FirstMBB.begin();
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+          TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
+
+  // Then we look for *all* terminators and returns, then replace those with
+  // PATCHABLE_RET instructions.
+  SmallVector<MachineInstr *, 4> Terminators;
+  for (auto &MBB : MF) {
+    for (auto &T : MBB.terminators()) {
+      // FIXME: Handle tail calls here too?
+      if (T.isReturn() && T.getOpcode() == TII->getReturnOpcode()) {
+        // Replace return instructions with:
+        //   PATCHABLE_RET <Opcode>, <Operand>...
+        auto MIB = BuildMI(MBB, T, T.getDebugLoc(),
+                           TII->get(TargetOpcode::PATCHABLE_RET))
+                       .addImm(T.getOpcode());
+        for (auto &MO : T.operands())
+          MIB.addOperand(MO);
+        Terminators.push_back(&T);
+        break;
+      }
+    }
+  }
+
+  for (auto &I : Terminators)
+    I->eraseFromParent();
+
+  return true;
+}
+
+char XRayInstrumentation::ID = 0;
+char &llvm::XRayInstrumentationID = XRayInstrumentation::ID;
+INITIALIZE_PASS(XRayInstrumentation, "xray-instrumentation", "Insert XRay ops",
+                false, false)